diff --git a/library/src/blas3/Tensile/Logic/asm_full/aldebaran/aldebaran_Cijk_Ailk_Bljk_I8II_BH.yaml b/library/src/blas3/Tensile/Logic/asm_full/aldebaran/aldebaran_Cijk_Ailk_Bljk_I8II_BH.yaml index 286a43509..6867b044b 100644 --- a/library/src/blas3/Tensile/Logic/asm_full/aldebaran/aldebaran_Cijk_Ailk_Bljk_I8II_BH.yaml +++ b/library/src/blas3/Tensile/Logic/asm_full/aldebaran/aldebaran_Cijk_Ailk_Bljk_I8II_BH.yaml @@ -17724,6 +17724,1086 @@ _WorkspaceSizePerElemC: 0 _staggerStrideShift: 1 allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 64 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [9, 0, 10] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 256 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 + LdsInitCVgprs: false + LdsNumElements: 24640 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4352 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 64 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 8, 1, 1, 1] + MIInputPerThread: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: false + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 9 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT64x64x64_MI32x32x8x1_SN_EPS0_LBSPP256_LPA0_LPB4_LRVW4_PGR2_SS1_SU32_SUM1_SUS128_SRVW0_SSO0_SVW1_WSGRA1_WSGRB0 + SourceSwap: true + StaggerU: 32 + StaggerUMapping: 1 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [1, 32] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 64 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 64 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [9, 0, 10] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 256 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 + LdsInitCVgprs: false + LdsNumElements: 24640 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4352 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 64 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 8, 1, 1, 1] + MIInputPerThread: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: false + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 9 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT64x64x64_MI32x32x8x1_SN_EPS0_LBSPP256_LPA0_LPB4_LRVW4_PGR2_SS1_SU32_SUM2_SUS128_SRVW0_SSO0_SVW1_WSGRA1_WSGRB1 + SourceSwap: true + StaggerU: 32 + StaggerUMapping: 2 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [1, 32] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 64 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 64 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [9, 0, 10] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 64 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 8, 1, 1, 1] + MIInputPerThread: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: false + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 9 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT64x64x64_MI32x32x8x1_SN_EPS1_LBSPP128_LPA0_LPB0_LRVW8_PGR1_SS1_SU0_SUM0_SUS0_SRVW0_SSO0_SVW1_WSGRA0_WSGRB1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [1, 32] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 64 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 64 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [9, 0, 10] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 64 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 8, 1, 1, 1] + MIInputPerThread: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: false + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 9 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT64x64x64_MI32x32x8x1_SN_EPS1_LBSPP128_LPA0_LPB0_LRVW8_PGR1_SS1_SU0_SUM0_SUS0_SRVW0_SSO0_SVW1_WSGRA1_WSGRB0 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [1, 32] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 64 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false - [2, 3, 0, 1] - - - [35, 35, 1, 512] - [0, 0.027] @@ -18035,4 +19115,14 @@ - [65, 80.631] - - [90112, 90112, 1, 2048] - [66, 79.07] + - - [3025, 256, 1, 64] + - [67, 3.051] + - - [3024, 256, 1, 64] + - [68, 3.001] + - - [3024, 256, 1, 128] + - [69, 5.554] + - - [3072, 256, 1, 64] + - [70, 3.416] + - - [3072, 256, 1, 128] + - [69, 6.13] - null diff --git a/library/src/blas3/Tensile/Logic/asm_full/aldebaran/aldebaran_Cijk_Alik_Bljk_I8II_BH.yaml b/library/src/blas3/Tensile/Logic/asm_full/aldebaran/aldebaran_Cijk_Alik_Bljk_I8II_BH.yaml index 5140a79b1..b50d85616 100644 --- a/library/src/blas3/Tensile/Logic/asm_full/aldebaran/aldebaran_Cijk_Alik_Bljk_I8II_BH.yaml +++ b/library/src/blas3/Tensile/Logic/asm_full/aldebaran/aldebaran_Cijk_Alik_Bljk_I8II_BH.yaml @@ -17286,6 +17286,1356 @@ _WorkspaceSizePerElemC: 0 _staggerStrideShift: 2 allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 64 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 10] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 64 + LdsBlockSizePerPadA: 64 + LdsBlockSizePerPadB: 64 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 64 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 8, 1, 1, 1] + MIInputPerThread: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 3 + NonTemporalD: 3 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: false + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 9 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT64x64x64_MI32x32x8x1_SN_1LDSB0_AMAS0_GRVW8_LBSPP64_LRVW16_NTC3_NTD3_SS1_SVW1_TT1_32_VW1_WSGRA0_WSGRB1_WG64_4_1_WGM4 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [1, 32] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 64 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 64 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 10] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 64 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 8, 1, 1, 1] + MIInputPerThread: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 3 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: false + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 9 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT64x64x64_MI32x32x8x1_SN_1LDSB0_AMAS3_GRVW8_LBSPP0_LRVW16_NTC0_NTD3_SS0_SVW4_TT1_32_VW2_WSGRA0_WSGRB0_WG64_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [1, 32] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 64 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 64 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 10] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 64 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 8, 1, 1, 1] + MIInputPerThread: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 3 + NonTemporalD: 1 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: false + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 9 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT64x64x64_MI32x32x8x1_SN_1LDSB0_AMAS0_GRVW8_LBSPP0_LRVW16_NTC3_NTD1_SS1_SVW1_TT1_32_VW1_WSGRA0_WSGRB1_WG64_4_1_WGM4 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [1, 32] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 64 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 64 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 10] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 64 + LdsBlockSizePerPadA: 64 + LdsBlockSizePerPadB: 64 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 64 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 8, 1, 1, 1] + MIInputPerThread: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 3 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: false + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 9 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT64x64x64_MI32x32x8x1_SN_1LDSB0_AMAS0_GRVW8_LBSPP64_LRVW16_NTC0_NTD3_SS1_SVW1_TT1_32_VW1_WSGRA0_WSGRB0_WG64_4_1_WGM4 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [1, 32] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 64 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 128 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 10] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 128 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 3 + NonTemporalD: 3 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: false + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 9 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT64x64x128_MI16x16x16x1_SN_1LDSB1_AMAS0_GRVW8_LBSPP0_LRVW16_NTC3_NTD3_SS1_SVW1_TT2_32_VW1_WSGRA1_WSGRB1_WG32_8_1_WGM1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [2, 32] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 128 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false - [2, 3, 0, 1] - - - [35, 35, 1, 512] - [0, 0.033] @@ -17597,4 +18947,14 @@ - [59, 85.696] - - [90112, 90112, 1, 2048] - [64, 84.493] + - - [3025, 256, 1, 64] + - [65, 3.426] + - - [3024, 256, 1, 64] + - [66, 3.363] + - - [3024, 256, 1, 128] + - [67, 6.491] + - - [3072, 256, 1, 64] + - [68, 3.78] + - - [3072, 256, 1, 128] + - [69, 7.364] - null diff --git a/library/src/blas3/Tensile/Logic/asm_full/aldebaran_104cu/aldebaran_Cijk_Ailk_Bljk_I8II_BH.yaml b/library/src/blas3/Tensile/Logic/asm_full/aldebaran_104cu/aldebaran_Cijk_Ailk_Bljk_I8II_BH.yaml index 46140a6ff..84be87047 100644 --- a/library/src/blas3/Tensile/Logic/asm_full/aldebaran_104cu/aldebaran_Cijk_Ailk_Bljk_I8II_BH.yaml +++ b/library/src/blas3/Tensile/Logic/asm_full/aldebaran_104cu/aldebaran_Cijk_Ailk_Bljk_I8II_BH.yaml @@ -16137,6 +16137,1086 @@ _WorkspaceSizePerElemC: 0 _staggerStrideShift: 1 allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 64 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [9, 0, 10] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 256 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 + LdsInitCVgprs: false + LdsNumElements: 24640 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4352 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 64 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 8, 1, 1, 1] + MIInputPerThread: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: false + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 9 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT64x64x64_MI32x32x8x1_SN_EPS0_LBSPP256_LPA0_LPB4_LRVW4_PGR2_SS1_SU32_SUM1_SUS128_SRVW0_SSO0_SVW1_WSGRA1_WSGRB0 + SourceSwap: true + StaggerU: 32 + StaggerUMapping: 1 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [1, 32] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 64 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 64 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [9, 0, 10] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 256 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 + LdsInitCVgprs: false + LdsNumElements: 24640 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4352 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 64 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 8, 1, 1, 1] + MIInputPerThread: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: false + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 9 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT64x64x64_MI32x32x8x1_SN_EPS0_LBSPP256_LPA0_LPB4_LRVW4_PGR2_SS1_SU32_SUM2_SUS128_SRVW0_SSO0_SVW1_WSGRA1_WSGRB1 + SourceSwap: true + StaggerU: 32 + StaggerUMapping: 2 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [1, 32] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 64 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 64 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [9, 0, 10] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 64 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 8, 1, 1, 1] + MIInputPerThread: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: false + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 9 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT64x64x64_MI32x32x8x1_SN_EPS1_LBSPP128_LPA0_LPB0_LRVW8_PGR1_SS1_SU0_SUM0_SUS0_SRVW0_SSO0_SVW1_WSGRA0_WSGRB1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [1, 32] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 64 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 64 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [9, 0, 10] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 64 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 8, 1, 1, 1] + MIInputPerThread: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: false + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 9 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT64x64x64_MI32x32x8x1_SN_EPS1_LBSPP128_LPA0_LPB0_LRVW8_PGR1_SS1_SU0_SUM0_SUS0_SRVW0_SSO0_SVW1_WSGRA1_WSGRB0 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [1, 32] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 64 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false - [2, 3, 0, 1] - - - [35, 35, 1, 512] - [0, 0.027] @@ -16448,4 +17528,14 @@ - [58, 80.631] - - [90112, 90112, 1, 2048] - [59, 79.07] + - - [3025, 256, 1, 64] + - [60, 3.051] + - - [3024, 256, 1, 64] + - [61, 3.001] + - - [3024, 256, 1, 128] + - [62, 5.554] + - - [3072, 256, 1, 64] + - [63, 3.416] + - - [3072, 256, 1, 128] + - [62, 6.13] - null diff --git a/library/src/blas3/Tensile/Logic/asm_full/aldebaran_104cu/aldebaran_Cijk_Alik_Bljk_I8II_BH.yaml b/library/src/blas3/Tensile/Logic/asm_full/aldebaran_104cu/aldebaran_Cijk_Alik_Bljk_I8II_BH.yaml index 98bd070e1..c2b045c5f 100644 --- a/library/src/blas3/Tensile/Logic/asm_full/aldebaran_104cu/aldebaran_Cijk_Alik_Bljk_I8II_BH.yaml +++ b/library/src/blas3/Tensile/Logic/asm_full/aldebaran_104cu/aldebaran_Cijk_Alik_Bljk_I8II_BH.yaml @@ -16954,6 +16954,1356 @@ _WorkspaceSizePerElemC: 0 _staggerStrideShift: 2 allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 64 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 10] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 64 + LdsBlockSizePerPadA: 64 + LdsBlockSizePerPadB: 64 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 64 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 8, 1, 1, 1] + MIInputPerThread: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 3 + NonTemporalD: 3 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: false + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 9 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT64x64x64_MI32x32x8x1_SN_1LDSB0_AMAS0_GRVW8_LBSPP64_LRVW16_NTC3_NTD3_SS1_SVW1_TT1_32_VW1_WSGRA0_WSGRB1_WG64_4_1_WGM4 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [1, 32] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 64 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 64 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 10] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 64 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 8, 1, 1, 1] + MIInputPerThread: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 3 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: false + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 9 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT64x64x64_MI32x32x8x1_SN_1LDSB0_AMAS3_GRVW8_LBSPP0_LRVW16_NTC0_NTD3_SS0_SVW4_TT1_32_VW2_WSGRA0_WSGRB0_WG64_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [1, 32] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 64 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 64 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 10] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 64 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 8, 1, 1, 1] + MIInputPerThread: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 3 + NonTemporalD: 1 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: false + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 9 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT64x64x64_MI32x32x8x1_SN_1LDSB0_AMAS0_GRVW8_LBSPP0_LRVW16_NTC3_NTD1_SS1_SVW1_TT1_32_VW1_WSGRA0_WSGRB1_WG64_4_1_WGM4 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [1, 32] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 64 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 64 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 10] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 64 + LdsBlockSizePerPadA: 64 + LdsBlockSizePerPadB: 64 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 64 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 8, 1, 1, 1] + MIInputPerThread: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 3 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: false + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 9 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT64x64x64_MI32x32x8x1_SN_1LDSB0_AMAS0_GRVW8_LBSPP64_LRVW16_NTC0_NTD3_SS1_SVW1_TT1_32_VW1_WSGRA0_WSGRB0_WG64_4_1_WGM4 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [1, 32] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 64 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 128 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 10] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 128 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 3 + NonTemporalD: 3 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: false + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 9 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT64x64x128_MI16x16x16x1_SN_1LDSB1_AMAS0_GRVW8_LBSPP0_LRVW16_NTC3_NTD3_SS1_SVW1_TT2_32_VW1_WSGRA1_WSGRB1_WG32_8_1_WGM1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [2, 32] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 128 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false - [2, 3, 0, 1] - - - [35, 35, 1, 512] - [0, 0.033] @@ -17265,4 +18615,14 @@ - [57, 85.696] - - [90112, 90112, 1, 2048] - [62, 84.493] + - - [3025, 256, 1, 64] + - [63, 3.426] + - - [3024, 256, 1, 64] + - [64, 3.363] + - - [3024, 256, 1, 128] + - [65, 6.491] + - - [3072, 256, 1, 64] + - [66, 3.78] + - - [3072, 256, 1, 128] + - [67, 7.364] - null