ok, so
https://gitlab.haskell.org/ghc/ghc/blob/4898df1cc25132dc9e2599d4fa4e1bbc9423cda5/compiler/GHC/StgToCmm/Prim.hs#L2480-2496

-- ----------------------------------------------------------------------------
-- Setting byte arrays

-- | Takes a 'MutableByteArray#', an offset into the array, a length,
-- and a byte, and sets each of the selected bytes in the array to the
-- character.
doSetByteArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr
                 -> FCode ()
doSetByteArrayOp ba off len c = do
    dflags <- getDynFlags

    let byteArrayAlignment = wordAlignment dflags -- known since BA is allocated on heap
        offsetAlignment = cmmExprAlignment off
        align = min byteArrayAlignment offsetAlignment

    p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags ba (arrWordsHdrSize dflags)) off
    emitMemsetCall p c len align

and emitMemsetCall is defined as

-- | Emit a call to @memset@.  The second argument must fit inside an
-- unsigned char.
emitMemsetCall :: CmmExpr -> CmmExpr -> CmmExpr -> Alignment -> FCode ()
emitMemsetCall dst c n align = do
    emitPrimCall
        [ {- no results -} ]
        (MO_Memset (alignmentBytes align))
        [ dst, c, n ]

then for known stuff we get

https://gitlab.haskell.org/ghc/ghc/blob/4898df1cc25132dc9e2599d4fa4e1bbc9423cda5/compiler/nativeGen/X86/CodeGen.hs#L2222-2293

genCCall' dflags _ (PrimTarget (MO_Memset align)) _
         [dst,
          CmmLit (CmmInt c _),
          CmmLit (CmmInt n _)]
         _
    | fromInteger insns <= maxInlineMemsetInsns dflags = do
        code_dst <- getAnyReg dst
        dst_r <- getNewRegNat format
        if format == II64 && n >= 8 then do
          code_imm8byte <- getAnyReg (CmmLit (CmmInt c8 W64))
          imm8byte_r <- getNewRegNat II64
          return $ code_dst dst_r `appOL`
                   code_imm8byte imm8byte_r `appOL`
                   go8 dst_r imm8byte_r (fromInteger n)
        else
          return $ code_dst dst_r `appOL`
                   go4 dst_r (fromInteger n)
  where
    maxAlignment = wordAlignment dflags -- only machine word wide MOVs are supported
    effectiveAlignment = min (alignmentOf align) maxAlignment
    format = intFormat . widthFromBytes $ alignmentBytes effectiveAlignment
    c2 = c `shiftL` 8 .|. c
    c4 = c2 `shiftL` 16 .|. c2
    c8 = c4 `shiftL` 32 .|. c4

    -- The number of instructions we will generate (approx). We need 1
    -- instructions per move.
    insns = (n + sizeBytes - 1) `div` sizeBytes

    -- The size of each move, in bytes.
    sizeBytes :: Integer
    sizeBytes = fromIntegral (formatInBytes format)

    -- Depending on size returns the widest MOV instruction and its
    -- width.
    gen4 :: AddrMode -> Integer -> (InstrBlock, Integer)
    gen4 addr size
        | size >= 4 =
            (unitOL (MOV II32 (OpImm (ImmInteger c4)) (OpAddr addr)), 4)
        | size >= 2 =
            (unitOL (MOV II16 (OpImm (ImmInteger c2)) (OpAddr addr)), 2)
        | size >= 1 =
            (unitOL (MOV II8 (OpImm (ImmInteger c)) (OpAddr addr)), 1)
        | otherwise = (nilOL, 0)

    -- Generates a 64-bit wide MOV instruction from REG to MEM.
    gen8 :: AddrMode -> Reg -> InstrBlock
    gen8 addr reg8byte =
      unitOL (MOV format (OpReg reg8byte) (OpAddr addr))

    -- Unrolls memset when the widest MOV is <= 4 bytes.
    go4 :: Reg -> Integer -> InstrBlock
    go4 dst left =
      if left <= 0 then nilOL
      else curMov `appOL` go4 dst (left - curWidth)
      where
        possibleWidth = minimum [left, sizeBytes]
        dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left))
        (curMov, curWidth) = gen4 dst_addr possibleWidth

    -- Unrolls memset when the widest MOV is 8 bytes (thus another Reg
    -- argument). Falls back to go4 when all 8 byte moves are
    -- exhausted.
    go8 :: Reg -> Reg -> Integer -> InstrBlock
    go8 dst reg8byte left =
      if possibleWidth >= 8 then
        let curMov = gen8 dst_addr reg8byte
        in  curMov `appOL` go8 dst reg8byte (left - 8)
      else go4 dst left
      where
        possibleWidth = minimum [left, sizeBytes]
        dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left))

otherwise it falls back to the c function call on a given platform.

memset is terrible and not performant likely (for nonzeroing path) #265

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions