admin管理员组

文章数量:1312674

Looking for bit twiddling insights to optimize an algorithm to spread the bits of an 8-bit integer to the LSB of each byte of a 64-bit integer. Example:

0b10110011 -> 0x0100010100000101

The best I've come up with so far is:

fn spread(x: u8) -> u64 {
    let x = x as u64;
    let y = (x * 0x0101010101010101) & 0x8040201008040201;
    (y | (y >> 1) | (y >> 2) | (y >> 3) | (y >> 4) | (y >> 5) | (y >> 6) | (y >> 7))
        & 0x0101010101010101
}

This results in branchless, but still quite long code:

    movzx   eax, dil
    movabs  rcx, 72340172838076673
    imul    rax, rcx
    movabs  rdx, -9205322385119247871
    and rdx, rax
    mov rsi, rdx
    mov rdi, rdx
    mov r8, rdx
    mov r9, rdx
    mov r10, rdx
    mov rax, rdx
    shr rax, 7
    or  rax, rdx
    shr rdx
    shr rsi, 2
    or  rsi, rdx
    shr rdi, 3
    or  rdi, rsi
    shr r8, 4
    or  r8, rdi
    shr r9, 5
    or  r9, r8
    shr r10, 6
    or  r10, r9
    or  rax, r10
    and rax, rcx
    ret

Clearly, the many shifts account for most of the instructions. Clever ideas to reduce the computation needed?

Looking for bit twiddling insights to optimize an algorithm to spread the bits of an 8-bit integer to the LSB of each byte of a 64-bit integer. Example:

0b10110011 -> 0x0100010100000101

The best I've come up with so far is:

fn spread(x: u8) -> u64 {
    let x = x as u64;
    let y = (x * 0x0101010101010101) & 0x8040201008040201;
    (y | (y >> 1) | (y >> 2) | (y >> 3) | (y >> 4) | (y >> 5) | (y >> 6) | (y >> 7))
        & 0x0101010101010101
}

This results in branchless, but still quite long code:

    movzx   eax, dil
    movabs  rcx, 72340172838076673
    imul    rax, rcx
    movabs  rdx, -9205322385119247871
    and rdx, rax
    mov rsi, rdx
    mov rdi, rdx
    mov r8, rdx
    mov r9, rdx
    mov r10, rdx
    mov rax, rdx
    shr rax, 7
    or  rax, rdx
    shr rdx
    shr rsi, 2
    or  rsi, rdx
    shr rdi, 3
    or  rdi, rsi
    shr r8, 4
    or  r8, rdi
    shr r9, 5
    or  r9, r8
    shr r10, 6
    or  r10, r9
    or  rax, r10
    and rax, rcx
    ret

Clearly, the many shifts account for most of the instructions. Clever ideas to reduce the computation needed?

Share Improve this question edited Feb 1 at 15:40 twig-froth asked Feb 1 at 7:50 twig-frothtwig-froth 672 silver badges6 bronze badges
Add a comment  | 

2 Answers 2

Reset to default 4

SWAR?

(((x&0x55) * 0x02040810204081LL) | ((x&0xAA) * 0x02040810204081LL)) & 0x0101010101010101LL

There is SSE (BMI2, available from Haswell and Excavator processors) assembler instruction PDEP, which is intended exactly for your task.

Delphi asm to check. If you can use intrinsics : _pdep_u64

function SpreadByte(src, mask: UInt64): UInt64;
asm
   pdep rax, src, mask
end;

procedure TForm2.Button22Click(Sender: TObject);
var
  src, dst, mask: UInt64;
begin
   src := %10110011;  //0b10110011
   mask := $0101010101010101; //0x0101010101010101
   dst := SpreadByte(src, mask);
   Memo1.Lines.Add(IntToHex(dst));
end;

Result

0100010100000101

本文标签: bit manipulationFast algorithm to spread bits of u8 to the LSBs of each byte of a u64Stack Overflow