bit manipulation - Fast algorithm to spread bits of u8 to the LSBs of each byte of a u64 - Stack Overflow

IT技术

更新时间：2025-03-141

admin管理员组
文章数量:1312674

Looking for bit twiddling insights to optimize an algorithm to spread the bits of an 8-bit integer to the LSB of each byte of a 64-bit integer. Example:

0b10110011 -> 0x0100010100000101

The best I've come up with so far is:

fn spread(x: u8) -> u64 {
    let x = x as u64;
    let y = (x * 0x0101010101010101) & 0x8040201008040201;
    (y | (y >> 1) | (y >> 2) | (y >> 3) | (y >> 4) | (y >> 5) | (y >> 6) | (y >> 7))
        & 0x0101010101010101
}

This results in branchless, but still quite long code:

    movzx   eax, dil
    movabs  rcx, 72340172838076673
    imul    rax, rcx
    movabs  rdx, -9205322385119247871
    and rdx, rax
    mov rsi, rdx
    mov rdi, rdx
    mov r8, rdx
    mov r9, rdx
    mov r10, rdx
    mov rax, rdx
    shr rax, 7
    or  rax, rdx
    shr rdx
    shr rsi, 2
    or  rsi, rdx
    shr rdi, 3
    or  rdi, rsi
    shr r8, 4
    or  r8, rdi
    shr r9, 5
    or  r9, r8
    shr r10, 6
    or  r10, r9
    or  rax, r10
    and rax, rcx
    ret

Clearly, the many shifts account for most of the instructions. Clever ideas to reduce the computation needed?

Looking for bit twiddling insights to optimize an algorithm to spread the bits of an 8-bit integer to the LSB of each byte of a 64-bit integer. Example:

0b10110011 -> 0x0100010100000101

The best I've come up with so far is:

fn spread(x: u8) -> u64 {
    let x = x as u64;
    let y = (x * 0x0101010101010101) & 0x8040201008040201;
    (y | (y >> 1) | (y >> 2) | (y >> 3) | (y >> 4) | (y >> 5) | (y >> 6) | (y >> 7))
        & 0x0101010101010101
}

This results in branchless, but still quite long code:

    movzx   eax, dil
    movabs  rcx, 72340172838076673
    imul    rax, rcx
    movabs  rdx, -9205322385119247871
    and rdx, rax
    mov rsi, rdx
    mov rdi, rdx
    mov r8, rdx
    mov r9, rdx
    mov r10, rdx
    mov rax, rdx
    shr rax, 7
    or  rax, rdx
    shr rdx
    shr rsi, 2
    or  rsi, rdx
    shr rdi, 3
    or  rdi, rsi
    shr r8, 4
    or  r8, rdi
    shr r9, 5
    or  r9, r8
    shr r10, 6
    or  r10, r9
    or  rax, r10
    and rax, rcx
    ret

Clearly, the many shifts account for most of the instructions. Clever ideas to reduce the computation needed?

Share Improve this question edited Feb 1 at 15:40 asked Feb 1 at 7:50 twig-froth 672 silver badges6 bronze badges

Add a comment |

2 Answers 2

Sorted by: Reset to default 4

SWAR?

(((x&0x55) * 0x02040810204081LL) | ((x&0xAA) * 0x02040810204081LL)) & 0x0101010101010101LL

There is SSE (BMI2, available from Haswell and Excavator processors) assembler instruction PDEP, which is intended exactly for your task.

Delphi asm to check. If you can use intrinsics : _pdep_u64

function SpreadByte(src, mask: UInt64): UInt64;
asm
   pdep rax, src, mask
end;

procedure TForm2.Button22Click(Sender: TObject);
var
  src, dst, mask: UInt64;
begin
   src := %10110011;  //0b10110011
   mask := $0101010101010101; //0x0101010101010101
   dst := SpreadByte(src, mask);
   Memo1.Lines.Add(IntToHex(dst));
end;

Result

0100010100000101

本文标签： bit manipulationFast algorithm to spread bits of u8 to the LSBs of each byte of a u64Stack Overflow

版权声明：本文标题：bit manipulation - Fast algorithm to spread bits of u8 to the LSBs of each byte of a u64 - Stack Overflow 内容由网友自发贡献，该文观点仅代表作者本人，转载请联系作者并注明出处：http://www.betaflare.com/web/1741882516a2402830.html，本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容，一经查实，本站将立刻删除。

编程频道|软件玩家 - 软件改变生活！

bit manipulation - Fast algorithm to spread bits of u8 to the LSBs of each byte of a u64 - Stack Overflow

2 Answers 2

更多相关文章