admin管理员组文章数量:1312674
Looking for bit twiddling insights to optimize an algorithm to spread the bits of an 8-bit integer to the LSB of each byte of a 64-bit integer. Example:
0b10110011 -> 0x0100010100000101
The best I've come up with so far is:
fn spread(x: u8) -> u64 {
let x = x as u64;
let y = (x * 0x0101010101010101) & 0x8040201008040201;
(y | (y >> 1) | (y >> 2) | (y >> 3) | (y >> 4) | (y >> 5) | (y >> 6) | (y >> 7))
& 0x0101010101010101
}
This results in branchless, but still quite long code:
movzx eax, dil
movabs rcx, 72340172838076673
imul rax, rcx
movabs rdx, -9205322385119247871
and rdx, rax
mov rsi, rdx
mov rdi, rdx
mov r8, rdx
mov r9, rdx
mov r10, rdx
mov rax, rdx
shr rax, 7
or rax, rdx
shr rdx
shr rsi, 2
or rsi, rdx
shr rdi, 3
or rdi, rsi
shr r8, 4
or r8, rdi
shr r9, 5
or r9, r8
shr r10, 6
or r10, r9
or rax, r10
and rax, rcx
ret
Clearly, the many shifts account for most of the instructions. Clever ideas to reduce the computation needed?
Looking for bit twiddling insights to optimize an algorithm to spread the bits of an 8-bit integer to the LSB of each byte of a 64-bit integer. Example:
0b10110011 -> 0x0100010100000101
The best I've come up with so far is:
fn spread(x: u8) -> u64 {
let x = x as u64;
let y = (x * 0x0101010101010101) & 0x8040201008040201;
(y | (y >> 1) | (y >> 2) | (y >> 3) | (y >> 4) | (y >> 5) | (y >> 6) | (y >> 7))
& 0x0101010101010101
}
This results in branchless, but still quite long code:
movzx eax, dil
movabs rcx, 72340172838076673
imul rax, rcx
movabs rdx, -9205322385119247871
and rdx, rax
mov rsi, rdx
mov rdi, rdx
mov r8, rdx
mov r9, rdx
mov r10, rdx
mov rax, rdx
shr rax, 7
or rax, rdx
shr rdx
shr rsi, 2
or rsi, rdx
shr rdi, 3
or rdi, rsi
shr r8, 4
or r8, rdi
shr r9, 5
or r9, r8
shr r10, 6
or r10, r9
or rax, r10
and rax, rcx
ret
Clearly, the many shifts account for most of the instructions. Clever ideas to reduce the computation needed?
Share Improve this question edited Feb 1 at 15:40 twig-froth asked Feb 1 at 7:50 twig-frothtwig-froth 672 silver badges6 bronze badges2 Answers
Reset to default 4SWAR?
(((x&0x55) * 0x02040810204081LL) | ((x&0xAA) * 0x02040810204081LL)) & 0x0101010101010101LL
There is SSE (BMI2, available from Haswell and Excavator processors) assembler instruction PDEP, which is intended exactly for your task.
Delphi asm to check. If you can use intrinsics : _pdep_u64
function SpreadByte(src, mask: UInt64): UInt64;
asm
pdep rax, src, mask
end;
procedure TForm2.Button22Click(Sender: TObject);
var
src, dst, mask: UInt64;
begin
src := %10110011; //0b10110011
mask := $0101010101010101; //0x0101010101010101
dst := SpreadByte(src, mask);
Memo1.Lines.Add(IntToHex(dst));
end;
Result
0100010100000101
本文标签: bit manipulationFast algorithm to spread bits of u8 to the LSBs of each byte of a u64Stack Overflow
版权声明:本文标题:bit manipulation - Fast algorithm to spread bits of u8 to the LSBs of each byte of a u64 - Stack Overflow 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1741882516a2402830.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论