ふと、swapみたいな単純なコードでも標準ライブラリの実装を呼び出した方がいいよ、という例として
import core.int128 : Cent; import std.algorithm.mutation : swap; struct Huge { Cent[128] data; alias this = data; } void f(ref Huge x, ref Huge y) { swap(x, y); }
みたいなコードを使おうと思ったんだけど、ぜんぜんそんなことはなかった。
$ ldc2 -O -c swap.d
$ objdump -dr --demangle=dlang -Mintel swap.o
(...)
0000000000000000 <swap.f(ref swap.Huge, ref swap.Huge)>:
0: 41 57 push r15
2: 41 56 push r14
4: 53 push rbx
5: 48 81 ec 00 08 00 00 sub rsp,0x800
c: 48 89 f3 mov rbx,rsi
f: 49 89 fe mov r14,rdi
12: 49 89 e7 mov r15,rsp
15: ba 00 08 00 00 mov edx,0x800
1a: 4c 89 ff mov rdi,r15
1d: 4c 89 f6 mov rsi,r14
20: e8 00 00 00 00 call 25 <swap.f(ref swap.Huge, ref swap.Huge)+0x25>
21: R_X86_64_PLT32 memcpy-0x4
25: ba 00 08 00 00 mov edx,0x800
2a: 4c 89 f7 mov rdi,r14
2d: 48 89 de mov rsi,rbx
30: e8 00 00 00 00 call 35 <swap.f(ref swap.Huge, ref swap.Huge)+0x35>
31: R_X86_64_PLT32 memcpy-0x4
35: ba 00 08 00 00 mov edx,0x800
3a: 48 89 df mov rdi,rbx
3d: 4c 89 fe mov rsi,r15
40: e8 00 00 00 00 call 45 <swap.f(ref swap.Huge, ref swap.Huge)+0x45>
41: R_X86_64_PLT32 memcpy-0x4
45: 48 81 c4 00 08 00 00 add rsp,0x800
4c: 5b pop rbx
4d: 41 5e pop r14
4f: 41 5f pop r15
51: c3 ret
(...)
0000000000000000 <std.algorithm.mutation.swap!(swap.Huge).swap(ref swap.Huge, ref swap.Huge)>:
0: 41 57 push r15
2: 41 56 push r14
4: 53 push rbx
5: 48 81 ec 00 08 00 00 sub rsp,0x800
c: 48 89 f3 mov rbx,rsi
f: 49 89 fe mov r14,rdi
12: 49 89 e7 mov r15,rsp
15: ba 00 08 00 00 mov edx,0x800
1a: 4c 89 ff mov rdi,r15
1d: 4c 89 f6 mov rsi,r14
20: e8 00 00 00 00 call 25 <std.algorithm.mutation.swap!(swap.Huge).swap(ref swap.Huge, ref swap.Huge)+0x25>
21: R_X86_64_PLT32 memcpy-0x4
25: ba 00 08 00 00 mov edx,0x800
2a: 4c 89 f7 mov rdi,r14
2d: 48 89 de mov rsi,rbx
30: e8 00 00 00 00 call 35 <std.algorithm.mutation.swap!(swap.Huge).swap(ref swap.Huge, ref swap.Huge)+0x35>
31: R_X86_64_PLT32 memcpy-0x4
35: ba 00 08 00 00 mov edx,0x800
3a: 48 89 df mov rdi,rbx
3d: 4c 89 fe mov rsi,r15
40: e8 00 00 00 00 call 45 <std.algorithm.mutation.swap!(swap.Huge).swap(ref swap.Huge, ref swap.Huge)+0x45>
41: R_X86_64_PLT32 memcpy-0x4
45: 48 81 c4 00 08 00 00 add rsp,0x800
4c: 5b pop rbx
4d: 41 5e pop r14
4f: 41 5f pop r15
51: c3 ret
実装みてもそんな感じしないけど
void swap(T)(ref T lhs, ref T rhs) if (is(typeof(lhs.proxySwap(rhs)))) { lhs.proxySwap(rhs); } /// ditto void swap(T)(ref T lhs, ref T rhs) @trusted pure nothrow @nogc if (isBlitAssignable!T && !is(typeof(lhs.proxySwap(rhs)))) { import std.traits : hasAliasing, hasElaborateAssign, isAssignable, isStaticArray; static if (hasAliasing!T) if (!__ctfe) { import std.exception : doesPointTo; assert(!doesPointTo(lhs, lhs), "Swap: lhs internal pointer."); assert(!doesPointTo(rhs, rhs), "Swap: rhs internal pointer."); assert(!doesPointTo(lhs, rhs), "Swap: lhs points to rhs."); assert(!doesPointTo(rhs, lhs), "Swap: rhs points to lhs."); } static if (hasElaborateAssign!T || !isAssignable!T) { if (&lhs != &rhs) { static if (__traits(compiles, lhs.tupleof = rhs.tupleof)) { if (__ctfe) { // can't reinterpret cast foreach (i, ref e; lhs.tupleof) swap(e, rhs.tupleof[i]); return; } } // For structs with non-trivial assignment, move memory directly ubyte[T.sizeof] t = void; auto a = (cast(ubyte*) &lhs)[0 .. T.sizeof]; auto b = (cast(ubyte*) &rhs)[0 .. T.sizeof]; t[] = a[]; a[] = b[]; b[] = t[]; } } else { //Avoid assigning overlapping arrays. Dynamic arrays are fine, because //it's their ptr and length properties which get assigned rather //than their elements when assigning them, but static arrays are value //types and therefore all of their elements get copied as part of //assigning them, which would be assigning overlapping arrays if lhs //and rhs were the same array. static if (isStaticArray!T) { if (lhs.ptr == rhs.ptr) return; } // For non-elaborate-assign types, suffice to do the classic swap static if (__traits(hasCopyConstructor, T)) { // don't invoke any elaborate constructors either T tmp = void; tmp = lhs; } else auto tmp = lhs; lhs = rhs; rhs = tmp; } }
なんかうまく最適化してくれそうだからここにきてほしいんだよな。
// For structs with non-trivial assignment, move memory directly ubyte[T.sizeof] t = void; auto a = (cast(ubyte*) &lhs)[0 .. T.sizeof]; auto b = (cast(ubyte*) &rhs)[0 .. T.sizeof]; t[] = a[]; a[] = b[]; b[] = t[];
hasElaborateAssign!T || !isAssignable!Tを満たしてないのか。
import core.int128 : Cent; import std.algorithm.mutation : swap; import std.traits : isAssignable, hasElaborateAssign; struct Huge { Cent[128] data; alias this = data; } void f(ref Huge x, ref Huge y) { pragma(msg, hasElaborateAssign!Huge); pragma(msg, !isAssignable!Huge); swap(x, y); }
だめだった。
$ ldc2 -O -c swap.d false false
disable this(this)を足そう。
import core.int128 : Cent; import std.algorithm.mutation : swap; import std.traits : isAssignable, hasElaborateAssign; struct Huge { @disable this(this); Cent[128] data; alias this = data; } void f(ref Huge x, ref Huge y) { pragma(msg, hasElaborateAssign!Huge); pragma(msg, !isAssignable!Huge); swap(x, y); }
$ ldc2 -O -c swap.d true true
これでいけるかな。
$ objdump -dr --demangle=dlang -Mintel swap.o
(...)
0000000000000000 <swap.f(ref swap.Huge, ref swap.Huge)>:
0: 48 39 f7 cmp rdi,rsi
3: 74 5c je 61 <swap.f(ref swap.Huge, ref swap.Huge)+0x61>
5: 41 57 push r15
7: 41 56 push r14
9: 53 push rbx
a: 48 81 ec 00 08 00 00 sub rsp,0x800
11: 48 89 f3 mov rbx,rsi
14: 49 89 fe mov r14,rdi
17: 49 89 e7 mov r15,rsp
1a: ba 00 08 00 00 mov edx,0x800
1f: 4c 89 ff mov rdi,r15
22: 4c 89 f6 mov rsi,r14
25: e8 00 00 00 00 call 2a <swap.f(ref swap.Huge, ref swap.Huge)+0x2a>
26: R_X86_64_PLT32 memcpy-0x4
2a: be 00 08 00 00 mov esi,0x800
2f: b9 00 08 00 00 mov ecx,0x800
34: 41 b8 01 00 00 00 mov r8d,0x1
3a: 4c 89 f7 mov rdi,r14
3d: 48 89 da mov rdx,rbx
40: e8 00 00 00 00 call 45 <swap.f(ref swap.Huge, ref swap.Huge)+0x45>
41: R_X86_64_PLT32 _d_array_slice_copy-0x4
45: ba 00 08 00 00 mov edx,0x800
4a: 48 89 df mov rdi,rbx
4d: 4c 89 fe mov rsi,r15
50: e8 00 00 00 00 call 55 <swap.f(ref swap.Huge, ref swap.Huge)+0x55>
51: R_X86_64_PLT32 memcpy-0x4
55: 48 81 c4 00 08 00 00 add rsp,0x800
5c: 5b pop rbx
5d: 41 5e pop r14
5f: 41 5f pop r15
61: c3 ret
(...)
0000000000000000 <std.algorithm.mutation.swap!(swap.Huge).swap(ref swap.Huge, ref swap.Huge)>:
0: 48 39 f7 cmp rdi,rsi
3: 74 5c je 61 <std.algorithm.mutation.swap!(swap.Huge).swap(ref swap.Huge, ref swap.Huge)+0x61>
5: 41 57 push r15
7: 41 56 push r14
9: 53 push rbx
a: 48 81 ec 00 08 00 00 sub rsp,0x800
11: 48 89 f3 mov rbx,rsi
14: 49 89 fe mov r14,rdi
17: 49 89 e7 mov r15,rsp
1a: ba 00 08 00 00 mov edx,0x800
1f: 4c 89 ff mov rdi,r15
22: 4c 89 f6 mov rsi,r14
25: e8 00 00 00 00 call 2a <std.algorithm.mutation.swap!(swap.Huge).swap(ref swap.Huge, ref swap.Huge)+0x2a>
26: R_X86_64_PLT32 memcpy-0x4
2a: be 00 08 00 00 mov esi,0x800
2f: b9 00 08 00 00 mov ecx,0x800
34: 41 b8 01 00 00 00 mov r8d,0x1
3a: 4c 89 f7 mov rdi,r14
3d: 48 89 da mov rdx,rbx
40: e8 00 00 00 00 call 45 <std.algorithm.mutation.swap!(swap.Huge).swap(ref swap.Huge, ref swap.Huge)+0x45>
41: R_X86_64_PLT32 _d_array_slice_copy-0x4
45: ba 00 08 00 00 mov edx,0x800
4a: 48 89 df mov rdi,rbx
4d: 4c 89 fe mov rsi,r15
50: e8 00 00 00 00 call 55 <std.algorithm.mutation.swap!(swap.Huge).swap(ref swap.Huge, ref swap.Huge)+0x55>
51: R_X86_64_PLT32 memcpy-0x4
55: 48 81 c4 00 08 00 00 add rsp,0x800
5c: 5b pop rbx
5d: 41 5e pop r14
5f: 41 5f pop r15
61: c3 ret
(...)
0000000000000000 <std.algorithm.mutation.swap!(core.int128.Cent[128]).swap(ref core.int128.Cent[128], ref core.int128.Cent[128])>:
0: 48 39 f7 cmp rdi,rsi
3: 74 51 je 56 <std.algorithm.mutation.swap!(core.int128.Cent[128]).swap(ref core.int128.Cent[128], ref core.int128.Cent[128])+0x56>
5: 41 57 push r15
7: 41 56 push r14
9: 53 push rbx
a: 48 81 ec 00 08 00 00 sub rsp,0x800
11: 48 89 f3 mov rbx,rsi
14: 49 89 fe mov r14,rdi
17: 49 89 e7 mov r15,rsp
1a: ba 00 08 00 00 mov edx,0x800
1f: 4c 89 ff mov rdi,r15
22: 4c 89 f6 mov rsi,r14
25: e8 00 00 00 00 call 2a <std.algorithm.mutation.swap!(core.int128.Cent[128]).swap(ref core.int128.Cent[128], ref core.int128.Cent[128])+0x2a>
26: R_X86_64_PLT32 memcpy-0x4
2a: ba 00 08 00 00 mov edx,0x800
2f: 4c 89 f7 mov rdi,r14
32: 48 89 de mov rsi,rbx
35: e8 00 00 00 00 call 3a <std.algorithm.mutation.swap!(core.int128.Cent[128]).swap(ref core.int128.Cent[128], ref core.int128.Cent[128])+0x3a>
36: R_X86_64_PLT32 memcpy-0x4
3a: ba 00 08 00 00 mov edx,0x800
3f: 48 89 df mov rdi,rbx
42: 4c 89 fe mov rsi,r15
45: e8 00 00 00 00 call 4a <std.algorithm.mutation.swap!(core.int128.Cent[128]).swap(ref core.int128.Cent[128], ref core.int128.Cent[128])+0x4a>
46: R_X86_64_PLT32 memcpy-0x4
4a: 48 81 c4 00 08 00 00 add rsp,0x800
51: 5b pop rbx
52: 41 5e pop r14
54: 41 5f pop r15
56: c3 ret
ぜんぜんだめ。
(というか_d_array_slice_copyはmemcpyにloweringされてほしいのだがエイリアス情報がないのでだめか。@restrictをつけるとできそうだが。)
sliceのコピーがそもそもだめなのか。
export void myswap(ref Huge lhs, ref Huge rhs) { ubyte[Huge.sizeof] t = void; auto a = (cast(ubyte*) &lhs)[0 .. Huge.sizeof]; auto b = (cast(ubyte*) &rhs)[0 .. Huge.sizeof]; t[] = a[]; a[] = b[]; b[] = t[]; }
そうらしい。
$ ldc2 -O -c swap.d
$ objdump -dr --demangle=dlang -Mintel swap.o
(...)
0000000000000000 <swap.myswap(ref swap.Huge, ref swap.Huge)>:
0: 41 57 push r15
2: 41 56 push r14
4: 53 push rbx
5: 48 81 ec 00 08 00 00 sub rsp,0x800
c: 48 89 f3 mov rbx,rsi
f: 49 89 fe mov r14,rdi
12: 49 89 e7 mov r15,rsp
15: ba 00 08 00 00 mov edx,0x800
1a: 4c 89 ff mov rdi,r15
1d: 4c 89 f6 mov rsi,r14
20: e8 00 00 00 00 call 25 <swap.myswap(ref swap.Huge, ref swap.Huge)+0x25>
21: R_X86_64_PLT32 memcpy-0x4
25: be 00 08 00 00 mov esi,0x800
2a: b9 00 08 00 00 mov ecx,0x800
2f: 41 b8 01 00 00 00 mov r8d,0x1
35: 4c 89 f7 mov rdi,r14
38: 48 89 da mov rdx,rbx
3b: e8 00 00 00 00 call 40 <swap.myswap(ref swap.Huge, ref swap.Huge)+0x40>
3c: R_X86_64_PLT32 _d_array_slice_copy-0x4
40: ba 00 08 00 00 mov edx,0x800
45: 48 89 df mov rdi,rbx
48: 4c 89 fe mov rsi,r15
4b: e8 00 00 00 00 call 50 <swap.myswap(ref swap.Huge, ref swap.Huge)+0x50>
4c: R_X86_64_PLT32 memcpy-0x4
50: 48 81 c4 00 08 00 00 add rsp,0x800
57: 5b pop rbx
58: 41 5e pop r14
5a: 41 5f pop r15
5c: c3 ret
と思ってみてみたらswapRangesってのがあるのか。
import core.int128 : Cent; import std.algorithm.mutation : swapRanges; struct Huge { Cent[128] data; alias this = data; } void f(ref Huge x, ref Huge y) { swapRanges(x[], y[]); }
あ、これでいいですね。
0000000000000000 <swap.f(ref swap.Huge, ref swap.Huge)>: swap.f(ref swap.Huge, ref swap.Huge): 0: 31 c0 xor eax,eax 2: 66 66 66 66 66 2e 0f data16 data16 data16 data16 cs nop WORD PTR [rax+rax*1+0x0] 9: 1f 84 00 00 00 00 00 10: 0f 10 04 07 movups xmm0,XMMWORD PTR [rdi+rax*1] 14: 0f 29 44 24 e8 movaps XMMWORD PTR [rsp-0x18],xmm0 19: 0f 10 04 06 movups xmm0,XMMWORD PTR [rsi+rax*1] 1d: 0f 11 04 07 movups XMMWORD PTR [rdi+rax*1],xmm0 21: 0f 28 44 24 e8 movaps xmm0,XMMWORD PTR [rsp-0x18] 26: 0f 11 04 06 movups XMMWORD PTR [rsi+rax*1],xmm0 2a: 0f 10 44 07 10 movups xmm0,XMMWORD PTR [rdi+rax*1+0x10] 2f: 0f 29 44 24 e8 movaps XMMWORD PTR [rsp-0x18],xmm0 34: 0f 10 44 06 10 movups xmm0,XMMWORD PTR [rsi+rax*1+0x10] 39: 0f 11 44 07 10 movups XMMWORD PTR [rdi+rax*1+0x10],xmm0 3e: 0f 28 44 24 e8 movaps xmm0,XMMWORD PTR [rsp-0x18] 43: 0f 11 44 06 10 movups XMMWORD PTR [rsi+rax*1+0x10],xmm0 48: 48 83 c0 20 add rax,0x20 4c: 48 3d 00 08 00 00 cmp rax,0x800 52: 75 bc jne 10 <swap.f(ref swap.Huge, ref swap.Huge)+0x10> 54: c3 ret Disassembly of section .text._D3std9algorithm8mutation__T10swapRangesTAS4core6int1284CentTQuZQBkFNaNbNiNfQBjQBmZSQDe8typecons__T5TupleTQCnTQCrZQp: 0000000000000000 <std.algorithm.mutation.swapRanges!(core.int128.Cent[], core.int128.Cent[]).swapRanges(core.int128.Cent[], core.int128.Cent[])>: std.algorithm.mutation.swapRanges!(core.int128.Cent[], core.int128.Cent[]).swapRanges(core.int128.Cent[], core.int128.Cent[]): 0: 48 89 f8 mov rax,rdi 3: 48 85 f6 test rsi,rsi 6: 40 0f 94 c7 sete dil a: 48 85 c9 test rcx,rcx d: 41 0f 94 c1 sete r9b 11: 41 08 f9 or r9b,dil 14: 75 4d jne 63 <std.algorithm.mutation.swapRanges!(core.int128.Cent[], core.int128.Cent[]).swapRanges(core.int128.Cent[], core.int128.Cent[])+0x63> 16: 31 ff xor edi,edi 18: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0] 1f: 00 20: 0f 10 02 movups xmm0,XMMWORD PTR [rdx] 23: 0f 29 44 24 e8 movaps XMMWORD PTR [rsp-0x18],xmm0 28: 41 0f 10 00 movups xmm0,XMMWORD PTR [r8] 2c: 0f 11 02 movups XMMWORD PTR [rdx],xmm0 2f: 0f 28 44 24 e8 movaps xmm0,XMMWORD PTR [rsp-0x18] 34: 41 0f 11 00 movups XMMWORD PTR [r8],xmm0 38: 48 83 c2 10 add rdx,0x10 3c: 49 83 c0 10 add r8,0x10 40: 4c 8d 14 3e lea r10,[rsi+rdi*1] 44: 4c 8d 4f ff lea r9,[rdi-0x1] 48: 49 83 fa 01 cmp r10,0x1 4c: 74 0f je 5d <std.algorithm.mutation.swapRanges!(core.int128.Cent[], core.int128.Cent[]).swapRanges(core.int128.Cent[], core.int128.Cent[])+0x5d> 4e: 4c 8d 14 0f lea r10,[rdi+rcx*1] 52: 49 ff ca dec r10 55: 4c 89 cf mov rdi,r9 58: 4d 85 d2 test r10,r10 5b: 75 c3 jne 20 <std.algorithm.mutation.swapRanges!(core.int128.Cent[], core.int128.Cent[]).swapRanges(core.int128.Cent[], core.int128.Cent[])+0x20> 5d: 4c 01 c9 add rcx,r9 60: 4c 01 ce add rsi,r9 63: 48 89 30 mov QWORD PTR [rax],rsi 66: 48 89 50 08 mov QWORD PTR [rax+0x8],rdx 6a: 48 89 48 10 mov QWORD PTR [rax+0x10],rcx 6e: 4c 89 40 18 mov QWORD PTR [rax+0x18],r8 72: c3 ret
どうもLLVMは1バイトずつswapする処理をうまく最適化できるらしい。 swapRangesもこのような実装になっている。
Tuple!(InputRange1, InputRange2) swapRanges(InputRange1, InputRange2)(InputRange1 r1, InputRange2 r2) if (hasSwappableElements!InputRange1 && hasSwappableElements!InputRange2 && is(ElementType!InputRange1 == ElementType!InputRange2)) { for (; !r1.empty && !r2.empty; r1.popFront(), r2.popFront()) { swap(r1.front, r2.front); } return tuple(r1, r2); }
いままでのswapのはなしはなんだったの?なんだったんですかね。。
Conclusion
- 複数のelementがあるデータのswapがしたいときはswapRangesを使おう