swap/swapRangesのはなし

ふと、swapみたいな単純なコードでも標準ライブラリの実装を呼び出した方がいいよ、という例として

import core.int128 : Cent;
import std.algorithm.mutation : swap;

struct Huge
{
    Cent[128] data;
    alias this = data;
}

void f(ref Huge x, ref Huge y)
{
    swap(x, y);
}

みたいなコードを使おうと思ったんだけど、ぜんぜんそんなことはなかった。

$ ldc2 -O -c swap.d
$ objdump -dr --demangle=dlang -Mintel swap.o
(...)
0000000000000000 <swap.f(ref swap.Huge, ref swap.Huge)>:
   0:   41 57                   push   r15
   2:   41 56                   push   r14
   4:   53                      push   rbx
   5:   48 81 ec 00 08 00 00    sub    rsp,0x800
   c:   48 89 f3                mov    rbx,rsi
   f:   49 89 fe                mov    r14,rdi
  12:   49 89 e7                mov    r15,rsp
  15:   ba 00 08 00 00          mov    edx,0x800
  1a:   4c 89 ff                mov    rdi,r15
  1d:   4c 89 f6                mov    rsi,r14
  20:   e8 00 00 00 00          call   25 <swap.f(ref swap.Huge, ref swap.Huge)+0x25>
            21: R_X86_64_PLT32  memcpy-0x4
  25:   ba 00 08 00 00          mov    edx,0x800
  2a:   4c 89 f7                mov    rdi,r14
  2d:   48 89 de                mov    rsi,rbx
  30:   e8 00 00 00 00          call   35 <swap.f(ref swap.Huge, ref swap.Huge)+0x35>
            31: R_X86_64_PLT32  memcpy-0x4
  35:   ba 00 08 00 00          mov    edx,0x800
  3a:   48 89 df                mov    rdi,rbx
  3d:   4c 89 fe                mov    rsi,r15
  40:   e8 00 00 00 00          call   45 <swap.f(ref swap.Huge, ref swap.Huge)+0x45>
            41: R_X86_64_PLT32  memcpy-0x4
  45:   48 81 c4 00 08 00 00    add    rsp,0x800
  4c:   5b                      pop    rbx
  4d:   41 5e                   pop    r14
  4f:   41 5f                   pop    r15
  51:   c3                      ret    
(...)
0000000000000000 <std.algorithm.mutation.swap!(swap.Huge).swap(ref swap.Huge, ref swap.Huge)>:
   0:   41 57                   push   r15
   2:   41 56                   push   r14
   4:   53                      push   rbx
   5:   48 81 ec 00 08 00 00    sub    rsp,0x800
   c:   48 89 f3                mov    rbx,rsi
   f:   49 89 fe                mov    r14,rdi
  12:   49 89 e7                mov    r15,rsp
  15:   ba 00 08 00 00          mov    edx,0x800
  1a:   4c 89 ff                mov    rdi,r15
  1d:   4c 89 f6                mov    rsi,r14
  20:   e8 00 00 00 00          call   25 <std.algorithm.mutation.swap!(swap.Huge).swap(ref swap.Huge, ref swap.Huge)+0x25>
            21: R_X86_64_PLT32  memcpy-0x4
  25:   ba 00 08 00 00          mov    edx,0x800
  2a:   4c 89 f7                mov    rdi,r14
  2d:   48 89 de                mov    rsi,rbx
  30:   e8 00 00 00 00          call   35 <std.algorithm.mutation.swap!(swap.Huge).swap(ref swap.Huge, ref swap.Huge)+0x35>
            31: R_X86_64_PLT32  memcpy-0x4
  35:   ba 00 08 00 00          mov    edx,0x800
  3a:   48 89 df                mov    rdi,rbx
  3d:   4c 89 fe                mov    rsi,r15
  40:   e8 00 00 00 00          call   45 <std.algorithm.mutation.swap!(swap.Huge).swap(ref swap.Huge, ref swap.Huge)+0x45>
            41: R_X86_64_PLT32  memcpy-0x4
  45:   48 81 c4 00 08 00 00    add    rsp,0x800
  4c:   5b                      pop    rbx
  4d:   41 5e                   pop    r14
  4f:   41 5f                   pop    r15
  51:   c3                      ret

実装みてもそんな感じしないけど

void swap(T)(ref T lhs, ref T rhs)
if (is(typeof(lhs.proxySwap(rhs))))
{
    lhs.proxySwap(rhs);
}

/// ditto
void swap(T)(ref T lhs, ref T rhs) @trusted pure nothrow @nogc
if (isBlitAssignable!T && !is(typeof(lhs.proxySwap(rhs))))
{
    import std.traits : hasAliasing, hasElaborateAssign, isAssignable,
                        isStaticArray;
    static if (hasAliasing!T) if (!__ctfe)
    {
        import std.exception : doesPointTo;
        assert(!doesPointTo(lhs, lhs), "Swap: lhs internal pointer.");
        assert(!doesPointTo(rhs, rhs), "Swap: rhs internal pointer.");
        assert(!doesPointTo(lhs, rhs), "Swap: lhs points to rhs.");
        assert(!doesPointTo(rhs, lhs), "Swap: rhs points to lhs.");
    }

    static if (hasElaborateAssign!T || !isAssignable!T)
    {
        if (&lhs != &rhs)
        {
            static if (__traits(compiles, lhs.tupleof = rhs.tupleof))
            {
                if (__ctfe)
                {
                    // can't reinterpret cast
                    foreach (i, ref e; lhs.tupleof)
                        swap(e, rhs.tupleof[i]);
                    return;
                }
            }

            // For structs with non-trivial assignment, move memory directly
            ubyte[T.sizeof] t = void;
            auto a = (cast(ubyte*) &lhs)[0 .. T.sizeof];
            auto b = (cast(ubyte*) &rhs)[0 .. T.sizeof];
            t[] = a[];
            a[] = b[];
            b[] = t[];
        }
    }
    else
    {
        //Avoid assigning overlapping arrays. Dynamic arrays are fine, because
        //it's their ptr and length properties which get assigned rather
        //than their elements when assigning them, but static arrays are value
        //types and therefore all of their elements get copied as part of
        //assigning them, which would be assigning overlapping arrays if lhs
        //and rhs were the same array.
        static if (isStaticArray!T)
        {
            if (lhs.ptr == rhs.ptr)
                return;
        }

        // For non-elaborate-assign types, suffice to do the classic swap
        static if (__traits(hasCopyConstructor, T))
        {
            // don't invoke any elaborate constructors either
            T tmp = void;
            tmp = lhs;
        }
        else
            auto tmp = lhs;
        lhs = rhs;
        rhs = tmp;
    }
}

なんかうまく最適化してくれそうだからここにきてほしいんだよな。

            // For structs with non-trivial assignment, move memory directly
            ubyte[T.sizeof] t = void;
            auto a = (cast(ubyte*) &lhs)[0 .. T.sizeof];
            auto b = (cast(ubyte*) &rhs)[0 .. T.sizeof];
            t[] = a[];
            a[] = b[];
            b[] = t[];

hasElaborateAssign!T || !isAssignable!Tを満たしてないのか。

import core.int128 : Cent;
import std.algorithm.mutation : swap;
import std.traits : isAssignable, hasElaborateAssign;

struct Huge
{
    Cent[128] data;
    alias this = data;
}

void f(ref Huge x, ref Huge y)
{
    pragma(msg, hasElaborateAssign!Huge);
    pragma(msg, !isAssignable!Huge);
    swap(x, y);
}

だめだった。

$ ldc2 -O -c swap.d
false
false

disable this(this)を足そう。

import core.int128 : Cent;
import std.algorithm.mutation : swap;
import std.traits : isAssignable, hasElaborateAssign;

struct Huge
{
    @disable this(this);
    Cent[128] data;
    alias this = data;
}

void f(ref Huge x, ref Huge y)
{
    pragma(msg, hasElaborateAssign!Huge);
    pragma(msg, !isAssignable!Huge);
    swap(x, y);
}

$ ldc2 -O -c swap.d
true
true

これでいけるかな。

$ objdump -dr --demangle=dlang -Mintel swap.o
(...)
0000000000000000 <swap.f(ref swap.Huge, ref swap.Huge)>:
   0:   48 39 f7                cmp    rdi,rsi
   3:   74 5c                   je     61 <swap.f(ref swap.Huge, ref swap.Huge)+0x61>
   5:   41 57                   push   r15
   7:   41 56                   push   r14
   9:   53                      push   rbx
   a:   48 81 ec 00 08 00 00    sub    rsp,0x800
  11:   48 89 f3                mov    rbx,rsi
  14:   49 89 fe                mov    r14,rdi
  17:   49 89 e7                mov    r15,rsp
  1a:   ba 00 08 00 00          mov    edx,0x800
  1f:   4c 89 ff                mov    rdi,r15
  22:   4c 89 f6                mov    rsi,r14
  25:   e8 00 00 00 00          call   2a <swap.f(ref swap.Huge, ref swap.Huge)+0x2a>
            26: R_X86_64_PLT32  memcpy-0x4
  2a:   be 00 08 00 00          mov    esi,0x800
  2f:   b9 00 08 00 00          mov    ecx,0x800
  34:   41 b8 01 00 00 00       mov    r8d,0x1
  3a:   4c 89 f7                mov    rdi,r14
  3d:   48 89 da                mov    rdx,rbx
  40:   e8 00 00 00 00          call   45 <swap.f(ref swap.Huge, ref swap.Huge)+0x45>
            41: R_X86_64_PLT32  _d_array_slice_copy-0x4
  45:   ba 00 08 00 00          mov    edx,0x800
  4a:   48 89 df                mov    rdi,rbx
  4d:   4c 89 fe                mov    rsi,r15
  50:   e8 00 00 00 00          call   55 <swap.f(ref swap.Huge, ref swap.Huge)+0x55>
            51: R_X86_64_PLT32  memcpy-0x4
  55:   48 81 c4 00 08 00 00    add    rsp,0x800
  5c:   5b                      pop    rbx
  5d:   41 5e                   pop    r14
  5f:   41 5f                   pop    r15
  61:   c3                      ret    
(...)
0000000000000000 <std.algorithm.mutation.swap!(swap.Huge).swap(ref swap.Huge, ref swap.Huge)>:
   0:   48 39 f7                cmp    rdi,rsi
   3:   74 5c                   je     61 <std.algorithm.mutation.swap!(swap.Huge).swap(ref swap.Huge, ref swap.Huge)+0x61>
   5:   41 57                   push   r15
   7:   41 56                   push   r14
   9:   53                      push   rbx
   a:   48 81 ec 00 08 00 00    sub    rsp,0x800
  11:   48 89 f3                mov    rbx,rsi
  14:   49 89 fe                mov    r14,rdi
  17:   49 89 e7                mov    r15,rsp
  1a:   ba 00 08 00 00          mov    edx,0x800
  1f:   4c 89 ff                mov    rdi,r15
  22:   4c 89 f6                mov    rsi,r14
  25:   e8 00 00 00 00          call   2a <std.algorithm.mutation.swap!(swap.Huge).swap(ref swap.Huge, ref swap.Huge)+0x2a>
            26: R_X86_64_PLT32  memcpy-0x4
  2a:   be 00 08 00 00          mov    esi,0x800
  2f:   b9 00 08 00 00          mov    ecx,0x800
  34:   41 b8 01 00 00 00       mov    r8d,0x1
  3a:   4c 89 f7                mov    rdi,r14
  3d:   48 89 da                mov    rdx,rbx
  40:   e8 00 00 00 00          call   45 <std.algorithm.mutation.swap!(swap.Huge).swap(ref swap.Huge, ref swap.Huge)+0x45>
            41: R_X86_64_PLT32  _d_array_slice_copy-0x4
  45:   ba 00 08 00 00          mov    edx,0x800
  4a:   48 89 df                mov    rdi,rbx
  4d:   4c 89 fe                mov    rsi,r15
  50:   e8 00 00 00 00          call   55 <std.algorithm.mutation.swap!(swap.Huge).swap(ref swap.Huge, ref swap.Huge)+0x55>
            51: R_X86_64_PLT32  memcpy-0x4
  55:   48 81 c4 00 08 00 00    add    rsp,0x800
  5c:   5b                      pop    rbx
  5d:   41 5e                   pop    r14
  5f:   41 5f                   pop    r15
  61:   c3                      ret    
(...)
0000000000000000 <std.algorithm.mutation.swap!(core.int128.Cent[128]).swap(ref core.int128.Cent[128], ref core.int128.Cent[128])>:
   0:   48 39 f7                cmp    rdi,rsi
   3:   74 51                   je     56 <std.algorithm.mutation.swap!(core.int128.Cent[128]).swap(ref core.int128.Cent[128], ref core.int128.Cent[128])+0x56>
   5:   41 57                   push   r15
   7:   41 56                   push   r14
   9:   53                      push   rbx
   a:   48 81 ec 00 08 00 00    sub    rsp,0x800
  11:   48 89 f3                mov    rbx,rsi
  14:   49 89 fe                mov    r14,rdi
  17:   49 89 e7                mov    r15,rsp
  1a:   ba 00 08 00 00          mov    edx,0x800
  1f:   4c 89 ff                mov    rdi,r15
  22:   4c 89 f6                mov    rsi,r14
  25:   e8 00 00 00 00          call   2a <std.algorithm.mutation.swap!(core.int128.Cent[128]).swap(ref core.int128.Cent[128], ref core.int128.Cent[128])+0x2a>
            26: R_X86_64_PLT32  memcpy-0x4
  2a:   ba 00 08 00 00          mov    edx,0x800
  2f:   4c 89 f7                mov    rdi,r14
  32:   48 89 de                mov    rsi,rbx
  35:   e8 00 00 00 00          call   3a <std.algorithm.mutation.swap!(core.int128.Cent[128]).swap(ref core.int128.Cent[128], ref core.int128.Cent[128])+0x3a>
            36: R_X86_64_PLT32  memcpy-0x4
  3a:   ba 00 08 00 00          mov    edx,0x800
  3f:   48 89 df                mov    rdi,rbx
  42:   4c 89 fe                mov    rsi,r15
  45:   e8 00 00 00 00          call   4a <std.algorithm.mutation.swap!(core.int128.Cent[128]).swap(ref core.int128.Cent[128], ref core.int128.Cent[128])+0x4a>
            46: R_X86_64_PLT32  memcpy-0x4
  4a:   48 81 c4 00 08 00 00    add    rsp,0x800
  51:   5b                      pop    rbx
  52:   41 5e                   pop    r14
  54:   41 5f                   pop    r15
  56:   c3                      ret

ぜんぜんだめ。 (というか_d_array_slice_copyはmemcpyにloweringされてほしいのだがエイリアス情報がないのでだめか。@restrictをつけるとできそうだが。)

sliceのコピーがそもそもだめなのか。

export void myswap(ref Huge lhs, ref Huge rhs)
{
    ubyte[Huge.sizeof] t = void;
    auto a = (cast(ubyte*) &lhs)[0 .. Huge.sizeof];
    auto b = (cast(ubyte*) &rhs)[0 .. Huge.sizeof];
    t[] = a[];
    a[] = b[];
    b[] = t[];  
}

そうらしい。

$ ldc2 -O -c swap.d
$ objdump -dr --demangle=dlang -Mintel swap.o
(...)
0000000000000000 <swap.myswap(ref swap.Huge, ref swap.Huge)>:
   0:   41 57                   push   r15
   2:   41 56                   push   r14
   4:   53                      push   rbx
   5:   48 81 ec 00 08 00 00    sub    rsp,0x800
   c:   48 89 f3                mov    rbx,rsi
   f:   49 89 fe                mov    r14,rdi
  12:   49 89 e7                mov    r15,rsp
  15:   ba 00 08 00 00          mov    edx,0x800
  1a:   4c 89 ff                mov    rdi,r15
  1d:   4c 89 f6                mov    rsi,r14
  20:   e8 00 00 00 00          call   25 <swap.myswap(ref swap.Huge, ref swap.Huge)+0x25>
            21: R_X86_64_PLT32  memcpy-0x4
  25:   be 00 08 00 00          mov    esi,0x800
  2a:   b9 00 08 00 00          mov    ecx,0x800
  2f:   41 b8 01 00 00 00       mov    r8d,0x1
  35:   4c 89 f7                mov    rdi,r14
  38:   48 89 da                mov    rdx,rbx
  3b:   e8 00 00 00 00          call   40 <swap.myswap(ref swap.Huge, ref swap.Huge)+0x40>
            3c: R_X86_64_PLT32  _d_array_slice_copy-0x4
  40:   ba 00 08 00 00          mov    edx,0x800
  45:   48 89 df                mov    rdi,rbx
  48:   4c 89 fe                mov    rsi,r15
  4b:   e8 00 00 00 00          call   50 <swap.myswap(ref swap.Huge, ref swap.Huge)+0x50>
            4c: R_X86_64_PLT32  memcpy-0x4
  50:   48 81 c4 00 08 00 00    add    rsp,0x800
  57:   5b                      pop    rbx
  58:   41 5e                   pop    r14
  5a:   41 5f                   pop    r15
  5c:   c3                      ret

と思ってみてみたらswapRangesってのがあるのか。

import core.int128 : Cent;
import std.algorithm.mutation : swapRanges;

struct Huge
{
    Cent[128] data;
    alias this = data;
}

void f(ref Huge x, ref Huge y)
{
    swapRanges(x[], y[]);
}

あ、これでいいですね。

0000000000000000 <swap.f(ref swap.Huge, ref swap.Huge)>:
swap.f(ref swap.Huge, ref swap.Huge):
   0:   31 c0                   xor    eax,eax
   2:   66 66 66 66 66 2e 0f    data16 data16 data16 data16 cs nop WORD PTR [rax+rax*1+0x0]
   9:   1f 84 00 00 00 00 00 
  10:   0f 10 04 07             movups xmm0,XMMWORD PTR [rdi+rax*1]
  14:   0f 29 44 24 e8          movaps XMMWORD PTR [rsp-0x18],xmm0
  19:   0f 10 04 06             movups xmm0,XMMWORD PTR [rsi+rax*1]
  1d:   0f 11 04 07             movups XMMWORD PTR [rdi+rax*1],xmm0
  21:   0f 28 44 24 e8          movaps xmm0,XMMWORD PTR [rsp-0x18]
  26:   0f 11 04 06             movups XMMWORD PTR [rsi+rax*1],xmm0
  2a:   0f 10 44 07 10          movups xmm0,XMMWORD PTR [rdi+rax*1+0x10]
  2f:   0f 29 44 24 e8          movaps XMMWORD PTR [rsp-0x18],xmm0
  34:   0f 10 44 06 10          movups xmm0,XMMWORD PTR [rsi+rax*1+0x10]
  39:   0f 11 44 07 10          movups XMMWORD PTR [rdi+rax*1+0x10],xmm0
  3e:   0f 28 44 24 e8          movaps xmm0,XMMWORD PTR [rsp-0x18]
  43:   0f 11 44 06 10          movups XMMWORD PTR [rsi+rax*1+0x10],xmm0
  48:   48 83 c0 20             add    rax,0x20
  4c:   48 3d 00 08 00 00       cmp    rax,0x800
  52:   75 bc                   jne    10 <swap.f(ref swap.Huge, ref swap.Huge)+0x10>
  54:   c3                      ret    

Disassembly of section .text._D3std9algorithm8mutation__T10swapRangesTAS4core6int1284CentTQuZQBkFNaNbNiNfQBjQBmZSQDe8typecons__T5TupleTQCnTQCrZQp:

0000000000000000 <std.algorithm.mutation.swapRanges!(core.int128.Cent[], core.int128.Cent[]).swapRanges(core.int128.Cent[], core.int128.Cent[])>:
std.algorithm.mutation.swapRanges!(core.int128.Cent[], core.int128.Cent[]).swapRanges(core.int128.Cent[], core.int128.Cent[]):
   0:   48 89 f8                mov    rax,rdi
   3:   48 85 f6                test   rsi,rsi
   6:   40 0f 94 c7             sete   dil
   a:   48 85 c9                test   rcx,rcx
   d:   41 0f 94 c1             sete   r9b
  11:   41 08 f9                or     r9b,dil
  14:   75 4d                   jne    63 <std.algorithm.mutation.swapRanges!(core.int128.Cent[], core.int128.Cent[]).swapRanges(core.int128.Cent[], core.int128.Cent[])+0x63>
  16:   31 ff                   xor    edi,edi
  18:   0f 1f 84 00 00 00 00    nop    DWORD PTR [rax+rax*1+0x0]
  1f:   00 
  20:   0f 10 02                movups xmm0,XMMWORD PTR [rdx]
  23:   0f 29 44 24 e8          movaps XMMWORD PTR [rsp-0x18],xmm0
  28:   41 0f 10 00             movups xmm0,XMMWORD PTR [r8]
  2c:   0f 11 02                movups XMMWORD PTR [rdx],xmm0
  2f:   0f 28 44 24 e8          movaps xmm0,XMMWORD PTR [rsp-0x18]
  34:   41 0f 11 00             movups XMMWORD PTR [r8],xmm0
  38:   48 83 c2 10             add    rdx,0x10
  3c:   49 83 c0 10             add    r8,0x10
  40:   4c 8d 14 3e             lea    r10,[rsi+rdi*1]
  44:   4c 8d 4f ff             lea    r9,[rdi-0x1]
  48:   49 83 fa 01             cmp    r10,0x1
  4c:   74 0f                   je     5d <std.algorithm.mutation.swapRanges!(core.int128.Cent[], core.int128.Cent[]).swapRanges(core.int128.Cent[], core.int128.Cent[])+0x5d>
  4e:   4c 8d 14 0f             lea    r10,[rdi+rcx*1]
  52:   49 ff ca                dec    r10
  55:   4c 89 cf                mov    rdi,r9
  58:   4d 85 d2                test   r10,r10
  5b:   75 c3                   jne    20 <std.algorithm.mutation.swapRanges!(core.int128.Cent[], core.int128.Cent[]).swapRanges(core.int128.Cent[], core.int128.Cent[])+0x20>
  5d:   4c 01 c9                add    rcx,r9
  60:   4c 01 ce                add    rsi,r9
  63:   48 89 30                mov    QWORD PTR [rax],rsi
  66:   48 89 50 08             mov    QWORD PTR [rax+0x8],rdx
  6a:   48 89 48 10             mov    QWORD PTR [rax+0x10],rcx
  6e:   4c 89 40 18             mov    QWORD PTR [rax+0x18],r8
  72:   c3                      ret

どうもLLVMは1バイトずつswapする処理をうまく最適化できるらしい。 swapRangesもこのような実装になっている。

Tuple!(InputRange1, InputRange2)
swapRanges(InputRange1, InputRange2)(InputRange1 r1, InputRange2 r2)
if (hasSwappableElements!InputRange1 && hasSwappableElements!InputRange2
    && is(ElementType!InputRange1 == ElementType!InputRange2))
{
    for (; !r1.empty && !r2.empty; r1.popFront(), r2.popFront())
    {
        swap(r1.front, r2.front);
    }
    return tuple(r1, r2);
}

いままでのswapのはなしはなんだったの？なんだったんですかね。。

Conclusion

複数のelementがあるデータのswapがしたいときはswapRangesを使おう

kubo39's blog

ただの雑記です。

swap/swapRangesのはなし

Conclusion