kubo39's blog

ただの雑記です。

re-visit kNN and operf/LDC

以前書いたものはシングルスレッド版のRust/OCamlよりも遅かった。これをprofilingしてみる。

  • スペック
$ lscpu                                                [kubo39:knn][git:master]
Architecture:          x86_64
CPU 操作モード:   32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                4
On-line CPU(s) list:   0-3
コアあたりのスレッド数:2
ソケットあたりのコア数:2
Socket(s):             1
NUMA ノード数:     1
ベンダー ID:       GenuineIntel
CPU ファミリー:   6
モデル:             69
Model name:            Intel(R) Core(TM) i5-4200U CPU @ 1.60GHz
...
  • dmdのバージョン
$ dmd --version                                        [kubo39:knn][git:master]
DMD64 D Compiler v2.073.1

コードを2.073.1でも動くように修正。

import std.algorithm;
import std.array;
import std.conv;
import std.range;
import std.stdio;
import std.string;
import std.typecons;
import std.parallelism;

struct LabelPixel
{
    int label;
    int[] pixels;
}

auto slurpFile(string filename)
{
    int count;

    auto lp = File(filename)
        .byLine
        .dropOne
        .tee!(a => count++)
        .map!chomp
        .map!(a => a.to!string.split(","))
        .map!(a => LabelPixel(a[0].to!int, a[1..$].to!(int[])) )
        .array;
    return tuple(lp, count);
}

int distanceSqrt(const ref int[] x, const ref int[] y) pure
{
    return reduce!((a, b) => a + (b[0] - b[1]) * (b[0] - b[1]))(0, x.zip(y));
}

int classify(const ref LabelPixel[] training, const ref int[] pixels) pure
{
    int smallest = int.max;
    int result = void;

    foreach (t; training)
    {
        int tmp = distanceSqrt(t.pixels, pixels);
        if (tmp < smallest)
        {
            smallest = tmp;
            result = t.label;
        }
    }
    return result;
}

void main()
{
    const trainingSet = "trainingsample.csv".slurpFile;
    const validationSample = "validationsample.csv".slurpFile;

    int count(const LabelPixel data) pure
    {
        int num;
        if (classify(trainingSet[0], data.pixels) == data.label)
            num++;
        return num;
    }

    immutable num = taskPool.reduce!"a + b"(
        std.algorithm.map!(count)(validationSample[0]).array);

    writefln("Percentage correct: %f percent",
             num.to!double / validationSample[1].to!double * 100.0);
}

timeコマンドで測ってみる。

  • 最適化なし
$ time ./knn                                           [kubo39:knn][git:master]
Percentage correct: 94.400000 percent
./knn  110.16s user 0.03s system 99% cpu 1:50.33 total

うーん、遅い。

  • 最適化あり
$ time ./knn                                           [kubo39:knn][git:master]
Percentage correct: 94.400000 percent
./knn  90.50s user 0.06s system 99% cpu 1:30.59 total

少し早くなったけどぜんぜん遅い。

Rustを見直し。

Rustのシングルスレッド版を再度測ってみる。

$ rustc --version                                      [kubo39:knn][git:master]
rustc 1.17.0-nightly (306035c21 2017-02-18)

少々手直しが必要。

use std::io::{BufRead, BufReader};
use std::fs::File;
use std::path::Path;
use std::str::FromStr;

struct LabelPixel {
    label: i32,
    pixels: Vec<i32>
}

fn slurp_file(file: &Path) -> Vec<LabelPixel> {
    BufReader::new(File::open(file).unwrap())
        .lines()
        .skip(1)
        .map(|line| {
            let line = line.unwrap();
            let mut iter = line.trim()
                .split(',')
                .map(|x| i32::from_str(x).unwrap());

            LabelPixel {
                label: iter.next().unwrap(),
                pixels: iter.collect()
            }
        })
        .collect()
}

fn distance_sqr(x: &[i32], y: &[i32]) -> i32 {
    // run through the two vectors, summing up the squares of the differences
    x.iter()
        .zip(y.iter())
        .fold(0, |s, (&a, &b)| s + (a - b) * (a - b))
}

fn classify(training: &[LabelPixel], pixels: &[i32]) -> i32 {
    training
        .iter()
        // find element of `training` with the smallest distance_sqr to `pixel`
        .min_by_key(|p| distance_sqr(p.pixels.as_slice(), pixels)).unwrap()
        .label
}

fn main() {
    let training_set = slurp_file(&Path::new("trainingsample.csv"));
    let validation_sample = slurp_file(&Path::new("validationsample.csv"));

    let num_correct = validation_sample.iter()
        .filter(|x| {
            classify(training_set.as_slice(), x.pixels.as_slice()) == x.label
        })
        .count();

    println!("Percentage correct: {}%",
             num_correct as f64 / validation_sample.len() as f64 * 100.0);
}

releaseビルド。めちゃくちゃ早い。。

$ cargo build --release                              [kubo39:knnrs][git:master]
   Compiling knnrs v0.1.0 (file:///home/kubo39/dev/dlang/knn/knnrs)
    Finished release [optimized] target(s) in 2.6 secs
$ time ./target/release/knnrs                        [kubo39:knnrs][git:master]
Percentage correct: 94.39999999999999%
./target/release/knnrs  1.21s user 0.01s system 99% cpu 1.217 total

D言語に戻る。

計測しやすいようにシングルスレッド版からはじめる。

import std.algorithm;
import std.array;
import std.conv;
import std.range;
import std.stdio;
import std.string;
import std.typecons;

struct LabelPixel
{
    int label;
    int[] pixels;
}

auto slurpFile(string filename)
{
    int count;

    auto lp = File(filename)
        .byLine
        .dropOne
        .tee!(a => count++)
        .map!chomp
        .map!(a => a.to!string.split(","))
        .map!(a => LabelPixel(a[0].to!int, a[1..$].to!(int[])) )
        .array;
    return tuple(lp, count);
}

int distanceSqrt(const ref int[] x, const ref int[] y) pure
{
    return reduce!((a, b) => a + (b[0] - b[1]) * (b[0] - b[1]))(0, x.zip(y));
}

int classify(const ref LabelPixel[] training, const ref int[] pixels) pure
{
    int smallest = int.max;
    int result = void;

    foreach (t; training)
    {
        int tmp = distanceSqrt(t.pixels, pixels);
        if (tmp < smallest)
        {
            smallest = tmp;
            result = t.label;
        }
    }
    return result;
}

void main()
{
    const trainingSet = "trainingsample.csv".slurpFile;
    const validationSample = "validationsample.csv".slurpFile;

    int count(const LabelPixel data) pure
    {
        int num;
        if (classify(trainingSet[0], data.pixels) == data.label)
            num++;
        return num;
    }

    immutable num = reduce!"a + b"(
        std.algorithm.map!(count)(validationSample[0]).array);

    writefln("Percentage correct: %f percent",
             num.to!double / validationSample[1].to!double * 100.0);
}

シングルスレッド版のほうが早い。といってもあまり変わらないが。

$ dmd -O knn.d                                         [kubo39:knn][git:master]
$ time ./knn                                           [kubo39:knn][git:master]
Percentage correct: 94.400000 percent
./knn  88.90s user 0.02s system 99% cpu 1:28.94 total

dmdだとだめそう。ldc2は理論値性能が出る?

http://leonardo-m.livejournal.com/111598.html

$ ldc2 -version | head -2
LDC - the LLVM D compiler (1.0.0):
  based on DMD v2.071.2 and LLVM 3.8.1

最適化つきで計測。DMDよりぜんぜん速いけどRustより遅い。

$ ldc2 -O knn.d
$ time ./knn
Percentage correct: 94.400000 percent
./knn  10.76s user 0.02s system 99% cpu 10.794 total

operf

operfはdwarf情報を使うので-gつけてビルド。

$ ldc2 -O -g knn.d
$ sudo operf ./knn
operf: Profiler started
Percentage correct: 94.400000 percent

Profiling done.

opannotate --source をみると、distanceSqrt関数が14%を占めていることがわかる。

$ opannotate --source
[ ... ]
/* 
 * Total samples for file : "/home/kubo39/dev/dlang/knn/knn.d"
 * 
 *  45735 15.2286
 */


               :import std.algorithm;
               :import std.array;
               :import std.conv;
               :import std.range;
               :import std.stdio;
               :import std.string;
               :import std.typecons;
               :
               :struct LabelPixel
               :{
               :    int label;
               :    int[] pixels;
               :}
               :
               :auto slurpFile(string filename)
               :{
               :    int count;
               :
               :    auto lp = File(filename)
               :        .byLine
               :        .dropOne
               :        .tee!(a => count++)
               :        .map!chomp
               :        .map!(a => a.to!string.split(","))
               :        .map!(a => LabelPixel(a[0].to!int, a[1..$].to!(int[])) )
               :        .array;
               :    return tuple(lp, count);
               :}
               :
               :int distanceSqrt(const ref int[] x, const ref int[] y) pure
               :{
 45028 14.9931 :    return reduce!((a, b) => a + (b[0] - b[1]) * (b[0] - b[1]))(0, x.zip(y)); /* _D3knn12distanceSqrtFNaKxAiKxAiZi total: 165424 55.0818 */
               :}
               :
               :int classify(const ref LabelPixel[] training, const ref int[] pixels) pure
               :{
               :    int smallest = int.max;
               :    int result = void;
               :
   523  0.1741 :    foreach (t; training)
               :    {
    33  0.0110 :        int tmp = distanceSqrt(t.pixels, pixels);
    50  0.0166 :        if (tmp < smallest)
               :        {
               :            smallest = tmp;
               :            result = t.label;
               :        }
               :    }
               :    return result;
               :}
               :
               :void main()
               :{
               :    const trainingSet = "trainingsample.csv".slurpFile; /* _Dmain total:    711  0.2367 */
               :    const validationSample = "validationsample.csv".slurpFile;
               :
               :    int count(const LabelPixel data) pure
               :    {
               :        int num;
   101  0.0336 :        if (classify(trainingSet[0], data.pixels) == data.label)
               :            num++;
               :        return num;
               :    }
               :
               :    immutable num = reduce!"a + b"(
               :        std.algorithm.map!(count)(validationSample[0]).array);
               :
               :    writefln("Percentage correct: %f percent",
               :             num.to!double / validationSample[1].to!double * 100.0);
               :}

distanceSqrtの実装を変えてみる。

...
int distanceSqrt(const ref int[] x, const ref int[] y) pure
{
    int total;
    foreach (i, a; x)
        total += (a - y[i]) ^^ 2;
    return total;
}
...

4倍以上早くなった。

$ ldc2 -O knn.d
$ time ./knn
Percentage correct: 94.400000 percent
./knn  2.45s user 0.01s system 99% cpu 2.463 total

その他シングルスレッド版では無駄な処理を省いてみる。

import std.algorithm;
import std.array;
import std.conv;
import std.range;
import std.stdio;
import std.string;
import std.typecons;

struct LabelPixel
{
    int label;
    int[] pixels;
}

auto slurpFile(string filename)
{
    int count;

    return File(filename)
        .byLine
        .dropOne
        .map!chomp
        .map!(a => a.to!string.split(","))
        .map!(a => LabelPixel(a[0].to!int, a[1..$].to!(int[])) )
        .array;
}

int distanceSqrt(const ref int[] x, const ref int[] y) pure
{
    int total;
    foreach (i, a; x)
        total += (a - y[i]) ^^ 2;
    return total;
}

int classify(const ref LabelPixel[] training, const ref int[] pixels) pure
{
    int smallest = int.max;
    int result = void;

    foreach (t; training)
    {
        int tmp = distanceSqrt(t.pixels, pixels);
        if (tmp < smallest)
        {
            smallest = tmp;
            result = t.label;
        }
    }
    return result;
}

void main()
{
    const trainingSet = "trainingsample.csv".slurpFile;
    const validationSample = "validationsample.csv".slurpFile;

    immutable num = validationSample
        .filter!(a => classify(trainingSet, a.pixels) == a.label)
        .count;

    writefln("Percentage correct: %f percent",
             num.to!double / validationSample.length.to!double * 100.0);
}

うーむ、誤差程度。というか少し遅くなってる。

$ ldc2 -O knn.d
$ time ./knn
Percentage correct: 94.400000 percent
./knn  2.52s user 0.00s system 99% cpu 2.528 total

もう一度operfをかけてみる。72%が二乗の処理らしい。

...
               :int distanceSqrt(const ref int[] x, const ref int[] y) pure
               :{
               :    int total;
  8513 13.5856 :    foreach (i, a; x)
 45571 72.7251 :        total += (a - y[i]) ^^ 2;
               :    return total;
               :}
...

operf –assemblyもしてみたがあまりわからなかった。

sub -> imul -> add -> inc の場所で60%以上喰ってるらしいが。。

000000000001a580 <_D3std9algorithm9iteration62__T12FilterResultS213knn4mainFZ9__lambda1TAxS3knn10LabelPixelZ12FilterResult8popFrontMFNaZv>: /* _D3std9algorithm9iteration62__T12FilterResultS213knn4mainFZ9__lambda1TAxS3knn10LabelPixelZ12FilterResult8popFrontMFNaZv total:  54369 86.7655 */
...
  4071  6.4968 :   1a600:   cmp    %rsi,%rbx
    21  0.0335 :   1a603:   jae    1a649 <_D3std9algorithm9iteration62__T12FilterResultS213knn4mainFZ9__lambda1TAxS3knn10LabelPixelZ12FilterResult8popFrontMFNaZv+0xc9>
  3182  5.0780 :   1a605:   mov    (%rcx,%rbx,4),%r8d
 14565 23.2438 :   1a609:   sub    (%rdx,%rbx,4),%r8d
  6849 10.9301 :   1a60d:   imul   %r8d,%r8d
 10065 16.0624 :   1a611:   add    %r8d,%ebp
  9923 15.8358 :   1a614:   inc    %rbx
  5268  8.4070 :   1a617:   cmp    %rax,%rbx
    35  0.0559 :   1a61a:   jb     1a600 <_D3std9algorithm9iteration62__T12FilterResultS213knn4mainFZ9__lambda1TAxS3knn10LabelPixelZ12FilterResult8popFrontMFNaZv+0x80>
...

そもそもなんでRustこんなはやいのか。

objdumpでみてみる。なるほど、SIMDでループをベクトル化してるのか。releaseビルドぱない。

    // run through the two vectors, summing up the squares of the differences
    x.iter()
        .zip(y.iter())
        .fold(0, |s, (&a, &b)| s + (a - b) * (a - b))
    8b80:       f3 0f 6f 0c b7          movdqu (%rdi,%rsi,4),%xmm1
    8b85:       f3 0f 6f 14 b2          movdqu (%rdx,%rsi,4),%xmm2
    8b8a:       66 0f fa ca             psubd  %xmm2,%xmm1
    8b8e:       66 0f 70 d1 f5          pshufd $0xf5,%xmm1,%xmm2
    8b93:       66 0f f4 c9             pmuludq %xmm1,%xmm1
    8b97:       66 0f 70 c9 e8          pshufd $0xe8,%xmm1,%xmm1
    8b9c:       66 0f f4 d2             pmuludq %xmm2,%xmm2
    8ba0:       66 0f 70 d2 e8          pshufd $0xe8,%xmm2,%xmm2
    8ba5:       66 0f 62 ca             punpckldq %xmm2,%xmm1
    8ba9:       66 0f fe c8             paddd  %xmm0,%xmm1
    8bad:       f3 0f 6f 44 b7 10       movdqu 0x10(%rdi,%rsi,4),%xmm0
    8bb3:       f3 0f 6f 54 b2 10       movdqu 0x10(%rdx,%rsi,4),%xmm2
    8bb9:       66 0f fa c2             psubd  %xmm2,%xmm0
    8bbd:       66 0f 70 d0 f5          pshufd $0xf5,%xmm0,%xmm2
    8bc2:       66 0f f4 c0             pmuludq %xmm0,%xmm0
    8bc6:       66 0f 70 c0 e8          pshufd $0xe8,%xmm0,%xmm0
    8bcb:       66 0f f4 d2             pmuludq %xmm2,%xmm2
    8bcf:       66 0f 70 d2 e8          pshufd $0xe8,%xmm2,%xmm2