以前書いたものはシングルスレッド版のRust/OCamlよりも遅かった。これをprofilingしてみる。
$ lscpu [kubo39:knn][git:master]
Architecture: x86_64
CPU 操作モード: 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 4
On-line CPU(s) list: 0-3
コアあたりのスレッド数:2
ソケットあたりのコア数:2
Socket(s): 1
NUMA ノード数: 1
ベンダー ID: GenuineIntel
CPU ファミリー: 6
モデル: 69
Model name: Intel(R) Core(TM) i5-4200U CPU @ 1.60GHz
...
$ dmd --version [kubo39:knn][git:master]
DMD64 D Compiler v2.073.1
コードを2.073.1でも動くように修正。
import std.algorithm;
import std.array;
import std.conv;
import std.range;
import std.stdio;
import std.string;
import std.typecons;
import std.parallelism;
struct LabelPixel
{
int label;
int[] pixels;
}
auto slurpFile(string filename)
{
int count;
auto lp = File(filename)
.byLine
.dropOne
.tee!(a => count++)
.map!chomp
.map!(a => a.to!string.split(","))
.map!(a => LabelPixel(a[0].to!int, a[1..$].to!(int[])) )
.array;
return tuple(lp, count);
}
int distanceSqrt(const ref int[] x, const ref int[] y) pure
{
return reduce!((a, b) => a + (b[0] - b[1]) * (b[0] - b[1]))(0, x.zip(y));
}
int classify(const ref LabelPixel[] training, const ref int[] pixels) pure
{
int smallest = int.max;
int result = void;
foreach (t; training)
{
int tmp = distanceSqrt(t.pixels, pixels);
if (tmp < smallest)
{
smallest = tmp;
result = t.label;
}
}
return result;
}
void main()
{
const trainingSet = "trainingsample.csv".slurpFile;
const validationSample = "validationsample.csv".slurpFile;
int count(const LabelPixel data) pure
{
int num;
if (classify(trainingSet[0], data.pixels) == data.label)
num++;
return num;
}
immutable num = taskPool.reduce!"a + b"(
std.algorithm.map!(count)(validationSample[0]).array);
writefln("Percentage correct: %f percent",
num.to!double / validationSample[1].to!double * 100.0);
}
timeコマンドで測ってみる。
$ time ./knn [kubo39:knn][git:master]
Percentage correct: 94.400000 percent
./knn 110.16s user 0.03s system 99% cpu 1:50.33 total
うーん、遅い。
$ time ./knn [kubo39:knn][git:master]
Percentage correct: 94.400000 percent
./knn 90.50s user 0.06s system 99% cpu 1:30.59 total
少し早くなったけどぜんぜん遅い。
Rustを見直し。
Rustのシングルスレッド版を再度測ってみる。
$ rustc --version [kubo39:knn][git:master]
rustc 1.17.0-nightly (306035c21 2017-02-18)
少々手直しが必要。
use std::io::{BufRead, BufReader};
use std::fs::File;
use std::path::Path;
use std::str::FromStr;
struct LabelPixel {
label: i32,
pixels: Vec<i32>
}
fn slurp_file(file: &Path) -> Vec<LabelPixel> {
BufReader::new(File::open(file).unwrap())
.lines()
.skip(1)
.map(|line| {
let line = line.unwrap();
let mut iter = line.trim()
.split(',')
.map(|x| i32::from_str(x).unwrap());
LabelPixel {
label: iter.next().unwrap(),
pixels: iter.collect()
}
})
.collect()
}
fn distance_sqr(x: &[i32], y: &[i32]) -> i32 {
x.iter()
.zip(y.iter())
.fold(0, |s, (&a, &b)| s + (a - b) * (a - b))
}
fn classify(training: &[LabelPixel], pixels: &[i32]) -> i32 {
training
.iter()
.min_by_key(|p| distance_sqr(p.pixels.as_slice(), pixels)).unwrap()
.label
}
fn main() {
let training_set = slurp_file(&Path::new("trainingsample.csv"));
let validation_sample = slurp_file(&Path::new("validationsample.csv"));
let num_correct = validation_sample.iter()
.filter(|x| {
classify(training_set.as_slice(), x.pixels.as_slice()) == x.label
})
.count();
println!("Percentage correct: {}%",
num_correct as f64 / validation_sample.len() as f64 * 100.0);
}
releaseビルド。めちゃくちゃ早い。。
$ cargo build --release [kubo39:knnrs][git:master]
Compiling knnrs v0.1.0 (file:///home/kubo39/dev/dlang/knn/knnrs)
Finished release [optimized] target(s) in 2.6 secs
$ time ./target/release/knnrs [kubo39:knnrs][git:master]
Percentage correct: 94.39999999999999%
./target/release/knnrs 1.21s user 0.01s system 99% cpu 1.217 total
計測しやすいようにシングルスレッド版からはじめる。
import std.algorithm;
import std.array;
import std.conv;
import std.range;
import std.stdio;
import std.string;
import std.typecons;
struct LabelPixel
{
int label;
int[] pixels;
}
auto slurpFile(string filename)
{
int count;
auto lp = File(filename)
.byLine
.dropOne
.tee!(a => count++)
.map!chomp
.map!(a => a.to!string.split(","))
.map!(a => LabelPixel(a[0].to!int, a[1..$].to!(int[])) )
.array;
return tuple(lp, count);
}
int distanceSqrt(const ref int[] x, const ref int[] y) pure
{
return reduce!((a, b) => a + (b[0] - b[1]) * (b[0] - b[1]))(0, x.zip(y));
}
int classify(const ref LabelPixel[] training, const ref int[] pixels) pure
{
int smallest = int.max;
int result = void;
foreach (t; training)
{
int tmp = distanceSqrt(t.pixels, pixels);
if (tmp < smallest)
{
smallest = tmp;
result = t.label;
}
}
return result;
}
void main()
{
const trainingSet = "trainingsample.csv".slurpFile;
const validationSample = "validationsample.csv".slurpFile;
int count(const LabelPixel data) pure
{
int num;
if (classify(trainingSet[0], data.pixels) == data.label)
num++;
return num;
}
immutable num = reduce!"a + b"(
std.algorithm.map!(count)(validationSample[0]).array);
writefln("Percentage correct: %f percent",
num.to!double / validationSample[1].to!double * 100.0);
}
シングルスレッド版のほうが早い。といってもあまり変わらないが。
$ dmd -O knn.d [kubo39:knn][git:master]
$ time ./knn [kubo39:knn][git:master]
Percentage correct: 94.400000 percent
./knn 88.90s user 0.02s system 99% cpu 1:28.94 total
dmdだとだめそう。ldc2は理論値性能が出る?
http://leonardo-m.livejournal.com/111598.html
$ ldc2 -version | head -2
LDC - the LLVM D compiler (1.0.0):
based on DMD v2.071.2 and LLVM 3.8.1
最適化つきで計測。DMDよりぜんぜん速いけどRustより遅い。
$ ldc2 -O knn.d
$ time ./knn
Percentage correct: 94.400000 percent
./knn 10.76s user 0.02s system 99% cpu 10.794 total
operf
operfはdwarf情報を使うので-gつけてビルド。
$ ldc2 -O -g knn.d
$ sudo operf ./knn
operf: Profiler started
Percentage correct: 94.400000 percent
Profiling done.
opannotate --source
をみると、distanceSqrt関数が14%を占めていることがわかる。
$ opannotate --source
[ ... ]
/*
* Total samples for file : "/home/kubo39/dev/dlang/knn/knn.d"
*
* 45735 15.2286
*/
:import std.algorithm;
:import std.array;
:import std.conv;
:import std.range;
:import std.stdio;
:import std.string;
:import std.typecons;
:
:struct LabelPixel
:{
: int label;
: int[] pixels;
:}
:
:auto slurpFile(string filename)
:{
: int count;
:
: auto lp = File(filename)
: .byLine
: .dropOne
: .tee!(a => count++)
: .map!chomp
: .map!(a => a.to!string.split(","))
: .map!(a => LabelPixel(a[0].to!int, a[1..$].to!(int[])) )
: .array;
: return tuple(lp, count);
:}
:
:int distanceSqrt(const ref int[] x, const ref int[] y) pure
:{
45028 14.9931 : return reduce!((a, b) => a + (b[0] - b[1]) * (b[0] - b[1]))(0, x.zip(y)); /* _D3knn12distanceSqrtFNaKxAiKxAiZi total: 165424 55.0818 */
:}
:
:int classify(const ref LabelPixel[] training, const ref int[] pixels) pure
:{
: int smallest = int.max;
: int result = void;
:
523 0.1741 : foreach (t; training)
: {
33 0.0110 : int tmp = distanceSqrt(t.pixels, pixels);
50 0.0166 : if (tmp < smallest)
: {
: smallest = tmp;
: result = t.label;
: }
: }
: return result;
:}
:
:void main()
:{
: const trainingSet = "trainingsample.csv".slurpFile; /* _Dmain total: 711 0.2367 */
: const validationSample = "validationsample.csv".slurpFile;
:
: int count(const LabelPixel data) pure
: {
: int num;
101 0.0336 : if (classify(trainingSet[0], data.pixels) == data.label)
: num++;
: return num;
: }
:
: immutable num = reduce!"a + b"(
: std.algorithm.map!(count)(validationSample[0]).array);
:
: writefln("Percentage correct: %f percent",
: num.to!double / validationSample[1].to!double * 100.0);
:}
distanceSqrtの実装を変えてみる。
...
int distanceSqrt(const ref int[] x, const ref int[] y) pure
{
int total;
foreach (i, a; x)
total += (a - y[i]) ^^ 2;
return total;
}
...
4倍以上早くなった。
$ ldc2 -O knn.d
$ time ./knn
Percentage correct: 94.400000 percent
./knn 2.45s user 0.01s system 99% cpu 2.463 total
その他シングルスレッド版では無駄な処理を省いてみる。
import std.algorithm;
import std.array;
import std.conv;
import std.range;
import std.stdio;
import std.string;
import std.typecons;
struct LabelPixel
{
int label;
int[] pixels;
}
auto slurpFile(string filename)
{
int count;
return File(filename)
.byLine
.dropOne
.map!chomp
.map!(a => a.to!string.split(","))
.map!(a => LabelPixel(a[0].to!int, a[1..$].to!(int[])) )
.array;
}
int distanceSqrt(const ref int[] x, const ref int[] y) pure
{
int total;
foreach (i, a; x)
total += (a - y[i]) ^^ 2;
return total;
}
int classify(const ref LabelPixel[] training, const ref int[] pixels) pure
{
int smallest = int.max;
int result = void;
foreach (t; training)
{
int tmp = distanceSqrt(t.pixels, pixels);
if (tmp < smallest)
{
smallest = tmp;
result = t.label;
}
}
return result;
}
void main()
{
const trainingSet = "trainingsample.csv".slurpFile;
const validationSample = "validationsample.csv".slurpFile;
immutable num = validationSample
.filter!(a => classify(trainingSet, a.pixels) == a.label)
.count;
writefln("Percentage correct: %f percent",
num.to!double / validationSample.length.to!double * 100.0);
}
うーむ、誤差程度。というか少し遅くなってる。
$ ldc2 -O knn.d
$ time ./knn
Percentage correct: 94.400000 percent
./knn 2.52s user 0.00s system 99% cpu 2.528 total
もう一度operfをかけてみる。72%が二乗の処理らしい。
...
:int distanceSqrt(const ref int[] x, const ref int[] y) pure
:{
: int total;
8513 13.5856 : foreach (i, a; x)
45571 72.7251 : total += (a - y[i]) ^^ 2;
: return total;
:}
...
operf –assemblyもしてみたがあまりわからなかった。
sub -> imul -> add -> inc の場所で60%以上喰ってるらしいが。。
000000000001a580 <_D3std9algorithm9iteration62__T12FilterResultS213knn4mainFZ9__lambda1TAxS3knn10LabelPixelZ12FilterResult8popFrontMFNaZv>: /* _D3std9algorithm9iteration62__T12FilterResultS213knn4mainFZ9__lambda1TAxS3knn10LabelPixelZ12FilterResult8popFrontMFNaZv total: 54369 86.7655 */
...
4071 6.4968 : 1a600: cmp %rsi,%rbx
21 0.0335 : 1a603: jae 1a649 <_D3std9algorithm9iteration62__T12FilterResultS213knn4mainFZ9__lambda1TAxS3knn10LabelPixelZ12FilterResult8popFrontMFNaZv+0xc9>
3182 5.0780 : 1a605: mov (%rcx,%rbx,4),%r8d
14565 23.2438 : 1a609: sub (%rdx,%rbx,4),%r8d
6849 10.9301 : 1a60d: imul %r8d,%r8d
10065 16.0624 : 1a611: add %r8d,%ebp
9923 15.8358 : 1a614: inc %rbx
5268 8.4070 : 1a617: cmp %rax,%rbx
35 0.0559 : 1a61a: jb 1a600 <_D3std9algorithm9iteration62__T12FilterResultS213knn4mainFZ9__lambda1TAxS3knn10LabelPixelZ12FilterResult8popFrontMFNaZv+0x80>
...
そもそもなんでRustこんなはやいのか。
objdumpでみてみる。なるほど、SIMDでループをベクトル化してるのか。releaseビルドぱない。
// run through the two vectors, summing up the squares of the differences
x.iter()
.zip(y.iter())
.fold(0, |s, (&a, &b)| s + (a - b) * (a - b))
8b80: f3 0f 6f 0c b7 movdqu (%rdi,%rsi,4),%xmm1
8b85: f3 0f 6f 14 b2 movdqu (%rdx,%rsi,4),%xmm2
8b8a: 66 0f fa ca psubd %xmm2,%xmm1
8b8e: 66 0f 70 d1 f5 pshufd $0xf5,%xmm1,%xmm2
8b93: 66 0f f4 c9 pmuludq %xmm1,%xmm1
8b97: 66 0f 70 c9 e8 pshufd $0xe8,%xmm1,%xmm1
8b9c: 66 0f f4 d2 pmuludq %xmm2,%xmm2
8ba0: 66 0f 70 d2 e8 pshufd $0xe8,%xmm2,%xmm2
8ba5: 66 0f 62 ca punpckldq %xmm2,%xmm1
8ba9: 66 0f fe c8 paddd %xmm0,%xmm1
8bad: f3 0f 6f 44 b7 10 movdqu 0x10(%rdi,%rsi,4),%xmm0
8bb3: f3 0f 6f 54 b2 10 movdqu 0x10(%rdx,%rsi,4),%xmm2
8bb9: 66 0f fa c2 psubd %xmm2,%xmm0
8bbd: 66 0f 70 d0 f5 pshufd $0xf5,%xmm0,%xmm2
8bc2: 66 0f f4 c0 pmuludq %xmm0,%xmm0
8bc6: 66 0f 70 c0 e8 pshufd $0xe8,%xmm0,%xmm0
8bcb: 66 0f f4 d2 pmuludq %xmm2,%xmm2
8bcf: 66 0f 70 d2 e8 pshufd $0xe8,%xmm2,%xmm2