v2 / examples / cpu_features / SSE_and_MMX_Extensions / sse3.v
36 lines · 34 sloc · 1.06 KB · 05377f3c0378ce0285bc8584106bf78865c551a6
Raw
1// SSE Instruction Set
2// SSE3: Added with later Pentium 4
3// ADDSUBPD, ADDSUBPS, HADDPD, HADDPS, HSUBPD, HSUBPS, MOVDDUP, MOVSHDUP, MOVSLDUP
4// The HADDPS instruction performs horizontal addition of two vectors of floats using SSE3
5// instructions.
6
7@[if amd64 && !tinyc && !msvc]
8fn horizontal_add_sse3(a &f32, b &f32, result &f32) {
9 unsafe {
10 asm volatile amd64 {
11 movaps xmm0, [a] // Load 4 floats from array a into SSE3 register xmm0
12 movaps xmm1, [b] // Load 4 floats from array b into SSE3 register xmm1
13 haddps xmm0, xmm1 // Perform horizontal add of xmm0 and xmm1
14 movaps [result], xmm0 // Store the result back to memory
15 ; ; r (a)
16 r (b)
17 r (result)
18 ; xmm0
19 xmm1
20 }
21 }
22}
23
24fn main() {
25 a := [f32(1.0), 2.0, 3.0, 4.0]
26 b := [f32(5.0), 6.0, 7.0, 8.0]
27 result := []f32{len: 4}
28 horizontal_add_sse3(&a[0], &b[0], &result[0])
29 println(result)
30 // The result should be [3.0, 7.0, 11.0, 15.0] due to horizontal addition.
31 // 1.0 + 2.0 = 3.0
32 // 3.0 + 4.0 = 7.0
33 // 5.0 + 6.0 = 11.0
34 // 7.0 + 8.0 = 15.0
35 assert result == [f32(3.0), 7.0, 11.0, 15.0]
36}
37