| 1 | // SSE Instruction Set |
| 2 | // SSE3: Added with later Pentium 4 |
| 3 | // ADDSUBPD, ADDSUBPS, HADDPD, HADDPS, HSUBPD, HSUBPS, MOVDDUP, MOVSHDUP, MOVSLDUP |
| 4 | // The HADDPS instruction performs horizontal addition of two vectors of floats using SSE3 |
| 5 | // instructions. |
| 6 | |
| 7 | @[if amd64 && !tinyc && !msvc] |
| 8 | fn horizontal_add_sse3(a &f32, b &f32, result &f32) { |
| 9 | unsafe { |
| 10 | asm volatile amd64 { |
| 11 | movaps xmm0, [a] // Load 4 floats from array a into SSE3 register xmm0 |
| 12 | movaps xmm1, [b] // Load 4 floats from array b into SSE3 register xmm1 |
| 13 | haddps xmm0, xmm1 // Perform horizontal add of xmm0 and xmm1 |
| 14 | movaps [result], xmm0 // Store the result back to memory |
| 15 | ; ; r (a) |
| 16 | r (b) |
| 17 | r (result) |
| 18 | ; xmm0 |
| 19 | xmm1 |
| 20 | } |
| 21 | } |
| 22 | } |
| 23 | |
| 24 | fn main() { |
| 25 | a := [f32(1.0), 2.0, 3.0, 4.0] |
| 26 | b := [f32(5.0), 6.0, 7.0, 8.0] |
| 27 | result := []f32{len: 4} |
| 28 | horizontal_add_sse3(&a[0], &b[0], &result[0]) |
| 29 | println(result) |
| 30 | // The result should be [3.0, 7.0, 11.0, 15.0] due to horizontal addition. |
| 31 | // 1.0 + 2.0 = 3.0 |
| 32 | // 3.0 + 4.0 = 7.0 |
| 33 | // 5.0 + 6.0 = 11.0 |
| 34 | // 7.0 + 8.0 = 15.0 |
| 35 | assert result == [f32(3.0), 7.0, 11.0, 15.0] |
| 36 | } |
| 37 | |