Gitly


1 // SSE Instruction Set
2 // SSE3: Added with later Pentium 4
3 // ADDSUBPD, ADDSUBPS, HADDPD, HADDPS, HSUBPD, HSUBPS, MOVDDUP, MOVSHDUP, MOVSLDUP
4 // The HADDPS instruction performs horizontal addition of two vectors of floats using SSE3
5 // instructions.
6 
7 @[if amd64 && !tinyc && !msvc]
8 fn horizontal_add_sse3(a &f32, b &f32, result &f32) {
9     unsafe {
10         asm volatile amd64 {
11             movaps xmm0, [a] // Load 4 floats from array a into SSE3 register xmm0
12             movaps xmm1, [b] // Load 4 floats from array b into SSE3 register xmm1
13             haddps xmm0, xmm1 // Perform horizontal add of xmm0 and xmm1
14             movaps [result], xmm0 // Store the result back to memory
15             ; ; r (a)
16               r (b)
17               r (result)
18             ; xmm0
19               xmm1
20         }
21     }
22 }
23 
24 fn main() {
25     a := [f32(1.0), 2.0, 3.0, 4.0]
26     b := [f32(5.0), 6.0, 7.0, 8.0]
27     result := []f32{len: 4}
28     horizontal_add_sse3(&a[0], &b[0], &result[0])
29     println(result)
30     // The result should be [3.0, 7.0, 11.0, 15.0] due to horizontal addition.
31     // 1.0 + 2.0 = 3.0
32     // 3.0 + 4.0 = 7.0
33     // 5.0 + 6.0 = 11.0
34     // 7.0 + 8.0 = 15.0
35     assert result == [f32(3.0), 7.0, 11.0, 15.0]
36 }
37

1	// SSE Instruction Set
2	// SSE3: Added with later Pentium 4
3	// ADDSUBPD, ADDSUBPS, HADDPD, HADDPS, HSUBPD, HSUBPS, MOVDDUP, MOVSHDUP, MOVSLDUP
4	// The HADDPS instruction performs horizontal addition of two vectors of floats using SSE3
5	// instructions.
6
7	@[if amd64 && !tinyc && !msvc]
8	fn horizontal_add_sse3(a &f32, b &f32, result &f32) {
9	unsafe {
10	asm volatile amd64 {
11	movaps xmm0, [a] // Load 4 floats from array a into SSE3 register xmm0
12	movaps xmm1, [b] // Load 4 floats from array b into SSE3 register xmm1
13	haddps xmm0, xmm1 // Perform horizontal add of xmm0 and xmm1
14	movaps [result], xmm0 // Store the result back to memory
15	; ; r (a)
16	r (b)
17	r (result)
18	; xmm0
19	xmm1
20	}
21	}
22	}
23
24	fn main() {
25	a := [f32(1.0), 2.0, 3.0, 4.0]
26	b := [f32(5.0), 6.0, 7.0, 8.0]
27	result := []f32{len: 4}
28	horizontal_add_sse3(&a[0], &b[0], &result[0])
29	println(result)
30	// The result should be [3.0, 7.0, 11.0, 15.0] due to horizontal addition.
31	// 1.0 + 2.0 = 3.0
32	// 3.0 + 4.0 = 7.0
33	// 5.0 + 6.0 = 11.0
34	// 7.0 + 8.0 = 15.0
35	assert result == [f32(3.0), 7.0, 11.0, 15.0]
36	}
37