r/asm

Hi there, I'm learning the Intel x64 ISA by doing some Project Euler problems. The first problem is to compute the sum of all the positive integers less than 1000 that are divisible by 3 or 5. I know that there is a closed-form expression for this problem that can be computed without loops or tests. My goal isn't to improve my solution to the problem, but to optimize the solution that I have, using what I learn about x64 optimizations. The code in file p1.s is below.

	bits 64			; Enable 64-bit instructions.
	default rel		; Declare that the program can be dynamically relocated.
	global main		; The entry point main must be exported.
	extern printf		; We must import the symbols of libc that we need.
	section .data

	CLOCK_MONOTONIC_RAW equ 4
	CLOCK_REALTIME equ 0
fmt:
	db "%d", 9, "%lu", 10, 0

	section .text

main:
	push rbp
	mov rbp, rsp
	sub rsp, 32 		; Allocate space for two timeval_t structures

	mov rax, 228                ; Call the clock_gettime() syscall
	mov rdi, CLOCK_MONOTONIC_RAW     ; Argument 1: Clock ID (0)
	lea rsi, [rbp-16]
	syscall

	xor rsi, rsi		; The sum starts at zero. ESI is also the second parameter of printf().
	mov ecx, 999		; The countdown starts at 999.

.L1:
	xor edx, edx		; Set the dividend EDX:EAX to the current count.
	mov eax, ecx
	mov ebx, 3		; Is the count divisible by 3?
	div ebx
	cmp edx, 0
	je .L2			; Add it if so.

	xor edx, edx		; Set the dividend EDX:EAX to the current count.
	mov eax, ecx
	mov ebx, 5		; Is the count divisible by 5?
	div ebx
	cmp edx, 0
	jne .L3			; Add it if so.

.L2:
	add esi, ecx

.L3:
	loop .L1		; Decrement the count and loop until the count is zero.

	push rsi
	mov rax, 228                ; Call the clock_gettime() syscall
	mov rdi, CLOCK_MONOTONIC_RAW     ; Argument 1: Clock ID (0)
	lea rsi, [rbp-32]                ; Argument 2: Pointer to the timespec struct on stack
	syscall
	pop rsi

	mov rdx, qword [rbp-24]
	sub rdx, qword [rbp-8]

	lea rdi, [fmt]		; Printf's first parameter is the format string. ESI holds the second parameter.
	xor rax, rax		; In the x64 ABI, since printf() is a variadic function, we must zero out EAX before calling.
	call printf wrt ..plt	; We must also call with-regards-to the PLT, which accounts for the fact that printf is dynamically loaded.

	add rsp, 32
	pop rbp
	
	xor rax, rax
	ret

I compiled this way:

nasm -f elf64 -g -o p1.o p1.s
cc -o p1 p1.o -ansi -pedantic -Wall -g

I then ran the program and cachegrind and saw this:

==132149== Cachegrind, a high-precision tracing profiler
==132149== Copyright (C) 2002-2024, and GNU GPL'd, by Nicholas Nethercote et al.
==132149== Using Valgrind-3.25.1 and LibVEX; rerun with -h for copyright info
==132149== Command: ./p1
==132149== 
--132149-- warning: L3 cache found, using its data for the LL simulation.
233168	418070
==132149== 
==132149== I refs:        133,262
==132149== I1  misses:      1,275
==132149== LLi misses:      1,253
==132149== I1  miss rate:    0.96%
==132149== LLi miss rate:    0.94%
==132149== 
==132149== D refs:         40,123  (28,356 rd   + 11,767 wr)
==132149== D1  misses:      1,591  ( 1,220 rd   +    371 wr)
==132149== LLd misses:      1,353  ( 1,011 rd   +    342 wr)
==132149== D1  miss rate:     4.0% (   4.3%     +    3.2%  )
==132149== LLd miss rate:     3.4% (   3.6%     +    2.9%  )
==132149== 
==132149== LL refs:         2,866  ( 2,495 rd   +    371 wr)
==132149== LL misses:       2,606  ( 2,264 rd   +    342 wr)
==132149== LL miss rate:      1.5% (   1.4%     +    2.9%  )

For such a small program, I was surprised that there are any cache misses. I tried applying align 16 to align the starts of loops, but it yielded no decrease in cache misses; it only increased the number of instructions.

Can you recommend any ways to optimize the code here?

Help me optimize a simple x64 program

Finally - "Boing!" in 64 bytes

Coroutines, part I: Co-operators and Promises

Analyzing Bytes: Pre-Disassembly Static Binary Analysis