From eaf5645f37ead6faba8eb7882b60f29945621374 Mon Sep 17 00:00:00 2001
From: aaaaaa aaaaaaa <a@bc.de>
Date: Tue, 9 Jan 2018 19:05:49 +0100
Subject: [PATCH] testcase loop & tail rec

---
 README.md | 157 ++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 110 insertions(+), 47 deletions(-)

diff --git a/README.md b/README.md
index 515c8e1..f8cfd5c 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,7 @@ engines (on windows) are. I'll try to write various functions, that are hard to
 patch and then see how each hooking engine does.
 
 I'll test:
+
 * [EasyHook](https://easyhook.github.io/)
 * [PolyHook](https://github.com/stevemk14ebr/PolyHook)
 * [MinHook](https://www.codeproject.com/Articles/44326/MinHook-The-Minimalistic-x-x-API-Hooking-Libra)
@@ -25,6 +26,7 @@ needs for each hook. This is just about the challenges the function to be
 hooked itself poses.
 
 Namely:
+
 * Are jumps relocated?
 * What about RIP adressing?
 * If there's a loop at the beginning / if it's a tail recurisve function, does
@@ -79,42 +81,44 @@ Test case: Small
 ================
 This is just a very small function; it is smaller than the hook code will be -
 so how does the library react?
-```ASM
-_small:
-	xor eax, eax
-	ret
-```
+
+
+	_small:
+		xor eax, eax
+		ret
+
 
 Test case: Branch
 =================
 Instead of the FASM code I'll show the disassembled version, so you can see the
 instruction lengths & offsets.
-```ASM
-0026 | 48 83 E0 01 | and rax,1
-002A | 74 17       | je test_cases.0043 ----+
-002C | 48 31 C0    | xor rax,rax            |
-002F | 90          | nop                    |
-0030 | 90          | nop                    |
-0031 | 90          | nop                    |
-0032 | 90          | nop                    |
-0033 | 90          | nop                    |
-0034 | 90          | nop                    |
-0035 | 90          | nop                    |
-0036 | 90          | nop                    |
-0037 | 90          | nop                    |
-0038 | 90          | nop                    |
-0039 | 90          | nop                    |
-003A | 90          | nop                    |
-003B | 90          | nop                    |
-003C | 90          | nop                    |
-003D | 90          | nop                    |
-003E | 90          | nop                    |
-003F | 90          | nop                    |
-0040 | 90          | nop                    |
-0041 | 90          | nop                    |
-0042 | 90          | nop                    |
-0043 | C3          | ret  <-----------------+
-```
+
+
+	0026 | 48 83 E0 01 | and rax,1
+	002A | 74 17       | je test_cases.0043 ----+
+	002C | 48 31 C0    | xor rax,rax            |
+	002F | 90          | nop                    |
+	0030 | 90          | nop                    |
+	0031 | 90          | nop                    |
+	0032 | 90          | nop                    |
+	0033 | 90          | nop                    |
+	0034 | 90          | nop                    |
+	0035 | 90          | nop                    |
+	0036 | 90          | nop                    |
+	0037 | 90          | nop                    |
+	0038 | 90          | nop                    |
+	0039 | 90          | nop                    |
+	003A | 90          | nop                    |
+	003B | 90          | nop                    |
+	003C | 90          | nop                    |
+	003D | 90          | nop                    |
+	003E | 90          | nop                    |
+	003F | 90          | nop                    |
+	0040 | 90          | nop                    |
+	0041 | 90          | nop                    |
+	0042 | 90          | nop                    |
+	0043 | C3          | ret  <-----------------+
+
 
 This function has a branch in the first 5 bytes. Hooking it detour-style isn't
 possible without fixing that branch in the trampoline. The NOP sled is just so
@@ -132,27 +136,29 @@ relocation table.
 
 A quick and dirty[1] test for this is re-implementing the well known C rand
 function.
-```ASM
-public _rip_relative
-_rip_relative:
-	mov rax, qword[seed]
-	mov ecx, 214013
-	mul ecx
-	add eax, 2531011
-	mov [seed], eax
-
-	shr eax, 16
-	and eax, 0x7FFF
-	ret
-
-seed dd 1
-```
+
+
+	public _rip_relative
+	_rip_relative:
+		mov rax, qword[seed]
+		mov ecx, 214013
+		mul ecx
+		add eax, 2531011
+		mov [seed], eax
+
+		shr eax, 16
+		and eax, 0x7FFF
+		ret
+
+	seed dd 1
+
 
 The very first instruction uses rip relative addressing, thus it needs to be
 fixed in the trampoline.
 
 Test case: AVX & RDRAND
 =======================
+
 The AMD64 instruction set is extended with every CPU generation. Becayse the
 hooking engines need to know the instruction lengths and their side effects to
 properly apply their hooks, they need to keep up.
@@ -161,8 +167,62 @@ The actual code in the test case is boring and doesn't matter. I'm sure there
 are disagreements on whether I've picked good candidates of "exotic" or new
 instructions, but those were the first that came to mind.
 
+Test case: loop and TailRec
+===========================
+
+My hypothesis before starting this evaluation was that those two cases would
+make most hooking engines fail. Back in the good ol' days of x86 detour hooking
+didn't require any special thought because the prologue was exactly as big as
+the hook itself -- 5 bytes for `PUSH ESP; MOV EBP, ESP` and 5 bytes for `JMP +-
+2GB`[2]. That isn't so easy for AMD64: a) the hook sometimes needs to be *way*
+bigger b) due to changes in the calling convention and the general architecture
+of AMD64 there just isn't a common prologue, used for almost all functions,
+anymore.
+
+Those by itself arn't a problem, since the hooking engines can fix all the
+instructions they would overwrite. However I hypothesized that only a few would
+check whether the function contained a loop that jumps back into the
+instructions that have been overwritten. Consider this:
+
+	public _loop
+	_loop:
+		mov rax, rcx
+	@loop_loop:
+		mul rcx
+		nop
+		nop
+		nop
+		loop @loop_loop ; lol
+		ret
+
+There's only 3 bytes that can be safely overwritten. Right after that is the
+destination of the jump backwards. This is a very simple (and kinda pointless)
+function so detecting that the loop might lead to problems shouldn't be a
+problem. Basically the same applies for the next example:
+
+	public _tail_recursion
+	_tail_recursion:
+		test ecx, ecx
+		je @is_0
+		mov eax, ecx
+		dec ecx
+	@loop:
+		test ecx, ecx
+		jz @tr_end
+
+		mul ecx
+		dec ecx
+
+		jnz @loop
+		jmp @tr_end
+	@is_0:
+		mov eax, 1
+	@tr_end:
+		ret
+
 (Preliminary) Results
 =====================
+
 +----------+-----+------+------------+---+------+----+-------+
 |      Name|Small|Branch|RIP Relative|AVX|RDRAND|Loop|TailRec|
 +----------+-----+------+------------+---+------+----+-------+
@@ -178,4 +238,7 @@ I load the seed DWORD as a QWORD -- which only works because the upper half is
 then thrown away by the multiplication. It's shitty code is what I'm saying.
 
 In retrospect I should have used a jump table like a switch-case could be
-compiled into. That would be read only data. Oh well.
\ No newline at end of file
+compiled into. That would be read only data. Oh well.
+
+[2] And Microsoft decided at some point to make it even easier for their code
+with the advent of hotpatching.
\ No newline at end of file