bpf: sk_msg program helper bpf_sk_msg_pull_data

Currently, if a bpf sk msg program is run the program can only parse data that the (start,end) pointers already consumed. For sendmsg hooks this is likely the first scatterlist element. For sendpage this will be the range (0,0) because the data is shared with userspace and by default we want to avoid allowing userspace to modify data while (or after) BPF verdict is being decided. To support pulling in additional bytes for parsing use a new helper bpf_sk_msg_pull(start, end, flags) which works similar to cls tc logic. This helper will attempt to point the data start pointer at 'start' bytes offest into msg and data end pointer at 'end' bytes offset into message. After basic sanity checks to ensure 'start' <= 'end' and 'end' <= msg_length there are a few cases we need to handle. First the sendmsg hook has already copied the data from userspace and has exclusive access to it. Therefor, it is not necessesary to copy the data. However, it may be required. After finding the scatterlist element with 'start' offset byte in it there are two cases. One the range (start,end) is entirely contained in the sg element and is already linear. All that is needed is to update the data pointers, no allocate/copy is needed. The other case is (start, end) crosses sg element boundaries. In this case we allocate a block of size 'end - start' and copy the data to linearize it. Next sendpage hook has not copied any data in initial state so that data pointers are (0,0). In this case we handle it similar to the above sendmsg case except the allocation/copy must always happen. Then when sending the data we have possibly three memory regions that need to be sent, (0, start - 1), (start, end), and (end + 1, msg_length). This is required to ensure any writes by the BPF program are correctly transmitted. Lastly this operation will invalidate any previous data checks so BPF programs will have to revalidate pointers after making this BPF call. Signed-off-by: John Fastabend <[email protected]> Acked-by: David S. Miller <[email protected]> Signed-off-by: Daniel Borkmann <[email protected]>
torvalds · Mar 19, 2018 · 015632b · 015632b
1 parent 91843d5
commit 015632b
Show file tree

Hide file tree

Showing 2 changed files with 136 additions and 2 deletions.
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
@@ -793,7 +793,8 @@ union bpf_attr {
 	FN(sock_ops_cb_flags_set),	\
 	FN(msg_redirect_map),		\
 	FN(msg_apply_bytes),		\
-	FN(msg_cork_bytes),
+	FN(msg_cork_bytes),		\
+	FN(msg_pull_data),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call

diff --git a/net/core/filter.c b/net/core/filter.c
@@ -1956,6 +1956,136 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_msg_pull_data,
+	   struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
+{
+	unsigned int len = 0, offset = 0, copy = 0;
+	struct scatterlist *sg = msg->sg_data;
+	int first_sg, last_sg, i, shift;
+	unsigned char *p, *to, *from;
+	int bytes = end - start;
+	struct page *page;
+
+	if (unlikely(flags || end <= start))
+		return -EINVAL;
+
+	/* First find the starting scatterlist element */
+	i = msg->sg_start;
+	do {
+		len = sg[i].length;
+		offset += len;
+		if (start < offset + len)
+			break;
+		i++;
+		if (i == MAX_SKB_FRAGS)
+			i = 0;
+	} while (i != msg->sg_end);
+
+	if (unlikely(start >= offset + len))
+		return -EINVAL;
+
+	if (!msg->sg_copy[i] && bytes <= len)
+		goto out;
+
+	first_sg = i;
+
+	/* At this point we need to linearize multiple scatterlist
+	 * elements or a single shared page. Either way we need to
+	 * copy into a linear buffer exclusively owned by BPF. Then
+	 * place the buffer in the scatterlist and fixup the original
+	 * entries by removing the entries now in the linear buffer
+	 * and shifting the remaining entries. For now we do not try
+	 * to copy partial entries to avoid complexity of running out
+	 * of sg_entry slots. The downside is reading a single byte
+	 * will copy the entire sg entry.
+	 */
+	do {
+		copy += sg[i].length;
+		i++;
+		if (i == MAX_SKB_FRAGS)
+			i = 0;
+		if (bytes < copy)
+			break;
+	} while (i != msg->sg_end);
+	last_sg = i;
+
+	if (unlikely(copy < end - start))
+		return -EINVAL;
+
+	page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy));
+	if (unlikely(!page))
+		return -ENOMEM;
+	p = page_address(page);
+	offset = 0;
+
+	i = first_sg;
+	do {
+		from = sg_virt(&sg[i]);
+		len = sg[i].length;
+		to = p + offset;
+
+		memcpy(to, from, len);
+		offset += len;
+		sg[i].length = 0;
+		put_page(sg_page(&sg[i]));
+
+		i++;
+		if (i == MAX_SKB_FRAGS)
+			i = 0;
+	} while (i != last_sg);
+
+	sg[first_sg].length = copy;
+	sg_set_page(&sg[first_sg], page, copy, 0);
+
+	/* To repair sg ring we need to shift entries. If we only
+	 * had a single entry though we can just replace it and
+	 * be done. Otherwise walk the ring and shift the entries.
+	 */
+	shift = last_sg - first_sg - 1;
+	if (!shift)
+		goto out;
+
+	i = first_sg + 1;
+	do {
+		int move_from;
+
+		if (i + shift >= MAX_SKB_FRAGS)
+			move_from = i + shift - MAX_SKB_FRAGS;
+		else
+			move_from = i + shift;
+
+		if (move_from == msg->sg_end)
+			break;
+
+		sg[i] = sg[move_from];
+		sg[move_from].length = 0;
+		sg[move_from].page_link = 0;
+		sg[move_from].offset = 0;
+
+		i++;
+		if (i == MAX_SKB_FRAGS)
+			i = 0;
+	} while (1);
+	msg->sg_end -= shift;
+	if (msg->sg_end < 0)
+		msg->sg_end += MAX_SKB_FRAGS;
+out:
+	msg->data = sg_virt(&sg[i]) + start - offset;
+	msg->data_end = msg->data + bytes;
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_pull_data_proto = {
+	.func		= bpf_msg_pull_data,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
 {
 	return task_get_classid(skb);
@@ -2897,7 +3027,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_l3_csum_replace ||
 	    func == bpf_l4_csum_replace ||
 	    func == bpf_xdp_adjust_head ||
-	    func == bpf_xdp_adjust_meta)
+	    func == bpf_xdp_adjust_meta ||
+	    func == bpf_msg_pull_data)
 		return true;
 
 	return false;
@@ -3666,6 +3797,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id)
 		return &bpf_msg_apply_bytes_proto;
 	case BPF_FUNC_msg_cork_bytes:
 		return &bpf_msg_cork_bytes_proto;
+	case BPF_FUNC_msg_pull_data:
+		return &bpf_msg_pull_data_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}