eBPF CVE 分析

Hongyi Lu (jwnhy) May 16, 2022

Preliminary

eBPF 寄存器追踪

在 eBPF 中，为了防止指针越界，所有寄存器都具有下面的变量用于追踪寄存器的值。

/* include/linux/bpf_verifier.h */
struct bpf_reg_state {
  /* ... */
  s64 smin_value; /* minimum possible (s64)value */
  /* ... */
  u32 u32_max_value; /* maximum possible (u32)value */
  /* ... */
}

接着在 adjust_ptr_min_max_vals() 中，这些值会被更新以确保没有越界的访问。其中很多类型的寄存器都是被禁止进行数值计算的，例如下面这些。

/* 随内核版本不同不同，下面为 v5.10 */
case PTR_TO_PACKET_END:
case PTR_TO_SOCKET:
case PTR_TO_SOCKET_OR_NULL:
case PTR_TO_SOCK_COMMON:
case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
  verbose(env, "R%d pointer arithmetic on %s prohibited\n",
    dst, reg_type_str[ptr_reg->type]);
  return -EACCES;

CVE-2022-23222 任意指针读写

在上面的例子中，可以看到并没有对 PTR_TO_MEM_OR_NULL 进行边界范围的计算（为什么？）这也引入了 CVE-2022-23222。

执行步骤

将 0xffff...ffff 传入 bpf_ringbuf_reserve()，该函数会在失败时返回 NULL，使 r0 = NULL。

将 r0 复制到 r1，将 r1 加一后，对 r0 进行 NULL 检查，这时，verifier 会认为 r0 与 r1 均为 0。在进行 NULL 检查时使用的指令为 BPF_JEQ，其会调用 reg_set_min_max() 对函数进行标记。

 case BPF_JEQ:
 case BPF_JNE:
  {
   struct bpf_reg_state *reg =
     opcode == BPF_JEQ ? true_reg : false_reg;
   /* JEQ/JNE comparison doesn't change the register equivalence.
    * r1 = r2;
    * if (r1 == 42) goto label;
    * ...
    * label: // here both r1 and r2 are known to be 42.
    *
    * Hence when marking register as known preserve it's ID.
    */
   if (is_jmp32)
     __mark_reg32_known(reg, val32);
   else
     __mark_reg_known(reg, val);
   break;
  }

此时我们拥有一个 verifier() 认为是 0 而实际值为 1 的寄存器，利用其 ptr = ptr + reg * offset 任意指针读写。
CVE 中还提到对栈空间的保护，因此需要利用 bpf_skb_load_bytes_*() 绕开这类保护，开许多进程同时读写，获取 task_struct 后将 uid 覆盖为 0。

CVE-3490 也是类似的问题

CVE-2022-0500

没有太多信息，只知道与 BPF_BTF_LOAD 有关。

CVE-2022-0433 内核崩溃

类型为 bloom_map 的对象不支持 get_next_key 操作，内核中省略了对应的实现，导致空指针崩溃。

diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
index 277a05e9c984..fa34dc871995 100644
--- a/kernel/bpf/bloom_filter.c
+++ b/kernel/bpf/bloom_filter.c
@@ -82,6 +82,11 @@ static int bloom_map_delete_elem(struct bpf_map *map, void *value)
   return -EOPNOTSUPP;
 }
 
+static int bloom_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+  return -EOPNOTSUPP;
+}
+
 static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
 {
   u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits;
@@ -192,6 +197,7 @@ const struct bpf_map_ops bloom_filter_map_ops = {
   .map_meta_equal = bpf_map_meta_equal,
   .map_alloc = bloom_map_alloc,
   .map_free = bloom_map_free,
+  .map_get_next_key = bloom_map_get_next_key,
   .map_push_elem = bloom_map_push_elem,
   .map_peek_elem = bloom_map_peek_elem,
   .map_pop_elem = bloom_map_pop_elem,
-- 
2.27.0

CVE-2021-45940 CVE-2021-45941

两个发生在 __bpf_object__open() 中的堆错误，应当为 bpftool 中的代码问题。

CVE-2021-45402 `check_alu_op()` 范围推测异常

当执行的 eBPF 指令为 32 位 BPF_MOV 时，会使用 zext_32_to_64() 将 32 位整数零扩展到 64 位。这时若该 32 位整数为负数，则会将结果设置为最坏结果，如下所示。

static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
{
  reg->umin_value = reg->u32_min_value;
  reg->umax_value = reg->u32_max_value;

  /* Attempt to pull 32-bit signed bounds into 64-bit bounds but must
   * be positive otherwise set to worse case bounds and refine later
   * from tnum.
   */
  if (__reg32_bound_s64(reg->s32_min_value) &&
      __reg32_bound_s64(reg->s32_max_value)) {
    reg->smin_value = reg->s32_min_value;
    reg->smax_value = reg->s32_max_value;
  } else {
    reg->smin_value = 0;
    reg->smax_value = U32_MAX;
  }
}

接着会通过下面的一组函数调用进行 refinement 来推测该寄存器的具体范围。

__update_reg_bounds(dst_reg);
__reg_deduce_bounds(dst_reg);
__reg_bound_offset(dst_reg);

但如同 CVE 作者所说，这个 refinement 并不是总会发生，而是取决于后续程序的运行。如果在 32 位 BPF_MOV 指令之后紧跟一条指针运算指令，就会生成一个包含指针内容的标量。这个标量是可以被正常写入到用户空间的，导致内核空间指针泄漏。

CVE-2021-41864 `prealloc_elems_and_freelist()` 整数溢出

elem_size 类型有问题，导致整型溢出。

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 09a3fd97d329e..6e75bbee39f0b 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -63,7 +63,8 @@ static inline int stack_map_data_size(struct bpf_map *map)
 
 static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
 {
-  u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
+  u64 elem_size = sizeof(struct stack_map_bucket) +
+      (u64)smap->map.value_size;
   int err;
 
   smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries,