seccomp (short for secure computing mode) is a computer security facility in the Linux kernel. It was merged into the Linux kernel mainline in kernel version 2.6.12, which was released on March 8, 2005. seccomp allows a process to make a one-way transition into a “secure” state where it cannot make any system calls except exit(), sigreturn(), read() and write() to already-open file descriptors. Should it attempt any other system calls, the kernel will terminate the process with SIGKILL or SIGSYS. In this sense, it does not virtualize the system’s resources but isolates the process from them entirely.
char * filename = "/bin/sh"; char * argv[] = {"/bin/sh",NULL}; char * envp[] = {NULL}; write(1,"i will give you a shell\n",24); syscall(59,filename,argv,envp);//execve return0; }
运行结果:
1 2 3 4
# veritas @ ubuntu in ~/test/seccomp $ ./simple_syscall_seccomp i will give you a shell [1] 14024 invalid system call (core dumped) ./simple_syscall_seccomp
/** * Kill the process */ #define SCMP_ACT_KILL 0x00000000U /** * Throw a SIGSYS signal */ #define SCMP_ACT_TRAP 0x00030000U /** * Return the specified error code */ #define SCMP_ACT_ERRNO(x) (0x00050000U | ((x) & 0x0000ffffU)) /** * Notify a tracing process with the specified value */ #define SCMP_ACT_TRACE(x) (0x7ff00000U | ((x) & 0x0000ffffU)) /** * Allow the syscall to be executed after the action has been logged */ #define SCMP_ACT_LOG 0x7ffc0000U /** * Allow the syscall to be executed */ #define SCMP_ACT_ALLOW 0x7fff0000U
seccomp_rule_add是添加一条规则,函数原形如下
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/** * Add a new rule to the filter * @param ctx the filter context * @param action the filter action * @param syscall the syscall number * @param arg_cnt the number of argument filters in the argument filter chain * @param ... scmp_arg_cmp structs (use of SCMP_ARG_CMP() recommended) * * This function adds a series of new argument/value checks to the seccomp * filter for the given syscall; multiple argument/value checks can be * specified and they will be chained together (AND'd together) in the filter. * If the specified rule needs to be adjusted due to architecture specifics it * will be adjusted without notification. Returns zero on success, negative * values on failure. * */ intseccomp_rule_add(scmp_filter_ctx ctx, uint32_t action, int syscall, unsignedint arg_cnt, ...);
seccomp_load是应用过滤,如果不调用seccomp_load则上面所有的过滤都不会生效
1 2 3 4 5 6 7 8 9 10 11
/** * Loads the filter into the kernel * @param ctx the filter context * * This function loads the given seccomp filter context into the kernel. If * the filter was loaded correctly, the kernel will be enforcing the filter * when this function returns. Returns zero on success, negative values on * error. * */ intseccomp_load(const scmp_filter_ctx ctx);
/** * Specify an argument comparison struct for use in declaring rules * @param arg the argument number, starting at 0 * @param op the comparison operator, e.g. SCMP_CMP_* * @param datum_a dependent on comparison * @param datum_b dependent on comparison, optional */ #define SCMP_CMP(...) ((struct scmp_arg_cmp){__VA_ARGS__})
/** * Specify an argument comparison struct for argument 0 */ #define SCMP_A0(...) SCMP_CMP(0, __VA_ARGS__)
/** * Specify an argument comparison struct for argument 1 */ #define SCMP_A1(...) SCMP_CMP(1, __VA_ARGS__)
/** * Specify an argument comparison struct for argument 2 */ #define SCMP_A2(...) SCMP_CMP(2, __VA_ARGS__)
/** * Specify an argument comparison struct for argument 3 */ #define SCMP_A3(...) SCMP_CMP(3, __VA_ARGS__)
/** * Specify an argument comparison struct for argument 4 */ #define SCMP_A4(...) SCMP_CMP(4, __VA_ARGS__)
/** * Specify an argument comparison struct for argument 5 */ #define SCMP_A5(...) SCMP_CMP(5, __VA_ARGS__)
/** * Comparison operators */ enumscmp_compare { _SCMP_CMP_MIN = 0, SCMP_CMP_NE = 1, /**< not equal */ SCMP_CMP_LT = 2, /**< less than */ SCMP_CMP_LE = 3, /**< less than or equal */ SCMP_CMP_EQ = 4, /**< equal */ SCMP_CMP_GE = 5, /**< greater than or equal */ SCMP_CMP_GT = 6, /**< greater than */ SCMP_CMP_MASKED_EQ = 7, /**< masked equality */ _SCMP_CMP_MAX, };
/** * Argument datum */ typedefuint64_tscmp_datum_t;
/** * Argument / Value comparison definition */ structscmp_arg_cmp { unsignedint arg; /**< argument number, starting at 0 */ enumscmp_compare op; /**< the comparison op, e.g. SCMP_CMP_* */ scmp_datum_t datum_a; scmp_datum_t datum_b; };
int main(void){ scmp_filter_ctx ctx; ctx = seccomp_init(SCMP_ACT_ALLOW); seccomp_rule_add(ctx, SCMP_ACT_KILL, SCMP_SYS(write),1,SCMP_A2(SCMP_CMP_EQ,0x10));//第2(从0)个参数等于0x10 seccomp_load(ctx); write(1,"i will give you a shell\n",24);//不被拦截 write(1,"1234567812345678",0x10);//被拦截 return 0; }
PR_SET_NO_NEW_PRIVS (since Linux 3.5) Set the calling process's no_new_privs bit to the value in arg2. With no_new_privs set to 1, execve(2) promises not to grant privileges to do anything that could not have been done without the execve(2) call (for example, rendering the set-user-ID and set-group-ID mode bits, and file capabilities non-functional). Once set, this bit cannot be unset. The setting of this bit is inherited by children created by fork(2) and clone(2), and preserved across execve(2). For more information, see the kernel source file Documentation/prctl/no_new_privs.txt.
例子:
1 2 3 4 5 6 7 8 9 10 11 12 13
#include <unistd.h> #include <sys/prctl.h>
int main(void){ prctl(PR_SET_NO_NEW_PRIVS,1,0,0,0);
char * filename = "/bin/sh"; char * argv[] = {"/bin/sh",NULL}; char * envp[] = {NULL}; write(1,"i will give you a shell\n",24); syscall(59,filename,argv,envp);//execve return 0; }
运行效果
1 2 3 4 5 6 7 8 9 10 11 12
# veritas @ ubuntu in ~/test/seccomp $ ./prctl_test i will give you a shell $ sudo sh sudo: effective uid is not 0, is sudo installed setuid root? $ whoami veritas $ id uid=1000(veritas) gid=1000(veritas) groups=1000(veritas),4(adm),24(cdrom),27(sudo),30(dip),46(plugdev),113(lpadmin),128(sambashare) $ sudo sudo: effective uid is not 0, is sudo installed setuid root? $
PR_SET_SECCOMP (since Linux 2.6.23) Set the secure computing (seccomp) mode for the calling thread, to limit the available system calls. The more recent seccomp(2) system call provides a superset of the functionality of PR_SET_SECCOMP.
The seccomp mode is selected via arg2. (The seccomp constants are defined in <linux/seccomp.h>.)
With arg2 set to SECCOMP_MODE_STRICT, the only system calls that the thread is permitted to make are read(2), write(2), _exit(2) (but not exit_group(2)), and sigreturn(2). Other system calls result in the delivery of a SIGKILL signal. Strict secure computing mode is useful for number-crunching applications that may need to execute untrusted byte code, perhaps obtained by reading from a pipe or socket. This operation is available only if the kernel is configured with CONFIG_SECCOMP enabled.
With arg2 set to SECCOMP_MODE_FILTER (since Linux 3.5), the system calls allowed are defined by a pointer to a Berkeley Packet Filter passed in arg3. This argument is a pointer to struct sock_fprog; it can be designed to filter arbitrary system calls and system call arguments. This mode is available only if the kernel is configured with CONFIG_SECCOMP_FILTER enabled.
If SECCOMP_MODE_FILTER filters permit fork(2), then the seccomp mode is inherited by children created by fork(2); if execve(2) is permitted, then the seccomp mode is preserved across execve(2). If the filters permit prctl() calls, then additional filters can be added; they are run in order until the first non- allow result is seen.
For further information, see the kernel source file Documentation/prctl/seccomp_filter.txt.
/* * Try and keep these values and structures similar to BSD, especially * the BPF code definitions which need to match so you can share filters */ structsock_filter { /* Filter block */ __u16 code; /* Actual filter code */ __u8 jt; /* Jump true */ __u8 jf; /* Jump false */ __u32 k; /* Generic multiuse field */ }; structsock_fprog { /* Required for SO_ATTACH_FILTER. */ unsignedshort len; /* Number of filter blocks */ structsock_filter *filter; };
______ | |__| | WELCOME TO THE | () | UNTRUSTED COMPUTING SERVICE |______| V0.0.1a
LOAD PROGRAM line CODE JT JF K ================================= 0000: 0x20 0x00 0x00 0x00000004 A = arch 0001: 0x15 0x01 0x00 0xc000003e if (A == ARCH_X86_64) goto 0003 0002: 0x06 0x00 0x00 0x00000000 return KILL 0003: 0x20 0x00 0x00 0x00000000 A = sys_number 0004: 0x15 0x00 0x01 0x00000002 if (A != open) goto 0006 0005: 0x06 0x00 0x00 0x00050016 return ERRNO(22) 0006: 0x15 0x00 0x01 0x00000009 if (A != mmap) goto 0008 0007: 0x06 0x00 0x00 0x00050016 return ERRNO(22) 0008: 0x15 0x00 0x01 0x00000101 if (A != openat) goto 0010 0009: 0x06 0x00 0x00 0x00050016 return ERRNO(22) 0010: 0x15 0x00 0x01 0x00000130 if (A != open_by_handle_at) goto 0012 0011: 0x06 0x00 0x00 0x00050016 return ERRNO(22) 0012: 0x15 0x00 0x01 0x00000065 if (A != ptrace) goto 0014 0013: 0x06 0x00 0x00 0x00050016 return ERRNO(22) 0014: 0x06 0x00 0x00 0x7fff0000 return ALLOW
只禁用了open,mmap,openat, open_by_handle_at和ptrace
第一反应:那为啥不直接用execve???
1 2 3 4 5 6 7 8
$ python exp.py [+] Starting local process './babypf': pid 23036 [*] Switching to interactive mode [*] Process './babypf' stopped with exit code 0 (pid 23036) sh: error while loading shared libraries: libc.so.6: cannot open shared object file: Invalid argument THANK YOU [*] Got EOF while reading in interactive $
果然是我太年轻了
wp上说Since 3.4 the Linux kernel has had a feature called the X32 ABI; 64bit syscalls with 32bit pointers.
The BPF instructions operate on the BPF virtual machine, which has four main elements: The accumulator register A, the index register X, the packet memory, and the scratch memory M[].