1. 前言
限于作者能力水平,本文可能存在谬误,因此而给读者带来的损失,作者不做任何承诺。
2. 分析背景
本文分析基于
linux-4.14
内核代码。
运行环境为:
Ubuntu 16.04.4 LTS + QEMU Arm vexpress-a9
rootfs
基于
ubuntu-base-16.04-core-armhf.tar.gz
制作。
3. 测试demo
strace
是众所周知的系统调用追踪工具,我们建立一个类似于
strace
的测试程序来分析 系统调用追踪的工作原理。该测试程序可以用来追踪 ARM32 架构下的程序的系统调用。测试程序
syscall_trace
的代码如下:
/*
* syscall_trace.c
*/
#include <unistd.h>
#include <sys/ptrace.h>
#include <sys/user.h>
#include <sys/wait.h>
#include <stdlib.h>
#include <stdio.h>
/*
* 每个架构下的系统调用编号并不一致,这里是 ARM32 架构下的系统调用表。
* 每个内核版本支持的系统调用也不一样,需要按实际情况进行修改。
*/
static struct syscall {
int no;
char *name;
} syscall_table[] = {
/* 架构无关系统调用 */
{0, "restart_syscall"},
{1, "exit"},
{2, "fork"},
{3, "read"},
{4, "write"},
{5, "open"},
{6, "close"},
{8, "creat"},
{9, "link"},
{10, "unlink"},
{11, "execve"},
{12, "chdir"},
{14, "mknod"},
{15, "chmod"},
{16, "lchown"},
{19, "lseek"},
{20, "getpid"},
{21, "mount"},
{23, "setuid"},
{24, "getuid"},
{26, "ptrace"},
{29, "pause"},
{33, "access"},
{34, "nice"},
{36, "sync"},
{37, "kill"},
{38, "rename"},
{39, "mkdir"},
{40, "rmdir"},
{41, "dup"},
{42, "pipe"},
{43, "times"},
{45, "brk"},
{46, "setgid"},
{47, "getgid"},
{49, "geteuid"},
{50, "getegid"},
{51, "acct"},
{52, "umount2"},
{54, "ioctl"},
{55, "fcntl"},
{57, "setpgid"},
{60, "umask"},
{61, "chroot"},
{62, "ustat"},
{63, "dup2"},
{64, "getppid"},
{65, "getpgrp"},
{66, "setsid"},
{67, "sigaction"},
{70, "setreuid"},
{71, "setregid"},
{72, "sigsuspend"},
{73, "sigpending"},
{74, "sethostname"},
{75, "setrlimit"},
{77, "getrusage"},
{78, "gettimeofday"},
{79, "settimeofday"},
{80, "getgroups"},
{81, "setgroups"},
{83, "symlink"},
{85, "readlink"},
{86, "uselib"},
{87, "swapon"},
{88, "reboot"},
{91, "munmap"},
{92, "truncate"},
{93, "ftruncate"},
{94, "fchmod"},
{95, "fchown"},
{96, "getpriority"},
{97, "setpriority"},
{99, "statfs"},
{100, "fstatfs"},
{103, "syslog"},
{104, "setitimer"},
{105, "getitimer"},
{106, "stat"},
{107, "lstat"},
{108, "fstat"},
{111, "vhangup"},
{114, "wait4"},
{115, "swapoff"},
{116, "sysinfo"},
{118, "fsync"},
{119, "sigreturn"},
{120, "clone"},
{121, "setdomainname"},
{122, "uname"},
{124, "adjtimex"},
{125, "mprotect"},
{126, "sigprocmask"},
{128, "init_module"},
{129, "delete_module"},
{131, "quotactl"},
{132, "getpgid"},
{133, "fchdir"},
{134, "bdflush"},
{135, "sysfs"},
{136, "personality"},
{138, "setfsuid"},
{139, "setfsgid"},
{140, "_llseek"},
{141, "getdents"},
{142, "_newselect"},
{143, "flock"},
{144, "msync"},
{145, "readv"},
{146, "writev"},
{147, "getsid"},
{148, "fdatasync"},
{149, "_sysctl"},
{150, "mlock"},
{151, "munlock"},
{152, "mlockall"},
{153, "munlockall"},
{154, "sched_setparam"},
{155, "sched_getparam"},
{156, "sched_setscheduler"},
{157, "sched_getscheduler"},
{158, "sched_yield"},
{159, "sched_get_priority_max"},
{160, "sched_get_priority_min"},
{161, "sched_rr_get_interval"},
{162, "nanosleep"},
{163, "mremap"},
{164, "setresuid"},
{165, "getresuid"},
{168, "poll"},
{169, "nfsservctl"},
{170, "setresgid"},
{171, "getresgid"},
{172, "prctl"},
{173, "rt_sigreturn"},
{174, "rt_sigaction"},
{175, "rt_sigprocmask"},
{176, "rt_sigpending"},
{177, "rt_sigtimedwait"},
{178, "rt_sigqueueinfo"},
{179, "rt_sigsuspend"},
{180, "pread64"},
{181, "pwrite64"},
{182, "chown"},
{183, "getcwd"},
{184, "capget"},
{185, "capset"},
{186, "sigaltstack"},
{187, "sendfile"},
{190, "vfork"},
{191, "ugetrlimit"},
{192, "mmap2"},
{193, "truncate64"},
{194, "ftruncate64"},
{195, "stat64"},
{196, "lstat64"},
{197, "fstat64"},
{198, "lchown32"},
{199, "getuid32"},
{200, "getgid32"},
{201, "geteuid32"},
{202, "getegid32"},
{203, "setreuid32"},
{204, "setregid32"},
{205, "getgroups32"},
{206, "setgroups32"},
{207, "fchown32"},
{208, "setresuid32"},
{209, "getresuid32"},
{210, "setresgid32"},
{211, "getresgid32"},
{212, "chown32"},
{213, "setuid32"},
{214, "setgid32"},
{215, "setfsuid32"},
{216, "setfsgid32"},
{217, "getdents64"},
{218, "pivot_root"},
{219, "mincore"},
{220, "madvise"},
{221, "fcntl64"},
{224, "gettid"},
{225, "readahead"},
{226, "setxattr"},
{227, "lsetxattr"},
{228, "fsetxattr"},
{229, "getxattr"},
{230, "lgetxattr"},
{231, "fgetxattr"},
{232, "listxattr"},
{233, "llistxattr"},
{234, "flistxattr"},
{235, "removexattr"},
{236, "lremovexattr"},
{237, "fremovexattr"},
{238, "tkill"},
{239, "sendfile64"},
{240, "futex"},
{241, "sched_setaffinity"},
{242, "sched_getaffinity"},
{243, "io_setup"},
{244, "io_destroy"},
{245, "io_getevents"},
{246, "io_submit"},
{247, "io_cancel"},
{248, "exit_group"},
{249, "lookup_dcookie"},
{250, "epoll_create"},
{251, "epoll_ctl"},
{252, "epoll_wait"},
{253, "remap_file_pages"},
{256, "set_tid_address"},
{257, "timer_create"},
{258, "timer_settime"},
{259, "timer_gettime"},
{260, "timer_getoverrun"},
{261, "timer_delete"},
{262, "clock_settime"},
{263, "clock_gettime"},
{264, "clock_getres"},
{265, "clock_nanosleep"},
{266, "statfs64"},
{267, "fstatfs64"},
{268, "tgkill"},
{269, "utimes"},
{270, "arm_fadvise64_64"},
{271, "pciconfig_iobase"},
{272, "pciconfig_read"},
{273, "pciconfig_write"},
{274, "mq_open"},
{275, "mq_unlink"},
{276, "mq_timedsend"},
{277, "mq_timedreceive"},
{278, "mq_notify"},
{279, "mq_getsetattr"},
{280, "waitid"},
{281, "socket"},
{282, "bind"},
{283, "connect"},
{284, "listen"},
{285, "accept"},
{286, "getsockname"},
{287, "getpeername"},
{288, "socketpair"},
{289, "send"},
{290, "sendto"},
{291, "recv"},
{292, "recvfrom"},
{293, "shutdown"},
{294, "setsockopt"},
{295, "getsockopt"},
{296, "sendmsg"},
{297, "recvmsg"},
{298, "semop"},
{299, "semget"},
{300, "semctl"},
{301, "msgsnd"},
{302, "msgrcv"},
{303, "msgget"},
{304, "msgctl"},
{305, "shmat"},
{306, "shmdt"},
{307, "shmget"},
{308, "shmctl"},
{309, "add_key"},
{310, "request_key"},
{311, "keyctl"},
{312, "semtimedop"},
{313, "vserver"},
{314, "ioprio_set"},
{315, "ioprio_get"},
{316, "inotify_init"},
{317, "inotify_add_watch"},
{318, "inotify_rm_watch"},
{319, "mbind"},
{320, "get_mempolicy"},
{321, "set_mempolicy"},
{322, "openat"},
{323, "mkdirat"},
{324, "mknodat"},
{325, "fchownat"},
{326, "futimesat"},
{327, "fstatat64"},
{328, "unlinkat"},
{329, "renameat"},
{330, "linkat"},
{331, "symlinkat"},
{332, "readlinkat"},
{333, "fchmodat"},
{334, "faccessat"},
{335, "pselect6"},
{336, "ppoll"},
{337, "unshare"},
{338, "set_robust_list"},
{339, "get_robust_list"},
{340, "splice"},
{341, "arm_sync_file_range"},
{342, "tee"},
{343, "vmsplice"},
{344, "move_pages"},
{345, "getcpu"},
{346, "epoll_pwait"},
{347, "kexec_load"},
{348, "utimensat"},
{349, "signalfd"},
{350, "timerfd_create"},
{351, "eventfd"},
{352, "fallocate"},
{353, "timerfd_settime"},
{354, "timerfd_gettime"},
{355, "signalfd4"},
{356, "eventfd2"},
{357, "epoll_create1"},
{358, "dup3"},
{359, "pipe2"},
{360, "inotify_init1"},
{361, "preadv"},
{362, "pwritev"},
{363, "rt_tgsigqueueinfo"},
{364, "perf_event_open"},
{365, "recvmmsg"},
{366, "accept4"},
{367, "fanotify_init"},
{368, "fanotify_mark"},
{369, "prlimit64"},
{370, "name_to_handle_at"},
{371, "open_by_handle_at"},
{372, "clock_adjtime"},
{373, "syncfs"},
{374, "sendmmsg"},
{375, "setns"},
{376, "process_vm_readv"},
{377, "process_vm_writev"},
{378, "kcmp"},
{379, "finit_module"},
{380, "sched_setattr"},
{381, "sched_getattr"},
{382, "renameat2"},
{383, "seccomp"},
{384, "getrandom"},
{385, "memfd_create"},
{386, "bpf"},
{387, "execveat"},
{388, "userfaultfd"},
{389, "membarrier"},
{390, "mlock2"},
{391, "copy_file_range"},
{392, "preadv2"},
{393, "pwritev2"},
{394, "pkey_mprotect"},
{395, "pkey_alloc"},
{396, "pkey_free"},
{397, "statx"},
/* 架构相关的系统调用 */
#define __ARM_NR_BASE 0x0f0000
#define __ARM_NR_breakpoint (__ARM_NR_BASE+1)
#define __ARM_NR_cacheflush (__ARM_NR_BASE+2)
#define __ARM_NR_usr26 (__ARM_NR_BASE+3)
#define __ARM_NR_usr32 (__ARM_NR_BASE+4)
#define __ARM_NR_set_tls (__ARM_NR_BASE+5)
{__ARM_NR_breakpoint, "breakpoint"},
{__ARM_NR_cacheflush, "cacheflush"},
{__ARM_NR_usr26, "usr26"},
{__ARM_NR_usr32, "usr32"},
{__ARM_NR_set_tls, "set_tls"},
{-1, NULL}
};
static char *find_syscall_name(int scno)
{
struct syscall *sc;
for (sc = syscall_table; sc->no >= 0; sc++)
if (sc->no == scno)
return sc->name;
return NULL;
}
int main(int argc, char *argv[])
{
pid_t child;
int status;
struct user_regs regs; /* 架构相关的寄存器数据结构 */
unsigned long int scret, scno;
if ((child = fork()) == 0) { /* 子进程 */
/* 标记子进程为被 trace 状态,加入到父进程追踪进程列表 */
ptrace(PTRACE_TRACEME, 0, NULL, NULL);
/*
* 在程序 /bin/hello 代码开始执行之前,子进程的 execl() 发送 SIGTRAP 信号给自身;
* 然后子进程在处理 SIGTRAP 信号时,给父进程发送 SIGCHLD 信号,同时暂停自身执行,
* 告知父进程自身的状态变化。
* 子进程在父进程将其唤醒之前,不会再继续运行。父进程后续通过 ptrace() 调用:
* ptrace(PTRACE_SYSCALL, child, NULL, NULL);
* 将子进程唤醒继续执行。
* 通过这样的方式进行父子进程同步,可以防止父进程在对子进程进行信息查询前,子进
* 程就已经运行或退出。
*/
execl("/bin/hello", "/bin/hello", NULL);
/* 永远不应该运行到这里 */
exit(0);
} else if (child > 0) { /* 父进程 */
/*
* 子进程 execl() 给自身发送 SIGTRAP ,然后处理该信号时给父进程发送
* SIGCHLD 信号,然后暂停自身执行,等待父进程唤醒。
* 这里父进程等待的是子进程因 execl() 发送的 SIGCHLD 信号。
*/
wait(&status);
/*
* 发送 PTRACE_SYSCALL 命令给被跟踪子进程,将唤醒子进程继续执行。
* 同时 PTRACE_SYSCALL 命令会导致子进程在 系统调用进入或退出时,
* 给自身发送 SIGTRAP 信号,然后子进程在处理 SIGTRAP 信号时给父
* 进程发送 SIGCHLD 信号,然后暂停自身执行,等待父进程唤醒后继续
* 执行。
* 父进程通过 ptrace(PTRACE_SYSCALL) 唤醒子进程继续执行。
*/
ptrace(PTRACE_SYSCALL, child, NULL, NULL);
for (;;) {
wait(&status); /* 接收子进程 进入系统调用 时发送过来的 SIGCHLD 信号 */
if (WIFEXITED(status)) /* 如果被追踪子进程退出了, 那么终止跟踪 */
break;
/*
* 如果关心 进入系统调用 时的寄存器状态,
* 在这里调用 ptrace(PTRACE_GETREGS) 获取相关信息
*/
// ......
/*
* 子进程在 进入系统调用 时将暂停执行,父进程通过
* ptrace(PTRACE_SYSCALL) 唤醒子进程继续执行。
*/
ptrace(PTRACE_SYSCALL, child, NULL, NULL);
wait(&status); /* 接收子进程 系统调用退出 时发送过来的 SIGCHLD 信号 */
if (WIFEXITED(status)) /* 如果被追踪子进程退出了, 那么终止跟踪 */
break;
/* 获取 系统调用退出 时的寄存器状态 */
ptrace(PTRACE_GETREGS, child, 0, ®s);
scret = regs.uregs[0]; /* 寄存器 r0 保存系统调用返回值 */
scno = regs.uregs[7]; /* 寄存器 r7 保存系统调用号 */
printf("%s() = %d\n", find_syscall_name(scno), (int)scret);
/*
* 子进程在 退出系统调用 时将暂停执行,父进程通过
* ptrace(PTRACE_SYSCALL) 唤醒子进程继续执行。
*/
ptrace(PTRACE_SYSCALL, child, NULL, NULL);
}
}
return 0;
}
被追踪程序
hello
的代码如下:
#include <stdio.h>
int main(void)
{
printf("Hello, World!\n");
return 0;
}
编译:
arm-linux-gnueabihf-gcc -o syscall_trace syscall_trace.c
arm-linux-gnueabihf-gcc -o hello hello.c
然后将程序
syscall_trace
和
hello
均放入根文件系统的
/bin
目录下,用 QEMU 加载系统,系统运行起来后,运行
syscall_trace
程序,执行结果如下:
$ /bin/syscall_trace
brk() = 135168
uname() = 0
access() = -2
mmap2() = 1995878400
access() = -2
open() = 3
fstat64() = 0
mmap2() = 1995866112
close() = 0
access() = -2
open() = 3
read() = 512
lseek() = 894132
read() = 2960
lseek() = 888324
read() = 51
fstat64() = 0
mmap2() = 1994760192
mprotect() = 0
mmap2() = 1995702272
mmap2() = 1995714560
close() = 0
mmap2() = 1995862016
set_tls() = 0
mprotect() = 0
mprotect() = 0
munmap() = 0
clock_gettime() = 0
fstat64() = 0
ioctl() = 0
brk() = 135168
brk() = 274432
Hello, World!
write() = 33
4. 工作原理分析
我们以测试程序
syscall_trace
的代码
syscall_trace.c
为起点,逐步分析系统调用追踪的原理。
4.1 父子进程追踪与被追踪关系的建立
main()
...
if ((child = fork()) == 0) { /* 子进程建立,这里不展开具体细节,因为这不是我们关注的重点 */
/* 标记子进程为被 trace 状态,加入到父进程追踪进程列表 */
ptrace(PTRACE_TRACEME, 0, NULL, NULL)
/* 以下是内核空间细节 */
sys_ptrace(PTRACE_TRACEME, 0, NULL, NULL)
if (request == PTRACE_TRACEME) {
ret = ptrace_traceme();
if (!current->ptrace) { /* 进程当前没有被追踪 */
ret = security_ptrace_traceme(current->parent);
if (!ret && !(current->real_parent->flags & PF_EXITING)) {
current->ptrace = PT_PTRACED; /* 标记进程处于被追踪状态 */
ptrace_link(current, current->real_parent); /* 将当前进程的父进程设置为追踪进程 */
__ptrace_link(child, new_parent, __task_cred(new_parent))
list_add(&child->ptrace_entry, &new_parent->ptraced); /* 将子进程添加到父进程的追踪进程列表 */
child->parent = new_parent; /* 设置父进程 task_struct::parent 为追踪进程 */
}
}
...
goto out;
}
...
out:
return ret;
}
此时父进程与被追踪子进程的关系图如下:
4.2 子进程暂停自身执行并知悉父进程
main()
...
if ((child = fork()) == 0) { /* 子进程 */
/* 标记子进程为被 trace 状态,加入到父进程追踪进程列表 */
ptrace(PTRACE_TRACEME, 0, NULL, NULL);
/*
* 在程序 /bin/hello 代码开始执行之前,子进程的 execl() 发送 SIGTRAP 信号给自身;
* 然后子进程在处理 SIGTRAP 信号时,给父进程发送 SIGCHLD 信号,同时暂停自身执行,
* 告知父进程自身的状态变化。
* 子进程在父进程将其唤醒之前,不会再继续运行。父进程后续通过 ptrace() 调用:
* ptrace(PTRACE_SYSCALL, child, NULL, NULL);
* 将子进程唤醒继续执行。
* 通过这样的方式进行父子进程同步,可以防止父进程在对子进程进行信息查询前,子进
* 程就已经运行或退出。
*/
execl("/bin/hello", "/bin/hello", NULL)
/* 以下是内核空间细节 */
/* @arch/arm/kernel/entry-common.S */
vector_swi:
...
adr tbl, sys_call_table /* r8 = 系统调用表 sys_call_table[] 的地址 */
get_thread_info tsk /* r9 = 进程的 thread_info */
...
/* 调用系统调用接口 */
invoke_syscall tbl, scno, r10, __ret_fast_syscall
sys_execve() /* @fs/exec.c */
...
do_execveat_common(AT_FDCWD, filename, argv, envp, 0)
/* 我们只关注 SIGTRAP 信号相关细节 */
...
retval = exec_binprm(bprm)
/* 搜寻合适的 binary handler 并用它加载程序 */
ret = search_binary_handler(bprm)
retval = load_elf_binary();
...
return retval;
if (ret >= 0) {
...
/* 进程给自身发送 SIGTRAP 信号 */
ptrace_event(PTRACE_EVENT_EXEC, old_vpid)
...
if ((current->ptrace & (PT_PTRACED|PT_SEIZED)) == PT_PTRACED)
send_sig(SIGTRAP, current, 0)
...
...
}
...
return retval;
}
此时,子进程已经有挂起的信号
SIGTRAP
待处理,接下来就是对该信号的处理了。子进程从系统调用
sys_execve()
返回用户空间时处理
SIGTRAP
信号,我们看一下子进程是如何处理
SIGTRAP
信号的:
do_work_pending()
...
if (thread_flags & _TIF_SIGPENDING) {
do_signal(regs, syscall)
...
get_signal(&ksig)
for (;;) {
struct k_sigaction *ka;
...
signr = dequeue_synchronous_signal(&ksig->info);
if (!signr)
signr = dequeue_signal(current, ¤t->blocked, &ksig->info);
if (!signr)
break; /* will return 0 */
/* 处理被追踪进程的 (SIGTRAP) 信号 */
if (unlikely(current->ptrace) && signr != SIGKILL) {
signr = ptrace_signal(signr, &ksig->info)
...
ptrace_stop(signr, CLD_TRAPPED, 0, info)
set_special_state(TASK_TRACED)
current->state = TASK_TRACED;
...
current->last_siginfo = info;
current->exit_code = exit_code; /* exit_code == SIGTRAP */
...
if (may_ptrace_stop()) {
/* 给父进程发送 SIGCHLD 信号 */
do_notify_parent_cldstop(current, true, why)
struct siginfo info;
...
info.si_signo = SIGCHLD;
...
info.si_code = why; /* why == CLD_TRAPPED */
switch (why) {
...
case CLD_TRAPPED:
info.si_status = tsk->exit_code & 0x7f;
break;
...
}
sighand = parent->sighand; /* 父进程的信号处理接口 */
/*
* 不给忽略了 SIGCHLD 的父进程发送该信号。
* 默认情况下进程的 SIGCHLD 处理接口为 SIG_DEL 。
*/
if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN &&
!(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP))
__group_send_sig_info(SIGCHLD, &info, parent); /* 给父进程所在线程组发送 SIGCHLD 信号 */
__wake_up_parent(tsk, parent) /* 唤醒调用 wait() 等待子进程状态变化的父进程处理 SIGCHLD 信号 */
__wake_up_sync_key(&parent->signal->wait_chldexit,
TASK_INTERRUPTIBLE, 1, p)
...
/*
* 将子进程停下来,直到父进程将其唤醒继续执行。
* 在我们的场景下,父进程通过 ptrace(PTRACE_SYSCALL) 将子进程加入调度继续执行。
*/
freezable_schedule();
}
...
recalc_sigpending_tsk(current);
...
}
...
}
}
...
我们看到,被追踪子进程对
SIGTRAP
信号的处理是这样的:给父进程发送
SIGCHLD
信号,然后暂停自身执行,等待被唤醒。
上面的分析涉及到
系统调用
和
信号处理
的细节,可以分别参考博文
Linux系统调用实现简析
和
Linux信号处理简析
进行了解。
4.3 父进程追踪子进程的系统调用
4.3.1 父进程等待子进程execl()调用期间发送的SIGCHLD信号
章节
4.2
讲到被追踪的子进程向父进程发送
SIGCHLD
信号,同时暂停执行。我们来看父进程等待【子进程
execl()
调用期间发送的
SIGCHLD
信号】的具体流程:
if ((child = fork()) == 0) { /* 子进程 */
/* 标记子进程为被 trace 状态,加入到父进程追踪进程列表 */
ptrace(PTRACE_TRACEME, 0, NULL, NULL);
/*
* 在程序 /bin/hello 代码开始执行之前,子进程的 execl() 发送 SIGTRAP 信号给自身;
* 然后子进程在处理 SIGTRAP 信号时,给父进程发送 SIGCHLD 信号,同时暂停自身执行,
* 告知父进程自身的状态变化。
* 子进程在父进程将其唤醒之前,不会再继续运行。父进程后续通过 ptrace() 调用:
* ptrace(PTRACE_SYSCALL, child, NULL, NULL);
* 将子进程唤醒继续执行。
* 通过这样的方式进行父子进程同步,可以防止父进程在对子进程进行信息查询前,子进
* 程就已经运行或退出。
*/
execl("/bin/hello", "/bin/hello", NULL);
/* 永远不应该运行到这里 */
exit(0);
} else { /* 父进程 */
/*
* 子进程 execl() 给自身发送 SIGTRAP ,然后处理该信号时给父进程发送
* SIGCHLD 信号,然后暂停自身执行,等待父进程唤醒。
* 这里父进程等待的是子进程因 execl() 发送的 SIGCHLD 信号。
*/
wait(&status)
/* 下面是 glibc 和 内核空间的细节 */
__waitpid(WAIT_ANY, stat_loc, 0) /* WAIT_ANY == -1 */
__wait4(-1, stat_loc, 0, NULL)
/* 内核空间的细节 */
kernel_wait4(-1, NULL, 0, NULL)
...
if (upid == -1) /* 等待任一子进程 */
type = PIDTYPE_MAX;
...
wo.wo_type = type;
wo.wo_pid = pid;
wo.wo_flags = options | WEXITED;
...
ret = do_wait(&wo)
/*
* 子进程处理 SIGTRAP 时给父进程(即当前进程),发送 SIGCHLD
* 信号,然后通过 @wo->child_wait 唤醒父进程处理 SIGCHLD 信号,
* child_wait_callback() 也会被触发,它继续完成唤醒父进程的整个过程:
* child_wait_callback() -> default_wake_function() 。
*/
init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
wo->child_wait.private = current;
add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait);
repeat:
...
set_current_state(TASK_INTERRUPTIBLE);
tsk = current;
do { /* 遍历线程组中的所有进程,包括 @current */
retval = do_wait_thread(wo, tsk); /* 等待 【进程 @tsk 所有没被追踪的子进程】 的状态变更 */
...
/* 遍历 @tsk 的所有子进程 */
list_for_each_entry(p, &tsk->children, sibling) {
int ret = wait_consider_task(wo, 0, p);
...
ret = wait_task_stopped(wo, ptrace, p);
...
pid = task_pid_vnr(p);
...
/* 设置子进程的状态,从 wait() 返回 */
if (likely(!(wo->wo_flags & WNOWAIT)))
wo->wo_stat = (exit_code << 8) | 0x7f; /* (SIGTRAP << 8) | 0x7f */
...
return pid;
...
if (ret)
return ret;
}
return 0;
if (retval)
goto end;
retval = ptrace_do_wait(wo, tsk); /* 等待 【进程 @tsk 所有被追踪的子进程】 的状态变更 */
/* 类似于 do_wait_thread() */
if (retval)
goto end;
} while_each_thread(current, tsk);
...
notask:
retval = wo->notask_error;
if (!retval && !(wo->wo_flags & WNOHANG)) {
...
if (!signal_pending(current)) {
/* 没有等到子进程的状态变化,父进程进入可中断睡眠继续等待 */
schedule();
goto repeat;
}
}
/* 等待到子进程的状态变化,发起 wait() 等待的父进程返回用户空间 */
__set_current_state(TASK_RUNNING);
remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait);
return retval;
}
...
在
wait_task_stopped()
中等待到了子进程因
SIGTRAP
信号而发送的
SIGCHLD
信号。父进程发起
wait()
调用后,父子进程数据关系如下图所示:
4.3.2 父进程唤醒子进程继续执行
现在子进程处在
execl()
发送给自身的
SIGTRAP
而导致的暂停状态,要跟踪子进程的系统调用,得让它继续运行。父进程通过
ptrace(PTRACE_SYSCALL)
调用将子进程重新纳入调度执行,并通过对子进程设置
TIF_SYSCALL_TRACE
标记,指示开始对子进程系统调用的追踪,具体流程如下:
main()
if ((child = fork()) == 0) { /* 子进程 */
...
} else {
/*
* 子进程 execl() 给自身发送 SIGTRAP ,然后处理该信号时给父进程发送
* SIGCHLD 信号,然后暂停自身执行,等待父进程唤醒。
* 这里父进程等待的是子进程因 execl() 发送的 SIGCHLD 信号。
*/
wait(&status);
/*
* 发送 PTRACE_SYSCALL 命令给被跟踪子进程,将唤醒子进程继续执行。
* 同时 PTRACE_SYSCALL 命令会导致子进程在 系统调用进入或退出时,
* 给自身发送 SIGTRAP 信号,然后子进程在处理 SIGTRAP 信号时给父
* 进程发送 SIGCHLD 信号,然后暂停自身执行,等待父进程唤醒后继续
* 执行。
* 父进程通过 ptrace(PTRACE_SYSCALL) 唤醒子进程继续执行。
*/
ptrace(PTRACE_SYSCALL, child, NULL, NULL)
/* 以下是内核空间细节 */
sys_ptrace(PTRACE_SYSCALL, child, NULL, NULL)
ret = ptrace_check_attach(child, request == PTRACE_KILL ||
request == PTRACE_INTERRUPT)
/* 子进程被 ptrace 了 && 当前是其父进程对其发起了 PTRACE_SYSCALL */
if (child->ptrace && child->parent == current) {
/*
* ptrace_freeze_traced() 确保子进程 @child
* 只能被父进程(即当前进程)被唤醒: 将其状态设置为 __TASK_TRACED 。
*/
if (ignore_state || ptrace_freeze_traced(child))
ret = 0;
}
if (!ret && !ignore_state) {
/* 等待子进程停下来 (从 CPU 的运行队列中移出) */
wait_task_inactive(child, __TASK_TRACED)
}
return ret;
ret = arch_ptrace(child, request, addr, data);
switch (request) {
...
default:
ret = ptrace_request(child, request, addr, data);
switch (request) {
...
case PTRACE_SYSCALL:
case PTRACE_CONT:
return ptrace_resume(child, request, data);
if (request == PTRACE_SYSCALL)
/*
* 启用子进程的系统调用追踪:
* 在系统调用出入口暂停子进程,方便父进程获取
* 调用信息,之后需要父进程来唤醒子进程。
*/
set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
else
...
...
child->exit_code = data;
wake_up_state(child, __TASK_TRACED); /* 唤醒子进程继续执行 */
...
return 0;
}
}
return ret;
for (;;) {
...
}
}
4.3.3 提取子进程系统调用进入退出时的信息
上小节中设置子进程的
TIF_SYSCALL_TRACE
标记后(对进程的
thread_info::flags
设置),每当子进程进入或退出系统调用,子进程都会通过给自己发送
SIGTRAP
信号暂停自身执行,同时给父进程发送
SIGCHILD
信号;而父进程则通过
wait()
等到了子进程发送的
SIGCHILD
信号,并在子进程暂停期间提取系统调用信息,然后通过
ptrace(PTRACE_SYSCALL)
调用重启子进程。
4.3.3.1 子进程进入系统调用暂停执行的流程
/* @arch/arm/kernel/entry-common.S */
ENTRY(vector_swi)
adr tbl, sys_call_table /* r8 = 系统调用表 sys_call_table[] 的地址 */
get_thread_info tsk /* r9 = 进程的 thread_info */
...
local_restart:
ldr r10, [tsk, #TI_FLAGS] /* r10 = 进程的 thread_info::flags */
...
tst r10, #_TIF_SYSCALL_WORK /* 检查系统调用追踪标记 */
bne __sys_trace /* 系统调用进入前追踪 */
...
__sys_trace:
bl syscall_trace_enter /* arch/arm/kernel/ptrace.c */
if (test_thread_flag(TIF_SYSCALL_TRACE))
tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER)
tracehook_report_syscall_entry(regs)
ptrace_report_syscall(regs)
/*
* 被追踪子进程给自身发送 SIGTRAP 信号:
* 子进程处理该信号时,发送信号 SIGCHLD 唤醒 wait()
* 等待的父进程,然后暂停自身,直到父进程通过
* ptrace(PTRACE_SYSCALL) 调用唤醒它。
*/
ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0));
...
/* 调用系统调用 */
invoke_syscall tbl, scno, r10, __sys_trace_return, reload=1
...
4.3.3.2 子进程退出系统调用暂停执行的流程
/* @arch/arm/kernel/entry-common.S */
ENTRY(vector_swi)
ldr r10, [tsk, #TI_FLAGS] /* r10 = 进程的 thread_info::flags */
...
tst r10, #_TIF_SYSCALL_WORK /* 检查系统调用追踪标记 */
bne __sys_trace /* 进行系统调用进入前追踪 */
...
__sys_trace:
/* 系统调用进入前追踪 */
bl syscall_trace_enter /* arch/arm/kernel/ptrace.c */
...
/* 调用系统调用,返回到 __sys_trace_return 标号处 */
invoke_syscall tbl, scno, r10, __sys_trace_return, reload=1
...
__sys_trace_return:
bl syscall_trace_exit /* arch/arm/kernel/ptrace.c */
...
if (test_thread_flag(TIF_SYSCALL_TRACE))
tracehook_report_syscall(regs, PTRACE_SYSCALL_EXIT)
tracehook_report_syscall_exit(regs, 0)
ptrace_report_syscall(regs)
/*
* 被 ptraced 子进程给自身发送 SIGTRAP 信号:
* 子进程处理该信号时,发送信号 SIGCHLD 唤醒 wait() 等待的父进程,
* 然后暂停自身,直到父进程通过 ptrace(PTRACE_SYSCALL) 调用唤醒它。
*/
ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0))
/* 返回用户空间前,处理挂起的信号 */
b ret_slow_syscall
4.3.3.3 子进程在系统调用进出口暂停后的重启流程
子进程在系统调用进出口暂停执行后的重启流程,参看章节
4.3.2
。
4.4 子进程退出与父进程的追踪终止
子进程退出时,最后的系统调用是
sys_exit_group()
,我们简略的看一下退出时的系统调用
sys_exit_group()
的追踪流程,同前面的有些不同,因为进程自身即将退出,它没法再去处理
SIGTRAP
信号了。
sys_exit_group(error_code)
do_group_exit((error_code & 0xff) << 8)
do_exit(exit_code)
...
tsk->exit_code = code; /* 设置进程退出码 */
exit_notify(tsk, group_dead)
...
if (unlikely(tsk->ptrace)) { /* 如果进程 @tsk 处于被追踪状态 */
/* 子进程直接给父进程发送 SIGCHLD ,不再通过 SIGTRAP 绕一圈:
* 进程正退出,没机会了! */
int sig = thread_group_leader(tsk) &&
thread_group_empty(tsk) &&
!ptrace_reparented(tsk) ? tsk->exit_signal : SIGCHLD;
autoreap = do_notify_parent(tsk, sig)
struct siginfo info;
...
info.si_signo = sig;
...
psig = tsk->parent->sighand; /* @tsk 父进程的信号处理数据 */
...
if (valid_signal(sig) && sig)
__group_send_sig_info(sig, &info, tsk->parent); /* 将信号 @sig 发送给 @tsk 的父进程 */
__wake_up_parent(tsk, tsk->parent); /* 唤醒 @tsk 的父进程处理信号 */
}
...
/*
* 设置进程状态为最终态: task_struct::state = TASK_DEAD ,
* 进程彻底终结放弃 CPU 。
*/
do_task_dead();
4.5 小结
上面测试程序的工作方式,类似于
strace /bin/hello
,都是在追踪程序进程内,通过
fork() + exec*()
发起被追踪子进程的方式。
5. 另一种场景下的实现
另外一种追踪系统调用的场景,是将追踪程序挂接到已运行程序上,如
strace -p <PID>
。我们来简单探讨这种方式的实现和原理。我们还是从一份测试代码开始:
/*
* syscall_trace.c
*
* This is an system call trace demo program for ARM32 architecture.
*
* Copyright (c) 2019 Leng Xujun <lengxujun2007@126.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <unistd.h>
#include <sys/ptrace.h>
#include <sys/user.h>
#include <sys/wait.h>
#include <stdlib.h>
#include <stdio.h>
#include <signal.h>
/*
* 每个架构下的系统调用编号并不一致,这里是 ARM32 架构下的系统调用表。
* 每个内核版本支持的系统调用也不一样,需要按实际情况进行修改。
*/
static struct syscall {
int no;
char *name;
} syscall_table[] = {
/* 架构无关系统调用 */
{0, "restart_syscall"},
{1, "exit"},
{2, "fork"},
{3, "read"},
{4, "write"},
{5, "open"},
{6, "close"},
{8, "creat"},
{9, "link"},
{10, "unlink"},
{11, "execve"},
{12, "chdir"},
{14, "mknod"},
{15, "chmod"},
{16, "lchown"},
{19, "lseek"},
{20, "getpid"},
{21, "mount"},
{23, "setuid"},
{24, "getuid"},
{26, "ptrace"},
{29, "pause"},
{33, "access"},
{34, "nice"},
{36, "sync"},
{37, "kill"},
{38, "rename"},
{39, "mkdir"},
{40, "rmdir"},
{41, "dup"},
{42, "pipe"},
{43, "times"},
{45, "brk"},
{46, "setgid"},
{47, "getgid"},
{49, "geteuid"},
{50, "getegid"},
{51, "acct"},
{52, "umount2"},
{54, "ioctl"},
{55, "fcntl"},
{57, "setpgid"},
{60, "umask"},
{61, "chroot"},
{62, "ustat"},
{63, "dup2"},
{64, "getppid"},
{65, "getpgrp"},
{66, "setsid"},
{67, "sigaction"},
{70, "setreuid"},
{71, "setregid"},
{72, "sigsuspend"},
{73, "sigpending"},
{74, "sethostname"},
{75, "setrlimit"},
{77, "getrusage"},
{78, "gettimeofday"},
{79, "settimeofday"},
{80, "getgroups"},
{81, "setgroups"},
{83, "symlink"},
{85, "readlink"},
{86, "uselib"},
{87, "swapon"},
{88, "reboot"},
{91, "munmap"},
{92, "truncate"},
{93, "ftruncate"},
{94, "fchmod"},
{95, "fchown"},
{96, "getpriority"},
{97, "setpriority"},
{99, "statfs"},
{100, "fstatfs"},
{103, "syslog"},
{104, "setitimer"},
{105, "getitimer"},
{106, "stat"},
{107, "lstat"},
{108, "fstat"},
{111, "vhangup"},
{114, "wait4"},
{115, "swapoff"},
{116, "sysinfo"},
{118, "fsync"},
{119, "sigreturn"},
{120, "clone"},
{121, "setdomainname"},
{122, "uname"},
{124, "adjtimex"},
{125, "mprotect"},
{126, "sigprocmask"},
{128, "init_module"},
{129, "delete_module"},
{131, "quotactl"},
{132, "getpgid"},
{133, "fchdir"},
{134, "bdflush"},
{135, "sysfs"},
{136, "personality"},
{138, "setfsuid"},
{139, "setfsgid"},
{140, "_llseek"},
{141, "getdents"},
{142, "_newselect"},
{143, "flock"},
{144, "msync"},
{145, "readv"},
{146, "writev"},
{147, "getsid"},
{148, "fdatasync"},
{149, "_sysctl"},
{150, "mlock"},
{151, "munlock"},
{152, "mlockall"},
{153, "munlockall"},
{154, "sched_setparam"},
{155, "sched_getparam"},
{156, "sched_setscheduler"},
{157, "sched_getscheduler"},
{158, "sched_yield"},
{159, "sched_get_priority_max"},
{160, "sched_get_priority_min"},
{161, "sched_rr_get_interval"},
{162, "nanosleep"},
{163, "mremap"},
{164, "setresuid"},
{165, "getresuid"},
{168, "poll"},
{169, "nfsservctl"},
{170, "setresgid"},
{171, "getresgid"},
{172, "prctl"},
{173, "rt_sigreturn"},
{174, "rt_sigaction"},
{175, "rt_sigprocmask"},
{176, "rt_sigpending"},
{177, "rt_sigtimedwait"},
{178, "rt_sigqueueinfo"},
{179, "rt_sigsuspend"},
{180, "pread64"},
{181, "pwrite64"},
{182, "chown"},
{183, "getcwd"},
{184, "capget"},
{185, "capset"},
{186, "sigaltstack"},
{187, "sendfile"},
{190, "vfork"},
{191, "ugetrlimit"},
{192, "mmap2"},
{193, "truncate64"},
{194, "ftruncate64"},
{195, "stat64"},
{196, "lstat64"},
{197, "fstat64"},
{198, "lchown32"},
{199, "getuid32"},
{200, "getgid32"},
{201, "geteuid32"},
{202, "getegid32"},
{203, "setreuid32"},
{204, "setregid32"},
{205, "getgroups32"},
{206, "setgroups32"},
{207, "fchown32"},
{208, "setresuid32"},
{209, "getresuid32"},
{210, "setresgid32"},
{211, "getresgid32"},
{212, "chown32"},
{213, "setuid32"},
{214, "setgid32"},
{215, "setfsuid32"},
{216, "setfsgid32"},
{217, "getdents64"},
{218, "pivot_root"},
{219, "mincore"},
{220, "madvise"},
{221, "fcntl64"},
{224, "gettid"},
{225, "readahead"},
{226, "setxattr"},
{227, "lsetxattr"},
{228, "fsetxattr"},
{229, "getxattr"},
{230, "lgetxattr"},
{231, "fgetxattr"},
{232, "listxattr"},
{233, "llistxattr"},
{234, "flistxattr"},
{235, "removexattr"},
{236, "lremovexattr"},
{237, "fremovexattr"},
{238, "tkill"},
{239, "sendfile64"},
{240, "futex"},
{241, "sched_setaffinity"},
{242, "sched_getaffinity"},
{243, "io_setup"},
{244, "io_destroy"},
{245, "io_getevents"},
{246, "io_submit"},
{247, "io_cancel"},
{248, "exit_group"},
{249, "lookup_dcookie"},
{250, "epoll_create"},
{251, "epoll_ctl"},
{252, "epoll_wait"},
{253, "remap_file_pages"},
{256, "set_tid_address"},
{257, "timer_create"},
{258, "timer_settime"},
{259, "timer_gettime"},
{260, "timer_getoverrun"},
{261, "timer_delete"},
{262, "clock_settime"},
{263, "clock_gettime"},
{264, "clock_getres"},
{265, "clock_nanosleep"},
{266, "statfs64"},
{267, "fstatfs64"},
{268, "tgkill"},
{269, "utimes"},
{270, "arm_fadvise64_64"},
{271, "pciconfig_iobase"},
{272, "pciconfig_read"},
{273, "pciconfig_write"},
{274, "mq_open"},
{275, "mq_unlink"},
{276, "mq_timedsend"},
{277, "mq_timedreceive"},
{278, "mq_notify"},
{279, "mq_getsetattr"},
{280, "waitid"},
{281, "socket"},
{282, "bind"},
{283, "connect"},
{284, "listen"},
{285, "accept"},
{286, "getsockname"},
{287, "getpeername"},
{288, "socketpair"},
{289, "send"},
{290, "sendto"},
{291, "recv"},
{292, "recvfrom"},
{293, "shutdown"},
{294, "setsockopt"},
{295, "getsockopt"},
{296, "sendmsg"},
{297, "recvmsg"},
{298, "semop"},
{299, "semget"},
{300, "semctl"},
{301, "msgsnd"},
{302, "msgrcv"},
{303, "msgget"},
{304, "msgctl"},
{305, "shmat"},
{306, "shmdt"},
{307, "shmget"},
{308, "shmctl"},
{309, "add_key"},
{310, "request_key"},
{311, "keyctl"},
{312, "semtimedop"},
{313, "vserver"},
{314, "ioprio_set"},
{315, "ioprio_get"},
{316, "inotify_init"},
{317, "inotify_add_watch"},
{318, "inotify_rm_watch"},
{319, "mbind"},
{320, "get_mempolicy"},
{321, "set_mempolicy"},
{322, "openat"},
{323, "mkdirat"},
{324, "mknodat"},
{325, "fchownat"},
{326, "futimesat"},
{327, "fstatat64"},
{328, "unlinkat"},
{329, "renameat"},
{330, "linkat"},
{331, "symlinkat"},
{332, "readlinkat"},
{333, "fchmodat"},
{334, "faccessat"},
{335, "pselect6"},
{336, "ppoll"},
{337, "unshare"},
{338, "set_robust_list"},
{339, "get_robust_list"},
{340, "splice"},
{341, "arm_sync_file_range"},
{342, "tee"},
{343, "vmsplice"},
{344, "move_pages"},
{345, "getcpu"},
{346, "epoll_pwait"},
{347, "kexec_load"},
{348, "utimensat"},
{349, "signalfd"},
{350, "timerfd_create"},
{351, "eventfd"},
{352, "fallocate"},
{353, "timerfd_settime"},
{354, "timerfd_gettime"},
{355, "signalfd4"},
{356, "eventfd2"},
{357, "epoll_create1"},
{358, "dup3"},
{359, "pipe2"},
{360, "inotify_init1"},
{361, "preadv"},
{362, "pwritev"},
{363, "rt_tgsigqueueinfo"},
{364, "perf_event_open"},
{365, "recvmmsg"},
{366, "accept4"},
{367, "fanotify_init"},
{368, "fanotify_mark"},
{369, "prlimit64"},
{370, "name_to_handle_at"},
{371, "open_by_handle_at"},
{372, "clock_adjtime"},
{373, "syncfs"},
{374, "sendmmsg"},
{375, "setns"},
{376, "process_vm_readv"},
{377, "process_vm_writev"},
{378, "kcmp"},
{379, "finit_module"},
{380, "sched_setattr"},
{381, "sched_getattr"},
{382, "renameat2"},
{383, "seccomp"},
{384, "getrandom"},
{385, "memfd_create"},
{386, "bpf"},
{387, "execveat"},
{388, "userfaultfd"},
{389, "membarrier"},
{390, "mlock2"},
{391, "copy_file_range"},
{392, "preadv2"},
{393, "pwritev2"},
{394, "pkey_mprotect"},
{395, "pkey_alloc"},
{396, "pkey_free"},
{397, "statx"},
/* 架构相关的系统调用 */
#define __ARM_NR_BASE 0x0f0000
#define __ARM_NR_breakpoint (__ARM_NR_BASE+1)
#define __ARM_NR_cacheflush (__ARM_NR_BASE+2)
#define __ARM_NR_usr26 (__ARM_NR_BASE+3)
#define __ARM_NR_usr32 (__ARM_NR_BASE+4)
#define __ARM_NR_set_tls (__ARM_NR_BASE+5)
{__ARM_NR_breakpoint, "breakpoint"},
{__ARM_NR_cacheflush, "cacheflush"},
{__ARM_NR_usr26, "usr26"},
{__ARM_NR_usr32, "usr32"},
{__ARM_NR_set_tls, "set_tls"},
{-1, NULL}
};
static char *find_syscall_name(int scno)
{
struct syscall *sc;
for (sc = syscall_table; sc->no >= 0; sc++)
if (sc->no == scno)
return sc->name;
return NULL;
}
static volatile int stopped;
static void signal_handler(int signo)
{
if (signo == SIGINT || signo == SIGTERM)
stopped = 1;
}
int main(int argc, char *argv[])
{
pid_t child;
int status;
struct user_regs regs;
unsigned long int scret, scno;
if (argc != 2) {
printf("Usage: %s <PID>\n", argv[0]);
exit(EXIT_FAILURE);
}
if (signal(SIGINT, signal_handler) == SIG_ERR ||
signal(SIGTERM, signal_handler) == SIG_ERR) {
perror("signal");
exit(EXIT_FAILURE);
}
child = (pid_t)atoi(argv[1]);
if (ptrace(PTRACE_ATTACH, child, NULL, NULL) < 0) {
perror("ptrace");
exit(EXIT_FAILURE);
}
wait(&status);
ptrace(PTRACE_SYSCALL, child, NULL, NULL);
while (!stopped) {
wait(&status);
if (WIFEXITED(status))
break;
/*
* 如果关心 进入系统调用 时的寄存器状态,
* 在这里调用 ptrace(PTRACE_GETREGS) 获取相关信息
*/
// ......
ptrace(PTRACE_SYSCALL, child, NULL, NULL);
wait(&status);
if (WIFEXITED(status))
break;
ptrace(PTRACE_GETREGS, child, 0, ®s);
scret = regs.uregs[0]; /* 寄存器 r0 保存系统调用返回值 */
scno = regs.uregs[7]; /* 寄存器 r7 保存系统调用号 */
printf("%s() = %d\n", find_syscall_name(scno), (int)scret);
ptrace(PTRACE_SYSCALL, child, NULL, NULL);
}
ptrace(PTRACE_DETACH, child, NULL, NULL);
return 0;
}
被追踪程序的代码:
/*
* tracee2.c
*/
#include <unistd.h>
#include <stdio.h>
int main(void)
{
for (;;) {
write(fileno(stdout), "@", 1);
sleep(1);
}
return 0;
}
编译:
arm-linux-gnueabihf-gcc -o syscall_trace syscall_trace.c # 编译 tracer 程序
arm-linux-gnueabihf-gcc -o tracee2 tracee2.c # 编译 tracee 程序
将
syscall_trace
和
tracee2
程序放到根文件系统的
/bin
目录,然后用 QEMU 启动系统,登录系统后运行测试程序:
$ tracee2 &
[1] 927
$ syscall_trace 927
nanosleep() = 0
@write() = 1
nanosleep() = 0
@write() = 1
nanosleep() = 0
@write() = 1
nanosleep() = 0
@write() = 1
nanosleep() = 0
@write() = 1
nanosleep() = 0
@write() = 1
nanosleep() = 0
@write() = 1
nanosleep() = 0
@write() = 1
^Cnanosleep() = 0
其中
@
是被追踪程序
tracee2
的输出内容。
程序
syscall_trace
中的
ptrace(PTRACE_ATTACH)
调用将向程序
tracee2
发送
SIGSTOP
信号;
tracee2
处理
SIGSTOP
信号时,向
syscall_trace
发送
SIGCHLD
信号,然后暂停执行,等待
syscall_trace
将其唤醒。
syscall_trace
后续通过
ptrace(PTRACE_SYSCALL)
唤醒
tracee2
,并开始对
tracee2
程序中系统调用地追踪。
接下来,我们来看
ptrace(PTRACE_ATTACH)
和
ptrace(PTRACE_DETACH)
的细节。
sys_ptrace(PTRACE_ATTACH, child, NULL, NULL)
...
child = ptrace_get_task_struct(pid); /* 被追踪进程 tracee2 的 task_struct */
if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
ret = ptrace_attach(child, request, addr, data)
task->ptrace = PT_PTRACED;
ptrace_link(task, current)
__ptrace_link(child, new_parent, __task_cred(new_parent))
list_add(&child->ptrace_entry, &new_parent->ptraced); /* 添加到 tracer 的 tracee 列表 */
child->parent = new_parent; /* 重置父进程为 tracer */
/* 给 tracee 进程 @task 发送 SIGSTOP 信号,让它暂停执行 */
if (!seize)
send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
if (!ret)
arch_ptrace_attach(child);
goto out_put_task_struct;
}
...
out_put_task_struct:
put_task_struct(child);
out:
return ret;
}
被追踪程序
tracee2
处理
SIGSTOP
信号:
do_work_pending()
if (thread_flags & _TIF_SIGPENDING) {
do_signal(regs, syscall)
get_signal(&ksig)
...
for (;;) {
...
signr = dequeue_synchronous_signal(&ksig->info);
if (!signr)
signr = dequeue_signal(current, ¤t->blocked, &ksig->info);
if (!signr)
break; /* will return 0 */
if (unlikely(current->ptrace) && signr != SIGKILL) {
signr = ptrace_signal(signr, &ksig->info)
ptrace_stop(signr, CLD_TRAPPED, 0, info)
...
/* 设置进程状态为 TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED) */
set_special_state(TASK_TRACED);
...
if (may_ptrace_stop()) {
/* 给 tracer 程序 syscall_trace 发送 SIGCHLD 信号并唤醒它:syscall_trace 在 wait() 处等待 */
do_notify_parent_cldstop(current, true, why);
...
/*
* 将子进程停下来,直到父进程将其唤醒继续执行。
* 在我们的场景下,父进程通过 ptrace(PTRACE_SYSCALL) 将子进程加入调度继续执行。
*/
freezable_schedule();
}
}
}
}
ptrace(PTRACE_DETACH)
停止
syscall_trace
对
tracee2
的追踪:
sys_ptrace(PTRACE_DETACH, child, NULL, NULL)
...
child = ptrace_get_task_struct(pid); /* 被追踪进程 tracee2 的 task_struct */
...
ret = arch_ptrace(child, request, addr, data);
...
ret = ptrace_request(child, request, addr, data);
switch (request) {
...
case PTRACE_DETACH: /* detach a process that was attached. */
ret = ptrace_detach(child, data);
...
child->exit_code = data;
__ptrace_detach(current, child);
__ptrace_unlink(p);
clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); /* 清除系统调用追踪状态 */
child->parent = child->real_parent; /* 恢复为真正的父进程(通常是 bash),而不是 tracer 进程 */
list_del_init(&child->ptrace_entry);
...
child->ptrace = 0; /* 移除被追踪状态 */
...
...
do_notify_parent(p, p->exit_signal); /* 给真正的父进程发送信号 ,告知自己的状态变化:不再被追踪了 */
break;
...
}
...
out_put_task_struct:
put_task_struct(child);
out:
return ret;
我们用下图来看
ptrace(PTRACE_ATTACH)
和
ptrace(PTRACE_DETACH)
对追踪程序
syscall_trace
和 被追踪进程
tracee2
进程关系的影响:
6. 参考资料