0%

Sysdig Source Code Analysis

一、前言

公司hids平台目前的事件监控维度相对较少,于是老大借鉴sysdig手撸了一个内核模块从kernel解析syscall事件,以此增加监控的事件类型,也为后续的产品检测能力增强提供了数据基础。但是dalao手撸的内核模块存在一些bug,在帮dalao找bug、保证内核模块稳定性的过程中深入了解了sysdig这个项目本身的一些代码层面的东西,同时也解开了之前看falco时候的许多代码执行流程上的困惑,非常愉悦,这边做下记录和总结。

二、架构

放一下sysdig整体架构图并做下大概说明:

主要是用户态、内核态两块,sysdig-probe在内核态通过tracepoint解析syscalls做事件收集,封装好的事件数据写入一个sysdig自己实现的ringbuffer;用户态scap从ringbuffer取数据放入特定结构体交给sinsp解析组成具体事件,sinsp将组装好的事件继续向上传递给sysdig CLI,由其根据用户输入的参数通过规则引擎对上传的事件进行过滤等,将用户想要的事件结果进行输出展示。sysdig最基本的流程和主要功能就是这样。

image-20210529182714338

流程理解了,现在有一些细节需要去深入,总结出如下几个问题:

1、内核模块是如何通过tracepoint收集事件的

2、sysdig中ringbuffer是如何实现的

3、scap如何从ringbuffer中取事件的,取出来的是什么,并且还做了什么操作

4、sinsp如何通过scap传递的数据解析出事件的,除了解析事件,还做了什么操作

主要就是如上几个问题,sysdig CLI的命令解析那边比较简单且暂时没有需求接触到,所以暂时不会进行深入分析。

三、源码分析

driver-main.c-sysdig_init

首先看driver的初始执行函数sysdig_init,其中在开头执行了get_tracepoint_handles函数,进入其中查看:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
int sysdig_init(void)
{
dev_t dev;
unsigned int cpu;
unsigned int num_cpus;
int ret;
int acrret = 0;
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0))
int hp_ret;
#endif
int j;
int n_created_devices = 0;
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
struct device *device = NULL;
#else
struct class_device *device = NULL;
#endif
pr_info("driver loading, " PROBE_NAME " " PROBE_VERSION "\n");

ret = get_tracepoint_handles();
if (ret < 0)
goto init_module_err;

num_cpus = 0;
for_each_possible_cpu(cpu) {
++num_cpus;
}

进入get_tracepoint_handles,发现调用了for_each_kernel_tracepoint(visit_tracepoint, NULL);操作,内核源码树里找一下,发现for_each_kernel_tracepoint函数的定义,对所有内核tracepoint点做迭代,同时将一个回调函数地址作为参数;继续进入visit_tracepoint查看,发现会在for_each_kernel_tracepoint迭代内核tracepoint点的过程中调用此回调函数,与sysdig指定的tracepoint名做比较,若一致,则将内核tracepoint结构体指针复制给全局变量;这些指定的tracepoint即为sysdig需要去做监控的tracepoint,通过这种方式拿到所有内核tracepoint结构信息;for_each_kernel_tracepoint函数返回后会对全局结构体指针是否成功获取做检测:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
static int get_tracepoint_handles(void)
{
for_each_kernel_tracepoint(visit_tracepoint, NULL);

if (!tp_sys_enter) {
pr_err("failed to find sys_enter tracepoint\n");
return -ENOENT;
}
if (!tp_sys_exit) {
pr_err("failed to find sys_exit tracepoint\n");
return -ENOENT;
}
if (!tp_sched_process_exit) {
pr_err("failed to find sched_process_exit tracepoint\n");
return -ENOENT;
}
#ifdef CAPTURE_CONTEXT_SWITCHES
if (!tp_sched_switch) {
pr_err("failed to find sched_switch tracepoint\n");
return -ENOENT;
}
#endif
#ifdef CAPTURE_SIGNAL_DELIVERIES
if (!tp_signal_deliver) {
pr_err("failed to find signal_deliver tracepoint\n");
return -ENOENT;
}
#endif
#ifdef CAPTURE_PAGE_FAULTS
if (!tp_page_fault_user) {
pr_notice("failed to find page_fault_user tracepoint, disabling page-faults\n");
g_fault_tracepoint_disabled = true;
}
if (!tp_page_fault_kernel) {
pr_notice("failed to find page_fault_kernel tracepoint, disabling page-faults\n");
g_fault_tracepoint_disabled = true;
}
#endif

return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
/**
* for_each_kernel_tracepoint - iteration on all kernel tracepoints
* @fct: callback
* @priv: private data
*/
void for_each_kernel_tracepoint(void (*fct)(struct tracepoint *tp, void *priv),
void *priv)
{
for_each_tracepoint_range(__start___tracepoints_ptrs,
__stop___tracepoints_ptrs, fct, priv);
}
EXPORT_SYMBOL_GPL(for_each_kernel_tracepoint);
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
static void visit_tracepoint(struct tracepoint *tp, void *priv)
{
if (!strcmp(tp->name, "sys_enter"))
tp_sys_enter = tp;
else if (!strcmp(tp->name, "sys_exit"))
tp_sys_exit = tp;
else if (!strcmp(tp->name, "sched_process_exit"))
tp_sched_process_exit = tp;
#ifdef CAPTURE_CONTEXT_SWITCHES
else if (!strcmp(tp->name, "sched_switch"))
tp_sched_switch = tp;
#endif
#ifdef CAPTURE_SIGNAL_DELIVERIES
else if (!strcmp(tp->name, "signal_deliver"))
tp_signal_deliver = tp;
#endif
#ifdef CAPTURE_PAGE_FAULTS
else if (!strcmp(tp->name, "page_fault_user"))
tp_page_fault_user = tp;
else if (!strcmp(tp->name, "page_fault_kernel"))
tp_page_fault_kernel = tp;
#endif
}

alloc_chrdev_region函数向内核动态申请设备号,其中参数含义如下,dev:函数向内核申请下来的设备号,baseminor:次设备号的起始,count:申请次设备号个数,name:设备名(cat /proc/devices中显示的设备名称);之后使用kmalloc_array函数以g_ppm_numbers变量的值作为长度,ppm_device结构体的size作为大小创建结构体数组,返回一个指向创建数组所在内存首地址的指针给g_ppm_devs;g_ppm_numdevs是cpu核数,放在这边应该是想创建对应数量的ringbuffer:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
	acrret = alloc_chrdev_region(&dev, 0, num_cpus + 1, PROBE_DEVICE_NAME);
if (acrret < 0) {
pr_err("could not allocate major number for %s\n", PROBE_DEVICE_NAME);
ret = -ENOMEM;
goto init_module_err;
}

g_ppm_class = class_create(THIS_MODULE, PROBE_DEVICE_NAME);
if (IS_ERR(g_ppm_class)) {
pr_err("can't allocate device class\n");
ret = -EFAULT;
goto init_module_err;
}

#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
g_ppm_class->devnode = ppm_devnode;
#endif

g_ppm_major = MAJOR(dev);
g_ppm_numdevs = num_cpus;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 4, 0)
g_ppm_devs = kmalloc(g_ppm_numdevs * sizeof(struct ppm_device), GFP_KERNEL);
#else
g_ppm_devs = kmalloc_array(g_ppm_numdevs, sizeof(struct ppm_device), GFP_KERNEL);
#endif
if (!g_ppm_devs) {
pr_err("can't allocate devices\n");
ret = -ENOMEM;
goto init_module_err;
}

使用cdev_init函数初始化cdev字符设备,第一个参数为将要被初始化的设备结构体指针,第二个参数为该设备对应的文件操作函数地址,cdev_init执行完成后,cdev即与file_operations完成绑定,完成绑定后,用户态应用程序可以通过cdev设备绑定的这些函数(系统调用)操纵此字符设备;使用cdev_add向系统添加初始化后的cdev设备以完成注册:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
for (j = 0; j < g_ppm_numdevs; ++j) {
cdev_init(&g_ppm_devs[j].cdev, &g_ppm_fops);
g_ppm_devs[j].dev = MKDEV(g_ppm_major, j);

if (cdev_add(&g_ppm_devs[j].cdev, g_ppm_devs[j].dev, 1) < 0) {
pr_err("could not allocate chrdev for %s\n", PROBE_DEVICE_NAME);
ret = -EFAULT;
goto init_module_err;
}

#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
device = device_create(
#else
device = class_device_create(
#endif
g_ppm_class, NULL, /* no parent device */
g_ppm_devs[j].dev,
NULL, /* no additional data */
PROBE_DEVICE_NAME "%d",
j);

if (IS_ERR(device)) {
pr_err("error creating the device for %s\n", PROBE_DEVICE_NAME);
cdev_del(&g_ppm_devs[j].cdev);
ret = -EFAULT;
goto init_module_err;
}

init_waitqueue_head(&g_ppm_devs[j].read_queue);
n_created_devices++;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/**
* cdev_init() - initialize a cdev structure
* @cdev: the structure to initialize
* @fops: the file_operations for this device
*
* Initializes @cdev, remembering @fops, making it ready to add to the
* system with cdev_add().
*/
void cdev_init(struct cdev *cdev, const struct file_operations *fops)
{
memset(cdev, 0, sizeof *cdev);
INIT_LIST_HEAD(&cdev->list);
kobject_init(&cdev->kobj, &ktype_cdev_default);
cdev->ops = fops;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
/**
* cdev_add() - add a char device to the system
* @p: the cdev structure for the device
* @dev: the first device number for which this device is responsible
* @count: the number of consecutive minor numbers corresponding to this
* device
*
* cdev_add() adds the device represented by @p to the system, making it
* live immediately. A negative error code is returned on failure.
*/
int cdev_add(struct cdev *p, dev_t dev, unsigned count)
{
int error;

p->dev = dev;
p->count = count;

error = kobj_map(cdev_map, dev, count, NULL,
exact_match, exact_lock, p);
if (error)
return error;

kobject_get(p->kobj.parent);

return 0;
}

sysdig_init模块初始化函数的后半部分主要是一些初始化模块异常处理、字符设备卸载等操作,重要程度相对较低,暂不分析:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
	if (dpi_lookahead_init() != PPM_SUCCESS) {
pr_err("initializing lookahead-based snaplen failed\n");
ret = -EFAULT;
goto init_module_err;
}

/*
* Set up our callback in case we get a hotplug even while we are
* initializing the cpu structures
*/
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0))
hp_ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
"sysdig/probe:online",
sysdig_cpu_online,
sysdig_cpu_offline);
if (hp_ret <= 0) {
pr_err("error registering cpu hotplug callback\n");
ret = hp_ret;
goto init_module_err;
}
hp_state = hp_ret;
#else
register_cpu_notifier(&cpu_notifier);
#endif

/*
* All ok. Final initializations.
*/
g_tracepoint_registered = false;

return 0;

init_module_err:
for (j = 0; j < n_created_devices; ++j) {
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
device_destroy(
#else
class_device_destroy(
#endif
g_ppm_class, g_ppm_devs[j].dev);

cdev_del(&g_ppm_devs[j].cdev);
}

if (g_ppm_class)
class_destroy(g_ppm_class);

if (acrret == 0)
unregister_chrdev_region(dev, g_ppm_numdevs);

kfree(g_ppm_devs);

return ret;
}

之后主要是看用户态应用程序在对字符设备文件做特定系统调用时(open、ioctl、etc.)触发的内核回调了:

1
2
3
4
5
6
7
static const struct file_operations g_ppm_fops = {
.open = ppm_open,
.release = ppm_release,
.mmap = ppm_mmap,
.unlocked_ioctl = ppm_ioctl,
.owner = THIS_MODULE,
};

ppm_open:

主要看下tracepoint注册相关(其实除了tracepoint注册,还有很多像ringbuffer初始化,consumer结构初始化、cpu调度、同步等等操作,但是太菜还不是很了解,这边主要看下函数的核心逻辑),都是调内核函数tracepoint_probe_register完成tracepoint注册回调,不再赘述:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
	reset_ring_buffer(ring);
ring->open = true;

if (!g_tracepoint_registered) {
pr_info("starting capture\n");
/*
* Enable the tracepoints
*/

#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
ret = compat_register_trace(syscall_exit_probe, "sys_exit", tp_sys_exit);
#else
ret = register_trace_syscall_exit(syscall_exit_probe);
#endif
if (ret) {
pr_err("can't create the sys_exit tracepoint\n");
goto err_sys_exit;
}

#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
ret = compat_register_trace(syscall_enter_probe, "sys_enter", tp_sys_enter);
#else
ret = register_trace_syscall_enter(syscall_enter_probe);
#endif
if (ret) {
pr_err("can't create the sys_enter tracepoint\n");
goto err_sys_enter;
}

ret = compat_register_trace(syscall_procexit_probe, "sched_process_exit", tp_sched_process_exit);
if (ret) {
pr_err("can't create the sched_process_exit tracepoint\n");
goto err_sched_procexit;
}

#ifdef CAPTURE_CONTEXT_SWITCHES
ret = compat_register_trace(sched_switch_probe, "sched_switch", tp_sched_switch);
if (ret) {
pr_err("can't create the sched_switch tracepoint\n");
goto err_sched_switch;
}
#endif

#ifdef CAPTURE_SIGNAL_DELIVERIES
ret = compat_register_trace(signal_deliver_probe, "signal_deliver", tp_signal_deliver);
if (ret) {
pr_err("can't create the signal_deliver tracepoint\n");
goto err_signal_deliver;
}
#endif
g_tracepoint_registered = true;
}

ret = 0;

goto cleanup_open;

#ifdef CAPTURE_SIGNAL_DELIVERIES
err_signal_deliver:
compat_unregister_trace(sched_switch_probe, "sched_switch", tp_sched_switch);
#endif
err_sched_switch:
compat_unregister_trace(syscall_procexit_probe, "sched_process_exit", tp_sched_process_exit);
err_sched_procexit:
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
compat_unregister_trace(syscall_enter_probe, "sys_enter", tp_sys_enter);
#else
unregister_trace_syscall_enter(syscall_enter_probe);
#endif
err_sys_enter:
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
compat_unregister_trace(syscall_exit_probe, "sys_exit", tp_sys_exit);
#else
unregister_trace_syscall_exit(syscall_exit_probe);
#endif
err_sys_exit:
ring->open = false;
err_init_ring_buffer:
check_remove_consumer(consumer, in_list);
cleanup_open:
mutex_unlock(&g_consumer_mutex);

return ret;
}
1
2
3
4
5
6
7
8
static int compat_register_trace(void *func, const char *probename, struct tracepoint *tp)
{
#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0))
return TRACEPOINT_PROBE_REGISTER(probename, func);
#else
return tracepoint_probe_register(tp, func, NULL);
#endif
}

选择最典型的sys_enter进行分析:

sys_enter可以看作是一个总入口,用户态做任何系统调用都会经过sys_enter这个tracepoint,通过sys_enter所在上下文的regs和task_struct结构,调syscall_get_nr拿到所执行对应系统调用的特定系统调用号;将此系统调用号作为cur_g_syscall_table的idx,从g_syscall_table中拿到used、drop_flags状态码以及enter_event_type类型;将其它数据如事件类型category(系统调用|上下文切换|信号|缺页中断)、寄存器组结构regs、系统调用号id等赋给event_data事件结构体;调用record_event_all_consumers函数,猜测是将事件发送给对应consumer所在的ringbuffer处理:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
TRACEPOINT_PROBE(syscall_exit_probe, struct pt_regs *regs, long ret)
{
int id;
long table_index;
const struct syscall_evt_pair *cur_g_syscall_table = g_syscall_table;
const enum ppm_syscall_code *cur_g_syscall_code_routing_table = g_syscall_code_routing_table;
bool compat = false;
#ifdef __NR_socketcall
int socketcall_syscall = __NR_socketcall;
#else
int socketcall_syscall = -1;
#endif

id = syscall_get_nr(current, regs);

#if defined(CONFIG_X86_64) && defined(CONFIG_IA32_EMULATION)
/*
* When a process does execve from 64bit to 32bit, TS_COMPAT is marked true
* but the id of the syscall is __NR_execve, so to correctly parse it we need to
* use 64bit syscall table. On 32bit __NR_execve is equal to __NR_ia32_oldolduname
* which is a very old syscall, not used anymore by most applications
*/
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0)
if (in_ia32_syscall() && id != __NR_execve) {
#else
if (unlikely((task_thread_info(current)->status & TS_COMPAT) && id != __NR_execve)) {
#endif
cur_g_syscall_table = g_syscall_ia32_table;
cur_g_syscall_code_routing_table = g_syscall_ia32_code_routing_table;
socketcall_syscall = __NR_ia32_socketcall;
compat = true;
}
#endif

g_n_tracepoint_hit_inc();

table_index = id - SYSCALL_TABLE_ID0;
if (likely(table_index >= 0 && table_index < SYSCALL_TABLE_SIZE)) {
struct event_data_t event_data;
int used = cur_g_syscall_table[table_index].flags & UF_USED;
enum syscall_flags drop_flags = cur_g_syscall_table[table_index].flags;
enum ppm_event_type type;

/*
* Simple mode event filtering
*/
if (g_simple_mode_enabled) {
if ((drop_flags & UF_SIMPLEDRIVER_KEEP) == 0) {
return;
}
}

#ifdef _HAS_SOCKETCALL
if (id == socketcall_syscall) {
used = true;
drop_flags = UF_NEVER_DROP;
type = PPME_GENERIC_X;
} else
type = cur_g_syscall_table[table_index].exit_event_type;
#else
type = cur_g_syscall_table[table_index].exit_event_type;
#endif

event_data.category = PPMC_SYSCALL;
event_data.event_info.syscall_data.regs = regs;
event_data.event_info.syscall_data.id = id;
event_data.event_info.syscall_data.cur_g_syscall_code_routing_table = cur_g_syscall_code_routing_table;
event_data.socketcall_syscall = socketcall_syscall;
event_data.compat = compat;

if (used)
record_event_all_consumers(type, drop_flags, &event_data);
else
record_event_all_consumers(PPME_GENERIC_X, UF_ALWAYS_DROP, &event_data);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
/**
* syscall_get_nr - find what system call a task is executing
* @task: task of interest, must be blocked
* @regs: task_pt_regs() of @task
*
* If @task is executing a system call or is at system call
* tracing about to attempt one, returns the system call number.
* If @task is not executing a system call, i.e. it's blocked
* inside the kernel for a fault or signal, returns -1.
*
* Note this returns int even on 64-bit machines. Only 32 bits of
* system call number can be meaningful. If the actual arch value
* is 64 bits, this truncates to 32 bits so 0xffffffff means -1.
*
* It's only valid to call this when @task is known to be blocked.
*/
int syscall_get_nr(struct task_struct *task, struct pt_regs *regs);

record_event_all_consumers调用了record_event_consumer,首先对事件类型做过滤,检测对应类型的事件是否需要被丢弃;需要注意的是,这边对capture_enabled做了检查,而这个变量是需要用户态应用程序主动ioctl发送特定信号来置位的,具体可参考file_operations中的ppm_ioctl回调:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
if (!test_bit(event_type, g_events_mask))
return res;

if (event_type != PPME_DROP_E && event_type != PPME_DROP_X) {
if (consumer->need_to_insert_drop_e == 1)
record_drop_e(consumer, ns, drop_flags);
else if (consumer->need_to_insert_drop_x == 1)
record_drop_x(consumer, ns, drop_flags);

if (drop_event(consumer,
event_type,
drop_flags,
ns,
event_datap->event_info.syscall_data.regs))
return res;
}

/*
* FROM THIS MOMENT ON, WE HAVE TO BE SUPER FAST
*/
cpu = get_cpu();
ring = per_cpu_ptr(consumer->ring_buffers, cpu);
ASSERT(ring);

ring_info = ring->info;

if (!ring->capture_enabled) {
put_cpu();
return res;
}

ring_info->n_evts++;
if (event_datap->category == PPMC_CONTEXT_SWITCH && event_datap->event_info.context_data.sched_prev != NULL) {
if (event_type != PPME_SYSDIGEVENT_E && event_type != PPME_CPU_HOTPLUG_E) {
ASSERT(event_datap->event_info.context_data.sched_prev != NULL);
ASSERT(event_datap->event_info.context_data.sched_next != NULL);
ring_info->n_context_switches++;
}
}

对ringbuffer的剩余空间进行计算,需要注意的是,sysdig是从head指针指向的位置写,从tail指针指向的位置开始读,这和常用的读写标志应用的正好相反;ringbuffer是环形的,需要根据head和tail的相对位置将其分为两种状态对freespace进行计算,freespace计算出后,自然就能得到usedspace:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
head = ring_info->head;
ttail = ring_info->tail;

if (ttail > head)
freespace = ttail - head - 1;
else
freespace = RING_BUF_SIZE + ttail - head - 1;

usedspace = RING_BUF_SIZE - freespace - 1;
delta_from_end = RING_BUF_SIZE + (2 * PAGE_SIZE) - head - 1;

ASSERT(freespace <= RING_BUF_SIZE);
ASSERT(usedspace <= RING_BUF_SIZE);
ASSERT(ttail <= RING_BUF_SIZE);
ASSERT(head <= RING_BUF_SIZE);
ASSERT(delta_from_end < RING_BUF_SIZE + (2 * PAGE_SIZE));
ASSERT(delta_from_end > (2 * PAGE_SIZE) - 1);

新建一个专用于事件参数过滤解析的结构体args,根据事件类型从g_event_info表中将对应事件的参数数量取出,并根据参数个数*2字节得到一个arg_data_offset的偏移,猜测是要用于后面的写buffer;之后计算freespace是否大于ppm_evt_hdr结构体和arg_data_offset的和之大小,判断ringbuffer的剩余空间能否再写入一个事件结构数据;若能继续写入,则将ringbuffer+head指针给hdr,将ns、pid、event_type、nargs等evt_header(即事件头)写入ringbuffer,然后将ringbuffer指针移位(累加上之前的evt_header大小);否则drop此事件:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
args.nargs = g_event_info[event_type].nparams;
args.arg_data_offset = args.nargs * sizeof(u16);

/*
* Make sure we have enough space for the event header.
* We need at least space for the header plus 16 bit per parameter for the lengths.
*/
if (likely(freespace >= sizeof(struct ppm_evt_hdr) + args.arg_data_offset)) {
/*
* Populate the header
*/
struct ppm_evt_hdr *hdr = (struct ppm_evt_hdr *)(ring->buffer + head);

#ifdef PPM_ENABLE_SENTINEL
hdr->sentinel_begin = ring->nevents;
#endif
hdr->ts = ns;
hdr->tid = current->pid;
hdr->type = event_type;
hdr->nparams = args.nargs;

/*
* Populate the parameters for the filler callback
*/
args.consumer = consumer;
args.buffer = ring->buffer + head + sizeof(struct ppm_evt_hdr);
#ifdef PPM_ENABLE_SENTINEL
args.sentinel = ring->nevents;
#endif
args.buffer_size = min(freespace, delta_from_end) - sizeof(struct ppm_evt_hdr); /* freespace is guaranteed to be bigger than sizeof(struct ppm_evt_hdr) */
args.event_type = event_type;

if (event_datap->category == PPMC_SYSCALL) {
args.regs = event_datap->event_info.syscall_data.regs;
args.syscall_id = event_datap->event_info.syscall_data.id;
args.cur_g_syscall_code_routing_table = event_datap->event_info.syscall_data.cur_g_syscall_code_routing_table;
args.compat = event_datap->compat;
} else {
args.regs = NULL;
args.syscall_id = -1;
args.cur_g_syscall_code_routing_table = NULL;
args.compat = false;
}

if (event_datap->category == PPMC_CONTEXT_SWITCH) {
args.sched_prev = event_datap->event_info.context_data.sched_prev;
args.sched_next = event_datap->event_info.context_data.sched_next;
} else {
args.sched_prev = NULL;
args.sched_next = NULL;
}

if (event_datap->category == PPMC_SIGNAL) {
args.signo = event_datap->event_info.signal_data.sig;
if (event_datap->event_info.signal_data.info == NULL) {
args.spid = (__kernel_pid_t) 0;
} else if (args.signo == SIGKILL) {
args.spid = event_datap->event_info.signal_data.info->_sifields._kill._pid;
} else if (args.signo == SIGTERM || args.signo == SIGHUP || args.signo == SIGINT ||
args.signo == SIGTSTP || args.signo == SIGQUIT) {
if (event_datap->event_info.signal_data.info->si_code == SI_USER ||
event_datap->event_info.signal_data.info->si_code == SI_QUEUE ||
event_datap->event_info.signal_data.info->si_code <= 0) {
args.spid = event_datap->event_info.signal_data.info->si_pid;
}
} else if (args.signo == SIGCHLD) {
args.spid = event_datap->event_info.signal_data.info->_sifields._sigchld._pid;
} else if (args.signo >= SIGRTMIN && args.signo <= SIGRTMAX) {
args.spid = event_datap->event_info.signal_data.info->_sifields._rt._pid;
} else {
args.spid = (__kernel_pid_t) 0;
}
} else {
args.signo = 0;
args.spid = (__kernel_pid_t) 0;
}
args.dpid = current->pid;

if (event_datap->category == PPMC_PAGE_FAULT)
args.fault_data = event_datap->event_info.fault_data;

args.curarg = 0;
args.arg_data_size = args.buffer_size - args.arg_data_offset;
args.nevents = ring->nevents;
args.str_storage = ring->str_storage;
args.enforce_snaplen = false;

根据事件类型从g_ppm_events表中找到对应事件的回调函数,将args结构体地址作为参数传入,在回调函数中将获取此事件的所有参数并填充到args结构体中;需要注意的是,这边用了多次宏,导致一开始较难发现实际指向的回调,认真分析后发现定义在ppm_filllers.c中:

1
2
3
4
5
6
if (likely(g_ppm_events[event_type].filler_callback)) {
cbres = g_ppm_events[event_type].filler_callback(&args);
} else {
pr_err("corrupted filler for event type %d: NULL callback\n", event_type);
ASSERT(0);
}
1
2
3
4
5
6
7
8
9
10
const struct ppm_event_entry g_ppm_events[PPM_EVENT_MAX] = {
[PPME_GENERIC_E] = {FILLER_REF(sys_generic)},
[PPME_GENERIC_X] = {FILLER_REF(sys_generic)},
[PPME_SYSCALL_OPEN_E] = {FILLER_REF(sys_empty)},
[PPME_SYSCALL_OPEN_X] = {FILLER_REF(sys_open_x)},
[PPME_SYSCALL_CLOSE_E] = {FILLER_REF(sys_single)},
[PPME_SYSCALL_CLOSE_X] = {FILLER_REF(sys_single_x)},
[PPME_SYSCALL_READ_E] = {FILLER_REF(sys_autofill), 2, APT_REG, {{0}, {2} } },
[PPME_SYSCALL_READ_X] = {FILLER_REF(sys_read_x)},
[PPME_SYSCALL_WRITE_E] = {FILLER_REF(sys_autofill), 2, APT_REG, {{0}, {2} } },
1
2
3
4
5
#if defined(__KERNEL__) || defined(UDIG)
#define FILLER_REF(x) f_##x, PPM_FILLER_##x
#else
#define FILLER_REF(x) 0, PPM_FILLER_##x
#endif /* __KERNEL__ */
1
2
3
4
5
6
7
8
9
10
#define FILLER_ENUM_FN(x) PPM_FILLER_##x,
enum ppm_filler_id {
FILLER_LIST_MAPPER(FILLER_ENUM_FN)
PPM_FILLER_MAX
};
#undef FILLER_ENUM_FN

#define FILLER_PROTOTYPE_FN(x) int f_##x(struct event_filler_arguments *args);
FILLER_LIST_MAPPER(FILLER_PROTOTYPE_FN)
#undef FILLER_PROTOTYPE_FN

仔细看可以发现此回调中获取并写入ringbuffer的数据全部是event_table.c中g_event_info里定义的特定系统调用事件结构中的参数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
int f_sys_openat_x(struct event_filler_arguments *args)
{
unsigned long val;
unsigned long flags;
unsigned long modes;
int res;
int64_t retval;

retval = (int64_t)syscall_get_return_value(current, args->regs);
res = val_to_ring(args, retval, 0, false, 0);
if (unlikely(res != PPM_SUCCESS))
return res;

/*
* dirfd
*/
syscall_get_arguments_deprecated(current, args->regs, 0, 1, &val);

if ((int)val == AT_FDCWD)
val = PPM_AT_FDCWD;

res = val_to_ring(args, val, 0, false, 0);
if (unlikely(res != PPM_SUCCESS))
return res;

/*
* name
*/
syscall_get_arguments_deprecated(current, args->regs, 1, 1, &val);
res = val_to_ring(args, val, 0, true, 0);
if (unlikely(res != PPM_SUCCESS))
return res;

/*
* Flags
* Note that we convert them into the ppm portable representation before pushing them to the ring
*/
syscall_get_arguments_deprecated(current, args->regs, 2, 1, &flags);
res = val_to_ring(args, open_flags_to_scap(flags), 0, false, 0);
if (unlikely(res != PPM_SUCCESS))
return res;

/*
* mode
*/
syscall_get_arguments_deprecated(current, args->regs, 3, 1, &modes);
res = val_to_ring(args, open_modes_to_scap(flags, modes), 0, false, 0);
if (unlikely(res != PPM_SUCCESS))
return res;

/*
* dev
*/
res = val_to_ring(args, get_fd_dev(retval), 0, false, 0);
if (unlikely(res != PPM_SUCCESS))
return res;

return add_sentinel(args);
}

需要特别说明的是val_to_ring这个参数值入ringbuffer函数,它主要由对应事件的回调函数调用并将每一个参数解析后写入ringbuffer;与前面的代码对应起来,其实就是在ringbuffer中本次事件的evt_header后依次写入每一个参数对应的字节码,写入的长度都在各分支中指定;每次入参到ringbuffer前都会检查本次写事件中ringbuffer的剩余长度是否足够支撑本次参数的写入,处于临界状态时就会返回PPM_FAILURE_BUFFER_FULL来drop掉此次事件,防止在临界状态下覆写tail后的数据造成ringbuffer数据错乱:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
int val_to_ring(struct event_filler_arguments *args, uint64_t val, u32 val_len, bool fromuser, u8 dyn_idx)
{
const struct ppm_param_info *param_info;
int len = -1;
u16 *psize = (u16 *)(args->buffer + args->curarg * sizeof(u16));
u32 max_arg_size = args->arg_data_size;

if (unlikely(args->curarg >= args->nargs)) {
#ifndef UDIG
pr_err("(%u)val_to_ring: too many arguments for event #%u, type=%u, curarg=%u, nargs=%u tid:%u\n",
smp_processor_id(),
args->nevents,
(u32)args->event_type,
args->curarg,
args->nargs,
current->pid);
memory_dump(args->buffer - sizeof(struct ppm_evt_hdr), 32);
#endif
ASSERT(0);
return PPM_FAILURE_BUG;
}

if (unlikely(args->arg_data_size == 0))
return PPM_FAILURE_BUFFER_FULL;

if (max_arg_size > PPM_MAX_ARG_SIZE)
max_arg_size = PPM_MAX_ARG_SIZE;

param_info = &(g_event_info[args->event_type].params[args->curarg]);
if (param_info->type == PT_DYN && param_info->info != NULL) {
const struct ppm_param_info *dyn_params;

if (unlikely(dyn_idx >= param_info->ninfo)) {
ASSERT(0);
return PPM_FAILURE_BUG;
}

#ifdef UDIG
dyn_params = (const struct ppm_param_info *)patch_pointer((uint8_t*)param_info->info);
#else
dyn_params = (const struct ppm_param_info *)param_info->info;
#endif

param_info = &dyn_params[dyn_idx];
if (likely(max_arg_size >= sizeof(u8))) {
*(u8 *)(args->buffer + args->arg_data_offset) = dyn_idx;
len = sizeof(u8);
} else {
return PPM_FAILURE_BUFFER_FULL;
}
args->arg_data_offset += len;
args->arg_data_size -= len;
max_arg_size -= len;
*psize = (u16)len;
} else {
*psize = 0;
}

switch (param_info->type) {
case PT_CHARBUF:
case PT_FSPATH:
if (likely(val != 0)) {
if (fromuser) {
len = ppm_strncpy_from_user(args->buffer + args->arg_data_offset,
(const char __user *)(unsigned long)val, max_arg_size);

if (unlikely(len < 0))
return PPM_FAILURE_INVALID_USER_MEMORY;
} else {
len = strlcpy(args->buffer + args->arg_data_offset,
(const char *)(unsigned long)val,
max_arg_size);

if (++len > max_arg_size)
len = max_arg_size;
}

/*
* Make sure the string is null-terminated
*/
*(char *)(args->buffer + args->arg_data_offset + len) = 0;
} else {
/*
* Handle NULL pointers
*/
len = strlcpy(args->buffer + args->arg_data_offset,
"(NULL)",
max_arg_size);

if (++len > max_arg_size)
len = max_arg_size;
}

break;
case PT_BYTEBUF:
if (likely(val != 0)) {
if (fromuser) {
/*
* Copy the lookahead portion of the buffer that we will use DPI-based
* snaplen calculation
*/
u32 dpi_lookahead_size = DPI_LOOKAHEAD_SIZE;

if (dpi_lookahead_size > val_len)
dpi_lookahead_size = val_len;

if (unlikely(dpi_lookahead_size >= max_arg_size))
return PPM_FAILURE_BUFFER_FULL;

len = (int)ppm_copy_from_user(args->buffer + args->arg_data_offset,
(const void __user *)(unsigned long)val,
dpi_lookahead_size);

if (unlikely(len != 0))
return PPM_FAILURE_INVALID_USER_MEMORY;

/*
* Check if there's more to copy
*/
if (likely((dpi_lookahead_size != val_len))) {
/*
* Calculate the snaplen
*/
if (likely(args->enforce_snaplen)) {
u32 sl = args->consumer->snaplen;

#ifndef UDIG
sl = compute_snaplen(args, args->buffer + args->arg_data_offset, dpi_lookahead_size);
#endif
if (val_len > sl)
val_len = sl;
}

if (unlikely((val_len) >= max_arg_size))
val_len = max_arg_size;

if (val_len > dpi_lookahead_size) {
len = (int)ppm_copy_from_user(args->buffer + args->arg_data_offset + dpi_lookahead_size,
(const void __user *)(unsigned long)val + dpi_lookahead_size,
val_len - dpi_lookahead_size);

if (unlikely(len != 0))
return PPM_FAILURE_INVALID_USER_MEMORY;
}
}

len = val_len;
} else {
if (likely(args->enforce_snaplen)) {
#ifdef UDIG
u32 sl = args->consumer->snaplen;
#else
u32 sl = compute_snaplen(args, (char *)(unsigned long)val, val_len);
#endif
if (val_len > sl)
val_len = sl;
}

if (unlikely(val_len >= max_arg_size))
return PPM_FAILURE_BUFFER_FULL;

memcpy(args->buffer + args->arg_data_offset,
(void *)(unsigned long)val, val_len);

len = val_len;
}
} else {
/*
* Handle NULL pointers
*/
len = 0;
}

break;
case PT_SOCKADDR:
case PT_SOCKTUPLE:
case PT_FDLIST:
if (likely(val != 0)) {
if (unlikely(val_len >= max_arg_size))
return PPM_FAILURE_BUFFER_FULL;

if (fromuser) {
len = (int)ppm_copy_from_user(args->buffer + args->arg_data_offset,
(const void __user *)(unsigned long)val,
val_len);

if (unlikely(len != 0))
return PPM_FAILURE_INVALID_USER_MEMORY;

len = val_len;
} else {
memcpy(args->buffer + args->arg_data_offset,
(void *)(unsigned long)val, val_len);

len = val_len;
}
} else {
/*
* Handle NULL pointers
*/
len = 0;
}

break;
case PT_FLAGS8:
case PT_UINT8:
case PT_SIGTYPE:
if (likely(max_arg_size >= sizeof(u8))) {
*(u8 *)(args->buffer + args->arg_data_offset) = (u8)val;
len = sizeof(u8);
} else {
return PPM_FAILURE_BUFFER_FULL;
}

break;
case PT_FLAGS16:
case PT_UINT16:
case PT_SYSCALLID:
if (likely(max_arg_size >= sizeof(u16))) {
*(u16 *)(args->buffer + args->arg_data_offset) = (u16)val;
len = sizeof(u16);
} else {
return PPM_FAILURE_BUFFER_FULL;
}

break;
case PT_FLAGS32:
case PT_UINT32:
case PT_MODE:
case PT_UID:
case PT_GID:
case PT_SIGSET:
if (likely(max_arg_size >= sizeof(u32))) {
*(u32 *)(args->buffer + args->arg_data_offset) = (u32)val;
len = sizeof(u32);
} else {
return PPM_FAILURE_BUFFER_FULL;
}

break;
case PT_RELTIME:
case PT_ABSTIME:
case PT_UINT64:
if (likely(max_arg_size >= sizeof(u64))) {
*(u64 *)(args->buffer + args->arg_data_offset) = (u64)val;
len = sizeof(u64);
} else {
return PPM_FAILURE_BUFFER_FULL;
}

break;
case PT_INT8:
if (likely(max_arg_size >= sizeof(s8))) {
*(s8 *)(args->buffer + args->arg_data_offset) = (s8)(long)val;
len = sizeof(s8);
} else {
return PPM_FAILURE_BUFFER_FULL;
}

break;
case PT_INT16:
if (likely(max_arg_size >= sizeof(s16))) {
*(s16 *)(args->buffer + args->arg_data_offset) = (s16)(long)val;
len = sizeof(s16);
} else {
return PPM_FAILURE_BUFFER_FULL;
}

break;
case PT_INT32:
if (likely(max_arg_size >= sizeof(s32))) {
*(s32 *)(args->buffer + args->arg_data_offset) = (s32)(long)val;
len = sizeof(s32);
} else {
return PPM_FAILURE_BUFFER_FULL;
}

break;
case PT_INT64:
case PT_ERRNO:
case PT_FD:
case PT_PID:
if (likely(max_arg_size >= sizeof(s64))) {
*(s64 *)(args->buffer + args->arg_data_offset) = (s64)(long)val;
len = sizeof(s64);
} else {
return PPM_FAILURE_BUFFER_FULL;
}

break;
default:
ASSERT(0);
#ifndef UDIG
pr_err("val_to_ring: invalid argument type %d. Event %u (%s) might have less parameters than what has been declared in nparams\n",
(int)g_event_info[args->event_type].params[args->curarg].type,
(u32)args->event_type,
g_event_info[args->event_type].name);
#endif
return PPM_FAILURE_BUG;
}

ASSERT(len <= PPM_MAX_ARG_SIZE);
ASSERT(len <= max_arg_size);

*psize += (u16)len;
args->curarg++;
args->arg_data_offset += len;
args->arg_data_size -= len;

return PPM_SUCCESS;
}

回调结束回到main,计算header_size+参数数量*2的size(存放两字节参数长度),即总共写入ringbuffer的数据大小,包括evt_header + evt_body,并将其更新到ringbuffer中(hdr当前也还是指向ringbuffer head指向位置的):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
	if (likely(cbres == PPM_SUCCESS)) {
/*
* Validate that the filler added the right number of parameters
*/
if (likely(args.curarg == args.nargs)) {
/*
* The event was successfully inserted in the buffer
*/
event_size = sizeof(struct ppm_evt_hdr) + args.arg_data_offset;
hdr->len = event_size;
drop = 0;
} else {
pr_err("corrupted filler for event type %d (added %u args, should have added %u)\n",
event_type,
args.curarg,
args.nargs);
ASSERT(0);
}
}
}

检查写入ringbuffer的数据长度是否超过RING_BUF_SIZE,若超过,则将超出部分覆写到ringbuffer头;更新next,并将其作为下一次写入ringbuffer的head指针;ring中的nevents事件+1:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
if (likely(!drop)) {
res = 1;

next = head + event_size;

if (unlikely(next >= RING_BUF_SIZE)) {
/*
* If something has been written in the cushion space at the end of
* the buffer, copy it to the beginning and wrap the head around.
* Note, we don't check that the copy fits because we assume that
* filler_callback failed if the space was not enough.
*/
if (next > RING_BUF_SIZE) {
memcpy(ring->buffer,
ring->buffer + RING_BUF_SIZE,
next - RING_BUF_SIZE);
}

next -= RING_BUF_SIZE;
}

/*
* Make sure all the memory has been written in real memory before
* we update the head and the user space process (on another CPU)
* can access the buffer.
*/
smp_wmb();

ring_info->head = next;

++ring->nevents;
}

根据回调返回值的一系列异常处理(buffer满了、无效用户态内存等),对应状态计数器++:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
else {
if (cbres == PPM_SUCCESS) {
ASSERT(freespace < sizeof(struct ppm_evt_hdr) + args.arg_data_offset);
ring_info->n_drops_buffer++;
} else if (cbres == PPM_FAILURE_INVALID_USER_MEMORY) {
#ifdef _DEBUG
pr_err("Invalid read from user for event %d\n", event_type);
#endif
ring_info->n_drops_pf++;
} else if (cbres == PPM_FAILURE_BUFFER_FULL) {
ring_info->n_drops_buffer++;
} else {
ASSERT(false);
}
}

四、总结

有些东西不去看永远觉得难,认真钻研之后发现其实并没有非常难以理解;sysdig主要是事件相关的结构体太多,一步步去跟代码逻辑会觉得非常绕,沉下心来看就好了;另外在分析的过程中可以在内核模块代码中加一些调试语句,查看事件/结构体成员/etc.内容,能更快的去理解其含义;sysdig项目太大了,上面的分析过程也没有太细,基本上都是代码整体流程这一层的记录,主要是觉得每个点都记录的太细确实有点花费时间了,后续重温的话基本看整体执行流程就知道是怎么回事了。这篇只分析了driver,后面会抽空把scap和inspect的补上;另外还打算出一篇讲ringbuffer的,以及最近用到的内核模块自动化编译工具driverkit……学海无涯,人生苦短。

五、参考链接

https://blog.csdn.net/zqixiao_09/article/details/50839042