3、系统调用之SYSCALL_DEFINE分析

  • Post author:
  • Post category:其他




前言

为什么要将系统调用定义成宏SYSCALL_DEFINEx?bug CVE-2009-0029,CVE-2010-3301的存在: Linux 2.6.28及以前版本的内核中,将系统调用中32位参数传入64位的寄存器时无法作符号扩展,可能导致系统崩溃或提权漏洞。

内核开发者通过将系统调用的所有输入参数都先转化成long类型(64位),再强制转化到相应的类型来规避这个漏洞。



SYSCALL_DEFINEx的宏定义

// 定义位置:\include\linux\syscalls.h
#define SYSCALL_DEFINE0(sname)					\
	SYSCALL_METADATA(_##sname, 0);				\
	asmlinkage long sys_##sname(void)

#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)

#define SYSCALL_DEFINEx(x, sname, ...)				\
	SYSCALL_METADATA(sname, x, __VA_ARGS__)			\
	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)

#define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)  
#define __SYSCALL_DEFINEx(x, name, ...)					\
	asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))	\
		__attribute__((alias(__stringify(SyS##name))));		\
	static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));	\
	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));	\
	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
	{								\
		long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
		__MAP(x,__SC_TEST,__VA_ARGS__);				\
		__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));	\
		return ret;						\
	}								\
	static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))

“##”是连接符,__VA_ARGS__代表前面“…”里面的可变参数

SYSCALL_DEFINEx里面的x代表的是

系统调用

的参数个数



sys_scoket系统调用的宏扩展

scoket是我们常用的函数,其对应的

系统调用

在\net\Scoket.c定义,函数部分如下:

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
	int retval;
	struct socket *sock;
	int flags;

	/* Check the SOCK_* constants for consistency.  */
	BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
	BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
	...
}
  • 1 SYSCALL_DEFINE3预处理,宏首先扩展为如下:
__SYSCALL_DEFINEx(3, _socket, int, family, int, type, int, protocol)  \

由于 SYSCALL_DEFINEx 调用了 SYSCALL_METADATA 和 __SYSCALL_DEFINEx

由于 SYSCALL_METADATA 是跟踪系统调用的, 因此只关注 __SYSCALL_DEFINEx

  • 2 __SYSCALL_DEFINEx(3, …)的宏再次扩展如下:
	asmlinkage long sys_socket(__MAP(x,__SC_DECL,int, family, int, type, int, protocol))	\
		__attribute__((alias(__stringify(SyS_socket))));		\
	static inline long SYSC_socket(__MAP(x,__SC_DECL,int, family, int, type, int, protocol));	\
	asmlinkage long SyS_socket(__MAP(x,__SC_LONG,int, family, int, type, int, protocol));	\
	asmlinkage long SyS_socket(__MAP(x,__SC_LONG,int, family, int, type, int, protocol))	\
	{								\
		long ret = SYSC_socket(__MAP(x,__SC_CAST,int, family, int, type, int, protocol));	\
		__MAP(x,__SC_TEST,int, family, int, type, int, protocol);				\
		__PROTECT(x, ret,__MAP(x,__SC_ARGS,int, family, int, type, int, protocol));	\
		return ret;						\
	}								\
	static inline long SYSC_socket(__MAP(x,__SC_DECL,int, family, int, type, int, protocol))

第一行的sys_scoket只是函数声明,最后一行SYSC_socket才是函数定义,因为其没有分号,再加上SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)之后函数体,明确无误是真正的socket系统调用函数定义

SyS_socket里面又调用到了SYSC_socket了,为何要使用宏定义这样绕来绕去的?

关键在于__MAP、__SC_DECL、__SC_LONG、__SC_CAST、__SC_TEST、__SC_ARGS这几个宏。



一些奇怪的宏的扩展

// 定义位置:\include\linux\syscalls.h
#define __MAP0(m,...)
#define __MAP1(m,t,a) m(t,a)
#define __MAP2(m,t,a,...) m(t,a), __MAP1(m,__VA_ARGS__)
#define __MAP3(m,t,a,...) m(t,a), __MAP2(m,__VA_ARGS__)
#define __MAP4(m,t,a,...) m(t,a), __MAP3(m,__VA_ARGS__)
#define __MAP5(m,t,a,...) m(t,a), __MAP4(m,__VA_ARGS__)
#define __MAP6(m,t,a,...) m(t,a), __MAP5(m,__VA_ARGS__)
#define __MAP(n,...) __MAP##n(__VA_ARGS__)

#define __SC_DECL(t, a)	t a
#define __TYPE_IS_L(t)	(__same_type((t)0, 0L))
#define __TYPE_IS_UL(t)	(__same_type((t)0, 0UL))
// t为long long或unsigned long long 返回真,t为int或long则返回假
#define __TYPE_IS_LL(t) (__same_type((t)0, 0LL) || __same_type((t)0, 0ULL))
#define __SC_LONG(t, a) __typeof(__builtin_choose_expr(__TYPE_IS_LL(t), 0LL, 0L)) a
#define __SC_CAST(t, a)	(t) a
#define __SC_ARGS(t, a)	a
#define __SC_TEST(t, a) (void)BUILD_BUG_ON_ZERO(!__TYPE_IS_LL(t) && sizeof(t) > sizeof(long))
// 定义位置:\include\linux/bug.h
/* 其中当e为非0的时候,经过两次非操作得到的结果是1,加上符号就是-1,struct的位字段不允许位负,这样在编译的时候就会报错;当e=0的时候,经过两次非操作仍然是0,-0的结果还是0.这样可以在编译的时候发现错误*/
#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); })) 


__MAP(x,__SC_DECL,int, family, int, type, int, protocol)宏展开后: int family, int type,int protocol

__MAP3(__SC_DECL,int, family, int, type, int, protocol) ==> __SC_DECL(int, family),__MAP2(__SC_DECL,int, type, int, protocol) ==>__SC_DECL(int, family),__SC_DECL(int, type),__SC_DECL(int, protocol) ==> int family,int type,int protocol


__MAP(x,__SC_LONG,int, family, int, type, int, protocol)宏展开后: long family, long type,long protocol

__MAP3(__SC_LONG,int, family, int, type, int, protocol) ==> __SC_LONG(int, family),__SC_LONG(int, type),__SC_LONG(int, protocol) ==> long family, long type,long protocol

其中__typeof(__builtin_choose_expr(__TYPE_IS_LL(t), 0LL, 0L))意思是如果t是long long类型就返回long long,否者返回long数据类型(64bit)


__MAP(x,__SC_CAST,int, family, int, type, int, protocol)宏展开后: (int) family, (int) type,(int) protocol

__MAP(x,__SC_CAST,int, family, int, type, int, protocol) ==> __SC_CAST(int, family),__SC_CAST(int, type),__SC_CAST(int, protocol) ==> (int) family, (int) type,(int) protocol


__MAP(x,__SC_TEST,int, family, int, type, int, protocol)宏展开后: (void)BUILD_BUG_ON_ZERO(0), (void)BUILD_BUG_ON_ZERO(0), (void)BUILD_BUG_ON_ZERO(0)

__MAP(x,__SC_TEST,int, family, int, type, int, protocol) ==> __SC_TEST(int, family),__SC_TEST(int, type),__SC_TEST(int, protocol) ==> (void)BUILD_BUG_ON_ZERO(0), (void)BUILD_BUG_ON_ZERO(0), (void)BUILD_BUG_ON_ZERO(0)


__MAP(x,__SC_ARGS,int, family, int, type, int, protocol)宏展开后: family, type, protoco

==> __SC_ARGS(int, family),__SC_ARGS(int, type),__SC_ARGS(int, protocol) ==> family, type, protoco


__PROTECT(x, ret,__MAP(x,__SC_ARGS,int, family, int, type, int, protocol))宏展开后: asmlinkage_protect(x, ret, family, type, protoco)

asmlinkage_protect(x, ret, family, type, protoco) ==> __asm__ __volatile__ ("" : "=r" (ret) : "0" (ret), "m" (family), "m" (type), "m" (protoco));  


汇编语句

的主要作用是避免系统调用的返回值(eax)被编译器优化掉,详情如下:

  • “”:Assembler Template(汇编模板)为空,即什么都不做,当然也就不会有任何优化操作了
  • “=r” (ret): OutputOperands(输出操作),从寄存器中读取系统调用返回值,并赋给ret变量
  • “0” (ret), “m” (family), “m” (type), “m” (protoco):InputOperands(输入操作)

    “0” (ret):从C变量ret中读取值,并存储到和输出操作同位置的寄存器中

    “m” (family), “m” (type), “m” (protoco):从内存中读取各变量值



扩展最初的宏

asmlinkage long sys_socket(__MAP(x,__SC_DECL,int, family, int, type, int, protocol))	\
		__attribute__((alias(__stringify(SyS_socket))));		\

扩展为:

asmlinkage long sys_socket(int family,int type,int protocol) _attribute__((alias("SyS_socket"))); // 将sys_socket设置为SyS_socket函数的别名,调用sys_socket就是调用SyS_socket

一些说明:

// 定义位置:\include\linux
#define __stringify_1(x...)	#x  // #的作用是将宏参数字符串化,如:#do = > "do"
#define __stringify(x...)	__stringify_1(x)

因此:__stringify(SyS_socket) ==> “SyS_socket”

最后,完整的扩展为:

#define __SYSCALL_DEFINEx(3, _socket, int, family, int, type, int, protocol)  \
asmlinkage long sys_socket(int family,int type,int protocol) _attribute__((alias("SyS_socket")));\
static inline long SYSC_socket(int family,int type,int protocol);
asmlinkage long SyS_socket(long family, long type,long protocol);	\
	asmlinkage long SyS_socket(long family, long type,long protocol)	\
	{								                                        \
		long ret = SYSC_socket((int) family, (int) type,(int) protocol);	\
		(void)BUILD_BUG_ON_ZERO(0), (void)BUILD_BUG_ON_ZERO(0), (void)BUILD_BUG_ON_ZERO(0);				\
		__asm__ __volatile__ ("" : "=r" (ret) : "0" (ret), "m" (family), "m" (type), "m" (protoco));	\
		return ret;						\
	}								\
	static inline long SYSC_socket(int family,int type,int protocol)
	{
	int retval;
	struct socket *sock;
	int flags;

	/* Check the SOCK_* constants for consistency.  */
	BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
	BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
	...
}

系统调用sys_socket函数时,其实是调用SyS_socket函数,SyS_socket函数内部又调用SYSC_socket函数(真正执行socket系统调用代码),最后将没有优化的返回值ret返回



参考


Anatomy of a system call, additional content



版权声明:本文为cswhl原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。