usleep的–系统调用流程–及不准确的问题 – Android4.0.1

  • Post author:
  • Post category:其他



1.由于在不同的硬件平台上经常遇到usleep不准确的问题,比如usleep(2*1000),结果sleep了10ms,是不是有点过分,测试代码如下:

#include <stdio.h>
#include <stdlib.h>

int main(int argc,char **argv)
{

    struct timeval oldTime, newTime;
    int iStime,i,j;
    iStime=5;
    for(i=0;i<60;i++)
    {
        for(j=0;j<10;j++)
        {
            gettimeofday( &oldTime, NULL );
            usleep( iStime * 1000 );
            gettimeofday( &newTime, NULL );
            printf("iStime:%d,actual time:%lld\n",iStime,((long long)(newTime.tv_sec*1000 + newTime.tv_usec/1000)-(long long)(oldTime.tv_sec*1000 + oldTime.tv_usec/1000)));
        }
        iStime++;
    }
}


当然为防止出现意外,禁止测试期间设置系统时间。



2. 根据以前的经验,此usleep不准主要是由于Kernel中系统timer的rating值过高引起的。



3. 下面从源码的角度分析一下usleep的实现细节,并进一步分析其原因。以下以Android4.0.1为例进行分析。注此问题主要与Kernel有关,与glibc或bionic无关,因为小弟最近搞Android,所以就以Android为例进行研究。



4. 首先找到usleep的源码:

//位于/bionic/libc/unistd/usleep.c
#include <time.h>
#include <errno.h>

int usleep(unsigned long usec)
{
  struct timespec ts;

  ts.tv_sec  = usec/1000000UL;

#ifdef __arm__
    /* avoid divisions and modulos on the ARM */
  ts.tv_nsec = (usec - ts.tv_sec*1000000UL)*1000;
#else
  ts.tv_nsec = (usec % 1000000UL) * 1000UL;
#endif

  for (;;)
  {
    if ( nanosleep( &ts, &ts ) == 0 )
        return 0;

    // We try again if the nanosleep failure is EINTR.
    // The other possible failures are EINVAL (which we should pass through),
    // and ENOSYS, which doesn't happen.
    if ( errno != EINTR )
        return -1;
  }
}



它也很懒的,就调用了nanosleep,哪就看看nanasleep的源码吧! 不幸是只找到一个extern int  nanosleep(const struct timespec *, struct timespec *); 它位于/bionic/libc/include/sys/linux-unistd.h,并没有找到它的实现。其实看看Linux系统调用,早就知道它是一个系统调用,哪就分析一下是如何进行系统调用的,以前只是讲过原理,并没有实例,在此把它完成了。



5. 寻找系统调用函数


如果这个函数没有实现,哪肯定是不能调用的,就像MIT教授在公开课上所讲的,搞计算机的不像搞别的,做不了假,别人不管你怎么设计的,只看你实现的结果,很有道理。也证明了搞if else的人不能做弊。哪就从它的Android.mk入手吧,看看还Link了什么东东。打开libc的Android.mk发现,其中有一行


include $(LOCAL_PATH)/arch-$(TARGET_ARCH)/syscalls.mk


这就是关键所在,syscalls系统调用,不正是我们要找的吗?进入arch-arm/syscalls.mk一看,其中一大片.s,Search一下,看有没有nanosleep.s,还真有这么一行,真是大快人心:syscall_src += arch-arm/syscalls/nanosleep.S


赶紧去瞧瞧,ARM汇编水平不高,能看懂吗?先把代码贴上再说,不懂就问google.

/* autogenerated by gensyscalls.py */
#include <sys/linux-syscalls.h>

    .text
    .type nanosleep, #function
    .globl nanosleep
    .align 4
    .fnstart

nanosleep:
    .save   {r4, r7}
    stmfd   sp!, {r4, r7}
    ldr     r7, =__NR_nanosleep
    swi     #0
    ldmfd   sp!, {r4, r7}
    movs    r0, r0
    bxpl    lr
    b       __set_syscall_errno
    .fnend


__NR_nanosleep是个什么东东,凭直觉,肯定在sys/linux-syscalls.h中有定义。打开/libc/include/sys/linux-syscalls.h并search __NR_nanosleep, 明白了,它定义了__NR_nanosleep的值为(__NR_SYSCALL_BASE + 162),其实就是定义了其系统调用号。这就与前一文swi连接起来了。上面的代码把系统调用号传递给r7,然后触发了一个软中断,从而进入内核态执行。



6. 软中断处理流程


根据常识,既然是软中断,就一定有一个对应的ISR,打开/kernel/arch/arm/kernel/entry-common.S,发现其中有一个ENTRY(vector_swi),这就是我们要找的ISR,其详细代码如下:

	.align	5
ENTRY(vector_swi)
	sub	sp, sp, #S_FRAME_SIZE
	stmia	sp, {r0 - r12}			@ Calling r0 - r12
 ARM(	add	r8, sp, #S_PC		)
 ARM(	stmdb	r8, {sp, lr}^		)	@ Calling sp, lr
 THUMB(	mov	r8, sp			)
 THUMB(	store_user_sp_lr r8, r10, S_SP	)	@ calling sp, lr
	mrs	r8, spsr			@ called from non-FIQ mode, so ok.
	str	lr, [sp, #S_PC]			@ Save calling PC
	str	r8, [sp, #S_PSR]		@ Save CPSR
	str	r0, [sp, #S_OLD_R0]		@ Save OLD_R0
	zero_fp

	/*
	 * Get the system call number.
	 */

#if defined(CONFIG_OABI_COMPAT)

	/*
	 * If we have CONFIG_OABI_COMPAT then we need to look at the swi
	 * value to determine if it is an EABI or an old ABI call.
	 */
#ifdef CONFIG_ARM_THUMB
	tst	r8, #PSR_T_BIT
	movne	r10, #0				@ no thumb OABI emulation
	ldreq	r10, [lr, #-4]			@ get SWI instruction
#else
	ldr	r10, [lr, #-4]			@ get SWI instruction
  A710(	and	ip, r10, #0x0f000000		@ check for SWI		)
  A710(	teq	ip, #0x0f000000						)
  A710(	bne	.Larm710bug						)
#endif
#ifdef CONFIG_CPU_ENDIAN_BE8
	rev	r10, r10			@ little endian instruction
#endif

#elif defined(CONFIG_AEABI)

	/*
	 * Pure EABI user space always put syscall number into scno (r7).
	 */
  A710(	ldr	ip, [lr, #-4]			@ get SWI instruction	)
  A710(	and	ip, ip, #0x0f000000		@ check for SWI		)
  A710(	teq	ip, #0x0f000000						)
  A710(	bne	.Larm710bug						)

#elif defined(CONFIG_ARM_THUMB)

	/* Legacy ABI only, possibly thumb mode. */
	tst	r8, #PSR_T_BIT			@ this is SPSR from save_user_regs
	addne	scno, r7, #__NR_SYSCALL_BASE	@ put OS number in
	ldreq	scno, [lr, #-4]

#else

	/* Legacy ABI only. */
	ldr	scno, [lr, #-4]			@ get SWI instruction
  A710(	and	ip, scno, #0x0f000000		@ check for SWI		)
  A710(	teq	ip, #0x0f000000						)
  A710(	bne	.Larm710bug						)

#endif

#ifdef CONFIG_ALIGNMENT_TRAP
	ldr	ip, __cr_alignment
	ldr	ip, [ip]
	mcr	p15, 0, ip, c1, c0		@ update control register
#endif
	enable_irq

	get_thread_info tsk
	adr	tbl, sys_call_table		@ load syscall table pointer
	ldr	ip, [tsk, #TI_FLAGS]		@ check for syscall tracing

#if defined(CONFIG_OABI_COMPAT)
	/*
	 * If the swi argument is zero, this is an EABI call and we do nothing.
	 *
	 * If this is an old ABI call, get the syscall number into scno and
	 * get the old ABI syscall table address.
	 */
	bics	r10, r10, #0xff000000
	eorne	scno, r10, #__NR_OABI_SYSCALL_BASE
	ldrne	tbl, =sys_oabi_call_table
#elif !defined(CONFIG_AEABI)
	bic	scno, scno, #0xff000000		@ mask off SWI op-code
	eor	scno, scno, #__NR_SYSCALL_BASE	@ check OS number
#endif

	stmdb	sp!, {r4, r5}			@ push fifth and sixth args
	tst	ip, #_TIF_SYSCALL_TRACE		@ are we tracing syscalls?
	bne	__sys_trace

	cmp	scno, #NR_syscalls		@ check upper syscall limit
	adr	lr, BSYM(ret_fast_syscall)	@ return address
	ldrcc	pc, [tbl, scno, lsl #2]		@ call sys_* routine

	add	r1, sp, #S_OFF
2:	mov	why, #0				@ no longer a real syscall
	cmp	scno, #(__ARM_NR_BASE - __NR_SYSCALL_BASE)
	eor	r0, scno, #__NR_SYSCALL_BASE	@ put OS number back
	bcs	arm_syscall	
	b	sys_ni_syscall			@ not private func
ENDPROC(vector_swi)



7. 找与nanosleep对应的处理函数


从上面的代码中可以看出,它将调用

sys_call_table

中的某个函数。在同一个文件中寻找sys_call_table,其代码如下:

	.type	sys_call_table, #object
ENTRY(sys_call_table)
#include "calls.S"



看看linux/arch/arm/kernel/calls.S中的内容:

/* 0 */		CALL(sys_restart_syscall)
		CALL(sys_exit)
		CALL(sys_fork_wrapper)
		CALL(sys_read)
		CALL(sys_write)
                ...
/* 160 */	CALL(sys_sched_get_priority_min)
		CALL(sys_sched_rr_get_interval)
		CALL(sys_nanosleep)
		CALL(sys_mremap)
		CALL(sys_setresuid16)



原来nanosleep系统调用在Kernel中的函数为sys_nanosleep,现在去分析一下是如何实现高精度的sleep的,是忙等(执行nop指令),还是闲等(让出CPU使用权)呢? 马上就会有答案了。由于小弟知识有限,没哪么简单,我找了2个小时也没有找到答案,惭愧啊!



8. 先看看熟悉的系统调用open吧!


也不幸运,没有sys_open这样的函数。反正知道这个东东在fs/open.c中,基本原理应该是一样的。在此文件中找到了下面这个函数:


SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)



linux/syscalls.h定义如下:


asmlinkage long sys_open(const char __user *filename,int flags, int mode); (

asmlinkage就是一个extern “C”)



这兄弟俩长得太像了,再看看SYSCALL_DEFINE3的定义,看看能不能找到二者的关系。


哈哈哈哈哈哈…..,终于在linux/syscalls.h中找到答案了,SYSCALL_DEFINE3的定义如下:

#define __SYSCALL_DEFINEx(x, name, ...)	asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__))
#define SYSCALL_DEFINEx(x, sname, ...)	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) 



把SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)还原就变成了:



asmlinkage long sys_open(const char __user *filename,int flags, int mode);是不是与要找的函数一模一样呢?终于找到如何看这个代码的方法了!



9. 继续找sys_nanosleep的实现代码


先看看linux/kernel/hrtimer.c中的commnets:


*

High-resolution kernel timers


*

*  In contrast to the low-resolution timeout API implemented in

*  kernel/timer.c, hrtimers provide finer resolution and accuracy

*  depending on system configuration and capabilities.

*

*  These timers are currently used for:

*   – itimers

*   – POSIX timers

*   – nanosleep

*   – precise in-kernel timing


看到上面的nanosleep了吗?说明有机会找到了。



SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,  struct timespec __user *, rmtp)

这不就是我要找的吗? 由于这是一个宏,在SourceInsight中查找函数nanosleep是找不到的,search字符串nanosleep是可行的。其代码如下:

SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
		struct timespec __user *, rmtp)
{
	struct timespec tu;

	if (copy_from_user(&tu, rqtp, sizeof(tu)))
		return -EFAULT;

	if (!timespec_valid(&tu))
		return -EINVAL;

	return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
}



hrtimer_nanosleep实现如下:

long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
		       const enum hrtimer_mode mode, const clockid_t clockid)
{
	struct restart_block *restart;
	struct hrtimer_sleeper t;
	int ret = 0;
	unsigned long slack;

	slack = current->timer_slack_ns;
	if (rt_task(current))
		slack = 0;

	hrtimer_init_on_stack(&t.timer, clockid, mode);
	hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
	if (do_nanosleep(&t, mode))
		goto out;

	/* Absolute timers do not update the rmtp value and restart: */
	if (mode == HRTIMER_MODE_ABS) {
		ret = -ERESTARTNOHAND;
		goto out;
	}

	if (rmtp) {
		ret = update_rmtp(&t.timer, rmtp);
		if (ret <= 0)
			goto out;
	}

	restart = ¤t_thread_info()->restart_block;
	restart->fn = hrtimer_nanosleep_restart;
	restart->nanosleep.index = t.timer.base->index;
	restart->nanosleep.rmtp = rmtp;
	restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);

	ret = -ERESTART_RESTARTBLOCK;
out:
	destroy_hrtimer_on_stack(&t.timer);
	return ret;
}
static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
{
	hrtimer_init_sleeper(t, current);

	do {
		set_current_state(TASK_INTERRUPTIBLE);
		hrtimer_start_expires(&t->timer, mode);
		if (!hrtimer_active(&t->timer))
			t->task = NULL;

		if (likely(t->task))
			schedule();

		hrtimer_cancel(&t->timer);
		mode = HRTIMER_MODE_ABS;

	} while (t->task && !signal_pending(current));

	__set_current_state(TASK_RUNNING);

	return t->task == NULL;
}


调用流程如下:




nanosleep()




–>




sys_nanosleep()




–>




hrtimer_nanosleep()




–>




do_nanosleep()




–>




hrtimer_start()




–>




enqueue_hrtimer()


–>


hrtimer_enqueue_reprogram()




–>




hrtimer_reprogram()




–>










int tick_program_event(ktime_t expires, int force)->

(struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 获得clock_event_device)


int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, int force)->

int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,ktime_t now) ->

dev->set_next_event((unsigned long) clc, dev)<在注册的clock_event_device中提供此函数,其主要功能是设置相关寄存器,以设置此超时事件>






版权声明:本文为MyArrow原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。