J.A. Magallon wrote:
>On 2002.04.30 Wanghong Yuan wrote:
>
>>Is there any package or program, which can be used to accurately measure the
>>CPU cycles used by a program? I think the following code can only provide an
>>inaccurate one, beause it may count on the waiting time of the program.
>>
>>gettimeofday(t1)
>>runprogam
>>gettimeofday(t2)
>>t2-t1
>>
>>If I need to instrument the kernel, which part I should investigate? Thanks
>>a lot in advance
>>
>
>man getrusage
>
getrusage is grossly inaccurate.  If a process is running when the timer 
interrupt hits, it's assigned an entire timer interrupt's worth of CPU, 
even if it's only run one instruction.  I've attached an old patch that 
makes getrusage farily accurate.  It's not going to be accurate to the 
CPU cycle, but it's not too bad.  It does increase the overhead for 
system calls and interrupts.  It works on newer x86 and PowerPC.
-Corey
--------------070202060507000102030408
Content-Type: text/plain;
 name="highres.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename="highres.patch"
--- ./fs/proc/proc_misc.c.highres	Tue Nov 20 23:29:09 2001
+++ ./fs/proc/proc_misc.c	Sat Feb  9 17:30:04 2002
@@ -101,7 +101,8 @@
 	int len;
 
 	uptime = jiffies;
-	idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime;
+	idle = (tms_time_to_ticks(init_tasks[0]->times.tms_utime)
+		+ tms_time_to_ticks(init_tasks[0]->times.tms_stime));
 
 	/* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but
 	   that would overflow about every five days at HZ == 100.
--- ./fs/proc/array.c.highres	Thu Oct 11 11:00:01 2001
+++ ./fs/proc/array.c	Sat Feb  9 17:30:04 2002
@@ -358,10 +358,10 @@
 		task->cmin_flt,
 		task->maj_flt,
 		task->cmaj_flt,
-		task->times.tms_utime,
-		task->times.tms_stime,
-		task->times.tms_cutime,
-		task->times.tms_cstime,
+		tms_time_to_ticks(task->times.tms_utime),
+		tms_time_to_ticks(task->times.tms_stime),
+		tms_time_to_ticks(task->times.tms_cutime),
+		tms_time_to_ticks(task->times.tms_cstime),
 		priority,
 		nice,
 		0UL /* removed */,
@@ -682,8 +682,8 @@
 
 	len = sprintf(buffer,
 		"cpu  %lu %lu\n",
-		task->times.tms_utime,
-		task->times.tms_stime);
+		tms_time_to_ticks(task->times.tms_utime),
+		tms_time_to_ticks(task->times.tms_stime));
 		
 	for (i = 0 ; i < smp_num_cpus; i++)
 		len += sprintf(buffer + len, "cpu%d %lu %lu\n",
--- ./fs/binfmt_elf.c.highres	Thu Jan 10 15:52:13 2002
+++ ./fs/binfmt_elf.c	Sat Feb  9 17:30:04 2002
@@ -1105,14 +1105,10 @@
 	psinfo.pr_ppid = prstatus.pr_ppid = current->p_pptr->pid;
 	psinfo.pr_pgrp = prstatus.pr_pgrp = current->pgrp;
 	psinfo.pr_sid = prstatus.pr_sid = current->session;
-	prstatus.pr_utime.tv_sec = CT_TO_SECS(current->times.tms_utime);
-	prstatus.pr_utime.tv_usec = CT_TO_USECS(current->times.tms_utime);
-	prstatus.pr_stime.tv_sec = CT_TO_SECS(current->times.tms_stime);
-	prstatus.pr_stime.tv_usec = CT_TO_USECS(current->times.tms_stime);
-	prstatus.pr_cutime.tv_sec = CT_TO_SECS(current->times.tms_cutime);
-	prstatus.pr_cutime.tv_usec = CT_TO_USECS(current->times.tms_cutime);
-	prstatus.pr_cstime.tv_sec = CT_TO_SECS(current->times.tms_cstime);
-	prstatus.pr_cstime.tv_usec = CT_TO_USECS(current->times.tms_cstime);
+	prstatus.pr_utime = tms_time_to_timeval(current->times.tms_utime);
+	prstatus.pr_stime = tms_time_to_timeval(current->times.tms_stime);
+	prstatus.pr_cutime = tms_time_to_timeval(current->times.tms_cutime);
+	prstatus.pr_cstime = tms_time_to_timeval(current->times.tms_cstime);
 
 	/*
 	 * This transfers the registers from regs into the standard
--- ./mm/oom_kill.c.highres	Sat Nov  3 19:05:25 2001
+++ ./mm/oom_kill.c	Sat Feb  9 17:30:04 2002
@@ -72,7 +72,9 @@
 	 * very well in practice. This is not safe against jiffie wraps
 	 * but we don't care _that_ much...
 	 */
-	cpu_time = (p->times.tms_utime + p->times.tms_stime) >> (SHIFT_HZ + 3);
+	cpu_time = (tms_time_to_ticks(p->times.tms_utime)
+		    + tms_time_to_ticks(p->times.tms_stime))
+			>> (SHIFT_HZ + 3);
 	run_time = (jiffies - p->start_time) >> (SHIFT_HZ + 10);
 
 	points /= int_sqrt(cpu_time);
--- ./arch/ppc/config.in.highres	Fri Nov 16 12:10:08 2001
+++ ./arch/ppc/config.in	Sat Feb  9 17:30:04 2002
@@ -237,6 +237,11 @@
   source drivers/zorro/Config.in
 fi
 
+bool 'High resolution CPU usage and virtual itimers' CONFIG_HIGHRES_CPU_USAGE
+if [ "$CONFIG_HIGHRES_CPU_USAGE" = "y" ]; then
+  bool 'Does the virtual itimer count system CPU usage?' CONFIG_VIRT_ITIMER_COUNTS_SYSTEM
+fi
+
 endmenu
 source drivers/mtd/Config.in
 source drivers/pnp/Config.in
--- ./arch/ppc/kernel/entry.S.highres	Mon Nov 26 07:29:17 2001
+++ ./arch/ppc/kernel/entry.S	Sat Feb  9 17:30:04 2002
@@ -298,6 +298,10 @@
 ret_to_user_hook:
 	nop
 restore:
+#ifdef CONFIG_HIGHRES_CPU_USAGE
+	mr	r3,r2
+	bl	tms_leave_system
+#endif
 	lwz	r3,_XER(r1)
 	mtspr	XER,r3
 	REST_10GPRS(9,r1)
--- ./arch/ppc/kernel/time.c.highres	Mon Oct  8 13:43:01 2001
+++ ./arch/ppc/kernel/time.c	Sat Feb  9 17:30:04 2002
@@ -243,6 +243,35 @@
 	tv->tv_usec = usec;
 }
 
+#ifdef CONFIG_HIGHRES_CPU_USAGE
+unsigned long curr_timestamp(void)
+{
+        return get_native_tbl();
+}
+
+unsigned long timestamp_usec_diff(unsigned long ts1, unsigned long ts2)
+{
+        unsigned long val;
+	unsigned long usec;
+
+	if (__USE_RTC()) {
+	        int delta = ts1 - ts2;
+		val = delta<0 ? delta + 1000000000 : delta;
+	} else {
+        	val = ts1 - ts2;
+	}
+	usec = mulhwu(tb_to_us, val);
+	if (usec > 1000000) {
+		/* We had some wierd overflow due to the timebases being
+		   changed, just ignore it. */
+		printk("tb went backwards? ts1=%lu, ts2=%ld\n", ts1, ts2);
+		usec = 0;
+	}
+
+	return usec;
+}
+#endif
+
 void do_settimeofday(struct timeval *tv)
 {
 	unsigned long flags;
--- ./arch/ppc/kernel/head.S.highres	Fri Nov  2 19:43:54 2001
+++ ./arch/ppc/kernel/head.S	Sat Feb  9 17:30:04 2002
@@ -771,6 +771,12 @@
 2:	addi	r2,r23,-THREAD		/* set r2 to current */
 	tovirt(r2,r2)
 	mflr	r23
+#ifdef CONFIG_HIGHRES_CPU_USAGE
+	bl	3f
+	.long	highres_start
+3:	mflr	r25
+	lwz	r25,0(r25)
+#endif
 	andi.	r24,r23,0x3f00		/* get vector offset */
 	stw	r24,TRAP(r21)
 	li	r22,0
@@ -784,11 +790,50 @@
 	lwz	r24,0(r23)		/* virtual address of handler */
 	lwz	r23,4(r23)		/* where to go when done */
 	FIX_SRR1(r20,r22)
+#ifdef CONFIG_HIGHRES_CPU_USAGE
+	mtspr	SRR0,r25
+#else
 	mtspr	SRR0,r24
+#endif
 	mtspr	SRR1,r20
+#ifndef CONFIG_HIGHRES_CPU_USAGE
 	mtlr	r23
+#endif
 	SYNC
 	RFI				/* jump to handler, enable MMU */
+
+#ifdef CONFIG_HIGHRES_CPU_USAGE
+highres_start:
+	subi	r1,r1,52
+	stw	r12,44(r1)
+	stw	r11,40(r1)
+	stw	r10,36(r1)
+	stw	r9,32(r1)
+	stw	r8,28(r1)
+	stw	r7,24(r1)
+	stw	r6,20(r1)
+	stw	r5,16(r1)
+	stw	r4,12(r1)
+	stw	r3,8(r1)
+	stw	r0,48(r1)
+	mr	r3,r2
+	bl	tms_enter_system
+	mtlr	r23
+	mtctr	r24
+	lwz	r12,44(r1)
+	lwz	r11,40(r1)
+	lwz	r10,36(r1)
+	lwz	r9,32(r1)
+	lwz	r8,28(r1)
+	lwz	r7,24(r1)
+	lwz	r6,20(r1)
+	lwz	r5,16(r1)
+	lwz	r4,12(r1)
+	lwz	r3,8(r1)
+	lwz	r0,48(r1)
+	addi	r1,r1,52
+	bctr
+#endif
 
 /*
  * On kernel stack overflow, load up an initial stack pointer
--- ./arch/i386/config.in.highres	Sat Feb  9 20:48:32 2002
+++ ./arch/i386/config.in	Sat Feb  9 21:04:26 2002
@@ -279,6 +279,9 @@
    bool '    Use real mode APM BIOS call to power off' CONFIG_APM_REAL_MODE_POWER_OFF
 fi
 
+dep_bool 'High resolution CPU usage and virtual itimers' CONFIG_HIGHRES_CPU_USAGE $CONFIG_X86_TSC
+dep_bool 'Does the virtual itimer count system CPU usage?' CONFIG_VIRT_ITIMER_COUNTS_SYSTEM $CONFIG_HIGHRES_CPU_USAGE
+
 endmenu
 
 source drivers/mtd/Config.in
--- ./arch/i386/kernel/entry.S.highres	Sat Feb  9 20:48:02 2002
+++ ./arch/i386/kernel/entry.S	Mon Feb 11 14:55:52 2002
@@ -81,6 +81,27 @@
 
 ENOSYS = 38
 
+#define GET_CURRENT(reg) \
+	movl $-8192, reg; \
+	andl %esp, reg
+
+#ifdef CONFIG_HIGHRES_CPU_USAGE
+#define HIGHRES_START_CODE \
+	pushl %eax;        \
+	GET_CURRENT(%ebx); \
+	pushl %ebx;        \
+	call  SYMBOL_NAME(tms_enter_system); \
+	addl  $4,%esp;     \
+	popl %eax; 
+#define HIGHRES_END_CODE \
+	GET_CURRENT(%ebx); \
+	pushl %ebx;        \
+	call  SYMBOL_NAME(tms_leave_system); \
+	addl  $4,%esp; 
+#else
+#define HIGHRES_START_CODE
+#define HIGHRES_END_CODE
+#endif
 
 #define SAVE_ALL \
 	cld; \
@@ -95,9 +116,11 @@
 	pushl %ebx; \
 	movl $(__KERNEL_DS),%edx; \
 	movl %edx,%ds; \
-	movl %edx,%es;
+	movl %edx,%es; \
+	HIGHRES_START_CODE;
 
 #define RESTORE_ALL	\
+	HIGHRES_END_CODE; \
 	popl %ebx;	\
 	popl %ecx;	\
 	popl %edx;	\
@@ -128,10 +151,6 @@
 	.long 3b,6b;	\
 .previous
 
-#define GET_CURRENT(reg) \
-	movl $-8192, reg; \
-	andl %esp, reg
-
 ENTRY(lcall7)
 	pushfl			# We get a different stack layout with call gates,
 	pushl %eax		# which has to be cleaned up later..
@@ -286,6 +305,9 @@
 	movl $(__KERNEL_DS),%edx
 	movl %edx,%ds
 	movl %edx,%es
+	pushl %edi
+	HIGHRES_START_CODE
+	popl %edi
 	GET_CURRENT(%ebx)
 	call *%edi
 	addl $8,%esp
--- ./arch/i386/kernel/time.c.highres	Sat Feb  9 20:46:39 2002
+++ ./arch/i386/kernel/time.c	Mon Feb 11 10:13:28 2002
@@ -715,3 +715,34 @@
 	setup_irq(0, &irq0);
 #endif
 }
+
+#ifdef CONFIG_HIGHRES_CPU_USAGE
+unsigned long curr_timestamp(void)
+{
+	register unsigned long eax, edx;
+	rdtsc(eax,edx);
+	return eax;
+}
+
+unsigned long timestamp_usec_diff(unsigned long ts1, unsigned long ts2)
+{
+	register unsigned long eax, rv;
+
+	/* Read the Time Stamp Counter */
+
+	eax = ts1 - ts2;
+	__asm__("mull %2"
+		:"=a" (eax), "=d" (rv)
+		:"rm" (fast_gettimeoffset_quotient),
+		 "0" (eax));
+
+	if (rv >= 1000000) {
+		/* We had some wierd overflow due to the timebases being
+		   changed, just ignore it. */
+		printk("tb went backwards? ts1=%lu, ts2=%ld\n", ts1, ts2);
+		rv = 0;
+	}
+
+	return rv;
+}
+#endif
--- ./kernel/acct.c.highres	Mon Mar 19 14:35:08 2001
+++ ./kernel/acct.c	Sat Feb  9 17:30:04 2002
@@ -296,8 +296,8 @@
 
 	ac.ac_btime = CT_TO_SECS(current->start_time) + (xtime.tv_sec - (jiffies / HZ));
 	ac.ac_etime = encode_comp_t(jiffies - current->start_time);
-	ac.ac_utime = encode_comp_t(current->times.tms_utime);
-	ac.ac_stime = encode_comp_t(current->times.tms_stime);
+	ac.ac_utime = encode_comp_t(tms_time_to_ticks(current->times.tms_utime));
+	ac.ac_stime = encode_comp_t(tms_time_to_ticks(current->times.tms_stime));
 	ac.ac_uid = current->uid;
 	ac.ac_gid = current->gid;
 	ac.ac_tty = (current->tty) ? kdev_t_to_nr(current->tty->device) : 0;
--- ./kernel/timer.c.highres	Mon Oct  8 12:41:41 2001
+++ ./kernel/timer.c	Mon Feb 11 15:23:55 2002
@@ -518,6 +518,8 @@
 	}
 }
 
+#ifndef CONFIG_HIGHRES_CPU_USAGE
+
 static inline void do_process_times(struct task_struct *p,
 	unsigned long user, unsigned long system)
 {
@@ -549,6 +551,144 @@
 	}
 }
 
+#else /* CONFIG_HIGHRES_CPU_USAGE */
+
+/* This spin lock protects the virtual itimer and tms structures while
+   we are messing with it. */
+rwlock_t virt_itimer_lock = RW_LOCK_UNLOCKED;
+
+/* This is assumed to be called more than once a second. */
+static inline void add_usec_to_timeval(struct timeval *tv,
+				       unsigned long  usecs)
+{
+	tv->tv_usec += usecs;
+	if (tv->tv_usec >= 1000000) {
+		tv->tv_usec -= 1000000;
+		tv->tv_sec += 1;
+	}
+}
+
+/* virt_itimer_lock should be held when this is called. */
+static inline void tms_check_virt(struct task_struct *p,
+				  unsigned long      usecs)
+{
+	struct timeval *it_virt = &(p->it_virt_value);
+
+	if (it_virt->tv_sec || it_virt->tv_usec) {
+		/* This is called every tick, so we don't have to
+                   worry about large values of usecs.  It should be
+                   less than 12ms or so, certainly less than a
+                   second. */
+		if ((it_virt->tv_sec == 0) && (it_virt->tv_usec <= usecs)) {
+			*it_virt = p->it_virt_incr;
+			send_sig(SIGVTALRM, p, 1);
+		} else {
+			it_virt->tv_usec -= usecs;
+			if (it_virt->tv_usec < 0) {
+				it_virt->tv_usec += 1000000;
+				it_virt->tv_sec--;
+			}
+		}
+	}
+}
+
+void tms_task_stopped_running(struct task_struct *p)
+{
+	unsigned long timestamp;
+	unsigned long usecs;
+	unsigned long flags;
+
+	write_lock_irqsave(&virt_itimer_lock, flags);
+	timestamp = curr_timestamp();
+
+	/* We have to be running in system code to call this.  But
+	   when called from a kernel thread, the nest count won't be
+	   correct, so fix it. */
+	if (!p->system_code_nest)
+		p->system_code_nest++;
+
+	usecs = timestamp_usec_diff(timestamp, p->last_timestamp);
+	add_usec_to_timeval(&p->times.tms_stime, usecs);
+
+#ifdef CONFIG_VIRT_ITIMER_COUNTS_SYSTEM
+	tms_check_virt(p, usecs);
+#endif
+	write_unlock_irqrestore(&virt_itimer_lock, flags);
+}
+
+void tms_task_started_running(struct task_struct *p)
+{
+	unsigned long flags;
+
+	write_lock_irqsave(&virt_itimer_lock, flags);
+	p->last_timestamp = curr_timestamp();
+	write_unlock_irqrestore(&virt_itimer_lock, flags);
+}
+
+void tms_update_task_times(struct task_struct *p)
+{
+	unsigned long timestamp;
+	unsigned long usecs;
+	unsigned long flags;
+
+	write_lock_irqsave(&virt_itimer_lock, flags);
+	timestamp = curr_timestamp();
+
+	/* We have to be running in system code to call this. */
+	usecs = timestamp_usec_diff(timestamp, p->last_timestamp);
+	add_usec_to_timeval(&p->times.tms_stime, usecs);
+#ifdef CONFIG_VIRT_ITIMER_COUNTS_SYSTEM
+	tms_check_virt(p, usecs);
+#endif
+	p->last_timestamp = timestamp;
+	write_unlock_irqrestore(&virt_itimer_lock, flags);
+}
+
+asmlinkage void tms_enter_system(struct task_struct *p)
+{
+	unsigned long timestamp;
+	unsigned long usecs;
+	unsigned long flags;
+
+	write_lock_irqsave(&virt_itimer_lock, flags);
+	timestamp = curr_timestamp();
+	if (p->system_code_nest) {
+		p->system_code_nest++;
+	} else {
+		/* Leaving userland. */
+		usecs = timestamp_usec_diff(timestamp, p->last_timestamp);
+		
+		add_usec_to_timeval(&p->times.tms_utime, usecs);
+		tms_check_virt(p, usecs);
+		p->last_timestamp = timestamp;
+		p->system_code_nest = 1;
+	}
+	write_unlock_irqrestore(&virt_itimer_lock, flags);
+}
+
+asmlinkage void tms_leave_system(struct task_struct *p)
+{
+	unsigned long timestamp;
+	unsigned long usecs;
+	unsigned long flags;
+
+	write_lock_irqsave(&virt_itimer_lock, flags);
+	timestamp = curr_timestamp();
+
+	p->system_code_nest--;
+	if (! p->system_code_nest) {
+		/* Returning to userland. */
+		usecs = timestamp_usec_diff(timestamp, p->last_timestamp);
+		add_usec_to_timeval(&p->times.tms_stime, usecs);
+#ifdef CONFIG_VIRT_ITIMER_COUNTS_SYSTEM
+		tms_check_virt(p, usecs);
+#endif
+		p->last_timestamp = timestamp;
+	}
+	write_unlock_irqrestore(&virt_itimer_lock, flags);
+}
+#endif /* CONFIG_HIGHRES_CPU_USAGE */
+
 static inline void do_it_prof(struct task_struct *p)
 {
 	unsigned long it_prof = p->it_prof_value;
@@ -567,8 +707,12 @@
 {
 	p->per_cpu_utime[cpu] += user;
 	p->per_cpu_stime[cpu] += system;
+#ifndef CONFIG_HIGHRES_CPU_USAGE
 	do_process_times(p, user, system);
 	do_it_virt(p, user);
+#else
+	tms_update_task_times(p);
+#endif
 	do_it_prof(p);
 }	
 
--- ./kernel/fork.c.highres	Wed Nov 21 12:18:42 2001
+++ ./kernel/fork.c	Sat Feb  9 17:30:04 2002
@@ -626,15 +626,25 @@
 	p->sigpending = 0;
 	init_sigpending(&p->pending);
 
-	p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
-	p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
+	p->it_real_value = p->it_prof_value = 0;
+	p->it_real_incr = p->it_prof_incr = 0;
+#ifndef CONFIG_HIGHRES_CPU_USAGE
+	p->it_virt_value = p->it_virt_incr = 0;
+#else
+	p->it_virt_value.tv_sec = p->it_virt_value.tv_usec = 0;
+	p->it_virt_incr.tv_sec = p->it_virt_incr.tv_usec = 0;
+	/* We start out in system code. */
+	p->system_code_nest = 1;
+#endif
 	init_timer(&p->real_timer);
 	p->real_timer.data = (unsigned long) p;
 
 	p->leader = 0;		/* session leadership doesn't inherit */
 	p->tty_old_pgrp = 0;
-	p->times.tms_utime = p->times.tms_stime = 0;
-	p->times.tms_cutime = p->times.tms_cstime = 0;
+	init_tms_time(&(p->times.tms_utime));
+	init_tms_time(&(p->times.tms_stime));
+	init_tms_time(&(p->times.tms_cutime));
+	init_tms_time(&(p->times.tms_cstime));
 #ifdef CONFIG_SMP
 	{
 		int i;
--- ./kernel/sys.c.highres	Tue Sep 18 16:10:43 2001
+++ ./kernel/sys.c	Sat Feb  9 17:30:04 2002
@@ -789,6 +789,15 @@
 	return old_fsgid;
 }
 
+static inline void tms_internal_to_tms(struct internal_tms *internal,
+				       struct tms          *external)
+{
+	external->tms_utime = tms_time_to_ticks(internal->tms_utime);
+	external->tms_stime = tms_time_to_ticks(internal->tms_stime);
+	external->tms_cutime = tms_time_to_ticks(internal->tms_cutime);
+	external->tms_cstime = tms_time_to_ticks(internal->tms_cstime);
+}
+
 asmlinkage long sys_times(struct tms * tbuf)
 {
 	/*
@@ -797,9 +806,12 @@
 	 *	atomically safe type this is just fine. Conceptually its
 	 *	as if the syscall took an instant longer to occur.
 	 */
-	if (tbuf)
-		if (copy_to_user(tbuf, ¤t->times, sizeof(struct tms)))
+	if (tbuf) {
+		struct tms external;
+		tms_internal_to_tms(¤t->times, &external);
+		if (copy_to_user(tbuf, &external, sizeof(struct tms)))
 			return -EFAULT;
+	}
 	return jiffies;
 }
 
@@ -1157,32 +1169,35 @@
 	memset((char *) &r, 0, sizeof(r));
 	switch (who) {
 		case RUSAGE_SELF:
-			r.ru_utime.tv_sec = CT_TO_SECS(p->times.tms_utime);
-			r.ru_utime.tv_usec = CT_TO_USECS(p->times.tms_utime);
-			r.ru_stime.tv_sec = CT_TO_SECS(p->times.tms_stime);
-			r.ru_stime.tv_usec = CT_TO_USECS(p->times.tms_stime);
+		        r.ru_utime = tms_time_to_timeval(p->times.tms_utime);
+		        r.ru_stime = tms_time_to_timeval(p->times.tms_stime);
 			r.ru_minflt = p->min_flt;
 			r.ru_majflt = p->maj_flt;
 			r.ru_nswap = p->nswap;
 			break;
 		case RUSAGE_CHILDREN:
-			r.ru_utime.tv_sec = CT_TO_SECS(p->times.tms_cutime);
-			r.ru_utime.tv_usec = CT_TO_USECS(p->times.tms_cutime);
-			r.ru_stime.tv_sec = CT_TO_SECS(p->times.tms_cstime);
-			r.ru_stime.tv_usec = CT_TO_USECS(p->times.tms_cstime);
+		        r.ru_utime = tms_time_to_timeval(p->times.tms_cutime);
+		        r.ru_stime = tms_time_to_timeval(p->times.tms_cstime);
 			r.ru_minflt = p->cmin_flt;
 			r.ru_majflt = p->cmaj_flt;
 			r.ru_nswap = p->cnswap;
 			break;
 		default:
-			r.ru_utime.tv_sec = CT_TO_SECS(p->times.tms_utime + p->times.tms_cutime);
-			r.ru_utime.tv_usec = CT_TO_USECS(p->times.tms_utime + p->times.tms_cutime);
-			r.ru_stime.tv_sec = CT_TO_SECS(p->times.tms_stime + p->times.tms_cstime);
-			r.ru_stime.tv_usec = CT_TO_USECS(p->times.tms_stime + p->times.tms_cstime);
+		{
+		        tms_time tmsv;
+
+			tmsv = add_tms_times(p->times.tms_utime,
+					     p->times.tms_cutime);
+		        r.ru_utime = tms_time_to_timeval(tmsv);
+			tmsv = add_tms_times(p->times.tms_stime,
+					     p->times.tms_cstime);
+		        r.ru_stime = tms_time_to_timeval(tmsv);
+
 			r.ru_minflt = p->min_flt + p->cmin_flt;
 			r.ru_majflt = p->maj_flt + p->cmaj_flt;
 			r.ru_nswap = p->nswap + p->cnswap;
 			break;
+		}
 	}
 	return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
 }
--- ./kernel/itimer.c.highres	Thu Jun 29 12:07:36 2000
+++ ./kernel/itimer.c	Sat Feb  9 17:30:04 2002
@@ -58,20 +58,35 @@
 			if ((long) val <= 0)
 				val = 1;
 		}
+		jiffiestotv(val, &value->it_value);
+		jiffiestotv(interval, &value->it_interval);
 		break;
 	case ITIMER_VIRTUAL:
+#ifndef CONFIG_HIGHRES_CPU_USAGE
 		val = current->it_virt_value;
 		interval = current->it_virt_incr;
+		jiffiestotv(val, &value->it_value);
+		jiffiestotv(interval, &value->it_interval);
+#else
+		{
+			unsigned long flags;
+
+			read_lock_irqsave(&virt_itimer_lock, flags);
+			value->it_value = current->it_virt_value;
+			value->it_interval = current->it_virt_incr;
+			read_unlock_irqrestore(&virt_itimer_lock, flags);
+		}
+#endif
 		break;
 	case ITIMER_PROF:
 		val = current->it_prof_value;
 		interval = current->it_prof_incr;
+		jiffiestotv(val, &value->it_value);
+		jiffiestotv(interval, &value->it_interval);
 		break;
 	default:
 		return(-EINVAL);
 	}
-	jiffiestotv(val, &value->it_value);
-	jiffiestotv(interval, &value->it_interval);
 	return 0;
 }
 
@@ -110,12 +125,12 @@
 	register unsigned long i, j;
 	int k;
 
-	i = tvtojiffies(&value->it_interval);
-	j = tvtojiffies(&value->it_value);
 	if (ovalue && (k = do_getitimer(which, ovalue)) < 0)
 		return k;
 	switch (which) {
 		case ITIMER_REAL:
+			i = tvtojiffies(&value->it_interval);
+			j = tvtojiffies(&value->it_value);
 			del_timer_sync(¤t->real_timer);
 			current->it_real_value = j;
 			current->it_real_incr = i;
@@ -128,12 +143,28 @@
 			add_timer(¤t->real_timer);
 			break;
 		case ITIMER_VIRTUAL:
+#ifndef CONFIG_HIGHRES_CPU_USAGE
+			i = tvtojiffies(&value->it_interval);
+			j = tvtojiffies(&value->it_value);
 			if (j)
 				j++;
 			current->it_virt_value = j;
 			current->it_virt_incr = i;
+#else
+			{
+				unsigned long flags;
+
+				write_lock_irqsave(&virt_itimer_lock, flags);
+				current->it_virt_value = value->it_value;
+				current->it_virt_incr = value->it_interval;
+				write_unlock_irqrestore(&virt_itimer_lock,
+							flags);
+			}
+#endif
 			break;
 		case ITIMER_PROF:
+			i = tvtojiffies(&value->it_interval);
+			j = tvtojiffies(&value->it_value);
 			if (j)
 				j++;
 			current->it_prof_value = j;
--- ./kernel/exit.c.highres	Wed Nov 21 16:42:27 2001
+++ ./kernel/exit.c	Sat Feb  9 17:30:04 2002
@@ -535,8 +535,19 @@
 				}
 				goto end_wait4;
 			case TASK_ZOMBIE:
-				current->times.tms_cutime += p->times.tms_utime + p->times.tms_cutime;
-				current->times.tms_cstime += p->times.tms_stime + p->times.tms_cstime;
+				current->times.tms_cutime
+				    = add_tms_times(current->times.tms_utime,
+						    p->times.tms_utime);
+				current->times.tms_cutime
+				    = add_tms_times(current->times.tms_cutime,
+						    p->times.tms_cutime);
+				current->times.tms_cstime
+				    = add_tms_times(current->times.tms_stime,
+						    p->times.tms_stime);
+				current->times.tms_cstime
+				    = add_tms_times(current->times.tms_cstime,
+						    p->times.tms_cstime);
+
 				read_unlock(&tasklist_lock);
 				retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
 				if (!retval && stat_addr)
--- ./kernel/signal.c.highres	Wed Nov 21 18:26:27 2001
+++ ./kernel/signal.c	Sat Feb  9 17:30:04 2002
@@ -742,8 +742,8 @@
 	info.si_uid = tsk->uid;
 
 	/* FIXME: find out whether or not this is supposed to be c*time. */
-	info.si_utime = tsk->times.tms_utime;
-	info.si_stime = tsk->times.tms_stime;
+	info.si_utime = tms_time_to_ticks(tsk->times.tms_utime);
+	info.si_stime = tms_time_to_ticks(tsk->times.tms_stime);
 
 	status = tsk->exit_code & 0x7f;
 	why = SI_KERNEL;	/* shouldn't happen */
--- ./kernel/sched.c.highres	Thu Jan 10 15:52:14 2002
+++ ./kernel/sched.c	Sat Feb  9 17:30:04 2002
@@ -694,6 +694,8 @@
 	 * This just switches the register state and the
 	 * stack.
 	 */
+	tms_task_stopped_running(prev);
+	tms_task_started_running(next);
 	switch_to(prev, next, prev);
 	__schedule_tail(prev);
 
--- ./Documentation/Configure.help.highres	Thu Jan 10 15:52:12 2002
+++ ./Documentation/Configure.help	Sat Feb  9 17:30:04 2002
@@ -19794,6 +19794,19 @@
 
   "Area6" will work for most boards. For ADX, select "Area5".
 
+High resolution CPU usage and virtual itimers
+CONFIG_HIGHRES_CPU_USAGE
+  If you say Y here, the cpu usage tracked for programs and virtual itimers
+  will be much higher precision.  The downside of this is that the system
+  call overhead will go up some (every entry into and out of the kernel must
+  be tracked).  Also, the architecture must support high-res timers of
+  some kind and the support must be added to the architecture code.
+
+Does the virtual itimer count system CPU usage?
+CONFIG_VIRT_ITIMER_COUNTS_SYSTEM
+  If highres CPU is counted, this sets whether virtual itimers count system
+  CPU used towards the amount of CPU until the virtual timer goes off.
+
 #
 # m68k-specific kernel options
 # Documented by Chris Lawrence <mailto:quango@themall.net> et al.
--- ./include/linux/sched.h.highres	Sun Jan 13 14:52:51 2002
+++ ./include/linux/sched.h	Sun Feb 10 20:19:05 2002
@@ -278,6 +278,124 @@
 extern struct user_struct root_user;
 #define INIT_USER (&root_user)
 
+#ifndef CONFIG_HIGHRES_CPU_USAGE
+typedef clock_t tms_time;
+#else
+typedef struct timeval tms_time;
+#endif
+
+struct internal_tms {
+        tms_time tms_utime;
+	tms_time tms_stime;
+	tms_time tms_cutime;
+	tms_time tms_cstime;
+};
+
+#ifndef CONFIG_HIGHRES_CPU_USAGE
+static inline clock_t tms_time_to_ticks(tms_time tv)
+{
+	return tv;
+}
+
+static inline tms_time add_tms_times(tms_time tv1, tms_time tv2)
+{
+	return tv1 + tv2;
+}
+
+static inline void init_tms_time(tms_time *tv)
+{
+	*tv = 0;
+}
+
+static inline struct timeval tms_time_to_timeval(tms_time tv)
+{
+	struct timeval rv;
+
+	rv.tv_sec = CT_TO_SECS(tv);
+	rv.tv_usec = CT_TO_USECS(tv);
+	return rv;
+}
+
+/* Called when a running task is stopped. */
+#define tms_task_stopped_running(p)
+
+/* Called when a task that was not running is started. */
+#define tms_task_started_running(p)
+
+/* Called periodically in case a task is taking all the CPU to avoid
+   curr_usec_time() overflowing. */
+#define tms_update_task_times(p)
+
+#else
+
+/* This spin lock protects the virtual itimer structure while we are
+   messing with it. */
+extern rwlock_t virt_itimer_lock;
+
+static inline clock_t tms_time_to_ticks(tms_time tv)
+{
+	clock_t ticks;
+
+	ticks = tv.tv_sec * HZ;
+	ticks += tv.tv_usec / (1000000 / HZ);
+	return ticks;
+}
+
+static inline tms_time add_tms_times(tms_time tv1,
+				     tms_time tv2)
+{
+	tms_time rv;
+
+	rv.tv_sec = tv1.tv_sec + tv2.tv_sec;
+	rv.tv_usec = tv1.tv_usec + tv2.tv_usec;
+	if (rv.tv_usec >= 1000000) {
+		rv.tv_usec -= 1000000;
+		rv.tv_sec += 1;
+	}
+
+	return rv;
+}
+
+static inline void init_tms_time(tms_time *tv)
+{
+	tv->tv_sec = 0;
+	tv->tv_usec = 0;
+}
+
+static inline struct timeval tms_time_to_timeval(tms_time tv)
+{
+	return tv;
+}
+
+/* Return a current timestamp from the architecture. */
+extern unsigned long curr_timestamp(void);
+
+/* Return the difference in microseconds of ts1-ts2.  This is called
+   by the highres usage time code to get microsecond resolution from
+   some highres clock. */
+extern unsigned long timestamp_usec_diff(unsigned long ts1, unsigned long ts2);
+
+struct task_struct;
+
+/* Called when a running task is stopped. */
+void tms_task_stopped_running(struct task_struct *p);
+
+/* Called when a task that was not running is started. */
+void tms_task_started_running(struct task_struct *p);
+
+/* Called periodically in case a task is taking all the CPU to avoid
+   curr_usec_time() overflowing. */
+void tms_update_task_times(struct task_struct *p);
+
+/* Called when we enter system code for a process. */
+void tms_enter_system(struct task_struct *p);
+
+/* Called when leave system code for a process. */
+void tms_leave_system(struct task_struct *p);
+
+#endif
+
+
 struct task_struct {
 	/*
 	 * offsets of these are hardcoded elsewhere - touch with care
@@ -355,10 +473,22 @@
 	wait_queue_head_t wait_chldexit;	/* for wait4() */
 	struct completion *vfork_done;		/* for vfork() */
 	unsigned long rt_priority;
-	unsigned long it_real_value, it_prof_value, it_virt_value;
-	unsigned long it_real_incr, it_prof_incr, it_virt_incr;
+	unsigned long it_real_value, it_prof_value;
+#ifndef CONFIG_HIGHRES_CPU_USAGE
+        unsigned long it_virt_value;
+#else
+        struct timeval it_virt_value;
+        unsigned long last_timestamp;   /* The last usec time taken for the process. */
+        unsigned long system_code_nest; /* How nested am I in system code? */
+#endif
+        unsigned long it_real_incr, it_prof_incr;
+#ifndef CONFIG_HIGHRES_CPU_USAGE
+        unsigned long it_virt_incr;
+#else
+        struct timeval it_virt_incr;
+#endif
 	struct timer_list real_timer;
-	struct tms times;
+	struct internal_tms times;
 	unsigned long start_time;
 	long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS];
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
@@ -451,6 +581,12 @@
 #define DEF_NICE	(0)
 
 
+#ifdef CONFIG_HIGHRES_CPU_USAGE
+#define INIT_SYSTEM_CODE_NEST	system_code_nest:		1,
+#else
+#define INIT_SYSTEM_CODE_NEST
+#endif
+
 /*
  * The default (Linux) execution domain.
  */
@@ -501,6 +637,7 @@
     blocked:		{{0}},						\
     alloc_lock:		SPIN_LOCK_UNLOCKED,				\
     journal_info:	NULL,						\
+    INIT_SYSTEM_CODE_NEST						\
 }
 
 
--- ./include/asm-i386/hw_irq.h.highres	Mon Feb 11 15:10:39 2002
+++ ./include/asm-i386/hw_irq.h	Mon Feb 11 15:19:50 2002
@@ -95,6 +95,22 @@
 #define __STR(x) #x
 #define STR(x) __STR(x)
 
+#define GET_CURRENT \
+	"movl %esp, %ebx\n\t" \
+	"andl $-8192, %ebx\n\t"
+
+#ifdef CONFIG_HIGHRES_CPU_USAGE
+#define HIGHRES_START_CODE \
+	"pushl %eax\n\t"		          \
+	GET_CURRENT				  \
+	"pushl %ebx\n\t"			  \
+	"call  " SYMBOL_NAME_STR(tms_enter_system) "\n\t" \
+	"addl  $4,%esp\n\t"			  \
+	"popl %eax\n\t" 
+#else
+#define HIGHRES_START_CODE
+#endif
+
 #define SAVE_ALL \
 	"cld\n\t" \
 	"pushl %es\n\t" \
@@ -108,14 +124,11 @@
 	"pushl %ebx\n\t" \
 	"movl $" STR(__KERNEL_DS) ",%edx\n\t" \
 	"movl %edx,%ds\n\t" \
-	"movl %edx,%es\n\t"
+	"movl %edx,%es\n\t" \
+        HIGHRES_START_CODE
 
 #define IRQ_NAME2(nr) nr##_interrupt(void)
 #define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
-
-#define GET_CURRENT \
-	"movl %esp, %ebx\n\t" \
-	"andl $-8192, %ebx\n\t"
 
 /*
  *	SMP has a few special interrupts for IPI messages
--------------070202060507000102030408--
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/