<!-- received="Sun Oct 10 08:13:59 1999 EET DST" -->
<!-- sent="Sun, 10 Oct 1999 06:04:44 +0100" -->
<!-- name="Artur Skawina" -->
<!-- email="skawina@geocities.com" -->
<!-- subject="Re: SCHED_YIELD again" -->
<!-- id="" -->
<!-- inreplyto="SCHED_YIELD again" -->
<title>Linux-kernel mailing list archive 1999-41,: Re: SCHED_YIELD again</title>
<body bgcolor="#FFFFFF"><font face="Arial,Helvetica">
<h1>Re: SCHED_YIELD again</h1>
<b>Artur Skawina</b> (<a href="mailto:skawina@geocities.com"><i>skawina@geocities.com</i></a>)<br>
<i>Sun, 10 Oct 1999 06:04:44 +0100</i>
<p>
<ul>
<li> <b>Messages sorted by:</b> <a href="date.html#49">[ date ]</a><a href="index.html#49">[ thread ]</a><a href="subject.html#49">[ subject ]</a><a href="author.html#49">[ author ]</a>
<!-- next="start" -->
<li> <b>Next message:</b> <a href="0050.html">Alexander Viro: "Re: [PATCH] devfs v123 available"</a>
<li> <b>Previous message:</b> <a href="0048.html">Brandon S. Allbery KF8NH: "Re: [linux-usb] Re: USB device allocation"</a>
<li> <b>Maybe in reply to:</b> <a href="0008.html">Borislav Deianov: "SCHED_YIELD again"</a>
<!-- nextthread="start" -->
<li> <b>Next in thread:</b> <a href="0112.html">Artur Skawina: "Re: SCHED_YIELD again"</a>
<!-- reply="end" -->
</ul>
<hr>
<!-- body="start" -->
Borislav Deianov wrote:<br>
<i>&gt; </i><br>
<i>&gt; There's a slight change in behaviour: if a process is preempted</i><br>
<i>&gt; (without sched_yield) and there's another process with exactly the</i><br>
<i>&gt; same priority, the other process gets to run. In the old code the</i><br>
<i>&gt; previous process runs again. This shouldn't be a problem.</i><br>
<p>
Consider what happens with 2+ equal priority SCHED_FIFO processes...<br>
<p>
[i didn't look at the patch closely, there might be more problems.<br>
 fixing the scheduler w/o (1) introducing new bugs, and (2) making<br>
 it slower isn't as simple as it seems at first sight. also there<br>
 are more problems with the stock scheduler than just yield()<br>
 (like completely broken SCHED_RR and wrong SCHED_YIELD assumptions).<br>
 I'll attach my current snapshot, which fixes all known issues<br>
 (incl. all reported here in the last few months), and doesn't<br>
 change behaviour at all (except when the old one was wrong).<br>
 [yep, the idle support is in there too -- removing it wouldn't<br>
 change _anything_ in the scheduler itself; as this patch isn't<br>
 production ready yet anyway there's no point.]<br>
 The only issue left is iirc the (external) SCHED_YIELD assumptions]<br>
 <br>
<p>
diff -urNp /img/linux-2.3.19/include/linux/sched.h linux-2.3.19as/include/linux/sched.h<br>
--- /img/linux-2.3.19/include/linux/sched.h	Sat Sep 11 00:07:24 1999<br>
+++ linux-2.3.19as/include/linux/sched.h	Wed Oct  6 19:52:01 1999<br>
@@ -109,6 +109,7 @@ extern int last_pid;<br>
 #define SCHED_OTHER		0<br>
 #define SCHED_FIFO		1<br>
 #define SCHED_RR		2<br>
+#define SCHED_IDLE		4<br>
 <br>
 /*<br>
  * This is an additional bit set when we want to<br>
@@ -311,7 +312,8 @@ struct task_struct {<br>
 <br>
 	wait_queue_head_t wait_chldexit;	/* for wait4() */<br>
 	struct semaphore *vfork_sem;		/* for vfork() */<br>
-	unsigned long policy, rt_priority;<br>
+	unsigned long policy;<br>
+	long rt_priority;<br>
 	unsigned long it_real_value, it_prof_value, it_virt_value;<br>
 	unsigned long it_real_incr, it_prof_incr, it_virt_incr;<br>
 	struct timer_list real_timer;<br>
@@ -364,6 +366,7 @@ struct task_struct {<br>
 					/* Not implemented yet, only for 486*/<br>
 #define PF_STARTING	0x00000002	/* being created */<br>
 #define PF_EXITING	0x00000004	/* getting shut down */<br>
+#define PF_IDLE		0x00000008	/* set for a SCHED_IDLE process */<br>
 #define PF_PTRACED	0x00000010	/* set if ptrace (0) has been called */<br>
 #define PF_TRACESYS	0x00000020	/* tracing system calls */<br>
 #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */<br>
diff -urNp /img/linux-2.3.19/kernel/sched.c linux-2.3.19as/kernel/sched.c<br>
--- /img/linux-2.3.19/kernel/sched.c	Fri Aug 27 13:13:14 1999<br>
+++ linux-2.3.19as/kernel/sched.c	Wed Oct  6 19:31:12 1999<br>
@@ -15,6 +15,8 @@<br>
  *				Copyright (C) 1998  Andrea Arcangeli<br>
  *  1998-12-28  Implemented better SMP scheduling by Ingo Molnar<br>
  *  1999-03-10	Improved NTP compatibility by Ulrich Windl<br>
+ *  1999-07-29  SCHED_IDLE support by Artur Skawina<br>
+ *  1999-07-30  Fixed sched_yield() by Artur Skawina<br>
  */<br>
 <br>
 /*<br>
@@ -155,17 +157,27 @@ void scheduling_functions_start_here(voi<br>
  *	 +1000: realtime process, select this.<br>
  */<br>
 <br>
+#define GOODNESS_MIN    (-1000)  /* goodness value of the real idle task(s) */<br>
+#define GOODNESS_MAX    1000     /* max 'normal' goodness; RT processes have more */<br>
+<br>
 static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)<br>
 {<br>
 	int weight;<br>
 <br>
-	/*<br>
-	 * Realtime process, select the first one on the<br>
-	 * runqueue (taking priorities within processes<br>
-	 * into account).<br>
-	 */<br>
 	if (p-&gt;policy != SCHED_OTHER) {<br>
-		weight = 1000 + p-&gt;rt_priority;<br>
+		/* We get here if:<br>
+		 *  o This is a FIFO, RR, or IDLE process.<br>
+		 *      Return the static priority (set by user).<br>
+		 *      For realtime this is 1001...1100, for idle -999...-900.<br>
+		 *  o This process came from sched_yield() [has SCHED_YIELD flag].<br>
+		 *      This process wants to give others a chance to run.<br>
+		 *      It is still runnable, has already been moved to the end<br>
+		 *      of the runqueue, and won't get any preferential treatment.<br>
+		 *      For SCHED_OTHER processes return 0 (note this also<br>
+		 *      causes counter recalculation if this process wins).<br>
+		 *      For FIFO, RR, and IDLE processes return static priority.<br>
+		 */<br>
+		weight = p-&gt;rt_priority;<br>
 		goto out;<br>
 	}<br>
 <br>
@@ -197,24 +209,6 @@ out:<br>
 }<br>
 <br>
 /*<br>
- * subtle. We want to discard a yielded process only if it's being<br>
- * considered for a reschedule. Wakeup-time 'queries' of the scheduling<br>
- * state do not count. Another optimization we do: sched_yield()-ed<br>
- * processes are runnable (and thus will be considered for scheduling)<br>
- * right when they are calling schedule(). So the only place we need<br>
- * to care about SCHED_YIELD is when we calculate the previous process'<br>
- * goodness ...<br>
- */<br>
-static inline int prev_goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)<br>
-{<br>
-	if (p-&gt;policy &amp; SCHED_YIELD) {<br>
-		p-&gt;policy &amp;= ~SCHED_YIELD;<br>
-		return 0;<br>
-	}<br>
-	return goodness(p, this_cpu, this_mm);<br>
-}<br>
-<br>
-/*<br>
  * the 'goodness value' of replacing a process on a given CPU.<br>
  * positive value means 'replace', zero or negative means 'dont'.<br>
  */<br>
@@ -575,8 +569,11 @@ asmlinkage void schedule(void)<br>
 tq_scheduler_back:<br>
 <br>
 	prev = current;<br>
+#ifdef __SMP__<br>
 	this_cpu = prev-&gt;processor;<br>
-<br>
+#else<br>
+	this_cpu = smp_processor_id(); /* it's not like we have more than one */<br>
+#endif<br>
 	if (in_interrupt())<br>
 		goto scheduling_in_interrupt;<br>
 <br>
@@ -596,7 +593,7 @@ handle_bh_back:<br>
 	spin_lock_irq(&amp;runqueue_lock);<br>
 <br>
 	/* move an exhausted RR process to be last.. */<br>
-	if (prev-&gt;policy == SCHED_RR)<br>
+	if (prev-&gt;policy&amp;(SCHED_RR|SCHED_IDLE))<br>
 		goto move_rr_last;<br>
 move_rr_back:<br>
 <br>
@@ -621,24 +618,25 @@ repeat_schedule:<br>
 	 * Default process to select..<br>
 	 */<br>
 	next = idle_task(this_cpu);<br>
-	c = -1000;<br>
+	c = GOODNESS_MIN;<br>
+	<br>
 	if (prev-&gt;state == TASK_RUNNING)<br>
 		goto still_running;<br>
 still_running_back:<br>
 <br>
-	tmp = runqueue_head.next;<br>
-	while (tmp != &amp;runqueue_head) {<br>
+	for (tmp = runqueue_head.next; tmp != &amp;runqueue_head; tmp = tmp-&gt;next) {<br>
 		p = list_entry(tmp, struct task_struct, run_list);<br>
 		if (can_schedule(p)) {<br>
 			int weight = goodness(p, this_cpu, prev-&gt;active_mm);<br>
 			if (weight &gt; c)<br>
 				c = weight, next = p;<br>
 		}<br>
-		tmp = tmp-&gt;next;<br>
 	}<br>
 <br>
+	prev-&gt;policy &amp;= ~SCHED_YIELD;<br>
+<br>
 	/* Do we need to re-calculate counters? */<br>
-	if (!c)<br>
+	if (c==0)<br>
 		goto recalculate;<br>
 	/*<br>
 	 * from this point on nothing can prevent us from<br>
@@ -739,8 +737,15 @@ recalculate:<br>
 	goto repeat_schedule;<br>
 <br>
 still_running:<br>
-	c = prev_goodness(prev, this_cpu, prev-&gt;active_mm);<br>
-	next = prev;<br>
+	/* If last process is still running prefer it over others<br>
+	 * with equal goodness. Except if it has just called<br>
+	 * sched_yield(), then treat it like any other process.<br>
+	 */<br>
+	if (!(prev-&gt;policy&amp;SCHED_YIELD))<br>
+	{<br>
+		c = goodness(prev, this_cpu, prev-&gt;active_mm);<br>
+		next = prev;<br>
+	}<br>
 	goto still_running_back;<br>
 <br>
 handle_bh:<br>
@@ -752,8 +757,9 @@ handle_tq_scheduler:<br>
 	goto tq_scheduler_back;<br>
 <br>
 move_rr_last:<br>
-	if (!prev-&gt;counter) {<br>
+	if (prev-&gt;counter==0) {<br>
 		prev-&gt;counter = prev-&gt;priority;<br>
+		prev-&gt;policy |= SCHED_YIELD;<br>
 		move_last_runqueue(prev);<br>
 	}<br>
 	goto move_rr_back;<br>
@@ -1515,13 +1521,13 @@ static int setscheduler(pid_t pid, int p<br>
 	else {<br>
 		retval = -EINVAL;<br>
 		if (policy != SCHED_FIFO &amp;&amp; policy != SCHED_RR &amp;&amp;<br>
-				policy != SCHED_OTHER)<br>
+				policy != SCHED_OTHER &amp;&amp; policy != SCHED_IDLE)<br>
 			goto out_unlock;<br>
 	}<br>
 	<br>
 	/*<br>
-	 * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid<br>
-	 * priority for SCHED_OTHER is 0.<br>
+	 * Valid priorities for SCHED_FIFO, SCHED_RR and SCHED_IDLE<br>
+	 * are 1..99, valid priority for SCHED_OTHER is 0.<br>
 	 */<br>
 	retval = -EINVAL;<br>
 	if (lp.sched_priority &lt; 0 || lp.sched_priority &gt; 99)<br>
@@ -1530,7 +1536,7 @@ static int setscheduler(pid_t pid, int p<br>
 		goto out_unlock;<br>
 <br>
 	retval = -EPERM;<br>
-	if ((policy == SCHED_FIFO || policy == SCHED_RR) &amp;&amp; <br>
+	if ((policy == SCHED_FIFO || policy == SCHED_RR || policy == SCHED_IDLE) &amp;&amp; <br>
 	    !capable(CAP_SYS_NICE))<br>
 		goto out_unlock;<br>
 	if ((current-&gt;euid != p-&gt;euid) &amp;&amp; (current-&gt;euid != p-&gt;uid) &amp;&amp;<br>
@@ -1538,8 +1544,22 @@ static int setscheduler(pid_t pid, int p<br>
 		goto out_unlock;<br>
 <br>
 	retval = 0;<br>
-	p-&gt;policy = policy;<br>
+	/* prevent changing policy of an already idle process to SCHED_IDLE */<br>
+	if ( (policy!=SCHED_IDLE) || !(p-&gt;flags&amp;PF_IDLE) )<br>
+		p-&gt;policy = policy;<br>
 	p-&gt;rt_priority = lp.sched_priority;<br>
+	switch ( policy ) /* move rt_priority into proper range, update flags */<br>
+	{<br>
+	case SCHED_IDLE:	p-&gt;rt_priority  += GOODNESS_MIN;<br>
+				p-&gt;flags        |= PF_IDLE;<br>
+				break;<br>
+	case SCHED_RR:<br>
+	case SCHED_FIFO:	p-&gt;rt_priority  += GOODNESS_MAX;<br>
+	default:<br>
+	/*case SCHED_OTHER:*/<br>
+				p-&gt;flags        &amp;= ~PF_IDLE;<br>
+	}<br>
+	<br>
 	if (task_on_runqueue(p))<br>
 		move_first_runqueue(p);<br>
 <br>
@@ -1580,7 +1600,7 @@ asmlinkage long sys_sched_getscheduler(p<br>
 	if (!p)<br>
 		goto out_unlock;<br>
 			<br>
-	retval = p-&gt;policy;<br>
+	retval = (p-&gt;flags&amp;PF_IDLE) ? SCHED_IDLE : p-&gt;policy;<br>
 <br>
 out_unlock:<br>
 	read_unlock(&amp;tasklist_lock);<br>
@@ -1605,6 +1625,12 @@ asmlinkage long sys_sched_getparam(pid_t<br>
 	if (!p)<br>
 		goto out_unlock;<br>
 	lp.sched_priority = p-&gt;rt_priority;<br>
+	switch ( p-&gt;policy )<br>
+	{<br>
+	default: /* case SCHED_OTHER: */  if ( !(p-&gt;flags&amp;PF_IDLE) ) break;<br>
+	case SCHED_IDLE:                  lp.sched_priority -= GOODNESS_MIN; break;<br>
+	case SCHED_RR: case SCHED_FIFO:   lp.sched_priority -= GOODNESS_MAX; break;<br>
+	}<br>
 	read_unlock(&amp;tasklist_lock);<br>
 <br>
 	/*<br>
@@ -1622,12 +1648,25 @@ out_unlock:<br>
 <br>
 asmlinkage long sys_sched_yield(void)<br>
 {<br>
-	spin_lock_irq(&amp;runqueue_lock);<br>
-	if (current-&gt;policy == SCHED_OTHER)<br>
+/*<br>
+ * A thread calling sched_yield() wants to give up its timeslice and let<br>
+ * other equal priority threads to run.<br>
+ * We optimize by ignoring the request completely when the number of<br>
+ * currently runnable processes is &lt;= the number of available CPUs.<br>
+ * IOW on UP sched_yield() does nothing if this is the only process; on<br>
+ * MP we don't reschedule if there are no processes waiting for a CPU.<br>
+ * [note 'nr_running' is accessed w/o obtaining the lock, but that's ok<br>
+ * - an application can not use yield() as a synchronization mechanism<br>
+ * anyway -- if we sometimes miss or do an extra reschedule that's fine]<br>
+ */<br>
+	if ( nr_running&gt;smp_num_cpus )<br>
+	{<br>
+		spin_lock_irq(&amp;runqueue_lock);<br>
 		current-&gt;policy |= SCHED_YIELD;<br>
-	current-&gt;need_resched = 1;<br>
-	move_last_runqueue(current);<br>
-	spin_unlock_irq(&amp;runqueue_lock);<br>
+		current-&gt;need_resched = 1;<br>
+		move_last_runqueue(current);<br>
+		spin_unlock_irq(&amp;runqueue_lock);<br>
+	}<br>
 	return 0;<br>
 }<br>
 <br>
@@ -1638,6 +1677,7 @@ asmlinkage long sys_sched_get_priority_m<br>
 	switch (policy) {<br>
 	case SCHED_FIFO:<br>
 	case SCHED_RR:<br>
+	case SCHED_IDLE:<br>
 		ret = 99;<br>
 		break;<br>
 	case SCHED_OTHER:<br>
@@ -1654,6 +1694,7 @@ asmlinkage long sys_sched_get_priority_m<br>
 	switch (policy) {<br>
 	case SCHED_FIFO:<br>
 	case SCHED_RR:<br>
+	case SCHED_IDLE:<br>
 		ret = 1;<br>
 		break;<br>
 	case SCHED_OTHER:<br>
@@ -1664,13 +1705,25 @@ asmlinkage long sys_sched_get_priority_m<br>
 <br>
 asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)<br>
 {<br>
-	struct timespec t;<br>
+	struct timespec    t;<br>
+	struct task_struct *p;<br>
+	int                retval;<br>
 <br>
-	t.tv_sec = 0;<br>
-	t.tv_nsec = 150000;<br>
-	if (copy_to_user(interval, &amp;t, sizeof(struct timespec)))<br>
-		return -EFAULT;<br>
-	return 0;<br>
+	if (pid &lt; 0)<br>
+		return -EINVAL;<br>
+<br>
+	retval = -ESRCH;<br>
+<br>
+	read_lock(&amp;tasklist_lock);<br>
+	p = find_process_by_pid(pid);<br>
+	if (p)<br>
+		jiffies_to_timespec(p-&gt;priority, &amp;t);<br>
+	read_unlock(&amp;tasklist_lock);<br>
+<br>
+	if (p)<br>
+		retval = copy_to_user(interval, &amp;t, sizeof(struct timespec)) ? -EFAULT : 0;<br>
+<br>
+	return retval;<br>
 }<br>
 <br>
 asmlinkage long sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp)<br>
@@ -1686,7 +1739,7 @@ asmlinkage long sys_nanosleep(struct tim<br>
 <br>
 <br>
 	if (t.tv_sec == 0 &amp;&amp; t.tv_nsec &lt;= 2000000L &amp;&amp;<br>
-	    current-&gt;policy != SCHED_OTHER)<br>
+	    (current-&gt;policy &amp; (SCHED_FIFO|SCHED_RR)))<br>
 	{<br>
 		/*<br>
 		 * Short delay requests up to 2 ms will be handled with<br>
@@ -1694,10 +1747,18 @@ asmlinkage long sys_nanosleep(struct tim<br>
 		 *<br>
 		 * Its important on SMP not to do this holding locks.<br>
 		 */<br>
-		udelay((t.tv_nsec + 999) / 1000);<br>
+		for ( expire=t.tv_nsec; ((long)expire)&gt;0; expire-=10000 )<br>
+		{<br>
+			if ( current-&gt;need_resched ) /* there may be a higher priority */<br>
+			{                            /*  RT thread waiting...          */<br>
+				t.tv_nsec = expire;  /* substract the time we've slept */<br>
+				goto schedule;       /* let the scheduler to its job   */<br>
+			}<br>
+			udelay(10);                  /* busy loop for ~10us (10000ns) */<br>
+		}<br>
 		return 0;<br>
 	}<br>
-<br>
+schedule:<br>
 	expire = timespec_to_jiffies(&amp;t) + (t.tv_sec || t.tv_nsec);<br>
 <br>
 	current-&gt;state = TASK_INTERRUPTIBLE;<br>
@@ -1831,12 +1892,10 @@ void __init sched_init(void)<br>
 	 * process right in SMP mode.<br>
 	 */<br>
 	int cpu=hard_smp_processor_id();<br>
-	int nr;<br>
 <br>
 	init_task.processor=cpu;<br>
 <br>
-	for(nr = 0; nr &lt; PIDHASH_SZ; nr++)<br>
-		pidhash[nr] = NULL;<br>
+	memset( pidhash, 0, PIDHASH_SZ*sizeof(pidhash[0])); /* clear pidhash[] */<br>
 <br>
 	init_bh(TIMER_BH, timer_bh);<br>
 	init_bh(TQUEUE_BH, tqueue_bh);<br>
<p>
<p>
-<br>
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in<br>
the body of a message to majordomo@vger.rutgers.edu<br>
Please read the FAQ at <a href="http://www.tux.org/lkml/">http://www.tux.org/lkml/</a><br>
<!-- body="end" -->
<hr>
<p>
<ul>
<!-- next="start" -->
<li> <b>Next message:</b> <a href="0050.html">Alexander Viro: "Re: [PATCH] devfs v123 available"</a>
<li> <b>Previous message:</b> <a href="0048.html">Brandon S. Allbery KF8NH: "Re: [linux-usb] Re: USB device allocation"</a>
<li> <b>Maybe in reply to:</b> <a href="0008.html">Borislav Deianov: "SCHED_YIELD again"</a>
<!-- nextthread="start" -->
<li> <b>Next in thread:</b> <a href="0112.html">Artur Skawina: "Re: SCHED_YIELD again"</a>
<!-- reply="end" -->
</ul>
</font></body>
