OK.
> I guess the store could get reordered (and so delayed) a bit, but an wmb
> is relatively expensive on some architectures.
> 
> > This is actually fairly pointless, because these fields are write-mostly and
> > read-rarely.  But we need a spinlock anyway because of the concurrent
> > modifiers problem.
> 
> It would be interesting to compare a spinlock or rwlock against a frlock
> in a write mostly situation. Actually isnt it going to be slower because
> we have to take a spinlock to serialise around the frlock write path?
A little bit, if it it very write-mostly.  But the gains on the read side
will compensate.
Here is Stephen's implementation:
 linux/frlock.h |  100 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 linux/time.h   |    3 +
 time.c         |   31 +++++++++--------
 timer.c        |   22 ++++++------
 4 files changed, 131 insertions(+), 25 deletions(-)
diff -puN /dev/null include/linux/frlock.h
--- /dev/null	2002-08-30 16:31:37.000000000 -0700
+++ linux-hype-akpm/include/linux/frlock.h	2003-01-25 18:35:14.000000000 -0800
@@ -0,0 +1,100 @@
+#ifndef __LINUX_FRLOCK_H
+#define __LINUX_FRLOCK_H
+
+/*
+ * Fast read-write spinlocks.
+ *
+ * Fast reader/writer locks without starving writers. This type of
+ * lock for data where the reader wants a consitent set of information
+ * and is willing to retry if the information changes.  Readers never
+ * block but they may have to retry if a writer is in
+ * progress. Writers do not wait for readers. 
+ *
+ * Generalization on sequence variables used for gettimeofday on x86-64 
+ * by Andrea Arcangeli
+ *
+ * This is not as cache friendly as brlock. Also, this will not work
+ * for data that contains pointers, because any writer could
+ * invalidate a pointer that a reader was following.
+ *
+ * 
+ * Expected reader usage:
+ * 	do {
+ *	    seq = fr_read_begin();
+ * 	...
+ *      } while (seq != fr_read_end());
+ *
+ * On non-SMP the spin locks disappear but the writer still needs
+ * to increment the sequence variables because an interrupt routine could
+ * change the state of the data.
+ */
+
+#include <linux/config.h>
+#include <linux/spinlock.h>
+
+typedef struct {
+	spinlock_t lock;
+	unsigned pre_sequence;
+	unsigned post_sequence;
+} frlock_t;
+
+#define FR_LOCK_UNLOCKED	(frlock_t) { SPIN_LOCK_UNLOCKED, 0, 0 }
+#define frlock_init(x)		do { *(x) = FR_LOCK_UNLOCKED; } while (0)
+
+static inline void fr_write_lock(frlock_t *rw)
+{
+	spin_lock(&rw->lock);
+	rw->pre_sequence++;
+	wmb();
+}	
+
+static inline void fr_write_unlock(frlock_t *rw) 
+{
+	wmb();
+	rw->post_sequence++;
+	spin_unlock(&rw->lock);
+}
+
+static inline int fr_write_trylock(frlock_t *rw)
+{
+	int ret  = spin_trylock(&rw->lock);
+
+	if (ret) {
+		++rw->pre_sequence;
+		wmb();
+	}
+	return ret;
+}
+
+static inline unsigned fr_read_begin(frlock_t *rw) 
+{
+	unsigned ret = rw->post_sequence;
+	rmb();
+	return ret;
+	
+}
+
+static inline unsigned fr_read_end(frlock_t *rw)
+{
+	rmb();
+	return rw->pre_sequence;
+}
+
+/*
+ * Possible sw/hw IRQ protected versions of the interfaces.
+ */
+#define fr_write_lock_irqsave(lock, flags) \
+	do { local_irq_save(flags);	 fr_write_lock(lock); } while (0)
+#define fr_write_lock_irq(lock) \
+	do { local_irq_disable();	 fr_write_lock(lock); } while (0)
+#define fr_write_lock_bh(lock) \
+        do { local_bh_disable();	 fr_write_lock(lock); } while (0)
+
+#define fr_write_unlock_irqrestore(lock, flags)	\
+	do { fr_write_unlock(lock); local_irq_restore(flags); } while(0)
+#define fr_write_unlock_irq(lock) \
+	do { fr_write_unlock(lock); local_irq_enable(); } while(0)
+#define fr_write_unlock_bh(lock) \
+	do { fr_write_unlock(lock); local_bh_enable(); } while(0)
+
+#endif /* __LINUX_FRLOCK_H */
diff -puN include/linux/time.h~frlock-xtime include/linux/time.h
--- linux-hype/include/linux/time.h~frlock-xtime	2003-01-25 18:11:45.000000000 -0800
+++ linux-hype-akpm/include/linux/time.h	2003-01-25 18:11:45.000000000 -0800
@@ -25,6 +25,7 @@ struct timezone {
 #ifdef __KERNEL__
 
 #include <linux/spinlock.h>
+#include <linux/frlock.h>
 
 /*
  * Change timeval to jiffies, trying to avoid the
@@ -120,7 +121,7 @@ mktime (unsigned int year, unsigned int 
 }
 
 extern struct timespec xtime;
-extern rwlock_t xtime_lock;
+extern frlock_t xtime_lock;
 
 static inline unsigned long get_seconds(void)
 { 
diff -puN kernel/time.c~frlock-xtime kernel/time.c
--- linux-hype/kernel/time.c~frlock-xtime	2003-01-25 18:11:45.000000000 -0800
+++ linux-hype-akpm/kernel/time.c	2003-01-25 18:11:45.000000000 -0800
@@ -27,7 +27,6 @@
 #include <linux/timex.h>
 #include <linux/errno.h>
 #include <linux/smp_lock.h>
-
 #include <asm/uaccess.h>
 
 /* 
@@ -38,7 +37,7 @@ struct timezone sys_tz;
 
 /* The xtime_lock is not only serializing the xtime read/writes but it's also
    serializing all accesses to the global NTP variables now. */
-extern rwlock_t xtime_lock;
+extern frlock_t xtime_lock;
 extern unsigned long last_time_offset;
 
 #if !defined(__alpha__) && !defined(__ia64__)
@@ -80,7 +79,7 @@ asmlinkage long sys_stime(int * tptr)
 		return -EPERM;
 	if (get_user(value, tptr))
 		return -EFAULT;
-	write_lock_irq(&xtime_lock);
+	fr_write_lock_irq(&xtime_lock);
 	xtime.tv_sec = value;
 	xtime.tv_nsec = 0;
 	last_time_offset = 0;
@@ -88,7 +87,7 @@ asmlinkage long sys_stime(int * tptr)
 	time_status |= STA_UNSYNC;
 	time_maxerror = NTP_PHASE_LIMIT;
 	time_esterror = NTP_PHASE_LIMIT;
-	write_unlock_irq(&xtime_lock);
+	fr_write_unlock_irq(&xtime_lock);
 	return 0;
 }
 
@@ -96,13 +95,13 @@ asmlinkage long sys_stime(int * tptr)
 
 asmlinkage long sys_gettimeofday(struct timeval *tv, struct timezone *tz)
 {
-	if (tv) {
+	if (likely(tv != NULL)) {
 		struct timeval ktv;
 		do_gettimeofday(&ktv);
 		if (copy_to_user(tv, &ktv, sizeof(ktv)))
 			return -EFAULT;
 	}
-	if (tz) {
+	if (unlikely(tz != NULL)) {
 		if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
 			return -EFAULT;
 	}
@@ -127,10 +126,10 @@ asmlinkage long sys_gettimeofday(struct 
  */
 inline static void warp_clock(void)
 {
-	write_lock_irq(&xtime_lock);
+	fr_write_lock_irq(&xtime_lock);
 	xtime.tv_sec += sys_tz.tz_minuteswest * 60;
 	last_time_offset = 0;
-	write_unlock_irq(&xtime_lock);
+	fr_write_unlock_irq(&xtime_lock);
 }
 
 /*
@@ -235,7 +234,7 @@ int do_adjtimex(struct timex *txc)
 		    txc->tick > 1100000/USER_HZ)
 			return -EINVAL;
 
-	write_lock_irq(&xtime_lock);
+	fr_write_lock_irq(&xtime_lock);
 	result = time_state;	/* mostly `TIME_OK' */
 
 	/* Save for later - semantics of adjtime is to return old value */
@@ -386,7 +385,7 @@ leave:	if ((time_status & (STA_UNSYNC|ST
 	txc->errcnt	   = pps_errcnt;
 	txc->stbcnt	   = pps_stbcnt;
 	last_time_offset = 0;
-	write_unlock_irq(&xtime_lock);
+	fr_write_unlock_irq(&xtime_lock);
 	do_gettimeofday(&txc->time);
 	return(result);
 }
@@ -409,9 +408,13 @@ asmlinkage long sys_adjtimex(struct time
 struct timespec current_kernel_time(void)
 {
         struct timespec now;
-        unsigned long flags;
-        read_lock_irqsave(&xtime_lock,flags);
-	now = xtime;
-        read_unlock_irqrestore(&xtime_lock,flags);
+        unsigned long seq;
+
+	do {
+		seq = fr_read_begin(&xtime_lock);
+		
+		now = xtime;
+	} while (seq != fr_read_end(&xtime_lock));
+
 	return now; 
 }
diff -puN kernel/timer.c~frlock-xtime kernel/timer.c
--- linux-hype/kernel/timer.c~frlock-xtime	2003-01-25 18:11:45.000000000 -0800
+++ linux-hype-akpm/kernel/timer.c	2003-01-25 18:11:45.000000000 -0800
@@ -758,7 +758,7 @@ unsigned long wall_jiffies;
  * This read-write spinlock protects us from races in SMP while
  * playing with xtime and avenrun.
  */
-rwlock_t xtime_lock __cacheline_aligned_in_smp = RW_LOCK_UNLOCKED;
+frlock_t xtime_lock __cacheline_aligned_in_smp = FR_LOCK_UNLOCKED;
 unsigned long last_time_offset;
 
 /*
@@ -798,8 +798,7 @@ static inline void update_times(void)
 }
   
 /*
- * The 64-bit jiffies value is not atomic - you MUST NOT read it
- * without holding read_lock_irq(&xtime_lock).
+ * The 64-bit jiffies value is not atomic 
  * jiffies is defined in the linker script...
  */
 
@@ -1087,18 +1086,21 @@ asmlinkage long sys_sysinfo(struct sysin
 	struct sysinfo val;
 	unsigned long mem_total, sav_total;
 	unsigned int mem_unit, bitcount;
+	unsigned long seq;
 
 	memset((char *)&val, 0, sizeof(struct sysinfo));
 
-	read_lock_irq(&xtime_lock);
-	val.uptime = jiffies / HZ;
+	do {
+		seq = fr_read_begin(&xtime_lock);
 
-	val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
-	val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
-	val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
+		val.uptime = jiffies / HZ;
 
-	val.procs = nr_threads;
-	read_unlock_irq(&xtime_lock);
+		val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
+		val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
+		val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
+
+		val.procs = nr_threads;
+	} while (seq != fr_read_end(&xtime_lock));
 
 	si_meminfo(&val);
 	si_swapinfo(&val);
_
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/