OK, it's writepage-versus-writepage and/or writepage-versus-truncate. 
There's no locking between the updaters and i_blocks goes negative.
This is good ;)  We have done well.
I went for the new frlocks which I dragged into -mm to fix up the xtime_lock
problems.  It's not 100% appropriate, but it'll do just fine.
Stephen, we can use that frlock in the inode to protect access to the 64-bit
i_size on 32-bit platforms as well.  However in the case of i_size, the
updaters do not need to take the frlock's spinlock, because writers are
serialised inode->i_sem.
So could you please take a look at adding new writer-side primitives to
frlocks to cover this requirement?  See i_size_read/i_size_write from the -aa
kernel:
+static inline void i_size_write(struct inode * inode, loff_t i_size)
+{
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#ifdef __ARCH_HAS_GET_SET_64BIT
+	set_64bit((unsigned long long *) &inode->i_size, (unsigned long long) i_size);
+#else
+	inode->i_size_version2++;
+	wmb();
+	inode->i_size = i_size;
+	wmb();
+	inode->i_size_version1++;
+	wmb(); /* make it visible ASAP */
+#endif
+#elif BITS_PER_LONG==64 || !defined(CONFIG_SMP)
+	inode->i_size = i_size;
+#endif
+}
Something like this might suffice:
diff -puN include/linux/frlock.h~frlock_write_begin include/linux/frlock.h
--- 25/include/linux/frlock.h~frlock_write_begin	2003-01-25 18:59:27.000000000 -0800
+++ 25-akpm/include/linux/frlock.h	2003-01-25 19:01:27.000000000 -0800
@@ -80,6 +80,22 @@ static inline unsigned fr_read_end(frloc
 	return rw->pre_sequence;
 }
 
+static inline unsigned fr_write_begin(frlock_t *rw)
+{
+	unsigned ret = rw->pre_sequence++;
+	wmb();
+	return ret;
+}	
+
+static inline unsigned fr_write_end(frlock_t *rw) 
+{
+	unsgned ret;
+	wmb();
+	ret = ++(rw->post_sequence);
+	wmb();
+	return ret;
+}
+
 /*
  * Possible sw/hw IRQ protected versions of the interfaces.
  */
Thanks.
Since Jan removed the lock_kernel()s in inode_add_bytes() and
inode_sub_bytes(), these functions have been racy.
One problematic workload has been discovered in which concurrent writepage
and truncate on SMP quickly causes i_blocks to go negative.  writepage() does
not take i_sem, and it seems that for ext2, there are no other locks in
force when inode_add_bytes() is called.
Putting the BKL back in there is not acceptable.  To fix this race I have
used the new frlocks.
This is actually fairly pointless, because these fields are write-mostly and
read-rarely.  But we need a spinlock anyway because of the concurrent
modifiers problem.
And we need this frlock in the inode to solve the problem of the nonatomicity
of i_size.  (But some aditional frlock work is needed first: in the case of
i_size we do not need to take the spinlock because of i_sem protection).
The splitting of the used disk space into i_blocks and i_bytes is silly - we
should nuke all that and just have a bare loff_t i_usedbytes.   Later.
 inode.c    |    1 +
 stat.c     |   46 ++++++++++++++++++++++++++++++++++++++++++++++
 linux/fs.h |   52 +++++++++++++++++-----------------------------------
 ksyms.c    |    4 ++++
 4 files changed, 68 insertions(+), 35 deletions(-)
diff -puN include/linux/fs.h~inode-accounting-race-fix include/linux/fs.h
--- linux-hype/include/linux/fs.h~inode-accounting-race-fix	2003-01-25 18:36:25.000000000 -0800
+++ linux-hype-akpm/include/linux/fs.h	2003-01-25 18:36:25.000000000 -0800
@@ -18,6 +18,7 @@
 #include <linux/stat.h>
 #include <linux/cache.h>
 #include <linux/radix-tree.h>
+#include <linux/frlock.h>
 #include <asm/atomic.h>
 
 struct iovec;
@@ -371,9 +372,19 @@ struct inode {
 	struct timespec		i_ctime;
 	unsigned int		i_blkbits;
 	unsigned long		i_blksize;
-	unsigned long		i_blocks;
 	unsigned long		i_version;
+
+	/*
+	 * i_bytes and i_blocks represent the amount of allocated disk space:
+	 * (i_blocks * 512 + i_bytes) (FIXME: just use an loff_t here).  They
+	 * are protected by i_frlock.
+	 */
+	unsigned long		i_blocks;
 	unsigned short          i_bytes;
+
+	/* Protects i_blocks, i_bytes and, later, i_size */
+	frlock_t		i_frlock;
+
 	struct semaphore	i_sem;
 	struct inode_operations	*i_op;
 	struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
@@ -400,7 +411,7 @@ struct inode {
 	void			*i_security;
 	__u32			i_generation;
 	union {
-		void				*generic_ip;
+		void		*generic_ip;
 	} u;
 };
 
@@ -412,39 +423,6 @@ struct fown_struct {
 	void *security;
 };
 
-static inline void inode_add_bytes(struct inode *inode, loff_t bytes)
-{
-	inode->i_blocks += bytes >> 9;
-	bytes &= 511;
-	inode->i_bytes += bytes;
-	if (inode->i_bytes >= 512) {
-		inode->i_blocks++;
-		inode->i_bytes -= 512;
-	}
-}
-
-static inline void inode_sub_bytes(struct inode *inode, loff_t bytes)
-{
-	inode->i_blocks -= bytes >> 9;
-	bytes &= 511;
-	if (inode->i_bytes < bytes) {
-		inode->i_blocks--;
-		inode->i_bytes += 512;
-	}
-	inode->i_bytes -= bytes;
-}
-
-static inline loff_t inode_get_bytes(struct inode *inode)
-{
-	return (((loff_t)inode->i_blocks) << 9) + inode->i_bytes;
-}
-
-static inline void inode_set_bytes(struct inode *inode, loff_t bytes)
-{
-	inode->i_blocks = bytes >> 9;
-	inode->i_bytes = bytes & 511;
-}
-
 /*
  * Track a single file's readahead state
  */
@@ -1277,6 +1255,10 @@ extern int page_symlink(struct inode *in
 extern struct inode_operations page_symlink_inode_operations;
 extern void generic_fillattr(struct inode *, struct kstat *);
 extern int vfs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+void inode_add_bytes(struct inode *inode, loff_t bytes);
+void inode_sub_bytes(struct inode *inode, loff_t bytes);
+loff_t inode_get_bytes(struct inode *inode);
+void inode_set_bytes(struct inode *inode, loff_t bytes);
 
 extern int vfs_readdir(struct file *, filldir_t, void *);
 
diff -puN fs/stat.c~inode-accounting-race-fix fs/stat.c
--- linux-hype/fs/stat.c~inode-accounting-race-fix	2003-01-25 18:36:25.000000000 -0800
+++ linux-hype-akpm/fs/stat.c	2003-01-25 18:36:25.000000000 -0800
@@ -316,3 +316,49 @@ asmlinkage long sys_fstat64(unsigned lon
 }
 
 #endif /* LFS-64 */
+
+void inode_add_bytes(struct inode *inode, loff_t bytes)
+{
+	fr_write_lock(&inode->i_frlock);
+	inode->i_blocks += bytes >> 9;
+	bytes &= 511;
+	inode->i_bytes += bytes;
+	if (inode->i_bytes >= 512) {
+		inode->i_blocks++;
+		inode->i_bytes -= 512;
+	}
+	fr_write_unlock(&inode->i_frlock);
+}
+
+void inode_sub_bytes(struct inode *inode, loff_t bytes)
+{
+	fr_write_lock(&inode->i_frlock);
+	inode->i_blocks -= bytes >> 9;
+	bytes &= 511;
+	if (inode->i_bytes < bytes) {
+		inode->i_blocks--;
+		inode->i_bytes += 512;
+	}
+	inode->i_bytes -= bytes;
+	fr_write_unlock(&inode->i_frlock);
+}
+
+loff_t inode_get_bytes(struct inode *inode)
+{
+	unsigned seq;
+	loff_t ret;
+
+	do {
+		seq = fr_read_begin(&inode->i_frlock);
+		ret = (((loff_t)inode->i_blocks) << 9) + inode->i_bytes;
+	} while (seq != fr_read_end(&inode->i_frlock));
+	return ret;
+}
+
+void inode_set_bytes(struct inode *inode, loff_t bytes)
+{
+	fr_write_lock(&inode->i_frlock);
+	inode->i_blocks = bytes >> 9;
+	inode->i_bytes = bytes & 511;
+	fr_write_unlock(&inode->i_frlock);
+}
diff -puN kernel/ksyms.c~inode-accounting-race-fix kernel/ksyms.c
--- linux-hype/kernel/ksyms.c~inode-accounting-race-fix	2003-01-25 18:36:25.000000000 -0800
+++ linux-hype-akpm/kernel/ksyms.c	2003-01-25 18:36:25.000000000 -0800
@@ -272,6 +272,10 @@ EXPORT_SYMBOL(vfs_fstat);
 EXPORT_SYMBOL(vfs_stat);
 EXPORT_SYMBOL(vfs_lstat);
 EXPORT_SYMBOL(vfs_getattr);
+EXPORT_SYMBOL(inode_add_bytes);
+EXPORT_SYMBOL(inode_sub_bytes);
+EXPORT_SYMBOL(inode_get_bytes);
+EXPORT_SYMBOL(inode_set_bytes);
 EXPORT_SYMBOL(lock_rename);
 EXPORT_SYMBOL(unlock_rename);
 EXPORT_SYMBOL(generic_read_dir);
diff -puN fs/inode.c~inode-accounting-race-fix fs/inode.c
--- linux-hype/fs/inode.c~inode-accounting-race-fix	2003-01-25 18:36:25.000000000 -0800
+++ linux-hype-akpm/fs/inode.c	2003-01-25 18:36:25.000000000 -0800
@@ -176,6 +176,7 @@ void inode_init_once(struct inode *inode
 	spin_lock_init(&inode->i_data.private_lock);
 	INIT_LIST_HEAD(&inode->i_data.i_mmap);
 	INIT_LIST_HEAD(&inode->i_data.i_mmap_shared);
+	frlock_init(&inode->i_frlock);
 }
 
 static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
_
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/