Re: [Ext2-devel] [RFC] probably bug in current ext3/jbd

Alex Tomas (bzzz@tmi.comex.ru)
Fri, 23 May 2003 11:08:44 +0000


hi!

here is small patch that intented to fix race with b_committed_data.
also, as Andrew asked I introduced new bit-based spinlock and
converted pte_chain_lock()/pte_chain_unlock() to use that one.
patch tested by dbench 1/2/4/6/8/16/24.

with best regards, Alex

diff -puNr linux-2.5.69-mm6/include/linux/rmap-locking.h b_commited_data-race/include/linux/rmap-locking.h
--- linux-2.5.69-mm6/include/linux/rmap-locking.h Fri May 16 17:59:38 2003
+++ b_commited_data-race/include/linux/rmap-locking.h Fri May 23 10:23:44 2003
@@ -10,32 +10,8 @@
struct pte_chain;
extern kmem_cache_t *pte_chain_cache;

-static inline void pte_chain_lock(struct page *page)
-{
- /*
- * Assuming the lock is uncontended, this never enters
- * the body of the outer loop. If it is contended, then
- * within the inner loop a non-atomic test is used to
- * busywait with less bus contention for a good time to
- * attempt to acquire the lock bit.
- */
- preempt_disable();
-#ifdef CONFIG_SMP
- while (test_and_set_bit(PG_chainlock, &page->flags)) {
- while (test_bit(PG_chainlock, &page->flags))
- cpu_relax();
- }
-#endif
-}
-
-static inline void pte_chain_unlock(struct page *page)
-{
-#ifdef CONFIG_SMP
- smp_mb__before_clear_bit();
- clear_bit(PG_chainlock, &page->flags);
-#endif
- preempt_enable();
-}
+#define pte_chain_lock(page) bb_spin_lock(PG_chainlock, &page->flags)
+#define pte_chain_unlock(page) bb_spin_unlock(PG_chainlock, &page->flags)

struct pte_chain *pte_chain_alloc(int gfp_flags);
void __pte_chain_free(struct pte_chain *pte_chain);
diff -puNr linux-2.5.69-mm6/include/linux/jbd.h b_commited_data-race/include/linux/jbd.h
--- linux-2.5.69-mm6/include/linux/jbd.h Fri May 16 17:59:41 2003
+++ b_commited_data-race/include/linux/jbd.h Fri May 23 10:28:17 2003
@@ -292,6 +292,7 @@ enum jbd_state_bits {
BH_Revoked, /* Has been revoked from the log */
BH_RevokeValid, /* Revoked flag is valid */
BH_JBDDirty, /* Is dirty but journaled */
+ BH_JLock, /* Buffer is locked for short time */
};

BUFFER_FNS(JBD, jbd)
@@ -308,6 +309,9 @@ static inline struct journal_head *bh2jh
{
return bh->b_private;
}
+
+#define jbd_lock_bh(bh) bb_spin_lock(BH_JLock, &bh->b_state)
+#define jbd_unlock_bh(bh) bb_spin_unlock(BH_JLock, &bh->b_state)

#define HAVE_JOURNAL_CALLBACK_STATUS
/**
diff -puNr linux-2.5.69-mm6/fs/jbd/commit.c b_commited_data-race/fs/jbd/commit.c
--- linux-2.5.69-mm6/fs/jbd/commit.c Fri May 16 18:03:20 2003
+++ b_commited_data-race/fs/jbd/commit.c Fri May 23 10:29:20 2003
@@ -686,12 +686,14 @@ skip_commit: /* The journal should be un
* Otherwise, we can just throw away the frozen data now.
*/
if (jh->b_committed_data) {
+ jbd_lock_bh(jh2bh(jh));
kfree(jh->b_committed_data);
jh->b_committed_data = NULL;
if (jh->b_frozen_data) {
jh->b_committed_data = jh->b_frozen_data;
jh->b_frozen_data = NULL;
}
+ jbd_unlock_bh(jh2bh(jh));
} else if (jh->b_frozen_data) {
kfree(jh->b_frozen_data);
jh->b_frozen_data = NULL;
diff -puNr linux-2.5.69-mm6/fs/ext3/balloc.c b_commited_data-race/fs/ext3/balloc.c
--- linux-2.5.69-mm6/fs/ext3/balloc.c Sat May 17 17:52:42 2003
+++ b_commited_data-race/fs/ext3/balloc.c Fri May 23 10:55:55 2003
@@ -185,11 +185,14 @@ do_more:
if (err)
goto error_return;

+ jbd_lock_bh(bitmap_bh);
+
for (i = 0; i < count; i++) {
/*
* An HJ special. This is expensive...
*/
#ifdef CONFIG_JBD_DEBUG
+ jbd_unlock_bh(bitmap_bh);
{
struct buffer_head *debug_bh;
debug_bh = sb_find_get_block(sb, block + i);
@@ -202,6 +205,7 @@ do_more:
__brelse(debug_bh);
}
}
+ jbd_lock_bh(bitmap_bh);
#endif
/* @@@ This prevents newly-allocated data from being
* freed and then reallocated within the same
@@ -243,6 +247,7 @@ do_more:
dquot_freed_blocks++;
}
}
+ jbd_unlock_bh(bitmap_bh);

spin_lock(sb_bgl_lock(sbi, block_group));
gdp->bg_free_blocks_count =
@@ -289,11 +294,12 @@ error_return:
* data-writes at some point, and disable it for metadata allocations or
* sync-data inodes.
*/
-static int ext3_test_allocatable(int nr, struct buffer_head *bh)
+static inline int ext3_test_allocatable(int nr, struct buffer_head *bh,
+ int have_access)
{
if (ext3_test_bit(nr, bh->b_data))
return 0;
- if (!buffer_jbd(bh) || !bh2jh(bh)->b_committed_data)
+ if (!have_access || !buffer_jbd(bh) || !bh2jh(bh)->b_committed_data)
return 1;
return !ext3_test_bit(nr, bh2jh(bh)->b_committed_data);
}
@@ -305,8 +311,8 @@ static int ext3_test_allocatable(int nr,
* the initial goal; then for a free byte somewhere in the bitmap; then
* for any free bit in the bitmap.
*/
-static int find_next_usable_block(int start,
- struct buffer_head *bh, int maxblocks)
+static int find_next_usable_block(int start, struct buffer_head *bh,
+ int maxblocks, int have_access)
{
int here, next;
char *p, *r;
@@ -322,7 +328,8 @@ static int find_next_usable_block(int st
*/
int end_goal = (start + 63) & ~63;
here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
- if (here < end_goal && ext3_test_allocatable(here, bh))
+ if (here < end_goal &&
+ ext3_test_allocatable(here, bh, have_access))
return here;

ext3_debug ("Bit not found near goal\n");
@@ -345,7 +352,7 @@ static int find_next_usable_block(int st
r = memscan(p, 0, (maxblocks - here + 7) >> 3);
next = (r - ((char *) bh->b_data)) << 3;

- if (next < maxblocks && ext3_test_allocatable(next, bh))
+ if (next < maxblocks && ext3_test_allocatable(next, bh, have_access))
return next;

/* The bitmap search --- search forward alternately
@@ -357,13 +364,13 @@ static int find_next_usable_block(int st
maxblocks, here);
if (next >= maxblocks)
return -1;
- if (ext3_test_allocatable(next, bh))
+ if (ext3_test_allocatable(next, bh, have_access))
return next;

- J_ASSERT_BH(bh, bh2jh(bh)->b_committed_data);
- here = ext3_find_next_zero_bit
- ((unsigned long *) bh2jh(bh)->b_committed_data,
- maxblocks, next);
+ if (have_access)
+ here = ext3_find_next_zero_bit
+ ((unsigned long *) bh2jh(bh)->b_committed_data,
+ maxblocks, next);
}
return -1;
}
@@ -402,17 +409,18 @@ ext3_try_to_allocate(struct super_block

*errp = 0;

- if (goal >= 0 && ext3_test_allocatable(goal, bitmap_bh))
+ if (goal >= 0 && ext3_test_allocatable(goal, bitmap_bh, 0))
goto got;

repeat:
goal = find_next_usable_block(goal, bitmap_bh,
- EXT3_BLOCKS_PER_GROUP(sb));
+ EXT3_BLOCKS_PER_GROUP(sb), have_access);
if (goal < 0)
goto fail;

for (i = 0;
- i < 7 && goal > 0 && ext3_test_allocatable(goal - 1, bitmap_bh);
+ i < 7 && goal > 0 &&
+ ext3_test_allocatable(goal - 1, bitmap_bh, have_access);
i++, goal--);

got:
@@ -429,6 +437,7 @@ got:
*errp = fatal;
goto fail;
}
+ jbd_lock_bh(bitmap_bh);
have_access = 1;
}

@@ -444,6 +453,7 @@ got:
}

BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for bitmap block");
+ jbd_unlock_bh(bitmap_bh);
fatal = ext3_journal_dirty_metadata(handle, bitmap_bh);
if (fatal) {
*errp = fatal;
@@ -454,6 +464,7 @@ got:
fail:
if (have_access) {
BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
+ jbd_unlock_bh(bitmap_bh);
ext3_journal_release_buffer(handle, bitmap_bh);
}
return -1;
diff -puNr linux-2.5.69-mm6/include/linux/spinlock.h b_commited_data-race/include/linux/spinlock.h
--- linux-2.5.69-mm6/include/linux/spinlock.h Fri May 16 18:03:21 2003
+++ b_commited_data-race/include/linux/spinlock.h Fri May 23 10:45:58 2003
@@ -540,4 +540,37 @@ do { \
extern int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
#endif

+/*
+ * bit-based spin_lock()
+ */
+static inline void bb_spin_lock(int bitnum, unsigned long *addr)
+{
+ /*
+ * Assuming the lock is uncontended, this never enters
+ * the body of the outer loop. If it is contended, then
+ * within the inner loop a non-atomic test is used to
+ * busywait with less bus contention for a good time to
+ * attempt to acquire the lock bit.
+ */
+ preempt_disable();
+#ifdef CONFIG_SMP
+ while (test_and_set_bit(bitnum, addr)) {
+ while (test_bit(bitnum, addr))
+ cpu_relax();
+ }
+#endif
+}
+
+/*
+ * bit-based spin_unlock()
+ */
+static inline void bb_spin_unlock(int bitnum, unsigned long *addr)
+{
+#ifdef CONFIG_SMP
+ smp_mb__before_clear_bit();
+ clear_bit(bitnum, addr);
+#endif
+ preempt_enable();
+}
+
#endif /* __LINUX_SPINLOCK_H */

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/