[PATCH] tmpfs 3/9 remove info->sem

Hugh Dickins (hugh@veritas.com)
Tue, 1 Oct 2002 22:47:44 +0100 (BST)


Between inode->i_sem and info->lock comes info->sem; but it doesn't
guard thoroughly against the difficult races (truncate during read),
and serializes reads from tmpfs unlike other filesystems. I'd prefer
to work with just i_sem and info->lock, backtracking when necessary
(when another task allocates block or metablock at the same time).

(I am not satisfied with the locked setting of next_index at the
start of shmem_getpage_locked: it's one lock hold too many, and
it doesn't really fix races against truncate better than before:
another patch in a later batch will resolve that.)

--- tmpfs2/include/linux/shmem_fs.h Thu Aug 1 23:58:33 2002
+++ tmpfs3/include/linux/shmem_fs.h Tue Oct 1 19:49:04 2002
@@ -11,7 +11,6 @@

struct shmem_inode_info {
spinlock_t lock;
- struct semaphore sem;
unsigned long next_index;
swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */
void **i_indirect; /* indirect blocks */
--- tmpfs2/mm/shmem.c Tue Oct 1 19:48:59 2002
+++ tmpfs3/mm/shmem.c Tue Oct 1 19:49:04 2002
@@ -111,11 +111,9 @@
* @page: optional page to add to the structure. Has to be preset to
* all zeros
*
- * If there is no space allocated yet it will return -ENOMEM when
- * page == 0 else it will use the page for the needed block.
- *
- * returns -EFBIG if the index is too big.
- *
+ * If there is no space allocated yet it will return NULL when
+ * page is 0, else it will use the page for the needed block,
+ * setting it to 0 on return to indicate that it has been used.
*
* The swap vector is organized the following way:
*
@@ -143,70 +141,80 @@
* +-> 48-51
* +-> 52-55
*/
-static swp_entry_t * shmem_swp_entry (struct shmem_inode_info *info, unsigned long index, unsigned long page)
+static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, unsigned long *page)
{
unsigned long offset;
void **dir;

+ if (index >= info->next_index)
+ return NULL;
if (index < SHMEM_NR_DIRECT)
return info->i_direct+index;
+ if (!info->i_indirect) {
+ if (page) {
+ info->i_indirect = (void *) *page;
+ *page = 0;
+ }
+ return NULL; /* need another page */
+ }

index -= SHMEM_NR_DIRECT;
offset = index % ENTRIES_PER_PAGE;
index /= ENTRIES_PER_PAGE;
-
- if (!info->i_indirect) {
- info->i_indirect = (void *) page;
- return ERR_PTR(-ENOMEM);
- }
-
dir = info->i_indirect + index;
+
if (index >= ENTRIES_PER_PAGE/2) {
index -= ENTRIES_PER_PAGE/2;
dir = info->i_indirect + ENTRIES_PER_PAGE/2
+ index/ENTRIES_PER_PAGE;
index %= ENTRIES_PER_PAGE;
-
- if(!*dir) {
- *dir = (void *) page;
- /* We return since we will need another page
- in the next step */
- return ERR_PTR(-ENOMEM);
+ if (!*dir) {
+ if (page) {
+ *dir = (void *) *page;
+ *page = 0;
+ }
+ return NULL; /* need another page */
}
dir = ((void **)*dir) + index;
}
+
if (!*dir) {
- if (!page)
- return ERR_PTR(-ENOMEM);
- *dir = (void *)page;
+ if (!page || !*page)
+ return NULL; /* need a page */
+ *dir = (void *) *page;
+ *page = 0;
}
return ((swp_entry_t *)*dir) + offset;
}

/*
- * shmem_alloc_entry - get the position of the swap entry for the
- * page. If it does not exist allocate the entry
+ * shmem_swp_alloc - get the position of the swap entry for the page.
+ * If it does not exist allocate the entry.
*
* @info: info structure for the inode
* @index: index of the page to find
*/
-static inline swp_entry_t * shmem_alloc_entry (struct shmem_inode_info *info, unsigned long index)
+static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index)
{
unsigned long page = 0;
- swp_entry_t * res;
-
- if (index >= SHMEM_MAX_INDEX)
- return ERR_PTR(-EFBIG);
-
- if (info->next_index <= index)
- info->next_index = index + 1;
+ swp_entry_t *entry;

- while ((res = shmem_swp_entry(info,index,page)) == ERR_PTR(-ENOMEM)) {
+ while (!(entry = shmem_swp_entry(info, index, &page))) {
+ if (index >= info->next_index) {
+ entry = ERR_PTR(-EFAULT);
+ break;
+ }
+ spin_unlock(&info->lock);
page = get_zeroed_page(GFP_USER);
+ spin_lock(&info->lock);
if (!page)
- break;
+ return ERR_PTR(-ENOMEM);
+ }
+ if (page) {
+ /* another task gave its page, or truncated the file */
+ free_page(page);
}
- return res;
+ return entry;
}

/*
@@ -330,17 +338,15 @@
unsigned long freed = 0;
struct shmem_inode_info * info = SHMEM_I(inode);

- down(&info->sem);
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
- spin_lock (&info->lock);
index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ spin_lock (&info->lock);
while (index < info->next_index)
freed += shmem_truncate_indirect(info, index);

info->swapped -= freed;
shmem_recalc_inode(inode);
spin_unlock (&info->lock);
- up(&info->sem);
}

static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
@@ -436,8 +442,8 @@

for (idx = SHMEM_NR_DIRECT; idx < info->next_index;
idx += ENTRIES_PER_PAGE) {
- ptr = shmem_swp_entry(info, idx, 0);
- if (IS_ERR(ptr))
+ ptr = shmem_swp_entry(info, idx, NULL);
+ if (!ptr)
continue;
offset = info->next_index - idx;
if (offset > ENTRIES_PER_PAGE)
@@ -519,10 +525,10 @@
return fail_writepage(page);

spin_lock(&info->lock);
- entry = shmem_swp_entry(info, index, 0);
- if (IS_ERR(entry)) /* this had been allocated on page allocation */
- BUG();
shmem_recalc_inode(inode);
+ entry = shmem_swp_entry(info, index, NULL);
+ if (!entry)
+ BUG();
if (entry->val)
BUG();

@@ -570,61 +576,68 @@
struct shmem_sb_info *sbinfo;
struct page *page;
swp_entry_t *entry;
- int error;
+ swp_entry_t swap;
+ int error = 0;
+
+ if (idx >= SHMEM_MAX_INDEX)
+ return ERR_PTR(-EFBIG);
+
+ /*
+ * When writing, i_sem is held against truncation and other
+ * writing, so next_index will remain as set here; but when
+ * reading, idx must always be checked against next_index
+ * after sleeping, lest truncation occurred meanwhile.
+ */
+ spin_lock(&info->lock);
+ if (info->next_index <= idx)
+ info->next_index = idx + 1;
+ spin_unlock(&info->lock);

repeat:
page = find_lock_page(mapping, idx);
if (page)
return page;

- entry = shmem_alloc_entry (info, idx);
- if (IS_ERR(entry))
+ spin_lock(&info->lock);
+ shmem_recalc_inode(inode);
+ entry = shmem_swp_alloc(info, idx);
+ if (IS_ERR(entry)) {
+ spin_unlock(&info->lock);
return (void *)entry;
-
- spin_lock (&info->lock);
-
- /* The shmem_alloc_entry() call may have blocked, and
- * shmem_writepage may have been moving a page between the page
- * cache and swap cache. We need to recheck the page cache
- * under the protection of the info->lock spinlock. */
-
- page = find_get_page(mapping, idx);
- if (page) {
- if (TestSetPageLocked(page))
- goto wait_retry;
- spin_unlock (&info->lock);
- return page;
}
-
- shmem_recalc_inode(inode);
- if (entry->val) {
+ swap = *entry;
+
+ if (swap.val) {
/* Look it up and read it in.. */
- page = lookup_swap_cache(*entry);
+ page = lookup_swap_cache(swap);
if (!page) {
- swp_entry_t swap = *entry;
- spin_unlock (&info->lock);
- swapin_readahead(*entry);
- page = read_swap_cache_async(*entry);
+ spin_unlock(&info->lock);
+ swapin_readahead(swap);
+ page = read_swap_cache_async(swap);
if (!page) {
- if (entry->val != swap.val)
- goto repeat;
- return ERR_PTR(-ENOMEM);
+ spin_lock(&info->lock);
+ entry = shmem_swp_alloc(info, idx);
+ if (IS_ERR(entry))
+ error = PTR_ERR(entry);
+ else if (entry->val == swap.val)
+ error = -ENOMEM;
+ spin_unlock(&info->lock);
+ if (error)
+ return ERR_PTR(error);
+ goto repeat;
}
wait_on_page_locked(page);
- if (!PageUptodate(page) && entry->val == swap.val) {
- page_cache_release(page);
- return ERR_PTR(-EIO);
- }
-
- /* Too bad we can't trust this page, because we
- * dropped the info->lock spinlock */
page_cache_release(page);
goto repeat;
}

/* We have to do this with page locked to prevent races */
- if (TestSetPageLocked(page))
- goto wait_retry;
+ if (TestSetPageLocked(page)) {
+ spin_unlock(&info->lock);
+ wait_on_page_locked(page);
+ page_cache_release(page);
+ goto repeat;
+ }
if (PageWriteback(page)) {
spin_unlock(&info->lock);
wait_on_page_writeback(page);
@@ -632,42 +645,55 @@
page_cache_release(page);
goto repeat;
}
- error = move_from_swap_cache(page, idx, mapping);
- if (error < 0) {
+
+ error = PageUptodate(page)?
+ move_from_swap_cache(page, idx, mapping): -EIO;
+ if (error) {
spin_unlock(&info->lock);
unlock_page(page);
page_cache_release(page);
return ERR_PTR(error);
}

- swap_free(*entry);
*entry = (swp_entry_t) {0};
info->swapped--;
spin_unlock (&info->lock);
+ swap_free(swap);
} else {
+ spin_unlock(&info->lock);
sbinfo = SHMEM_SB(inode->i_sb);
- spin_unlock (&info->lock);
- spin_lock (&sbinfo->stat_lock);
- if (sbinfo->free_blocks == 0)
- goto no_space;
+ spin_lock(&sbinfo->stat_lock);
+ if (sbinfo->free_blocks == 0) {
+ spin_unlock(&sbinfo->stat_lock);
+ return ERR_PTR(-ENOSPC);
+ }
sbinfo->free_blocks--;
- spin_unlock (&sbinfo->stat_lock);
+ spin_unlock(&sbinfo->stat_lock);

- /* Ok, get a new page. We don't have to worry about the
- * info->lock spinlock here: we cannot race against
- * shm_writepage because we have already verified that
- * there is no page present either in memory or in the
- * swap cache, so we are guaranteed to be populating a
- * new shm entry. The inode semaphore we already hold
- * is enough to make this atomic. */
page = page_cache_alloc(mapping);
- if (!page)
- goto no_mem;
- error = add_to_page_cache_lru(page, mapping, idx);
- if (error < 0) {
+ if (!page) {
+ spin_lock(&sbinfo->stat_lock);
+ sbinfo->free_blocks++;
+ spin_unlock(&sbinfo->stat_lock);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ spin_lock(&info->lock);
+ entry = shmem_swp_alloc(info, idx);
+ if (IS_ERR(entry))
+ error = PTR_ERR(entry);
+ if (error || entry->val ||
+ add_to_page_cache_lru(page, mapping, idx) < 0) {
+ spin_unlock(&info->lock);
page_cache_release(page);
- goto no_mem;
+ spin_lock(&sbinfo->stat_lock);
+ sbinfo->free_blocks++;
+ spin_unlock(&sbinfo->stat_lock);
+ if (error)
+ return ERR_PTR(error);
+ goto repeat;
}
+ spin_unlock(&info->lock);
clear_highpage(page);
inode->i_blocks += BLOCKS_PER_PAGE;
}
@@ -675,22 +701,6 @@
/* We have the page */
SetPageUptodate(page);
return page;
-
-no_mem:
- spin_lock(&sbinfo->stat_lock);
- sbinfo->free_blocks++;
- spin_unlock(&sbinfo->stat_lock);
- return ERR_PTR(-ENOMEM);
-
-no_space:
- spin_unlock (&sbinfo->stat_lock);
- return ERR_PTR(-ENOSPC);
-
-wait_retry:
- spin_unlock(&info->lock);
- wait_on_page_locked(page);
- page_cache_release(page);
- goto repeat;
}

static int shmem_getpage(struct inode * inode, unsigned long idx, struct page **ptr)
@@ -698,7 +708,6 @@
struct shmem_inode_info *info = SHMEM_I(inode);
int error;

- down (&info->sem);
*ptr = ERR_PTR(-EFAULT);
if (inode->i_size <= (loff_t) idx * PAGE_CACHE_SIZE)
goto failed;
@@ -708,10 +717,8 @@
goto failed;

unlock_page(*ptr);
- up (&info->sem);
return 0;
failed:
- up (&info->sem);
error = PTR_ERR(*ptr);
*ptr = NOPAGE_SIGBUS;
if (error == -ENOMEM)
@@ -734,8 +741,8 @@
spin_lock(&info->lock);
page = find_get_page(inode->i_mapping, idx);
if (!page) {
- entry = shmem_swp_entry(info, idx, 0);
- if (!IS_ERR(entry))
+ entry = shmem_swp_entry(info, idx, NULL);
+ if (entry)
swap = *entry;
}
spin_unlock(&info->lock);
@@ -814,12 +821,8 @@
inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
info = SHMEM_I(inode);
- spin_lock_init (&info->lock);
- sema_init (&info->sem, 1);
- info->next_index = 0;
- memset (info->i_direct, 0, sizeof(info->i_direct));
- info->i_indirect = NULL;
- info->swapped = 0;
+ memset(info, 0, (char *)inode - (char *)info);
+ spin_lock_init(&info->lock);
info->flags = VM_ACCOUNT;
switch (mode & S_IFMT) {
default:
@@ -971,9 +974,7 @@
}

info = SHMEM_I(inode);
- down (&info->sem);
page = shmem_getpage_locked(info, inode, index);
- up (&info->sem);

status = PTR_ERR(page);
if (IS_ERR(page))
@@ -1041,17 +1042,33 @@
end_index = inode->i_size >> PAGE_CACHE_SHIFT;
if (index > end_index)
break;
- nr = PAGE_CACHE_SIZE;
if (index == end_index) {
nr = inode->i_size & ~PAGE_CACHE_MASK;
if (nr <= offset)
break;
}

- nr = nr - offset;
-
- if ((desc->error = shmem_getpage(inode, index, &page)))
+ desc->error = shmem_getpage(inode, index, &page);
+ if (desc->error) {
+ if (desc->error == -EFAULT)
+ desc->error = 0;
break;
+ }
+
+ /*
+ * We must evaluate after, since reads (unlike writes)
+ * are called without i_sem protection against truncate
+ */
+ nr = PAGE_CACHE_SIZE;
+ end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+ if (index == end_index) {
+ nr = inode->i_size & ~PAGE_CACHE_MASK;
+ if (nr <= offset) {
+ page_cache_release(page);
+ break;
+ }
+ }
+ nr -= offset;

if (!list_empty(&mapping->i_mmap_shared))
flush_dcache_page(page);
@@ -1279,10 +1296,8 @@
iput(inode);
return -ENOMEM;
}
- down(&info->sem);
page = shmem_getpage_locked(info, inode, 0);
if (IS_ERR(page)) {
- up(&info->sem);
vm_unacct_memory(VM_ACCT(1));
iput(inode);
return PTR_ERR(page);
@@ -1297,7 +1312,6 @@
set_page_dirty(page);
unlock_page(page);
page_cache_release(page);
- up(&info->sem);
}
dir->i_size += BOGO_DIRENT_SIZE;
dir->i_ctime = dir->i_mtime = CURRENT_TIME;

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/