Re: owner field in `struct fs'

Philipp Rumpf (prumpf@uzix.org)
Fri, 30 Jun 2000 18:54:21 -0700


--X1bOJ3K7DJ5YkBrT
Content-Type: text/plain; charset=us-ascii

On Sat, Jun 24, 2000 at 09:13:29PM +0100, Alan Cox wrote:
> Make sure you cover the hard real time task that wont be pre-empted case.

This is the only problem I'm aware of with this patch; I can't see how
the other deadlocks Alan and others described could occur.

Please note that I don't want to revert the changes made to
struct file_operations; the changes don't conflict, and I think there are
other subsystems that'd prefer not using the "increment module refcount
externally" approach.

This is the current version of the grab-all-CPUs patch (against 2.4.0-test2);
it somehow got developed between Rusty Russell, Andrew Morton, and me, though
I suspect my part was to slow everything down by living in an incompatible
timezone. It works by making sure our CPU is the only one executing in
process context before calling cleanup_module.

Note that it currently leaves the [un]lock_kernels in module.c, in addition
to an rw semaphore. This looks somewhat redundant and probably should be
cleaned up.

This means that quite a few module races that existed before now can't
happen:

- MOD_DEC_USE_COUNT is safe now. Before, a CPU might have been stuck after
the MOD_DEC_USE_COUNT that decremented the module use count to 0 and the
return instruction. In theory, the module's code could have been
overwritten by the time the return was actually fetched, resulting in
completely unpredictable behaviour.

- MOD_INC_USE_COUNT is safe. This is equivalent to "try_inc_mod_count never
fails".

- You can sleep in cleanup_module, as long as you verified no code that
would increment the use count can be called.

- The "try to access /dev/hda while ide is unloading" problem doesn't occur.
What will happen is that we don't recognize /dev/hda's major, call modprobe,
and modprobe will sleep until ide has been unloaded.

So, what are module functions allowed to do (assuming you didn't decide to
use struct file_operations-style owner fields) ?

open()-style functions, which leave the module incremented, must do the
MOD_INC_USE_COUNT before they go to sleep. Doing the MOD_INC_USE_COUNT
as the first thing in the function might help readability.

release()-style functions, which decrement the use count, must do the
MOD_DEC_USE_COUNT after they last slept. Doing the MOD_DEC_USE_COUNT
as the last thing in the ffunction might help readability

miscellaneous functions, which don't leave any external references to the
module, must use MOD_(INC|DEC)_USE_COUNT only if they go to sleep, and
only around the sections that sleep. In particular, most of the
miscellaneous functions currently in use don't sleep, and don't have to
use MOD_(INC|DEC)_USE_COUNT.

module_exit() functions can sleep after ensuring no other functions will
try to do MOD_INC_USE_COUNT. This isn't very complicated in practice,
though it requires some thinking for module-internal semaphores (which is
why I'll solve that case for you).

You're still not allowed to touch your module refcount within interrupt
handlers or bhs. That would require two-stage module unloading, which we
might see in 2.5 (it's an extension that shouldn't break anything).

Philipp Rumpf

--X1bOJ3K7DJ5YkBrT
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename=freeze-diff-07

diff -urN linux/arch/i386/mm/extable.c linux-prumpf/arch/i386/mm/extable.c
--- linux/arch/i386/mm/extable.c Wed Nov 10 08:31:37 1999
+++ linux-prumpf/arch/i386/mm/extable.c Wed Jun 28 13:43:10 2000
@@ -30,26 +30,30 @@
return 0;
}

+extern struct rw_semaphore module_mutex;
+
unsigned long
search_exception_table(unsigned long addr)
{
- unsigned long ret;
+ unsigned long ret = 0;

#ifndef CONFIG_MODULES
/* There is only the kernel to search. */
ret = search_one_table(__start___ex_table, __stop___ex_table-1, addr);
- if (ret) return ret;
#else
/* The kernel is the last "module" -- no need to treat it special. */
struct module *mp;
+
+ down_read(&module_mutex);
for (mp = module_list; mp != NULL; mp = mp->next) {
if (mp->ex_table_start == NULL)
continue;
ret = search_one_table(mp->ex_table_start,
mp->ex_table_end - 1, addr);
- if (ret) return ret;
+ if (ret) break;
}
+ up_read(&module_mutex);
#endif

- return 0;
+ return ret;
}
diff -urN linux/include/linux/smp.h linux-prumpf/include/linux/smp.h
--- linux/include/linux/smp.h Tue Jun 27 17:43:55 2000
+++ linux-prumpf/include/linux/smp.h Wed Jun 28 13:54:40 2000
@@ -76,6 +76,9 @@
#define MSG_RESCHEDULE 0x0003 /* Reschedule request from master CPU*/
#define MSG_CALL_FUNCTION 0x0004 /* Call function on all other CPUs */

+extern int freeze_other_cpus(void);
+extern void melt_other_cpus(void);
+
#else

/*
@@ -91,6 +94,8 @@
#define cpu_number_map(cpu) 0
#define smp_call_function(func,info,retry,wait) ({ 0; })
#define cpu_online_map 1
+#define freeze_other_cpus() 0
+#define melt_other_cpus() do { } while(0)

#endif
#endif
diff -urN linux/kernel/module.c linux-prumpf/kernel/module.c
--- linux/kernel/module.c Fri Jun 23 20:17:18 2000
+++ linux-prumpf/kernel/module.c Wed Jun 28 13:45:28 2000
@@ -46,6 +46,11 @@
/* Rest are NULL */
};

+/* module_mutex protects module_list - if you can down_read() it, modules
+ * won't be added/removed, but their usecount, flags aso still might change.
+ */
+
+DECLARE_RWSEM(module_mutex);
struct module *module_list = &kernel_module;

static long get_mod_name(const char *user_name, char **buf);
@@ -114,7 +119,6 @@

if (!capable(CAP_SYS_MODULE))
return -EPERM;
- lock_kernel();
if ((namelen = get_mod_name(name_user, &name)) < 0) {
error = namelen;
goto err0;
@@ -123,32 +127,40 @@
error = -EINVAL;
goto err1;
}
+ lock_kernel();
+ down_write(&module_mutex);
if (find_module(name) != NULL) {
error = -EEXIST;
- goto err1;
+ goto err2;
}
if ((mod = (struct module *)module_map(size)) == NULL) {
error = -ENOMEM;
- goto err1;
+ goto err2;
}

memset(mod, 0, sizeof(*mod));
mod->size_of_struct = sizeof(*mod);
- mod->next = module_list;
mod->name = (char *)(mod + 1);
mod->size = size;
memcpy((char*)(mod+1), name, namelen+1);

put_mod_name(name);

+ mod->next = module_list;
module_list = mod; /* link it in */

error = (long) mod;
+
+ up_write(&module_mutex);
+ unlock_kernel();
+
goto err0;
+err2:
+ up_write(&module_mutex);
+ unlock_kernel();
err1:
put_mod_name(name);
err0:
- unlock_kernel();
return error;
}

@@ -168,6 +180,7 @@
if (!capable(CAP_SYS_MODULE))
return -EPERM;
lock_kernel();
+ down_read(&module_mutex);
if ((namelen = get_mod_name(name_user, &name)) < 0) {
error = namelen;
goto err0;
@@ -346,6 +359,7 @@
err1:
put_mod_name(name);
err0:
+ up_read(&module_mutex);
unlock_kernel();
return error;
}
@@ -353,16 +367,10 @@
static spinlock_t unload_lock = SPIN_LOCK_UNLOCKED;
int try_inc_mod_count(struct module *mod)
{
- int res = 1;
- if (mod) {
- spin_lock(&unload_lock);
- if (mod->flags & MOD_DELETED)
- res = 0;
- else
- __MOD_INC_USE_COUNT(mod);
- spin_unlock(&unload_lock);
- }
- return res;
+ if (mod)
+ __MOD_INC_USE_COUNT(mod);
+
+ return 1;
}

asmlinkage long
@@ -376,10 +384,12 @@
if (!capable(CAP_SYS_MODULE))
return -EPERM;

+ if ((error = get_mod_name(name_user, &name)) < 0)
+ return error;
+
lock_kernel();
+ down_write(&module_mutex);
if (name_user) {
- if ((error = get_mod_name(name_user, &name)) < 0)
- goto out;
if (error == 0) {
error = -EINVAL;
put_mod_name(name);
@@ -399,7 +409,11 @@
if (!__MOD_IN_USE(mod)) {
mod->flags |= MOD_DELETED;
spin_unlock(&unload_lock);
- free_module(mod, 0);
+ error = freeze_other_cpus();
+ if(error == 0) {
+ free_module(mod, 0);
+ melt_other_cpus();
+ }
error = 0;
} else {
spin_unlock(&unload_lock);
@@ -426,7 +440,10 @@
} else {
mod->flags |= MOD_DELETED;
spin_unlock(&unload_lock);
- free_module(mod, 1);
+ if(freeze_other_cpus() == 0) {
+ free_module(mod, 1);
+ melt_other_cpus();
+ }
something_changed = 1;
}
} else {
@@ -439,6 +456,7 @@
mod->flags &= ~MOD_JUST_FREED;
error = 0;
out:
+ up_write(&module_mutex);
unlock_kernel();
return error;
}
@@ -661,6 +679,7 @@
int err;

lock_kernel();
+ down_read(&module_mutex);
if (name_user == NULL)
mod = &kernel_module;
else {
@@ -706,6 +725,7 @@
break;
}
out:
+ up_read(&module_mutex);
unlock_kernel();
return err;
}
@@ -726,6 +746,7 @@
struct kernel_sym ksym;

lock_kernel();
+ down_read(&module_mutex);
for (mod = module_list, i = 0; mod; mod = mod->next) {
/* include the count for the module name! */
i += mod->nsyms + 1;
@@ -768,6 +789,7 @@
}
}
out:
+ up_read(&module_mutex);
unlock_kernel();
return i;
}
@@ -848,6 +870,8 @@
char tmpstr[64];
struct module_ref *ref;

+ down_read(&module_mutex);
+
for (mod = module_list; mod != &kernel_module; mod = mod->next) {
long len;
const char *q;
@@ -914,8 +938,9 @@
#undef safe_copy_str
#undef safe_copy_cstr
}
-
fini:
+ up_read(&module_mutex);
+
return PAGE_SIZE - left;
}

@@ -932,6 +957,7 @@
off_t pos = 0;
off_t begin = 0;

+ down_read(&module_mutex);
for (mod = module_list; mod; mod = mod->next) {
unsigned i;
struct module_symbol *sym;
@@ -966,6 +992,7 @@
len -= (offset - begin);
if (len > length)
len = length;
+ up_read(&module_mutex);
return len;
}

@@ -1010,7 +1037,9 @@
void put_module_symbol(unsigned long addr)
{
struct module *mp;
-
+
+ /* XXX: safe to sleep here ? */
+ down_read(&module_mutex);
for (mp = module_list; mp; mp = mp->next) {
if (MOD_CAN_QUERY(mp) &&
addr >= (unsigned long)mp &&
@@ -1019,6 +1048,7 @@
return;
}
}
+ up_read(&module_mutex);
}

#else /* CONFIG_MODULES */
diff -urN linux/lib/Makefile linux-prumpf/lib/Makefile
--- linux/lib/Makefile Fri Jun 23 20:17:18 2000
+++ linux-prumpf/lib/Makefile Wed Jun 28 01:46:34 2000
@@ -10,4 +10,8 @@
L_OBJS := errno.o ctype.o string.o vsprintf.o brlock.o
LX_OBJS := cmdline.o

+ifeq ($(CONFIG_SMP),y)
+L_OBJS += freeze.o
+endif
+
include $(TOPDIR)/Rules.make
diff -urN linux/lib/freeze.c linux-prumpf/lib/freeze.c
--- linux/lib/freeze.c Wed Dec 31 16:00:00 1969
+++ linux-prumpf/lib/freeze.c Wed Jun 28 17:15:42 2000
@@ -0,0 +1,52 @@
+/* Currently, those functions are only used for module unloading. I put them
+ * here as they might be useful for other people and would introduce
+ * unnecessary noise in module.c */
+#include <asm/atomic.h>
+#include <asm/errno.h>
+#include <asm/processor.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+
+static atomic_t freeze_count = ATOMIC_INIT(0);
+static volatile int ice_block;
+static spinlock_t freeze_lock = SPIN_LOCK_UNLOCKED;
+
+static int
+antarctica(void *unused)
+{
+ atomic_inc(&freeze_count);
+ while (ice_block)
+ ;
+ atomic_dec(&freeze_count);
+ return 0;
+}
+
+int freeze_other_cpus(void)
+{
+ int cpu, retval;
+
+ /* If you reenter this code, you're seriously fucked */
+ if (!spin_trylock(&freeze_lock))
+ return -EAGAIN;
+
+ ice_block = 1;
+ for (cpu = 0; cpu < smp_num_cpus - 1; cpu++) {
+ retval = kernel_thread(antarctica, (void *)0, 0);
+ if (retval < 0)
+ goto out_melt;
+ }
+
+ while(atomic_read(&freeze_count) != smp_num_cpus - 1);
+ return 0;
+out_melt:
+ ice_block = 0;
+ spin_unlock(&freeze_lock);
+ return retval;
+}
+
+void melt_other_cpus(void)
+{
+ ice_block = 0;
+ while(atomic_read(&freeze_count) != 0); /* this is paranoia */
+ spin_unlock(&freeze_lock);
+}

--X1bOJ3K7DJ5YkBrT--

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/