Index: linux-2.6.24-rc8-rt1/Makefile =================================================================== --- linux-2.6.24-rc8-rt1.orig/Makefile 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/Makefile 2008-01-16 22:29:32.000000000 -0500 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 24 -EXTRAVERSION = -rc8 +EXTRAVERSION = -rc8-rt1 NAME = Arr Matey! A Hairy Bilge Rat! # *DOCUMENTATION* @@ -509,6 +509,9 @@ endif include $(srctree)/arch/$(SRCARCH)/Makefile +ifdef CONFIG_MCOUNT +KBUILD_CFLAGS += -pg +endif ifdef CONFIG_FRAME_POINTER KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls else Index: linux-2.6.24-rc8-rt1/arch/x86/Kconfig =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/Kconfig 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/Kconfig 2008-01-16 22:29:12.000000000 -0500 @@ -19,6 +19,7 @@ config X86_64 config X86 bool default y + select HAVE_MCOUNT config GENERIC_TIME bool @@ -94,10 +95,19 @@ config DMI default y config RWSEM_GENERIC_SPINLOCK - def_bool !X86_XADD + bool + depends on !X86_XADD || PREEMPT_RT + default y + +config ASM_SEMAPHORES + bool + default y + config RWSEM_XCHGADD_ALGORITHM - def_bool X86_XADD + bool + depends on X86_XADD && !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT + default y config ARCH_HAS_ILOG2_U32 def_bool n @@ -1219,6 +1229,10 @@ config OUT_OF_LINE_PFN_TO_PAGE def_bool X86_64 depends on DISCONTIGMEM +config HARDIRQS_SW_RESEND + bool + default y + menu "Power management options" depends on !X86_VOYAGER Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/entry_32.S =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/entry_32.S 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/entry_32.S 2008-01-16 22:28:48.000000000 -0500 @@ -75,6 +75,28 @@ DF_MASK = 0x00000400 NT_MASK = 0x00004000 VM_MASK = 0x00020000 +#ifdef CONFIG_MCOUNT +.globl mcount +mcount: + cmpl $0, mcount_enabled + jz out + + /* taken from glibc */ + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %edx + movl 0x4(%ebp), %eax + + call *mcount_trace_function + + popl %edx + popl %ecx + popl %eax +out: + ret +#endif + #ifdef CONFIG_PREEMPT #define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else @@ -265,14 +287,18 @@ END(ret_from_exception) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) + cmpl $0, kernel_preemption + jz restore_nocheck cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_nocheck need_resched: movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl - jz restore_all + jz restore_nocheck testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all + jz restore_nocheck + DISABLE_INTERRUPTS(CLBR_ANY) + call preempt_schedule_irq jmp need_resched END(resume_kernel) @@ -330,6 +356,11 @@ sysenter_past_esp: pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL +#ifdef CONFIG_EVENT_TRACE + pushl %edx; pushl %ecx; pushl %ebx; pushl %eax + call sys_call + popl %eax; popl %ebx; popl %ecx; popl %edx +#endif GET_THREAD_INFO(%ebp) /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ @@ -345,6 +376,11 @@ sysenter_past_esp: movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx jne syscall_exit_work +#ifdef CONFIG_EVENT_TRACE + pushl %eax + call sys_ret + popl %eax +#endif /* if something modifies registers it must also disable sysexit */ movl PT_EIP(%esp), %edx movl PT_OLDESP(%esp), %ecx @@ -368,6 +404,11 @@ ENTRY(system_call) pushl %eax # save orig_eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL +#ifdef CONFIG_EVENT_TRACE + pushl %edx; pushl %ecx; pushl %ebx; pushl %eax + call sys_call + popl %eax; popl %ebx; popl %ecx; popl %edx +#endif GET_THREAD_INFO(%ebp) # system call tracing in operation / emulation /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ @@ -466,20 +507,19 @@ ENDPROC(system_call) ALIGN RING0_PTREGS_FRAME # can't unwind into user space anyway work_pending: - testb $_TIF_NEED_RESCHED, %cl + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx jz work_notifysig work_resched: - call schedule LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + call __schedule # setting need_resched or sigpending # between sampling and the iret - TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx andl $_TIF_WORK_MASK, %ecx # is there any work to be done other # than syscall tracing? jz restore_all - testb $_TIF_NEED_RESCHED, %cl + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx jnz work_resched work_notifysig: # deal with pending signals and Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/entry_64.S =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/entry_64.S 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/entry_64.S 2008-01-16 22:28:33.000000000 -0500 @@ -53,6 +53,39 @@ .code64 +#ifdef CONFIG_MCOUNT + +ENTRY(mcount) + cmpl $0, mcount_enabled + jz out + + /* taken from glibc */ + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + movq 0x38(%rsp), %rsi + movq 8(%rbp), %rdi + + call *mcount_trace_function + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp +out: + retq +#endif + #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif @@ -234,7 +267,9 @@ ENTRY(system_call) cmpq $__NR_syscall_max,%rax ja badsys movq %r10,%rcx + TRACE_SYS_CALL call *sys_call_table(,%rax,8) # XXX: rip relative + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) /* * Syscall return path ending with SYSRET (fast path) @@ -268,8 +303,8 @@ sysret_check: /* Handle reschedules */ /* edx: work, edi: workmask */ sysret_careful: - bt $TIF_NEED_RESCHED,%edx - jnc sysret_signal + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx + jz sysret_signal TRACE_IRQS_ON sti pushq %rdi @@ -292,7 +327,7 @@ sysret_signal: leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 xorl %esi,%esi # oldset -> arg2 call ptregscall_common -1: movl $_TIF_NEED_RESCHED,%edi +1: movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ cli @@ -317,7 +352,9 @@ tracesys: cmova %rcx,%rax ja 1f movq %r10,%rcx /* fixup for C */ + TRACE_SYS_CALL call *sys_call_table(,%rax,8) + TRACE_SYS_RET 1: movq %rax,RAX-ARGOFFSET(%rsp) /* Use IRET because user could have changed frame */ @@ -346,8 +383,8 @@ int_with_check: /* First do a reschedule test. */ /* edx: work, edi: workmask */ int_careful: - bt $TIF_NEED_RESCHED,%edx - jnc int_very_careful + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx + jz int_very_careful TRACE_IRQS_ON sti pushq %rdi @@ -382,7 +419,7 @@ int_signal: movq %rsp,%rdi # &ptregs -> arg1 xorl %esi,%esi # oldset -> arg2 call do_notify_resume -1: movl $_TIF_NEED_RESCHED,%edi +1: movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi int_restore_rest: RESTORE_REST cli @@ -588,8 +625,8 @@ bad_iret: /* edi: workmask, edx: work */ retint_careful: CFI_RESTORE_STATE - bt $TIF_NEED_RESCHED,%edx - jnc retint_signal + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx + jz retint_signal TRACE_IRQS_ON sti pushq %rdi @@ -615,7 +652,7 @@ retint_signal: RESTORE_REST cli TRACE_IRQS_OFF - movl $_TIF_NEED_RESCHED,%edi + movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi GET_THREAD_INFO(%rcx) jmp retint_check Index: linux-2.6.24-rc8-rt1/include/linux/linkage.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/linkage.h 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/linkage.h 2008-01-16 22:27:52.000000000 -0500 @@ -3,6 +3,8 @@ #include +#define notrace __attribute__((no_instrument_function)) + #ifdef __cplusplus #define CPP_ASMLINKAGE extern "C" #else Index: linux-2.6.24-rc8-rt1/include/linux/mcount.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/include/linux/mcount.h 2008-01-16 22:28:11.000000000 -0500 @@ -0,0 +1,34 @@ +#ifndef _LINUX_MCOUNT_H +#define _LINUX_MCOUNT_H + +#ifdef CONFIG_MCOUNT +extern int mcount_enabled; + +#include + +typedef void (*mcount_func_t)(unsigned long ip, unsigned long parent_ip); + +struct mcount_ops { + mcount_func_t func; + struct mcount_ops *next; +}; + +/* + * The mcount_ops must be a static and should also + * be read_mostly. These functions do modify read_mostly variables + * so use them sparely. Never free an mcount_op or modify the + * next pointer after it has been registered. Even after unregistering + * it, the next pointer may still be used internally. + */ +int register_mcount_function(struct mcount_ops *ops); +int unregister_mcount_function(struct mcount_ops *ops); +void clear_mcount_function(void); + +extern void mcount(void); + +#else /* !CONFIG_MCOUNT */ +# define register_mcount_function(ops) do { } while (0) +# define unregister_mcount_function(ops) do { } while (0) +# define clear_mcount_function(ops) do { } while (0) +#endif /* CONFIG_MCOUNT */ +#endif /* _LINUX_MCOUNT_H */ Index: linux-2.6.24-rc8-rt1/kernel/sysctl.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/sysctl.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/sysctl.c 2008-01-16 22:28:58.000000000 -0500 @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -43,9 +44,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -66,6 +69,7 @@ extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern int sysctl_panic_on_oom; extern int sysctl_oom_kill_allocating_task; +extern int futex_performance_hack; extern int max_threads; extern int core_uses_pid; extern int suid_dumpable; @@ -346,6 +350,200 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_FUTEX + { + .ctl_name = CTL_UNNUMBERED, + .procname = "futex_performance_hack", + .data = &futex_performance_hack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = CTL_UNNUMBERED, + .procname = "prof_pid", + .data = &prof_pid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#ifdef CONFIG_PREEMPT + { + .ctl_name = CTL_UNNUMBERED, + .procname = "kernel_preemption", + .data = &kernel_preemption, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_PREEMPT_VOLUNTARY + { + .ctl_name = CTL_UNNUMBERED, + .procname = "voluntary_preemption", + .data = &voluntary_preemption, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#if defined(CONFIG_PREEMPT_SOFTIRQS) && !defined(CONFIG_PREEMPT_RT) + { + .ctl_name = CTL_UNNUMBERED, + .procname = "softirq_preemption", + .data = &softirq_preemption, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#if defined(CONFIG_PREEMPT_HARDIRQS) && !defined(CONFIG_PREEMPT_RT) + { + .ctl_name = CTL_UNNUMBERED, + .procname = "hardirq_preemption", + .data = &hardirq_preemption, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_WAKEUP_TIMING + { + .ctl_name = CTL_UNNUMBERED, + .procname = "wakeup_timing", + .data = &wakeup_timing, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#if defined(CONFIG_LATENCY_TIMING) + { + .ctl_name = CTL_UNNUMBERED, + .procname = "preempt_max_latency", + .data = &preempt_max_latency, + .maxlen = sizeof(preempt_max_latency), + .mode = 0644, + .proc_handler = &proc_preempt_max_latency, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "preempt_thresh", + .data = &preempt_thresh, + .maxlen = sizeof(preempt_thresh), + .mode = 0644, + .proc_handler = &proc_preempt_threshold, + }, +#endif +#ifdef CONFIG_EVENT_TRACE + { + .ctl_name = CTL_UNNUMBERED, + .procname = "preempt_mark_thresh", + .data = &preempt_mark_thresh, + .maxlen = sizeof(preempt_mark_thresh), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_enabled", + .data = &trace_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "syscall_tracing", + .data = &syscall_tracing, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "stackframe_tracing", + .data = &stackframe_tracing, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_user_triggered", + .data = &trace_user_triggered, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_user_trigger_irq", + .data = &trace_user_trigger_irq, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_freerunning", + .data = &trace_freerunning, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_print_on_crash", + .data = &trace_print_on_crash, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_verbose", + .data = &trace_verbose, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_all_cpus", + .data = &trace_all_cpus, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_use_raw_cycles", + .data = &trace_use_raw_cycles, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_all_runnable", + .data = &trace_all_runnable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_GENERIC_HARDIRQS + { + .ctl_name = CTL_UNNUMBERED, + .procname = "debug_direct_keyboard", + .data = &debug_direct_keyboard, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = KERN_CORE_USES_PID, .procname = "core_uses_pid", @@ -470,6 +668,16 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_MCOUNT + { + .ctl_name = CTL_UNNUMBERED, + .procname = "mcount_enabled", + .data = &mcount_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif #ifdef CONFIG_KMOD { .ctl_name = KERN_MODPROBE, Index: linux-2.6.24-rc8-rt1/lib/Kconfig.debug =================================================================== --- linux-2.6.24-rc8-rt1.orig/lib/Kconfig.debug 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/lib/Kconfig.debug 2008-01-16 22:29:23.000000000 -0500 @@ -189,6 +189,8 @@ config DEBUG_RT_MUTEXES help This allows rt mutex semantics violations and rt mutex related deadlocks (lockups) to be detected and reported automatically. + When realtime preemption is enabled this includes spinlocks, + rwlocks, mutexes and (rw)semaphores config DEBUG_PI_LIST bool @@ -212,7 +214,7 @@ config DEBUG_SPINLOCK config DEBUG_MUTEXES bool "Mutex debugging: basic checks" - depends on DEBUG_KERNEL + depends on DEBUG_KERNEL && !PREEMPT_RT help This feature allows mutex semantics violations to be detected and reported. @@ -342,6 +344,194 @@ config STACKTRACE depends on DEBUG_KERNEL depends on STACKTRACE_SUPPORT +config PREEMPT_TRACE + bool + default y + depends on DEBUG_PREEMPT + +config EVENT_TRACE + bool "Kernel event tracing" + default n + depends on GENERIC_TIME + select FRAME_POINTER + select STACKTRACE + help + This option enables a kernel tracing mechanism that will track + certain kernel events such as system call entry and return, + IRQ entry, context-switching, etc. + + Run the scripts/trace-it utility on a kernel with this option + enabled for sample output. + +config FUNCTION_TRACE + bool "Kernel function call tracing" + default n + depends on !REORDER && HAVE_MCOUNT + select EVENT_TRACE + select MCOUNT + help + This option enables a kernel tracing mechanism that will track + precise function-call granularity kernel execution. Sample + output: + + pcscd-1772 0D..2 6867us : deactivate_task (-2 1) + pcscd-1772 0D..2 6867us : dequeue_task (deactivate_task) + -0 0D..2 6870us : __switch_to (__schedule) + -0 0D..2 6871us : __schedule (-2 20) + -0 0D..2 6871us : __lock_acquire (lock_acquire) + -0 0D..2 6872us : __spin_unlock_irq (__schedule) + + Run the scripts/trace-it sample utility on a kernel with this + option enabled to capture 1 second worth of events. + + (Note that kernel size and overhead increases noticeably + with this option enabled.) + +config WAKEUP_TIMING + bool "Wakeup latency timing" + depends on GENERIC_TIME + help + This option measures the time spent from a highprio thread being + woken up to it getting scheduled on a CPU, with microsecond + accuracy. + + The default measurement method is a maximum search, which is + disabled by default, if neither WAKEUP_LATENCY_HIST nor + CRITICAL_LATENCY_HIST is selected, and can be runtime (re-)started + via: + + echo 0 > /proc/sys/kernel/preempt_max_latency + +config LATENCY_TRACE + bool "Latency tracing" + default n + depends on LATENCY_TIMING && !REORDER && GENERIC_TIME + select FRAME_POINTER + select FUNCTION_TRACE + help + When this option is enabled then the last maximum latency timing + event's full trace can be found in /proc/latency_trace, in a + human-readable (or rather as some would say, in a + kernel-developer-readable) form. + + (Note that kernel size and overhead increases noticeably + with this option enabled.) + +config CRITICAL_PREEMPT_TIMING + bool "Non-preemptible critical section latency timing" + default n + depends on PREEMPT + depends on GENERIC_TIME + help + This option measures the time spent in preempt-off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default, if neither WAKEUP_LATENCY_HIST nor + CRITICAL_LATENCY_HIST is selected, and can be runtime (re-)started + via: + + echo 0 > /proc/sys/kernel/preempt_max_latency + + (Note that kernel size and overhead increases with this option + enabled. This option and the irqs-off timing option can be + used together or separately.) + +config CRITICAL_IRQSOFF_TIMING + bool "Interrupts-off critical section latency timing" + default n + depends on GENERIC_TIME + select TRACE_IRQFLAGS + help + This option measures the time spent in irqs-off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default, if neither WAKEUP_LATENCY_HIST nor + CRITICAL_LATENCY_HIST is selected, and can be runtime (re-)started + via: + + echo 0 > /proc/sys/kernel/preempt_max_latency + + (Note that kernel size and overhead increases with this option + enabled. This option and the preempt-off timing option can be + used together or separately.) + +config WAKEUP_LATENCY_HIST + bool "wakeup latency histogram" + default n + depends on WAKEUP_TIMING + help + This option logs all the wakeup latency timing to a big histogram + bucket, in the meanwhile, it also dummies up printk produced by + wakeup latency timing. + + The wakeup latency timing histogram can be viewed via: + + cat /proc/latency_hist/wakeup_latency/CPU* + + (Note: * presents CPU ID.) + +config PREEMPT_OFF_HIST + bool "non-preemptible critical section latency histogram" + default n + depends on CRITICAL_PREEMPT_TIMING + help + This option logs all the non-preemptible critical section latency + timing to a big histogram bucket, in the meanwhile, it also + dummies up printk produced by non-preemptible critical section + latency timing. + + The non-preemptible critical section latency timing histogram can + be viewed via: + + cat /proc/latency_hist/preempt_off_latency/CPU* + + (Note: * presents CPU ID.) + +config INTERRUPT_OFF_HIST + bool "interrupts-off critical section latency histogram" + default n + depends on CRITICAL_IRQSOFF_TIMING + help + This option logs all the interrupts-off critical section latency + timing to a big histogram bucket, in the meanwhile, it also + dummies up printk produced by interrupts-off critical section + latency timing. + + The interrupts-off critical section latency timing histogram can + be viewed via: + + cat /proc/latency_hist/interrupt_off_latency/CPU* + + (Note: * presents CPU ID.) + +config CRITICAL_TIMING + bool + default y + depends on CRITICAL_PREEMPT_TIMING || CRITICAL_IRQSOFF_TIMING || WAKEUP_TIMING + +config DEBUG_TRACE_IRQFLAGS + bool + default y + depends on CRITICAL_IRQSOFF_TIMING + +config LATENCY_TIMING + bool + default y + depends on WAKEUP_TIMING || CRITICAL_TIMING + select SYSCTL + +config CRITICAL_LATENCY_HIST + bool + default y + depends on PREEMPT_OFF_HIST || INTERRUPT_OFF_HIST + +config LATENCY_HIST + bool + default y + depends on WAKEUP_LATENCY_HIST || CRITICAL_LATENCY_HIST + config DEBUG_KOBJECT bool "kobject debugging" depends on DEBUG_KERNEL @@ -517,4 +707,6 @@ config FAULT_INJECTION_STACKTRACE_FILTER help Provide stacktrace filter for fault-injection capabilities +source lib/tracing/Kconfig + source "samples/Kconfig" Index: linux-2.6.24-rc8-rt1/lib/Makefile =================================================================== --- linux-2.6.24-rc8-rt1.orig/lib/Makefile 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/lib/Makefile 2008-01-16 22:28:59.000000000 -0500 @@ -3,7 +3,7 @@ # lib-y := ctype.o string.o vsprintf.o cmdline.o \ - rbtree.o radix-tree.o dump_stack.o \ + rbtree.o radix-tree.o dump_stack.o lock_list.o \ idr.o int_sqrt.o extable.o prio_tree.o \ sha1.o irq_regs.o reciprocal_div.o argv_split.o \ proportions.o prio_heap.o @@ -27,7 +27,8 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o -lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o +obj-$(CONFIG_PREEMPT_RT) += plist.o +obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o @@ -66,6 +67,8 @@ obj-$(CONFIG_AUDIT_GENERIC) += audit.o obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o +obj-$(CONFIG_MCOUNT) += tracing/ + lib-$(CONFIG_GENERIC_BUG) += bug.o hostprogs-y := gen_crc32table Index: linux-2.6.24-rc8-rt1/lib/tracing/Kconfig =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/lib/tracing/Kconfig 2008-01-16 22:27:52.000000000 -0500 @@ -0,0 +1,10 @@ + +# Archs that enable MCOUNT should select HAVE_MCOUNT +config HAVE_MCOUNT + bool + +# MCOUNT itself is useless, or will just be added overhead. +# It needs something to register a function with it. +config MCOUNT + bool + select FRAME_POINTER Index: linux-2.6.24-rc8-rt1/lib/tracing/Makefile =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/lib/tracing/Makefile 2008-01-16 22:27:52.000000000 -0500 @@ -0,0 +1,3 @@ +obj-$(CONFIG_MCOUNT) += libmcount.o + +libmcount-y := mcount.o Index: linux-2.6.24-rc8-rt1/lib/tracing/mcount.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/lib/tracing/mcount.c 2008-01-16 22:27:52.000000000 -0500 @@ -0,0 +1,141 @@ +/* + * Infrastructure for profiling code inserted by 'gcc -pg'. + * + * Copyright (C) 2007-2008 Steven Rostedt + * + * Originally ported from the -rt patch by: + * Copyright (C) 2007 Arnaldo Carvalho de Melo + * + * Based on code in the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ + +#include +#include + +/* + * Since we have nothing protecting between the test of + * mcount_trace_function and the call to it, we can't + * set it to NULL without risking a race that will have + * the kernel call the NULL pointer. Instead, we just + * set the function pointer to a dummy function. + */ +notrace void dummy_mcount_tracer(unsigned long ip, + unsigned long parent_ip) +{ + /* do nothing */ +} + +static DEFINE_SPINLOCK(mcount_func_lock); +static struct mcount_ops mcount_list_end __read_mostly = +{ + .func = dummy_mcount_tracer, +}; + +static struct mcount_ops *mcount_list __read_mostly = &mcount_list_end; +mcount_func_t mcount_trace_function __read_mostly = dummy_mcount_tracer; +int mcount_enabled __read_mostly; + +/* mcount is defined per arch in assembly */ +EXPORT_SYMBOL_GPL(mcount); + +notrace void mcount_list_func(unsigned long ip, unsigned long parent_ip) +{ + struct mcount_ops *op = mcount_list; + + while (op != &mcount_list_end) { + op->func(ip, parent_ip); + op = op->next; + }; +} + +/** + * register_mcount_function - register a function for profiling + * @ops - ops structure that holds the function for profiling. + * + * Register a function to be called by all functions in the + * kernel. + * + * Note: @ops->func and all the functions it calls must be labeled + * with "notrace", otherwise it will go into a + * recursive loop. + */ +int register_mcount_function(struct mcount_ops *ops) +{ + unsigned long flags; + + spin_lock_irqsave(&mcount_func_lock, flags); + ops->next = mcount_list; + /* must have next seen before we update the list pointer */ + smp_wmb(); + mcount_list = ops; + /* + * For one func, simply call it directly. + * For more than one func, call the chain. + */ + if (ops->next == &mcount_list_end) + mcount_trace_function = ops->func; + else + mcount_trace_function = mcount_list_func; + spin_unlock_irqrestore(&mcount_func_lock, flags); + + return 0; +} + +/** + * unregister_mcount_function - unresgister a function for profiling. + * @ops - ops structure that holds the function to unregister + * + * Unregister a function that was added to be called by mcount profiling. + */ +int unregister_mcount_function(struct mcount_ops *ops) +{ + unsigned long flags; + struct mcount_ops **p; + int ret = 0; + + spin_lock_irqsave(&mcount_func_lock, flags); + + /* + * If we are the only function, then the mcount pointer is + * pointing directly to that function. + */ + if (mcount_list == ops && ops->next == &mcount_list_end) { + mcount_trace_function = dummy_mcount_tracer; + mcount_list = &mcount_list_end; + goto out; + } + + for (p = &mcount_list; *p != &mcount_list_end; p = &(*p)->next) + if (*p == ops) + break; + + if (*p != ops) { + ret = -1; + goto out; + } + + *p = (*p)->next; + + /* If we only have one func left, then call that directly */ + if (mcount_list->next == &mcount_list_end) + mcount_trace_function = mcount_list->func; + + out: + spin_unlock_irqrestore(&mcount_func_lock, flags); + + return 0; +} + +/** + * clear_mcount_function - reset the mcount function + * + * This NULLs the mcount function and in essence stops + * tracing. There may be lag + */ +void clear_mcount_function(void) +{ + mcount_trace_function = dummy_mcount_tracer; +} Index: linux-2.6.24-rc8-rt1/lib/smp_processor_id.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/lib/smp_processor_id.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/lib/smp_processor_id.c 2008-01-16 22:28:49.000000000 -0500 @@ -7,7 +7,7 @@ #include #include -unsigned int debug_smp_processor_id(void) +notrace unsigned int debug_smp_processor_id(void) { unsigned long preempt_count = preempt_count(); int this_cpu = raw_smp_processor_id(); @@ -42,7 +42,7 @@ unsigned int debug_smp_processor_id(void if (!printk_ratelimit()) goto out_enable; - printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] code: %s/%d\n", preempt_count(), current->comm, current->pid); + printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] code: %s/%d\n", preempt_count()-1, current->comm, current->pid); print_symbol("caller is %s\n", (long)__builtin_return_address(0)); dump_stack(); Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/head64.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/head64.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/head64.c 2008-01-16 22:28:42.000000000 -0500 @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -24,7 +25,11 @@ static void __init zap_identity_mappings { pgd_t *pgd = pgd_offset_k(0UL); pgd_clear(pgd); - __flush_tlb(); + /* + * preempt_disable/enable does not work this early in the + * bootup yet: + */ + write_cr3(read_cr3()); } /* Don't add a printk in there. printk relies on the PDA which is not initialized @@ -46,7 +51,7 @@ static void __init copy_bootdata(char *r } } -void __init x86_64_start_kernel(char * real_mode_data) +notrace void __init x86_64_start_kernel(char *real_mode_data) { int i; Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/setup64.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/setup64.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/setup64.c 2008-01-16 22:29:19.000000000 -0500 @@ -114,7 +114,7 @@ void __init setup_per_cpu_areas(void) } } -void pda_init(int cpu) +notrace void pda_init(int cpu) { struct x8664_pda *pda = cpu_pda(cpu); @@ -197,7 +197,7 @@ DEFINE_PER_CPU(struct orig_ist, orig_ist * 'CPU state barrier', nothing should get across. * A lot of state is already set up in PDA init. */ -void __cpuinit cpu_init (void) +notrace void __cpuinit cpu_init(void) { int cpu = stack_smp_processor_id(); struct tss_struct *t = &per_cpu(init_tss, cpu); @@ -248,7 +248,9 @@ void __cpuinit cpu_init (void) for (v = 0; v < N_EXCEPTION_STACKS; v++) { static const unsigned int order[N_EXCEPTION_STACKS] = { [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, +#if DEBUG_STACK > 0 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER +#endif }; if (cpu) { estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/smpboot_64.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/smpboot_64.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/smpboot_64.c 2008-01-16 22:28:57.000000000 -0500 @@ -317,7 +317,7 @@ static inline void set_cpu_sibling_map(i /* * Setup code on secondary processor (after comming out of the trampoline) */ -void __cpuinit start_secondary(void) +notrace __cpuinit void start_secondary(void) { /* * Dont put anything before smp_callin(), SMP @@ -867,7 +867,6 @@ void __init smp_set_apicids(void) */ void __init smp_prepare_cpus(unsigned int max_cpus) { - nmi_watchdog_default(); current_cpu_data = boot_cpu_data; current_thread_info()->cpu = 0; /* needed? */ smp_set_apicids(); Index: linux-2.6.24-rc8-rt1/arch/x86/vdso/vclock_gettime.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/vdso/vclock_gettime.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/vdso/vclock_gettime.c 2008-01-16 22:27:52.000000000 -0500 @@ -24,7 +24,7 @@ #define gtod vdso_vsyscall_gtod_data -static long vdso_fallback_gettime(long clock, struct timespec *ts) +static long notrace vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; asm("syscall" : "=a" (ret) : @@ -32,7 +32,7 @@ static long vdso_fallback_gettime(long c return ret; } -static inline long vgetns(void) +static inline long notrace vgetns(void) { long v; cycles_t (*vread)(void); @@ -41,7 +41,7 @@ static inline long vgetns(void) return (v * gtod->clock.mult) >> gtod->clock.shift; } -static noinline int do_realtime(struct timespec *ts) +static noinline int notrace do_realtime(struct timespec *ts) { unsigned long seq, ns; do { @@ -55,7 +55,8 @@ static noinline int do_realtime(struct t } /* Copy of the version in kernel/time.c which we cannot directly access */ -static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec) +static void notrace +vset_normalized_timespec(struct timespec *ts, long sec, long nsec) { while (nsec >= NSEC_PER_SEC) { nsec -= NSEC_PER_SEC; @@ -69,7 +70,7 @@ static void vset_normalized_timespec(str ts->tv_nsec = nsec; } -static noinline int do_monotonic(struct timespec *ts) +static noinline int notrace do_monotonic(struct timespec *ts) { unsigned long seq, ns, secs; do { @@ -83,7 +84,7 @@ static noinline int do_monotonic(struct return 0; } -int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) +int notrace __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { if (likely(gtod->sysctl_enabled && gtod->clock.vread)) switch (clock) { @@ -97,7 +98,7 @@ int __vdso_clock_gettime(clockid_t clock int clock_gettime(clockid_t, struct timespec *) __attribute__((weak, alias("__vdso_clock_gettime"))); -int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) +int notrace __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) { long ret; if (likely(gtod->sysctl_enabled && gtod->clock.vread)) { Index: linux-2.6.24-rc8-rt1/arch/x86/vdso/vgetcpu.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/vdso/vgetcpu.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/vdso/vgetcpu.c 2008-01-16 22:27:52.000000000 -0500 @@ -13,7 +13,8 @@ #include #include "vextern.h" -long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) +long notrace +__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) { unsigned int dummy, p; Index: linux-2.6.24-rc8-rt1/include/asm-x86/vsyscall.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/vsyscall.h 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/vsyscall.h 2008-01-16 22:27:52.000000000 -0500 @@ -24,7 +24,8 @@ enum vsyscall_num { ((unused, __section__ (".vsyscall_gtod_data"),aligned(16))) #define __section_vsyscall_clock __attribute__ \ ((unused, __section__ (".vsyscall_clock"),aligned(16))) -#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn"))) +#define __vsyscall_fn __attribute__ \ + ((unused, __section__(".vsyscall_fn"))) notrace #define VGETCPU_RDTSCP 1 #define VGETCPU_LSL 2 Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/vsyscall_64.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/vsyscall_64.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/vsyscall_64.c 2008-01-16 22:29:06.000000000 -0500 @@ -42,7 +42,8 @@ #include #include -#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) +#define __vsyscall(nr) \ + __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace #define __syscall_clobber "r11","rcx","memory" #define __pa_vsymbol(x) \ ({unsigned long v; \ @@ -60,7 +61,7 @@ int __vgetcpu_mode __section_vgetcpu_mod struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data = { - .lock = SEQLOCK_UNLOCKED, + .lock = __RAW_SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), .sysctl_enabled = 1, }; @@ -85,6 +86,7 @@ void update_vsyscall(struct timespec *wa vsyscall_gtod_data.clock.mask = clock->mask; vsyscall_gtod_data.clock.mult = clock->mult; vsyscall_gtod_data.clock.shift = clock->shift; + vsyscall_gtod_data.clock.cycle_accumulated = clock->cycle_accumulated; vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; @@ -120,10 +122,29 @@ static __always_inline long time_syscall static __always_inline void do_vgettimeofday(struct timeval * tv) { - cycle_t now, base, mask, cycle_delta; + cycle_t now, base, accumulated, mask, cycle_delta; unsigned seq; unsigned long mult, shift, nsec; cycle_t (*vread)(void); + + if (likely(__vsyscall_gtod_data.sysctl_enabled == 2)) { + struct timeval tmp; + + do { + barrier(); + tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; + tv->tv_usec = __vsyscall_gtod_data.wall_time_nsec; + barrier(); + tmp.tv_sec = __vsyscall_gtod_data.wall_time_sec; + tmp.tv_usec = __vsyscall_gtod_data.wall_time_nsec; + + } while (tmp.tv_usec != tv->tv_usec || + tmp.tv_sec != tv->tv_sec); + + tv->tv_usec /= NSEC_PER_USEC; + return; + } + do { seq = read_seqbegin(&__vsyscall_gtod_data.lock); @@ -134,6 +155,7 @@ static __always_inline void do_vgettimeo } now = vread(); base = __vsyscall_gtod_data.clock.cycle_last; + accumulated = __vsyscall_gtod_data.clock.cycle_accumulated; mask = __vsyscall_gtod_data.clock.mask; mult = __vsyscall_gtod_data.clock.mult; shift = __vsyscall_gtod_data.clock.shift; @@ -144,6 +166,7 @@ static __always_inline void do_vgettimeo /* calculate interval: */ cycle_delta = (now - base) & mask; + cycle_delta += accumulated; /* convert to nsecs: */ nsec += (cycle_delta * mult) >> shift; Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/nmi_32.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/nmi_32.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/nmi_32.c 2008-01-16 22:29:25.000000000 -0500 @@ -27,6 +27,8 @@ #include #include +#include + #include "mach_traps.h" int unknown_nmi_panic; @@ -43,7 +45,7 @@ static cpumask_t backtrace_mask = CPU_MA atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ unsigned int nmi_watchdog = NMI_DEFAULT; -static unsigned int nmi_hz = HZ; +static unsigned int nmi_hz = 1000; static DEFINE_PER_CPU(short, wd_enabled); @@ -59,7 +61,12 @@ static int endflag __initdata = 0; static __init void nmi_cpu_busy(void *data) { #ifdef CONFIG_SMP + /* + * avoid a warning, on PREEMPT_RT this wont run in hardirq context: + */ +#ifndef CONFIG_PREEMPT_RT local_irq_enable_in_hardirq(); +#endif /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, even if there is a simulator or similar that catches the @@ -94,7 +101,7 @@ static int __init check_nmi_watchdog(voi for_each_possible_cpu(cpu) prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count; local_irq_enable(); - mdelay((20*1000)/nmi_hz); // wait 20 ticks + mdelay((100*1000)/nmi_hz); /* wait 100 ticks */ for_each_possible_cpu(cpu) { #ifdef CONFIG_SMP @@ -323,7 +330,47 @@ EXPORT_SYMBOL(touch_nmi_watchdog); extern void die_nmi(struct pt_regs *, const char *msg); -__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) +int nmi_show_regs[NR_CPUS]; + +void nmi_show_all_regs(void) +{ + int i; + + if (system_state == SYSTEM_BOOTING) + return; + + printk(KERN_WARNING "nmi_show_all_regs(): start on CPU#%d.\n", + raw_smp_processor_id()); + dump_stack(); + + for_each_online_cpu(i) + nmi_show_regs[i] = 1; + + smp_send_nmi_allbutself(); + + for_each_online_cpu(i) { + while (nmi_show_regs[i] == 1) + barrier(); + } +} + +static DEFINE_RAW_SPINLOCK(nmi_print_lock); + +notrace void irq_show_regs_callback(int cpu, struct pt_regs *regs) +{ + if (!nmi_show_regs[cpu]) + return; + + nmi_show_regs[cpu] = 0; + spin_lock(&nmi_print_lock); + printk(KERN_WARNING "NMI show regs on CPU#%d:\n", cpu); + printk(KERN_WARNING "apic_timer_irqs: %d\n", + per_cpu(irq_stat, cpu).apic_timer_irqs); + show_regs(regs); + spin_unlock(&nmi_print_lock); +} + +notrace __kprobes int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) { /* @@ -336,6 +383,8 @@ __kprobes int nmi_watchdog_tick(struct p int cpu = smp_processor_id(); int rc=0; + __profile_tick(CPU_PROFILING, regs); + /* check for other users first */ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) { @@ -348,7 +397,7 @@ __kprobes int nmi_watchdog_tick(struct p spin_lock(&lock); printk("NMI backtrace for cpu %d\n", cpu); - dump_stack(); + show_regs(regs); spin_unlock(&lock); cpu_clear(cpu, backtrace_mask); } @@ -360,6 +409,9 @@ __kprobes int nmi_watchdog_tick(struct p sum = per_cpu(irq_stat, cpu).apic_timer_irqs + per_cpu(irq_stat, cpu).irq0_irqs; + irq_show_regs_callback(cpu, regs); + + /* if the apic timer isn't firing, this cpu isn't doing much */ /* if the none of the timers isn't firing, this cpu isn't doing much */ if (!touched && last_irq_sums[cpu] == sum) { /* @@ -367,11 +419,29 @@ __kprobes int nmi_watchdog_tick(struct p * wait a few IRQs (5 seconds) before doing the oops ... */ alert_counter[cpu]++; - if (alert_counter[cpu] == 5*nmi_hz) - /* - * die_nmi will return ONLY if NOTIFY_STOP happens.. - */ - die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP"); + if (alert_counter[cpu] && !(alert_counter[cpu] % (5*nmi_hz))) { + int i; + + spin_lock(&nmi_print_lock); + printk(KERN_WARNING "NMI watchdog detected lockup on " + "CPU#%d (%d/%d)\n", cpu, alert_counter[cpu], + 5*nmi_hz); + show_regs(regs); + spin_unlock(&nmi_print_lock); + + for_each_online_cpu(i) { + if (i == cpu) + continue; + nmi_show_regs[i] = 1; + while (nmi_show_regs[i] == 1) + cpu_relax(); + } + printk(KERN_WARNING "NMI watchdog running again ...\n"); + for_each_online_cpu(i) + alert_counter[i] = 0; + + } + } else { last_irq_sums[cpu] = sum; alert_counter[cpu] = 0; @@ -469,5 +539,17 @@ void __trigger_all_cpu_backtrace(void) } } +void smp_send_nmi_allbutself(void) +{ +#ifdef CONFIG_SMP + cpumask_t mask = cpu_online_map; + preempt_disable(); + cpu_clear(safe_smp_processor_id(), mask); + if (!cpus_empty(mask)) + send_IPI_mask(mask, NMI_VECTOR); + preempt_enable(); +#endif +} + EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/traps_32.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/traps_32.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/traps_32.c 2008-01-16 22:29:06.000000000 -0500 @@ -237,6 +237,7 @@ show_trace_log_lvl(struct task_struct *t { dump_trace(task, regs, stack, &print_trace_ops, log_lvl); printk("%s =======================\n", log_lvl); + print_traces(task); } void show_trace(struct task_struct *task, struct pt_regs *regs, @@ -266,8 +267,15 @@ static void show_stack_log_lvl(struct ta printk("\n%s ", log_lvl); printk("%08lx ", *stack++); } + + pause_on_oops_head(); + printk("\n%sCall Trace:\n", log_lvl); show_trace_log_lvl(task, regs, esp, log_lvl); + + pause_on_oops_tail(); + + debug_show_held_locks(task); } void show_stack(struct task_struct *task, unsigned long *esp) @@ -293,6 +301,12 @@ void dump_stack(void) EXPORT_SYMBOL(dump_stack); +#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_EVENT_TRACE) +extern unsigned long worst_stack_left; +#else +# define worst_stack_left -1L +#endif + void show_registers(struct pt_regs *regs) { int i; @@ -362,7 +376,7 @@ void die(const char * str, struct pt_reg u32 lock_owner; int lock_owner_depth; } die = { - .lock = __RAW_SPIN_LOCK_UNLOCKED, + .lock = RAW_SPIN_LOCK_UNLOCKED(die.lock), .lock_owner = -1, .lock_owner_depth = 0 }; @@ -374,7 +388,7 @@ void die(const char * str, struct pt_reg if (die.lock_owner != raw_smp_processor_id()) { console_verbose(); raw_local_irq_save(flags); - __raw_spin_lock(&die.lock); + spin_lock(&die.lock); die.lock_owner = smp_processor_id(); die.lock_owner_depth = 0; bust_spinlocks(1); @@ -423,7 +437,7 @@ void die(const char * str, struct pt_reg bust_spinlocks(0); die.lock_owner = -1; add_taint(TAINT_DIE); - __raw_spin_unlock(&die.lock); + spin_unlock(&die.lock); raw_local_irq_restore(flags); if (!regs) @@ -463,6 +477,11 @@ static void __kprobes do_trap(int trapnr if (!user_mode(regs)) goto kernel_trap; +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif + trap_signal: { /* * We want error_code and trap_no set for userspace faults and @@ -719,10 +738,11 @@ void __kprobes die_nmi(struct pt_regs *r crash_kexec(regs); } + nmi_exit(); do_exit(SIGSEGV); } -static __kprobes void default_do_nmi(struct pt_regs * regs) +static notrace __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; @@ -762,11 +782,12 @@ static __kprobes void default_do_nmi(str static int ignore_nmis; -fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code) +notrace fastcall __kprobes void do_nmi(struct pt_regs *regs, long error_code) { int cpu; nmi_enter(); + nmi_trace((unsigned long)do_nmi, regs->eip, regs->eflags); cpu = smp_processor_id(); Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/nmi_64.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/nmi_64.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/nmi_64.c 2008-01-16 22:29:25.000000000 -0500 @@ -20,11 +20,13 @@ #include #include #include +#include #include #include #include #include +#include int unknown_nmi_panic; int nmi_watchdog_enabled; @@ -42,7 +44,7 @@ atomic_t nmi_active = ATOMIC_INIT(0); / int panic_on_timeout; unsigned int nmi_watchdog = NMI_DEFAULT; -static unsigned int nmi_hz = HZ; +static unsigned int nmi_hz = 1000; static DEFINE_PER_CPU(short, wd_enabled); @@ -50,7 +52,7 @@ static DEFINE_PER_CPU(short, wd_enabled) static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); /* Run after command line and cpu_init init, but before all other checks */ -void nmi_watchdog_default(void) +static inline void nmi_watchdog_default(void) { if (nmi_watchdog != NMI_DEFAULT) return; @@ -66,7 +68,9 @@ static int endflag __initdata = 0; */ static __init void nmi_cpu_busy(void *data) { +#ifndef CONFIG_PREEMPT_RT local_irq_enable_in_hardirq(); +#endif /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, even if there is a simulator or similar that catches the @@ -301,7 +305,7 @@ void touch_nmi_watchdog(void) unsigned cpu; /* - * Tell other CPUs to reset their alert counters. We cannot + * Tell other CPUs to reset their alert counters. We cannot * do it ourselves because the alert count increase is not * atomic. */ @@ -312,15 +316,54 @@ void touch_nmi_watchdog(void) } touch_softlockup_watchdog(); + touch_softlockup_watchdog(); } -int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) +int nmi_show_regs[NR_CPUS]; + +void nmi_show_all_regs(void) +{ + int i; + + if (system_state == SYSTEM_BOOTING) + return; + + smp_send_nmi_allbutself(); + + for_each_online_cpu(i) + nmi_show_regs[i] = 1; + + for_each_online_cpu(i) { + while (nmi_show_regs[i] == 1) + barrier(); + } +} + +static DEFINE_RAW_SPINLOCK(nmi_print_lock); + +notrace void irq_show_regs_callback(int cpu, struct pt_regs *regs) +{ + if (!nmi_show_regs[cpu]) + return; + + nmi_show_regs[cpu] = 0; + spin_lock(&nmi_print_lock); + printk(KERN_WARNING "NMI show regs on CPU#%d:\n", cpu); + printk(KERN_WARNING "apic_timer_irqs: %d\n", read_pda(apic_timer_irqs)); + show_regs(regs); + spin_unlock(&nmi_print_lock); +} + +notrace __kprobes int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) { int sum; int touched = 0; int cpu = smp_processor_id(); int rc = 0; + irq_show_regs_callback(cpu, regs); + __profile_tick(CPU_PROFILING, regs); + /* check for other users first */ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) { @@ -328,7 +371,6 @@ int __kprobes nmi_watchdog_tick(struct p touched = 1; } - sum = read_pda(apic_timer_irqs) + read_pda(irq0_irqs); if (__get_cpu_var(nmi_touch)) { __get_cpu_var(nmi_touch) = 0; touched = 1; @@ -344,6 +386,12 @@ int __kprobes nmi_watchdog_tick(struct p cpu_clear(cpu, backtrace_mask); } + /* + * Take the local apic timer and PIT/HPET into account. We don't + * know which one is active, when we have highres/dyntick on + */ + sum = read_pda(apic_timer_irqs) + kstat_cpu(cpu).irqs[0]; + #ifdef CONFIG_X86_MCE /* Could check oops_in_progress here too, but it's safer not too */ @@ -357,9 +405,20 @@ int __kprobes nmi_watchdog_tick(struct p * wait a few IRQs (5 seconds) before doing the oops ... */ local_inc(&__get_cpu_var(alert_counter)); - if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) + if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) { + int i; + + for_each_online_cpu(i) { + if (i == cpu) + continue; + nmi_show_regs[i] = 1; + while (nmi_show_regs[i] == 1) + cpu_relax(); + } + die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs, panic_on_timeout); + } } else { __get_cpu_var(last_irq_sum) = sum; local_set(&__get_cpu_var(alert_counter), 0); @@ -477,6 +536,15 @@ void __trigger_all_cpu_backtrace(void) } } +void smp_send_nmi_allbutself(void) +{ +#ifdef CONFIG_SMP + preempt_disable(); + send_IPI_allbutself(NMI_VECTOR); + preempt_enable(); +#endif +} + EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); EXPORT_SYMBOL(touch_nmi_watchdog); Index: linux-2.6.24-rc8-rt1/drivers/clocksource/acpi_pm.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/clocksource/acpi_pm.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/clocksource/acpi_pm.c 2008-01-16 22:27:53.000000000 -0500 @@ -30,13 +30,13 @@ */ u32 pmtmr_ioport __read_mostly; -static inline u32 read_pmtmr(void) +static inline notrace u32 read_pmtmr(void) { /* mask the output to 24 bits */ return inl(pmtmr_ioport) & ACPI_PM_MASK; } -u32 acpi_pm_read_verified(void) +notrace u32 acpi_pm_read_verified(void) { u32 v1 = 0, v2 = 0, v3 = 0; @@ -56,12 +56,12 @@ u32 acpi_pm_read_verified(void) return v2; } -static cycle_t acpi_pm_read_slow(void) +static notrace cycle_t acpi_pm_read_slow(void) { return (cycle_t)acpi_pm_read_verified(); } -static cycle_t acpi_pm_read(void) +static notrace cycle_t acpi_pm_read(void) { return (cycle_t)read_pmtmr(); } Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/apic_32.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/apic_32.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/apic_32.c 2008-01-16 22:28:12.000000000 -0500 @@ -577,10 +577,12 @@ static void local_apic_timer_interrupt(v * interrupt as well. Thus we cannot inline the local irq ... ] */ -void fastcall smp_apic_timer_interrupt(struct pt_regs *regs) +notrace fastcall void smp_apic_timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); + trace_special(regs->eip, 1, 0); + /* * NOTE! We'd better ACK the irq immediately, * because timer handling can be slow. @@ -1308,6 +1310,7 @@ void smp_error_interrupt(struct pt_regs */ printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", smp_processor_id(), v , v1); + dump_stack(); irq_exit(); } Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/hpet.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/hpet.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/hpet.c 2008-01-16 22:27:53.000000000 -0500 @@ -295,7 +295,7 @@ static int hpet_legacy_next_event(unsign /* * Clock source related code */ -static cycle_t read_hpet(void) +static notrace cycle_t read_hpet(void) { return (cycle_t)hpet_readl(HPET_COUNTER); } Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/time_32.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/time_32.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/time_32.c 2008-01-16 22:27:53.000000000 -0500 @@ -122,7 +122,7 @@ static int set_rtc_mmss(unsigned long no int timer_ack; -unsigned long profile_pc(struct pt_regs *regs) +notrace unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/tsc_32.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/tsc_32.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/tsc_32.c 2008-01-16 22:27:53.000000000 -0500 @@ -269,7 +269,7 @@ core_initcall(cpufreq_tsc); static unsigned long current_tsc_khz = 0; -static cycle_t read_tsc(void) +static notrace cycle_t read_tsc(void) { cycle_t ret; Index: linux-2.6.24-rc8-rt1/arch/x86/lib/delay_32.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/lib/delay_32.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/lib/delay_32.c 2008-01-16 22:27:53.000000000 -0500 @@ -24,7 +24,7 @@ #endif /* simple loop based delay: */ -static void delay_loop(unsigned long loops) +static notrace void delay_loop(unsigned long loops) { int d0; @@ -39,7 +39,7 @@ static void delay_loop(unsigned long loo } /* TSC based delay: */ -static void delay_tsc(unsigned long loops) +static notrace void delay_tsc(unsigned long loops) { unsigned long bclock, now; @@ -72,7 +72,7 @@ int read_current_timer(unsigned long *ti return -1; } -void __delay(unsigned long loops) +notrace void __delay(unsigned long loops) { delay_fn(loops); } Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/tsc_64.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/tsc_64.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/tsc_64.c 2008-01-16 22:27:53.000000000 -0500 @@ -248,13 +248,13 @@ __setup("notsc", notsc_setup); /* clock source code: */ -static cycle_t read_tsc(void) +static notrace cycle_t read_tsc(void) { cycle_t ret = (cycle_t)get_cycles_sync(); return ret; } -static cycle_t __vsyscall_fn vread_tsc(void) +static notrace cycle_t __vsyscall_fn vread_tsc(void) { cycle_t ret = (cycle_t)get_cycles_sync(); return ret; Index: linux-2.6.24-rc8-rt1/kernel/lockdep.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/lockdep.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/lockdep.c 2008-01-16 22:29:14.000000000 -0500 @@ -66,7 +66,7 @@ module_param(lock_stat, int, 0644); * to use a raw spinlock - we really dont want the spinlock * code to recurse back into the lockdep code... */ -static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +static __raw_spinlock_t lockdep_lock = (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; static int graph_lock(void) { @@ -270,14 +270,14 @@ static struct list_head chainhash_table[ ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ (key2)) -void lockdep_off(void) +notrace void lockdep_off(void) { current->lockdep_recursion++; } EXPORT_SYMBOL(lockdep_off); -void lockdep_on(void) +notrace void lockdep_on(void) { current->lockdep_recursion--; } @@ -508,7 +508,11 @@ static void print_lock(struct held_lock static void lockdep_print_held_locks(struct task_struct *curr) { - int i, depth = curr->lockdep_depth; + int i, depth; + + if (!curr) + curr = current; + depth = curr->lockdep_depth; if (!depth) { printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr)); @@ -573,7 +577,7 @@ static void print_lock_dependencies(stru static void print_kernel_version(void) { - printk("%s %.*s\n", init_utsname()->release, + printk("[ %s %.*s\n", init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); } @@ -809,6 +813,21 @@ out_unlock_set: return class; } +#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_TRACE_IRQFLAGS) + +#define RECURSION_LIMIT 40 + +static int noinline print_infinite_recursion_bug(void) +{ + if (!debug_locks_off_graph_unlock()) + return 0; + + WARN_ON(1); + + return 0; +} +#endif /* CONFIG_PROVE_LOCKING || CONFIG_TRACE_IRQFLAGS */ + #ifdef CONFIG_PROVE_LOCKING /* * Allocate a lockdep entry. (assumes the graph_lock held, returns @@ -939,18 +958,6 @@ static noinline int print_circular_bug_t return 0; } -#define RECURSION_LIMIT 40 - -static int noinline print_infinite_recursion_bug(void) -{ - if (!debug_locks_off_graph_unlock()) - return 0; - - WARN_ON(1); - - return 0; -} - /* * Prove that the dependency graph starting at can not * lead to . Print an error and return 0 if it does. @@ -1036,7 +1043,7 @@ find_usage_forwards(struct lock_class *s * Return 1 otherwise and keep unchanged. * Return 0 on error. */ -static noinline int +static noinline notrace int find_usage_backwards(struct lock_class *source, unsigned int depth) { struct lock_list *entry; @@ -1068,6 +1075,7 @@ find_usage_backwards(struct lock_class * return 1; } +#ifdef CONFIG_PROVE_LOCKING static int print_bad_irq_dependency(struct task_struct *curr, struct held_lock *prev, @@ -1128,6 +1136,7 @@ print_bad_irq_dependency(struct task_str return 0; } +#endif /* CONFIG_PROVE_LOCKING */ static int check_usage(struct task_struct *curr, struct held_lock *prev, @@ -1586,7 +1595,7 @@ static inline int validate_chain(struct * We are building curr_chain_key incrementally, so double-check * it from scratch, to make sure that it's done correctly: */ -static void check_chain_key(struct task_struct *curr) +static notrace void check_chain_key(struct task_struct *curr) { #ifdef CONFIG_DEBUG_LOCKDEP struct held_lock *hlock, *prev_hlock = NULL; @@ -1962,7 +1971,7 @@ static int mark_lock_irq(struct task_str /* * Mark all held locks with a usage bit: */ -static int +static notrace int mark_held_locks(struct task_struct *curr, int hardirq) { enum lock_usage_bit usage_bit; @@ -2009,7 +2018,7 @@ void early_boot_irqs_on(void) /* * Hardirqs will be enabled: */ -void trace_hardirqs_on(void) +notrace void trace_hardirqs_on_caller(unsigned long a0) { struct task_struct *curr = current; unsigned long ip; @@ -2050,6 +2059,13 @@ void trace_hardirqs_on(void) curr->hardirq_enable_ip = ip; curr->hardirq_enable_event = ++curr->irq_events; debug_atomic_inc(&hardirqs_on_events); +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + time_hardirqs_on(a0, 0 /* CALLER_ADDR1 */); +#endif +} + +void notrace trace_hardirqs_on(void) { + trace_hardirqs_on_caller(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on); @@ -2057,7 +2073,7 @@ EXPORT_SYMBOL(trace_hardirqs_on); /* * Hardirqs were disabled: */ -void trace_hardirqs_off(void) +void notrace trace_hardirqs_off_caller(unsigned long a0) { struct task_struct *curr = current; @@ -2075,10 +2091,17 @@ void trace_hardirqs_off(void) curr->hardirq_disable_ip = _RET_IP_; curr->hardirq_disable_event = ++curr->irq_events; debug_atomic_inc(&hardirqs_off_events); +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + time_hardirqs_off(a0, 0 /* CALLER_ADDR1 */); +#endif } else debug_atomic_inc(&redundant_hardirqs_off); } +void notrace trace_hardirqs_off(void) { + trace_hardirqs_off_caller(CALLER_ADDR0); +} + EXPORT_SYMBOL(trace_hardirqs_off); /* @@ -2241,8 +2264,8 @@ static inline int separate_irq_context(s /* * Mark a lock with a usage bit, and validate the state transition: */ -static int mark_lock(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit) +static notrace int mark_lock(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit) { unsigned int new_mask = 1 << new_bit, ret = 1; @@ -2301,6 +2324,7 @@ static int mark_lock(struct task_struct * We must printk outside of the graph_lock: */ if (ret == 2) { + user_trace_stop(); printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); print_lock(this); print_irqtrace_events(curr); @@ -2518,6 +2542,55 @@ static int check_unlock(struct task_stru return 1; } +static int +__lock_set_subclass(struct lockdep_map *lock, + unsigned int subclass, unsigned long ip) +{ + struct task_struct *curr = current; + struct held_lock *hlock, *prev_hlock; + struct lock_class *class; + unsigned int depth; + int i; + + depth = curr->lockdep_depth; + if (DEBUG_LOCKS_WARN_ON(!depth)) + return 0; + + prev_hlock = NULL; + for (i = depth-1; i >= 0; i--) { + hlock = curr->held_locks + i; + /* + * We must not cross into another context: + */ + if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) + break; + if (hlock->instance == lock) + goto found_it; + prev_hlock = hlock; + } + return print_unlock_inbalance_bug(curr, lock, ip); + +found_it: + class = register_lock_class(lock, subclass, 0); + hlock->class = class; + + curr->lockdep_depth = i; + curr->curr_chain_key = hlock->prev_chain_key; + + for (; i < depth; i++) { + hlock = curr->held_locks + i; + if (!__lock_acquire(hlock->instance, + hlock->class->subclass, hlock->trylock, + hlock->read, hlock->check, hlock->hardirqs_off, + hlock->acquire_ip)) + return 0; + } + + if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) + return 0; + return 1; +} + /* * Remove the lock to the list of currently held locks in a * potentially non-nested (out of order) manner. This is a @@ -2648,7 +2721,7 @@ __lock_release(struct lockdep_map *lock, /* * Check whether we follow the irq-flags state precisely: */ -static void check_flags(unsigned long flags) +static notrace void check_flags(unsigned long flags) { #if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS) if (!debug_locks) @@ -2681,12 +2754,35 @@ static void check_flags(unsigned long fl #endif } +void +lock_set_subclass(struct lockdep_map *lock, + unsigned int subclass, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(!lock_stat && !prove_locking)) + return; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + current->lockdep_recursion = 1; + check_flags(flags); + if (__lock_set_subclass(lock, subclass, ip)) + check_chain_key(current); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} + +EXPORT_SYMBOL_GPL(lock_set_subclass); + /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: */ -void lock_acquire(struct lockdep_map *lock, unsigned int subclass, - int trylock, int read, int check, unsigned long ip) +notrace void lock_acquire(struct lockdep_map *lock, unsigned int subclass, + int trylock, int read, int check, unsigned long ip) { unsigned long flags; @@ -2697,9 +2793,9 @@ void lock_acquire(struct lockdep_map *lo return; raw_local_irq_save(flags); + current->lockdep_recursion = 1; check_flags(flags); - current->lockdep_recursion = 1; __lock_acquire(lock, subclass, trylock, read, check, irqs_disabled_flags(flags), ip); current->lockdep_recursion = 0; @@ -2708,7 +2804,8 @@ void lock_acquire(struct lockdep_map *lo EXPORT_SYMBOL_GPL(lock_acquire); -void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) +notrace void lock_release(struct lockdep_map *lock, int nested, + unsigned long ip) { unsigned long flags; @@ -2719,8 +2816,8 @@ void lock_release(struct lockdep_map *lo return; raw_local_irq_save(flags); - check_flags(flags); current->lockdep_recursion = 1; + check_flags(flags); __lock_release(lock, nested, ip); current->lockdep_recursion = 0; raw_local_irq_restore(flags); @@ -3032,13 +3129,13 @@ void __init lockdep_info(void) { printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); - printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); - printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); - printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); - printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); - printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); - printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); - printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); + printk("... MAX_LOCKDEP_SUBCLASSES: %6lu\n", MAX_LOCKDEP_SUBCLASSES); + printk("... MAX_LOCK_DEPTH: %6lu\n", MAX_LOCK_DEPTH); + printk("... MAX_LOCKDEP_KEYS: %6lu\n", MAX_LOCKDEP_KEYS); + printk("... CLASSHASH_SIZE: %6lu\n", CLASSHASH_SIZE); + printk("... MAX_LOCKDEP_ENTRIES: %6lu\n", MAX_LOCKDEP_ENTRIES); + printk("... MAX_LOCKDEP_CHAINS: %6lu\n", MAX_LOCKDEP_CHAINS); + printk("... CHAINHASH_SIZE: %6lu\n", CHAINHASH_SIZE); printk(" memory used by lock dependency info: %lu kB\n", (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS + @@ -3205,7 +3302,8 @@ void debug_show_held_locks(struct task_s printk("INFO: lockdep is turned off.\n"); return; } - lockdep_print_held_locks(task); + if (task == current) + lockdep_print_held_locks(task); } EXPORT_SYMBOL_GPL(debug_show_held_locks); Index: linux-2.6.24-rc8-rt1/kernel/spinlock.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/spinlock.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/spinlock.c 2008-01-16 22:28:31.000000000 -0500 @@ -21,7 +21,7 @@ #include #include -int __lockfunc _spin_trylock(spinlock_t *lock) +int __lockfunc __spin_trylock(raw_spinlock_t *lock) { preempt_disable(); if (_raw_spin_trylock(lock)) { @@ -32,9 +32,46 @@ int __lockfunc _spin_trylock(spinlock_t preempt_enable(); return 0; } -EXPORT_SYMBOL(_spin_trylock); +EXPORT_SYMBOL(__spin_trylock); -int __lockfunc _read_trylock(rwlock_t *lock) +int __lockfunc __spin_trylock_irq(raw_spinlock_t *lock) +{ + local_irq_disable(); + preempt_disable(); + + if (_raw_spin_trylock(lock)) { + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + return 1; + } + + __preempt_enable_no_resched(); + local_irq_enable(); + preempt_check_resched(); + + return 0; +} +EXPORT_SYMBOL(__spin_trylock_irq); + +int __lockfunc __spin_trylock_irqsave(raw_spinlock_t *lock, + unsigned long *flags) +{ + local_irq_save(*flags); + preempt_disable(); + + if (_raw_spin_trylock(lock)) { + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + return 1; + } + + __preempt_enable_no_resched(); + local_irq_restore(*flags); + preempt_check_resched(); + + return 0; +} +EXPORT_SYMBOL(__spin_trylock_irqsave); + +int __lockfunc __read_trylock(raw_rwlock_t *lock) { preempt_disable(); if (_raw_read_trylock(lock)) { @@ -45,9 +82,9 @@ int __lockfunc _read_trylock(rwlock_t *l preempt_enable(); return 0; } -EXPORT_SYMBOL(_read_trylock); +EXPORT_SYMBOL(__read_trylock); -int __lockfunc _write_trylock(rwlock_t *lock) +int __lockfunc __write_trylock(raw_rwlock_t *lock) { preempt_disable(); if (_raw_write_trylock(lock)) { @@ -58,7 +95,21 @@ int __lockfunc _write_trylock(rwlock_t * preempt_enable(); return 0; } -EXPORT_SYMBOL(_write_trylock); +EXPORT_SYMBOL(__write_trylock); + +int __lockfunc __write_trylock_irqsave(raw_rwlock_t *lock, unsigned long *flags) +{ + int ret; + + local_irq_save(*flags); + ret = __write_trylock(lock); + if (ret) + return ret; + + local_irq_restore(*flags); + return 0; +} +EXPORT_SYMBOL(__write_trylock_irqsave); /* * If lockdep is enabled then we use the non-preemption spin-ops @@ -66,17 +117,17 @@ EXPORT_SYMBOL(_write_trylock); * not re-enabled during lock-acquire (which the preempt-spin-ops do): */ #if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \ - defined(CONFIG_DEBUG_LOCK_ALLOC) + defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_PREEMPT_RT) -void __lockfunc _read_lock(rwlock_t *lock) +void __lockfunc __read_lock(raw_rwlock_t *lock) { preempt_disable(); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); } -EXPORT_SYMBOL(_read_lock); +EXPORT_SYMBOL(__read_lock); -unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) +unsigned long __lockfunc __spin_lock_irqsave(raw_spinlock_t *lock) { unsigned long flags; @@ -95,27 +146,27 @@ unsigned long __lockfunc _spin_lock_irqs #endif return flags; } -EXPORT_SYMBOL(_spin_lock_irqsave); +EXPORT_SYMBOL(__spin_lock_irqsave); -void __lockfunc _spin_lock_irq(spinlock_t *lock) +void __lockfunc __spin_lock_irq(raw_spinlock_t *lock) { local_irq_disable(); preempt_disable(); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } -EXPORT_SYMBOL(_spin_lock_irq); +EXPORT_SYMBOL(__spin_lock_irq); -void __lockfunc _spin_lock_bh(spinlock_t *lock) +void __lockfunc __spin_lock_bh(raw_spinlock_t *lock) { local_bh_disable(); preempt_disable(); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } -EXPORT_SYMBOL(_spin_lock_bh); +EXPORT_SYMBOL(__spin_lock_bh); -unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) +unsigned long __lockfunc __read_lock_irqsave(raw_rwlock_t *lock) { unsigned long flags; @@ -125,27 +176,27 @@ unsigned long __lockfunc _read_lock_irqs LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); return flags; } -EXPORT_SYMBOL(_read_lock_irqsave); +EXPORT_SYMBOL(__read_lock_irqsave); -void __lockfunc _read_lock_irq(rwlock_t *lock) +void __lockfunc __read_lock_irq(raw_rwlock_t *lock) { local_irq_disable(); preempt_disable(); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); } -EXPORT_SYMBOL(_read_lock_irq); +EXPORT_SYMBOL(__read_lock_irq); -void __lockfunc _read_lock_bh(rwlock_t *lock) +void __lockfunc __read_lock_bh(raw_rwlock_t *lock) { local_bh_disable(); preempt_disable(); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); } -EXPORT_SYMBOL(_read_lock_bh); +EXPORT_SYMBOL(__read_lock_bh); -unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) +unsigned long __lockfunc __write_lock_irqsave(raw_rwlock_t *lock) { unsigned long flags; @@ -155,43 +206,43 @@ unsigned long __lockfunc _write_lock_irq LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); return flags; } -EXPORT_SYMBOL(_write_lock_irqsave); +EXPORT_SYMBOL(__write_lock_irqsave); -void __lockfunc _write_lock_irq(rwlock_t *lock) +void __lockfunc __write_lock_irq(raw_rwlock_t *lock) { local_irq_disable(); preempt_disable(); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); } -EXPORT_SYMBOL(_write_lock_irq); +EXPORT_SYMBOL(__write_lock_irq); -void __lockfunc _write_lock_bh(rwlock_t *lock) +void __lockfunc __write_lock_bh(raw_rwlock_t *lock) { local_bh_disable(); preempt_disable(); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); } -EXPORT_SYMBOL(_write_lock_bh); +EXPORT_SYMBOL(__write_lock_bh); -void __lockfunc _spin_lock(spinlock_t *lock) +void __lockfunc __spin_lock(raw_spinlock_t *lock) { preempt_disable(); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } -EXPORT_SYMBOL(_spin_lock); +EXPORT_SYMBOL(__spin_lock); -void __lockfunc _write_lock(rwlock_t *lock) +void __lockfunc __write_lock(raw_rwlock_t *lock) { preempt_disable(); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); } -EXPORT_SYMBOL(_write_lock); +EXPORT_SYMBOL(__write_lock); #else /* CONFIG_PREEMPT: */ @@ -204,7 +255,7 @@ EXPORT_SYMBOL(_write_lock); */ #define BUILD_LOCK_OPS(op, locktype) \ -void __lockfunc _##op##_lock(locktype##_t *lock) \ +void __lockfunc __##op##_lock(locktype##_t *lock) \ { \ for (;;) { \ preempt_disable(); \ @@ -214,15 +265,16 @@ void __lockfunc _##op##_lock(locktype##_ \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ - while (!op##_can_lock(lock) && (lock)->break_lock) \ - _raw_##op##_relax(&lock->raw_lock); \ + while (!__raw_##op##_can_lock(&(lock)->raw_lock) && \ + (lock)->break_lock) \ + __raw_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ } \ \ -EXPORT_SYMBOL(_##op##_lock); \ +EXPORT_SYMBOL(__##op##_lock); \ \ -unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \ +unsigned long __lockfunc __##op##_lock_irqsave(locktype##_t *lock) \ { \ unsigned long flags; \ \ @@ -236,23 +288,24 @@ unsigned long __lockfunc _##op##_lock_ir \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ - while (!op##_can_lock(lock) && (lock)->break_lock) \ - _raw_##op##_relax(&lock->raw_lock); \ + while (!__raw_##op##_can_lock(&(lock)->raw_lock) && \ + (lock)->break_lock) \ + __raw_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ return flags; \ } \ \ -EXPORT_SYMBOL(_##op##_lock_irqsave); \ +EXPORT_SYMBOL(__##op##_lock_irqsave); \ \ -void __lockfunc _##op##_lock_irq(locktype##_t *lock) \ +void __lockfunc __##op##_lock_irq(locktype##_t *lock) \ { \ - _##op##_lock_irqsave(lock); \ + __##op##_lock_irqsave(lock); \ } \ \ -EXPORT_SYMBOL(_##op##_lock_irq); \ +EXPORT_SYMBOL(__##op##_lock_irq); \ \ -void __lockfunc _##op##_lock_bh(locktype##_t *lock) \ +void __lockfunc __##op##_lock_bh(locktype##_t *lock) \ { \ unsigned long flags; \ \ @@ -261,39 +314,40 @@ void __lockfunc _##op##_lock_bh(locktype /* irq-disabling. We use the generic preemption-aware */ \ /* function: */ \ /**/ \ - flags = _##op##_lock_irqsave(lock); \ + flags = __##op##_lock_irqsave(lock); \ local_bh_disable(); \ local_irq_restore(flags); \ } \ \ -EXPORT_SYMBOL(_##op##_lock_bh) +EXPORT_SYMBOL(__##op##_lock_bh) /* * Build preemption-friendly versions of the following * lock-spinning functions: * - * _[spin|read|write]_lock() - * _[spin|read|write]_lock_irq() - * _[spin|read|write]_lock_irqsave() - * _[spin|read|write]_lock_bh() + * __[spin|read|write]_lock() + * __[spin|read|write]_lock_irq() + * __[spin|read|write]_lock_irqsave() + * __[spin|read|write]_lock_bh() */ -BUILD_LOCK_OPS(spin, spinlock); -BUILD_LOCK_OPS(read, rwlock); -BUILD_LOCK_OPS(write, rwlock); +BUILD_LOCK_OPS(spin, raw_spinlock); +BUILD_LOCK_OPS(read, raw_rwlock); +BUILD_LOCK_OPS(write, raw_rwlock); #endif /* CONFIG_PREEMPT */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) +void __lockfunc __spin_lock_nested(raw_spinlock_t *lock, int subclass) { preempt_disable(); spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } +EXPORT_SYMBOL(__spin_lock_nested); -EXPORT_SYMBOL(_spin_lock_nested); -unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) +unsigned long __lockfunc +__spin_lock_irqsave_nested(raw_spinlock_t *lock, int subclass) { unsigned long flags; @@ -312,117 +366,130 @@ unsigned long __lockfunc _spin_lock_irqs #endif return flags; } - -EXPORT_SYMBOL(_spin_lock_irqsave_nested); +EXPORT_SYMBOL(__spin_lock_irqsave_nested); #endif -void __lockfunc _spin_unlock(spinlock_t *lock) +void __lockfunc __spin_unlock(raw_spinlock_t *lock) { spin_release(&lock->dep_map, 1, _RET_IP_); _raw_spin_unlock(lock); preempt_enable(); } -EXPORT_SYMBOL(_spin_unlock); +EXPORT_SYMBOL(__spin_unlock); + +void __lockfunc __spin_unlock_no_resched(raw_spinlock_t *lock) +{ + spin_release(&lock->dep_map, 1, _RET_IP_); + _raw_spin_unlock(lock); + __preempt_enable_no_resched(); +} +/* not exported */ -void __lockfunc _write_unlock(rwlock_t *lock) +void __lockfunc __write_unlock(raw_rwlock_t *lock) { rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_write_unlock(lock); preempt_enable(); } -EXPORT_SYMBOL(_write_unlock); +EXPORT_SYMBOL(__write_unlock); -void __lockfunc _read_unlock(rwlock_t *lock) +void __lockfunc __read_unlock(raw_rwlock_t *lock) { rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_read_unlock(lock); preempt_enable(); } -EXPORT_SYMBOL(_read_unlock); +EXPORT_SYMBOL(__read_unlock); -void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) +void __lockfunc __spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) { spin_release(&lock->dep_map, 1, _RET_IP_); _raw_spin_unlock(lock); + __preempt_enable_no_resched(); local_irq_restore(flags); - preempt_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_spin_unlock_irqrestore); +EXPORT_SYMBOL(__spin_unlock_irqrestore); -void __lockfunc _spin_unlock_irq(spinlock_t *lock) +void __lockfunc __spin_unlock_irq(raw_spinlock_t *lock) { spin_release(&lock->dep_map, 1, _RET_IP_); _raw_spin_unlock(lock); + __preempt_enable_no_resched(); local_irq_enable(); - preempt_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_spin_unlock_irq); +EXPORT_SYMBOL(__spin_unlock_irq); -void __lockfunc _spin_unlock_bh(spinlock_t *lock) +void __lockfunc __spin_unlock_bh(raw_spinlock_t *lock) { spin_release(&lock->dep_map, 1, _RET_IP_); _raw_spin_unlock(lock); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); local_bh_enable_ip((unsigned long)__builtin_return_address(0)); } -EXPORT_SYMBOL(_spin_unlock_bh); +EXPORT_SYMBOL(__spin_unlock_bh); -void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +void __lockfunc __read_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags) { rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_read_unlock(lock); + __preempt_enable_no_resched(); local_irq_restore(flags); - preempt_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_read_unlock_irqrestore); +EXPORT_SYMBOL(__read_unlock_irqrestore); -void __lockfunc _read_unlock_irq(rwlock_t *lock) +void __lockfunc __read_unlock_irq(raw_rwlock_t *lock) { rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_read_unlock(lock); + __preempt_enable_no_resched(); local_irq_enable(); - preempt_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_read_unlock_irq); +EXPORT_SYMBOL(__read_unlock_irq); -void __lockfunc _read_unlock_bh(rwlock_t *lock) +void __lockfunc __read_unlock_bh(raw_rwlock_t *lock) { rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_read_unlock(lock); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); local_bh_enable_ip((unsigned long)__builtin_return_address(0)); } -EXPORT_SYMBOL(_read_unlock_bh); +EXPORT_SYMBOL(__read_unlock_bh); -void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +void __lockfunc __write_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags) { rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_write_unlock(lock); + __preempt_enable_no_resched(); local_irq_restore(flags); - preempt_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_write_unlock_irqrestore); +EXPORT_SYMBOL(__write_unlock_irqrestore); -void __lockfunc _write_unlock_irq(rwlock_t *lock) +void __lockfunc __write_unlock_irq(raw_rwlock_t *lock) { rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_write_unlock(lock); + __preempt_enable_no_resched(); local_irq_enable(); - preempt_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_write_unlock_irq); +EXPORT_SYMBOL(__write_unlock_irq); -void __lockfunc _write_unlock_bh(rwlock_t *lock) +void __lockfunc __write_unlock_bh(raw_rwlock_t *lock) { rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_write_unlock(lock); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); local_bh_enable_ip((unsigned long)__builtin_return_address(0)); } -EXPORT_SYMBOL(_write_unlock_bh); +EXPORT_SYMBOL(__write_unlock_bh); -int __lockfunc _spin_trylock_bh(spinlock_t *lock) +int __lockfunc __spin_trylock_bh(raw_spinlock_t *lock) { local_bh_disable(); preempt_disable(); @@ -431,18 +498,29 @@ int __lockfunc _spin_trylock_bh(spinlock return 1; } - preempt_enable_no_resched(); + __preempt_enable_no_resched(); local_bh_enable_ip((unsigned long)__builtin_return_address(0)); return 0; } -EXPORT_SYMBOL(_spin_trylock_bh); +EXPORT_SYMBOL(__spin_trylock_bh); -int in_lock_functions(unsigned long addr) +notrace int in_lock_functions(unsigned long addr) { /* Linker adds these: start and end of __lockfunc functions */ extern char __lock_text_start[], __lock_text_end[]; return addr >= (unsigned long)__lock_text_start - && addr < (unsigned long)__lock_text_end; + && addr < (unsigned long)__lock_text_end; } EXPORT_SYMBOL(in_lock_functions); + +void notrace __debug_atomic_dec_and_test(atomic_t *v) +{ + static int warn_once = 1; + + if (!atomic_read(v) && warn_once) { + warn_once = 0; + printk("BUG: atomic counter underflow!\n"); + WARN_ON(1); + } +} Index: linux-2.6.24-rc8-rt1/include/linux/preempt.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/preempt.h 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/preempt.h 2008-01-16 22:28:34.000000000 -0500 @@ -9,23 +9,39 @@ #include #include #include +#include -#ifdef CONFIG_DEBUG_PREEMPT - extern void fastcall add_preempt_count(int val); - extern void fastcall sub_preempt_count(int val); +#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING) + extern void notrace add_preempt_count(unsigned int val); + extern void notrace sub_preempt_count(unsigned int val); + extern void notrace mask_preempt_count(unsigned int mask); + extern void notrace unmask_preempt_count(unsigned int mask); #else # define add_preempt_count(val) do { preempt_count() += (val); } while (0) # define sub_preempt_count(val) do { preempt_count() -= (val); } while (0) +# define mask_preempt_count(mask) \ + do { preempt_count() |= (mask); } while (0) +# define unmask_preempt_count(mask) \ + do { preempt_count() &= ~(mask); } while (0) +#endif + +#ifdef CONFIG_CRITICAL_TIMING + extern void touch_critical_timing(void); + extern void stop_critical_timing(void); +#else +# define touch_critical_timing() do { } while (0) +# define stop_critical_timing() do { } while (0) #endif #define inc_preempt_count() add_preempt_count(1) #define dec_preempt_count() sub_preempt_count(1) -#define preempt_count() (current_thread_info()->preempt_count) +#define preempt_count() (current_thread_info()->preempt_count) #ifdef CONFIG_PREEMPT asmlinkage void preempt_schedule(void); +asmlinkage void preempt_schedule_irq(void); #define preempt_disable() \ do { \ @@ -33,21 +49,34 @@ do { \ barrier(); \ } while (0) -#define preempt_enable_no_resched() \ +#define __preempt_enable_no_resched() \ do { \ barrier(); \ dec_preempt_count(); \ } while (0) + +#ifdef CONFIG_DEBUG_PREEMPT +extern void notrace preempt_enable_no_resched(void); +#else +# define preempt_enable_no_resched() __preempt_enable_no_resched() +#endif + #define preempt_check_resched() \ do { \ if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ preempt_schedule(); \ } while (0) +#define preempt_check_resched_delayed() \ +do { \ + if (unlikely(test_thread_flag(TIF_NEED_RESCHED_DELAYED))) \ + preempt_schedule(); \ +} while (0) + #define preempt_enable() \ do { \ - preempt_enable_no_resched(); \ + __preempt_enable_no_resched(); \ barrier(); \ preempt_check_resched(); \ } while (0) @@ -56,8 +85,12 @@ do { \ #define preempt_disable() do { } while (0) #define preempt_enable_no_resched() do { } while (0) +#define __preempt_enable_no_resched() do { } while (0) #define preempt_enable() do { } while (0) #define preempt_check_resched() do { } while (0) +#define preempt_check_resched_delayed() do { } while (0) + +#define preempt_schedule_irq() do { } while (0) #endif Index: linux-2.6.24-rc8-rt1/arch/x86/mm/fault_32.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/mm/fault_32.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/mm/fault_32.c 2008-01-16 22:29:04.000000000 -0500 @@ -293,8 +293,8 @@ int show_unhandled_signals = 1; * bit 3 == 1 means use of reserved bit detected * bit 4 == 1 means fault was an instruction fetch */ -fastcall void __kprobes do_page_fault(struct pt_regs *regs, - unsigned long error_code) +notrace fastcall void __kprobes do_page_fault(struct pt_regs *regs, + unsigned long error_code) { struct task_struct *tsk; struct mm_struct *mm; @@ -310,6 +310,7 @@ fastcall void __kprobes do_page_fault(st /* get the address */ address = read_cr2(); + trace_special(regs->eip, error_code, address); tsk = current; @@ -354,7 +355,7 @@ fastcall void __kprobes do_page_fault(st * If we're in an interrupt, have no user context or are running in an * atomic region then we must not take the fault.. */ - if (in_atomic() || !mm) + if (in_atomic() || !mm || current->pagefault_disabled) goto bad_area_nosemaphore; /* When running in the kernel we expect faults to occur only to @@ -498,6 +499,9 @@ bad_area_nosemaphore: nr = (address - idt_descr.address) >> 3; if (nr == 6) { + stop_trace(); + user_trace_stop(); + zap_rt_locks(); do_invalid_op(regs, 0); return; } Index: linux-2.6.24-rc8-rt1/arch/x86/mm/init_32.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/mm/init_32.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/mm/init_32.c 2008-01-16 22:28:34.000000000 -0500 @@ -47,7 +47,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; static int noinline do_test_wp_bit(void); @@ -200,7 +200,7 @@ static inline int page_kills_ppro(unsign return 0; } -int page_is_ram(unsigned long pagenr) +notrace int page_is_ram(unsigned long pagenr) { int i; unsigned long addr, end; Index: linux-2.6.24-rc8-rt1/kernel/irq/handle.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/irq/handle.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/irq/handle.c 2008-01-16 22:28:52.000000000 -0500 @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -53,12 +54,13 @@ struct irq_desc irq_desc[NR_IRQS] __cach .chip = &no_irq_chip, .handle_irq = handle_bad_irq, .depth = 1, - .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), + .lock = RAW_SPIN_LOCK_UNLOCKED(irq_desc), #ifdef CONFIG_SMP .affinity = CPU_MASK_ALL #endif } }; +EXPORT_SYMBOL_GPL(irq_desc); /* * What should we do if we get a hw irq event on an illegal vector? @@ -131,26 +133,88 @@ irqreturn_t handle_IRQ_event(unsigned in irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; +#ifdef __i386__ + if (debug_direct_keyboard && irq == 1) + lockdep_off(); +#endif + handle_dynamic_tick(action); - if (!(action->flags & IRQF_DISABLED)) - local_irq_enable_in_hardirq(); + /* + * Unconditionally enable interrupts for threaded + * IRQ handlers: + */ + if (!hardirq_count() || !(action->flags & IRQF_DISABLED)) + local_irq_enable(); do { + unsigned int preempt_count = preempt_count(); + ret = action->handler(irq, action->dev_id); + if (preempt_count() != preempt_count) { + stop_trace(); + print_symbol("BUG: unbalanced irq-handler preempt count in %s!\n", (unsigned long) action->handler); + printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count()); + dump_stack(); + preempt_count() = preempt_count; + } if (ret == IRQ_HANDLED) status |= action->flags; retval |= ret; action = action->next; } while (action); - if (status & IRQF_SAMPLE_RANDOM) + if (status & IRQF_SAMPLE_RANDOM) { + local_irq_enable(); add_interrupt_randomness(irq); + } local_irq_disable(); +#ifdef __i386__ + if (debug_direct_keyboard && irq == 1) + lockdep_on(); +#endif return retval; } +/* + * Hack - used for development only. + */ +int __read_mostly debug_direct_keyboard = 0; + +int __init debug_direct_keyboard_setup(char *str) +{ + debug_direct_keyboard = 1; + printk(KERN_INFO "Switching IRQ 1 (keyboard) to to direct!\n"); +#ifdef CONFIG_PREEMPT_RT + printk(KERN_INFO "WARNING: kernel may easily crash this way!\n"); +#endif + return 1; +} + +__setup("debug_direct_keyboard", debug_direct_keyboard_setup); + +int redirect_hardirq(struct irq_desc *desc) +{ + /* + * Direct execution: + */ + if (!hardirq_preemption || (desc->status & IRQ_NODELAY) || + !desc->thread) + return 0; + +#ifdef __i386__ + if (debug_direct_keyboard && (desc - irq_desc == 1)) + return 0; +#endif + + BUG_ON(!irqs_disabled()); + if (desc->thread && desc->thread->state != TASK_RUNNING) + wake_up_process(desc->thread); + + return 1; +} + #ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ /** * __do_IRQ - original all in one highlevel IRQ handler @@ -163,7 +227,7 @@ irqreturn_t handle_IRQ_event(unsigned in * This is the original x86 implementation which is used for every * interrupt type. */ -fastcall unsigned int __do_IRQ(unsigned int irq) +notrace fastcall unsigned int __do_IRQ(unsigned int irq) { struct irq_desc *desc = irq_desc + irq; struct irqaction *action; @@ -186,6 +250,13 @@ fastcall unsigned int __do_IRQ(unsigned desc->chip->end(irq); return 1; } + /* + * If the task is currently running in user mode, don't + * detect soft lockups. If CONFIG_DETECT_SOFTLOCKUP is not + * configured, this should be optimized out. + */ + if (user_mode(get_irq_regs())) + touch_softlockup_watchdog(); spin_lock(&desc->lock); if (desc->chip->ack) Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/irq_32.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/irq_32.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/irq_32.c 2008-01-16 22:28:57.000000000 -0500 @@ -66,8 +66,8 @@ static union irq_ctx *softirq_ctx[NR_CPU * SMP cross-CPU interrupts have their own specific * handlers). */ -fastcall unsigned int do_IRQ(struct pt_regs *regs) -{ +notrace fastcall unsigned int do_IRQ(struct pt_regs *regs) +{ struct pt_regs *old_regs; /* high bit used in ret_from_ code */ int irq = ~regs->orig_eax; @@ -77,6 +77,10 @@ fastcall unsigned int do_IRQ(struct pt_r u32 *isp; #endif +#ifdef CONFIG_X86_LOCAL_APIC + irq_show_regs_callback(smp_processor_id(), regs); +#endif + if (unlikely((unsigned)irq >= NR_IRQS)) { printk(KERN_EMERG "%s: cannot handle IRQ %d\n", __FUNCTION__, irq); @@ -85,6 +89,11 @@ fastcall unsigned int do_IRQ(struct pt_r old_regs = set_irq_regs(regs); irq_enter(); +#ifdef CONFIG_EVENT_TRACE + if (irq == trace_user_trigger_irq) + user_trace_start(); +#endif + trace_special(regs->eip, irq, 0); #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: is there less than 1KB free? */ { @@ -93,7 +102,7 @@ fastcall unsigned int do_IRQ(struct pt_r __asm__ __volatile__("andl %%esp,%0" : "=r" (esp) : "0" (THREAD_SIZE - 1)); if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { - printk("do_IRQ: stack overflow: %ld\n", + printk("BUG: do_IRQ: stack overflow: %ld\n", esp - sizeof(struct thread_info)); dump_stack(); } Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/smp_32.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/smp_32.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/smp_32.c 2008-01-16 22:29:12.000000000 -0500 @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -247,7 +248,7 @@ void send_IPI_mask_sequence(cpumask_t ma static cpumask_t flush_cpumask; static struct mm_struct * flush_mm; static unsigned long flush_va; -static DEFINE_SPINLOCK(tlbstate_lock); +static DEFINE_RAW_SPINLOCK(tlbstate_lock); /* * We cannot call mmdrop() because we are in interrupt context, @@ -476,10 +477,28 @@ static void native_smp_send_reschedule(i } /* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + send_IPI_allbutself(RESCHEDULE_VECTOR); +} + +void smp_send_reschedule_allbutself_cpumask(cpumask_t mask) +{ + cpu_clear(smp_processor_id(), mask); + cpus_and(mask, mask, cpu_online_map); + if (!cpus_empty(mask)) + send_IPI_mask(mask, RESCHEDULE_VECTOR); +} + +/* * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. */ -static DEFINE_SPINLOCK(call_lock); +static DEFINE_RAW_SPINLOCK(call_lock); struct call_data_struct { void (*func) (void *info); @@ -634,11 +653,10 @@ static void native_smp_send_stop(void) } /* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. + * Reschedule call back. Trigger a reschedule pass so that + * RT-overload balancing can pass tasks around. */ -fastcall void smp_reschedule_interrupt(struct pt_regs *regs) +notrace fastcall void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); __get_cpu_var(irq_stat).irq_resched_count++; Index: linux-2.6.24-rc8-rt1/kernel/rcupdate.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/rcupdate.c 2008-01-16 22:27:51.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/rcupdate.c 2008-01-16 22:29:20.000000000 -0500 @@ -15,7 +15,7 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - * Copyright (C) IBM Corporation, 2001 + * Copyright IBM Corporation, 2001 * * Authors: Dipankar Sarma * Manfred Spraul @@ -35,14 +35,12 @@ #include #include #include -#include #include #include #include #include #include #include -#include #include #include #include @@ -56,144 +54,66 @@ struct lockdep_map rcu_lock_map = EXPORT_SYMBOL_GPL(rcu_lock_map); #endif -/* Definition for rcupdate control block. */ -static struct rcu_ctrlblk rcu_ctrlblk = { - .cur = -300, - .completed = -300, - .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), - .cpumask = CPU_MASK_NONE, -}; -static struct rcu_ctrlblk rcu_bh_ctrlblk = { - .cur = -300, - .completed = -300, - .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), - .cpumask = CPU_MASK_NONE, +struct rcu_synchronize { + struct rcu_head head; + struct completion completion; }; -DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; -DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; - -/* Fake initialization required by compiler */ -static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; -static int blimit = 10; -static int qhimark = 10000; -static int qlowmark = 100; - +DEFINE_PER_CPU(int, rcu_data_passed_quiesc); +static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); static struct completion rcu_barrier_completion; -#ifdef CONFIG_SMP -static void force_quiescent_state(struct rcu_data *rdp, - struct rcu_ctrlblk *rcp) -{ - int cpu; - cpumask_t cpumask; - set_need_resched(); - if (unlikely(!rcp->signaled)) { - rcp->signaled = 1; - /* - * Don't send IPI to itself. With irqs disabled, - * rdp->cpu is the current cpu. - */ - cpumask = rcp->cpumask; - cpu_clear(rdp->cpu, cpumask); - for_each_cpu_mask(cpu, cpumask) - smp_send_reschedule(cpu); - } -} -#else -static inline void force_quiescent_state(struct rcu_data *rdp, - struct rcu_ctrlblk *rcp) +/* Because of FASTCALL declaration of complete, we use this wrapper */ +static void wakeme_after_rcu(struct rcu_head *head) { - set_need_resched(); + struct rcu_synchronize *rcu; + + rcu = container_of(head, struct rcu_synchronize, head); + complete(&rcu->completion); } -#endif /** - * call_rcu - Queue an RCU callback for invocation after a grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual update function to be invoked after the grace period + * synchronize_rcu - wait until a grace period has elapsed. * - * The update function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU + * Control will return to the caller some time after a full grace + * period has elapsed, in other words after all currently executing RCU * read-side critical sections have completed. RCU read-side critical * sections are delimited by rcu_read_lock() and rcu_read_unlock(), * and may be nested. */ -void fastcall call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) +void synchronize_rcu(void) { - unsigned long flags; - struct rcu_data *rdp; - - head->func = func; - head->next = NULL; - local_irq_save(flags); - rdp = &__get_cpu_var(rcu_data); - *rdp->nxttail = head; - rdp->nxttail = &head->next; - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_ctrlblk); - } - local_irq_restore(flags); -} + struct rcu_synchronize rcu; -/** - * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual update function to be invoked after the grace period - * - * The update function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_bh() assumes - * that the read-side critical sections end on completion of a softirq - * handler. This means that read-side critical sections in process - * context must not be interrupted by softirqs. This interface is to be - * used when most of the read-side critical sections are in softirq context. - * RCU read-side critical sections are delimited by rcu_read_lock() and - * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() - * and rcu_read_unlock_bh(), if in process context. These may be nested. - */ -void fastcall call_rcu_bh(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) -{ - unsigned long flags; - struct rcu_data *rdp; + init_completion(&rcu.completion); + /* Will wake me after RCU finished */ + call_rcu(&rcu.head, wakeme_after_rcu); - head->func = func; - head->next = NULL; - local_irq_save(flags); - rdp = &__get_cpu_var(rcu_bh_data); - *rdp->nxttail = head; - rdp->nxttail = &head->next; - - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_bh_ctrlblk); - } + rcu_boost_readers(); - local_irq_restore(flags); + /* Wait for it */ + wait_for_completion(&rcu.completion); + rcu_unboost_readers(); } +EXPORT_SYMBOL_GPL(synchronize_rcu); -/* - * Return the number of RCU batches processed thus far. Useful - * for debug and statistics. - */ -long rcu_batches_completed(void) -{ - return rcu_ctrlblk.completed; -} +#ifdef CONFIG_PREEMPT_RCU /* - * Return the number of RCU batches processed thus far. Useful - * for debug and statistics. + * Map synchronize_sched() to the classic RCU implementation. */ -long rcu_batches_completed_bh(void) +void __synchronize_sched(void) { - return rcu_bh_ctrlblk.completed; + struct rcu_synchronize rcu; + + init_completion(&rcu.completion); + call_rcu_classic(&rcu.head, wakeme_after_rcu); + wait_for_completion(&rcu.completion); } +EXPORT_SYMBOL_GPL(__synchronize_sched); +#endif /* #ifdef CONFIG_PREEMPT_RCU */ static void rcu_barrier_callback(struct rcu_head *notused) { @@ -207,10 +127,8 @@ static void rcu_barrier_callback(struct static void rcu_barrier_func(void *notused) { int cpu = smp_processor_id(); - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - struct rcu_head *head; + struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); - head = &rdp->barrier; atomic_inc(&rcu_barrier_cpu_count); call_rcu(head, rcu_barrier_callback); } @@ -225,420 +143,18 @@ void rcu_barrier(void) mutex_lock(&rcu_barrier_mutex); init_completion(&rcu_barrier_completion); atomic_set(&rcu_barrier_cpu_count, 0); + /* + * The queueing of callbacks in all CPUs must be atomic with + * respect to RCU, otherwise one CPU may queue a callback, + * wait for a grace period, decrement barrier count and call + * complete(), while other CPUs have not yet queued anything. + * So, we need to make sure that grace periods cannot complete + * until all the callbacks are queued. + */ + rcu_read_lock(); on_each_cpu(rcu_barrier_func, NULL, 0, 1); + rcu_read_unlock(); wait_for_completion(&rcu_barrier_completion); mutex_unlock(&rcu_barrier_mutex); } EXPORT_SYMBOL_GPL(rcu_barrier); - -/* - * Invoke the completed RCU callbacks. They are expected to be in - * a per-cpu list. - */ -static void rcu_do_batch(struct rcu_data *rdp) -{ - struct rcu_head *next, *list; - int count = 0; - - list = rdp->donelist; - while (list) { - next = list->next; - prefetch(next); - list->func(list); - list = next; - if (++count >= rdp->blimit) - break; - } - rdp->donelist = list; - - local_irq_disable(); - rdp->qlen -= count; - local_irq_enable(); - if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) - rdp->blimit = blimit; - - if (!rdp->donelist) - rdp->donetail = &rdp->donelist; - else - tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu)); -} - -/* - * Grace period handling: - * The grace period handling consists out of two steps: - * - A new grace period is started. - * This is done by rcu_start_batch. The start is not broadcasted to - * all cpus, they must pick this up by comparing rcp->cur with - * rdp->quiescbatch. All cpus are recorded in the - * rcu_ctrlblk.cpumask bitmap. - * - All cpus must go through a quiescent state. - * Since the start of the grace period is not broadcasted, at least two - * calls to rcu_check_quiescent_state are required: - * The first call just notices that a new grace period is running. The - * following calls check if there was a quiescent state since the beginning - * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If - * the bitmap is empty, then the grace period is completed. - * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace - * period (if necessary). - */ -/* - * Register a new batch of callbacks, and start it up if there is currently no - * active batch and the batch to be registered has not already occurred. - * Caller must hold rcu_ctrlblk.lock. - */ -static void rcu_start_batch(struct rcu_ctrlblk *rcp) -{ - if (rcp->next_pending && - rcp->completed == rcp->cur) { - rcp->next_pending = 0; - /* - * next_pending == 0 must be visible in - * __rcu_process_callbacks() before it can see new value of cur. - */ - smp_wmb(); - rcp->cur++; - - /* - * Accessing nohz_cpu_mask before incrementing rcp->cur needs a - * Barrier Otherwise it can cause tickless idle CPUs to be - * included in rcp->cpumask, which will extend graceperiods - * unnecessarily. - */ - smp_mb(); - cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); - - rcp->signaled = 0; - } -} - -/* - * cpu went through a quiescent state since the beginning of the grace period. - * Clear it from the cpu mask and complete the grace period if it was the last - * cpu. Start another grace period if someone has further entries pending - */ -static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) -{ - cpu_clear(cpu, rcp->cpumask); - if (cpus_empty(rcp->cpumask)) { - /* batch completed ! */ - rcp->completed = rcp->cur; - rcu_start_batch(rcp); - } -} - -/* - * Check if the cpu has gone through a quiescent state (say context - * switch). If so and if it already hasn't done so in this RCU - * quiescent cycle, then indicate that it has done so. - */ -static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - if (rdp->quiescbatch != rcp->cur) { - /* start new grace period: */ - rdp->qs_pending = 1; - rdp->passed_quiesc = 0; - rdp->quiescbatch = rcp->cur; - return; - } - - /* Grace period already completed for this cpu? - * qs_pending is checked instead of the actual bitmap to avoid - * cacheline trashing. - */ - if (!rdp->qs_pending) - return; - - /* - * Was there a quiescent state since the beginning of the grace - * period? If no, then exit and wait for the next call. - */ - if (!rdp->passed_quiesc) - return; - rdp->qs_pending = 0; - - spin_lock(&rcp->lock); - /* - * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync - * during cpu startup. Ignore the quiescent state. - */ - if (likely(rdp->quiescbatch == rcp->cur)) - cpu_quiet(rdp->cpu, rcp); - - spin_unlock(&rcp->lock); -} - - -#ifdef CONFIG_HOTPLUG_CPU - -/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing - * locking requirements, the list it's pulling from has to belong to a cpu - * which is dead and hence not processing interrupts. - */ -static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, - struct rcu_head **tail) -{ - local_irq_disable(); - *this_rdp->nxttail = list; - if (list) - this_rdp->nxttail = tail; - local_irq_enable(); -} - -static void __rcu_offline_cpu(struct rcu_data *this_rdp, - struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ - /* if the cpu going offline owns the grace period - * we can block indefinitely waiting for it, so flush - * it here - */ - spin_lock_bh(&rcp->lock); - if (rcp->cur != rcp->completed) - cpu_quiet(rdp->cpu, rcp); - spin_unlock_bh(&rcp->lock); - rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); - rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); - rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); -} - -static void rcu_offline_cpu(int cpu) -{ - struct rcu_data *this_rdp = &get_cpu_var(rcu_data); - struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); - - __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, - &per_cpu(rcu_data, cpu)); - __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, - &per_cpu(rcu_bh_data, cpu)); - put_cpu_var(rcu_data); - put_cpu_var(rcu_bh_data); - tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu); -} - -#else - -static void rcu_offline_cpu(int cpu) -{ -} - -#endif - -/* - * This does the RCU processing work from tasklet context. - */ -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { - *rdp->donetail = rdp->curlist; - rdp->donetail = rdp->curtail; - rdp->curlist = NULL; - rdp->curtail = &rdp->curlist; - } - - if (rdp->nxtlist && !rdp->curlist) { - local_irq_disable(); - rdp->curlist = rdp->nxtlist; - rdp->curtail = rdp->nxttail; - rdp->nxtlist = NULL; - rdp->nxttail = &rdp->nxtlist; - local_irq_enable(); - - /* - * start the next batch of callbacks - */ - - /* determine batch number */ - rdp->batch = rcp->cur + 1; - /* see the comment and corresponding wmb() in - * the rcu_start_batch() - */ - smp_rmb(); - - if (!rcp->next_pending) { - /* and start it/schedule start if it's a new batch */ - spin_lock(&rcp->lock); - rcp->next_pending = 1; - rcu_start_batch(rcp); - spin_unlock(&rcp->lock); - } - } - - rcu_check_quiescent_state(rcp, rdp); - if (rdp->donelist) - rcu_do_batch(rdp); -} - -static void rcu_process_callbacks(unsigned long unused) -{ - __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); - __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); -} - -static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ - /* This cpu has pending rcu entries and the grace period - * for them has completed. - */ - if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) - return 1; - - /* This cpu has no pending entries, but there are new entries */ - if (!rdp->curlist && rdp->nxtlist) - return 1; - - /* This cpu has finished callbacks to invoke */ - if (rdp->donelist) - return 1; - - /* The rcu core waits for a quiescent state from the cpu */ - if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) - return 1; - - /* nothing to do */ - return 0; -} - -/* - * Check to see if there is any immediate RCU-related work to be done - * by the current CPU, returning 1 if so. This function is part of the - * RCU implementation; it is -not- an exported member of the RCU API. - */ -int rcu_pending(int cpu) -{ - return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || - __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); -} - -/* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. This function is part of the RCU implementation; it is -not- - * an exported member of the RCU API. - */ -int rcu_needs_cpu(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); - - return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); -} - -void rcu_check_callbacks(int cpu, int user) -{ - if (user || - (idle_cpu(cpu) && !in_softirq() && - hardirq_count() <= (1 << HARDIRQ_SHIFT))) { - rcu_qsctr_inc(cpu); - rcu_bh_qsctr_inc(cpu); - } else if (!in_softirq()) - rcu_bh_qsctr_inc(cpu); - tasklet_schedule(&per_cpu(rcu_tasklet, cpu)); -} - -static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - memset(rdp, 0, sizeof(*rdp)); - rdp->curtail = &rdp->curlist; - rdp->nxttail = &rdp->nxtlist; - rdp->donetail = &rdp->donelist; - rdp->quiescbatch = rcp->completed; - rdp->qs_pending = 0; - rdp->cpu = cpu; - rdp->blimit = blimit; -} - -static void __devinit rcu_online_cpu(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu); - - rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); - rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); - tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); -} - -static int __cpuinit rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - rcu_online_cpu(cpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - rcu_offline_cpu(cpu); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata rcu_nb = { - .notifier_call = rcu_cpu_notify, -}; - -/* - * Initializes rcu mechanism. Assumed to be called early. - * That is before local timer(SMP) or jiffie timer (uniproc) is setup. - * Note that rcu_qsctr and friends are implicitly - * initialized due to the choice of ``0'' for RCU_CTR_INVALID. - */ -void __init rcu_init(void) -{ - rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - /* Register notifier for non-boot CPUs */ - register_cpu_notifier(&rcu_nb); -} - -struct rcu_synchronize { - struct rcu_head head; - struct completion completion; -}; - -/* Because of FASTCALL declaration of complete, we use this wrapper */ -static void wakeme_after_rcu(struct rcu_head *head) -{ - struct rcu_synchronize *rcu; - - rcu = container_of(head, struct rcu_synchronize, head); - complete(&rcu->completion); -} - -/** - * synchronize_rcu - wait until a grace period has elapsed. - * - * Control will return to the caller some time after a full grace - * period has elapsed, in other words after all currently executing RCU - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. - * - * If your read-side code is not protected by rcu_read_lock(), do -not- - * use synchronize_rcu(). - */ -void synchronize_rcu(void) -{ - struct rcu_synchronize rcu; - - init_completion(&rcu.completion); - /* Will wake me after RCU finished */ - call_rcu(&rcu.head, wakeme_after_rcu); - - /* Wait for it */ - wait_for_completion(&rcu.completion); -} - -module_param(blimit, int, 0); -module_param(qhimark, int, 0); -module_param(qlowmark, int, 0); -EXPORT_SYMBOL_GPL(rcu_batches_completed); -EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); -EXPORT_SYMBOL_GPL(call_rcu); -EXPORT_SYMBOL_GPL(call_rcu_bh); -EXPORT_SYMBOL_GPL(synchronize_rcu); Index: linux-2.6.24-rc8-rt1/arch/powerpc/xmon/xmon.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/xmon/xmon.c 2008-01-16 22:27:50.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/xmon/xmon.c 2008-01-16 22:28:45.000000000 -0500 @@ -340,6 +340,7 @@ static int xmon_core(struct pt_regs *reg unsigned long timeout; #endif + preempt_disable(); local_irq_save(flags); bp = in_breakpoint_table(regs->nip, &offset); @@ -516,6 +517,7 @@ static int xmon_core(struct pt_regs *reg insert_cpu_bpts(); local_irq_restore(flags); + preempt_enable(); return cmd != 'X' && cmd != EOF; } @@ -2129,7 +2131,7 @@ print_address(unsigned long addr) static unsigned long mdest; /* destination address */ static unsigned long msrc; /* source address */ static unsigned long mval; /* byte value to set memory to */ -static unsigned long mcount; /* # bytes to affect */ +static unsigned long xmon_mcount; /* # bytes to affect */ static unsigned long mdiffs; /* max # differences to print */ void @@ -2141,19 +2143,20 @@ memops(int cmd) scanhex((void *)(cmd == 's'? &mval: &msrc)); if( termch != '\n' ) termch = 0; - scanhex((void *)&mcount); + scanhex((void *)&xmon_mcount); switch( cmd ){ case 'm': - memmove((void *)mdest, (void *)msrc, mcount); + memmove((void *)mdest, (void *)msrc, xmon_mcount); break; case 's': - memset((void *)mdest, mval, mcount); + memset((void *)mdest, mval, xmon_mcount); break; case 'd': if( termch != '\n' ) termch = 0; scanhex((void *)&mdiffs); - memdiffs((unsigned char *)mdest, (unsigned char *)msrc, mcount, mdiffs); + memdiffs((unsigned char *)mdest, (unsigned char *)msrc, + xmon_mcount, mdiffs); break; } } Index: linux-2.6.24-rc8-rt1/arch/powerpc/kernel/entry_64.S =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/kernel/entry_64.S 2008-01-16 22:27:50.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/kernel/entry_64.S 2008-01-16 22:28:43.000000000 -0500 @@ -470,7 +470,8 @@ _GLOBAL(ret_from_except_lite) #ifdef CONFIG_PREEMPT clrrdi r9,r1,THREAD_SHIFT /* current_thread_info() */ - li r0,_TIF_NEED_RESCHED /* bits to check */ + li r0,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) + /* bits to check */ ld r3,_MSR(r1) ld r4,TI_FLAGS(r9) /* Move MSR_PR bit in r3 to _TIF_SIGPENDING position in r0 */ @@ -592,17 +593,22 @@ do_work: rotldi r10,r10,16 mtmsrd r10,1 ld r4,TI_FLAGS(r9) - andi. r0,r4,_TIF_NEED_RESCHED + andi. r0,r4,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne 1b b restore user_work: #endif + /* here we are preempting the current task */ + li r0,1 + stb r0,PACASOFTIRQEN(r13) + stb r0,PACAHARDIRQEN(r13) + /* Enable interrupts */ ori r10,r10,MSR_EE mtmsrd r10,1 - andi. r0,r4,_TIF_NEED_RESCHED + andi. r0,r4,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) beq 1f bl .schedule b .ret_from_except_lite @@ -846,3 +852,71 @@ _GLOBAL(enter_prom) ld r0,16(r1) mtlr r0 blr + +#ifdef CONFIG_MCOUNT +/* + * code almost taken from entry_32.S + */ +#define MCOUNT_FRAME_SIZE 32 +_GLOBAL(mcount) + stdu r1,-MCOUNT_FRAME_SIZE(r1) + mflr r3 + + LOAD_REG_ADDR(r5,mcount_enabled) + lwz r5,0(r5) + std r3,MCOUNT_FRAME_SIZE+16(r1) + cmpwi r5,0 + beq 1f + + /* r3 contains lr (eip), put parent lr (parent_eip) in r4 */ + ld r4,MCOUNT_FRAME_SIZE(r1) + ld r4,16(r4) + LOAD_REG_ADDR(r5,mcount_trace_function) + ld r5,0(r5) + ld r5,0(r5) + mtctr r5 + bctrl + nop +1: + ld r0,MCOUNT_FRAME_SIZE+16(r1) + mtlr r0 + addi r1,r1,MCOUNT_FRAME_SIZE + blr + +/* + * Based on glibc-2.4/sysdeps/powerpc/powerpc64/ppc-mcount.S + * + * We don't need to save the parameter-passing registers as gcc takes + * care of that for us. Thus this function looks fairly normal. + * In fact, the generic code would work for us. + */ +_GLOBAL(_mcount) + /* return if we're in real mode. */ + mfmsr r3 + andi. r0,r3,MSR_IR|MSR_DR /* see if relocation is on? */ + beqlr /* if not, do nothing. */ + /* we're in translation mode. keep going. */ + mflr r3 + ld r11,0(r1) /* load back chain ptr */ + stdu r1,-STACK_FRAME_OVERHEAD(r1) + std r3,STACK_FRAME_OVERHEAD+16(r1) + ld r4,16(r11) /* LR in back chain */ + LOAD_REG_ADDR(r5,mcount_enabled) + lwz r5,0(r5) + cmpwi r5,0 /* see if mcount_enabled? */ + beq 1f /* if disabled, then skip */ + + /* r3 contains lr (eip), put parent lr (parent_eip) in r4 */ + LOAD_REG_ADDR(r5,mcount_trace_function) + ld r5,0(r5) + ld r5,0(r5) + mtctr r5 + bctrl + nop +1: + ld r0,STACK_FRAME_OVERHEAD+16(r1) /* restore saved LR */ + mtlr r0 + addi r1,r1,STACK_FRAME_OVERHEAD + blr + +#endif /* CONFIG_MCOUNT */ Index: linux-2.6.24-rc8-rt1/arch/powerpc/kernel/setup_64.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/kernel/setup_64.c 2008-01-16 22:27:50.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/kernel/setup_64.c 2008-01-16 22:27:55.000000000 -0500 @@ -607,3 +607,21 @@ struct ppc_pci_io ppc_pci_io; EXPORT_SYMBOL(ppc_pci_io); #endif /* CONFIG_PPC_INDIRECT_IO */ +#ifdef CONFIG_STACKTRACE +#include +void notrace save_stack_trace(struct stack_trace *trace) +{ +} +#endif /* CONFIG_STACKTRACE */ + +#ifdef CONFIG_EARLY_PRINTK +void notrace early_printk(const char *fmt, ...) +{ + BUG(); +} +#endif /* CONFIG_EARLY_PRINTK */ + +#ifdef CONFIG_MCOUNT +extern void _mcount(void); +EXPORT_SYMBOL(_mcount); +#endif /* CONFIG_MCOUNT */ Index: linux-2.6.24-rc8-rt1/arch/powerpc/kernel/irq.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/kernel/irq.c 2008-01-16 22:27:50.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/kernel/irq.c 2008-01-16 22:28:44.000000000 -0500 @@ -94,8 +94,6 @@ extern atomic_t ipi_sent; #endif #ifdef CONFIG_PPC64 -EXPORT_SYMBOL(irq_desc); - int distribute_irqs = 1; static inline unsigned long get_hard_enabled(void) @@ -114,7 +112,7 @@ static inline void set_soft_enabled(unsi : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled))); } -void local_irq_restore(unsigned long en) +void notrace raw_local_irq_restore(unsigned long en) { /* * get_paca()->soft_enabled = en; @@ -405,7 +403,7 @@ void do_softirq(void) #ifdef CONFIG_PPC_MERGE static LIST_HEAD(irq_hosts); -static DEFINE_SPINLOCK(irq_big_lock); +static DEFINE_RAW_SPINLOCK(irq_big_lock); static DEFINE_PER_CPU(unsigned int, irq_radix_reader); static unsigned int irq_radix_writer; struct irq_map_entry irq_map[NR_IRQS]; Index: linux-2.6.24-rc8-rt1/arch/powerpc/kernel/entry_32.S =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/kernel/entry_32.S 2008-01-16 22:27:50.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/kernel/entry_32.S 2008-01-16 22:28:27.000000000 -0500 @@ -661,7 +661,7 @@ user_exc_return: /* r10 contains MSR_KE /* Check current_thread_info()->flags */ rlwinm r9,r1,0,0,(31-THREAD_SHIFT) lwz r9,TI_FLAGS(r9) - andi. r0,r9,(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NEED_RESCHED) + andi. r0,r9,(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne do_work restore_user: @@ -896,7 +896,7 @@ global_dbcr0: #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */ do_work: /* r10 contains MSR_KERNEL here */ - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) beq do_user_signal do_resched: /* r10 contains MSR_KERNEL here */ @@ -910,7 +910,7 @@ recheck: MTMSRD(r10) /* disable interrupts */ rlwinm r9,r1,0,0,(31-THREAD_SHIFT) lwz r9,TI_FLAGS(r9) - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne- do_resched andi. r0,r9,_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK beq restore_user @@ -1022,3 +1022,91 @@ machine_check_in_rtas: /* XXX load up BATs and panic */ #endif /* CONFIG_PPC_RTAS */ + +#ifdef CONFIG_MCOUNT +/* + * mcount() is not the same as _mcount(). The callers of mcount() have a + * normal context. The callers of _mcount() do not have a stack frame and + * have not saved the "caller saves" registers. + */ +_GLOBAL(mcount) + stwu r1,-16(r1) + mflr r3 + lis r5,mcount_enabled@ha + lwz r5,mcount_enabled@l(r5) + stw r3,20(r1) + cmpwi r5,0 + beq 1f + /* r3 contains lr (eip), put parent lr (parent_eip) in r4 */ + lwz r4,16(r1) + lwz r4,4(r4) + LOAD_REG_ADDR(r5,mcount_trace_function) + lwz r5,0(r5) + mtctr r5 + bctrl +1: + lwz r0,20(r1) + mtlr r0 + addi r1,r1,16 + blr + +/* + * The -pg flag, which is specified in the case of CONFIG_MCOUNT, causes the + * C compiler to add a call to _mcount() at the start of each function + * preamble, before the stack frame is created. An example of this preamble + * code is: + * + * mflr r0 + * lis r12,-16354 + * stw r0,4(r1) + * addi r0,r12,-19652 + * bl 0xc00034c8 <_mcount> + * mflr r0 + * stwu r1,-16(r1) + */ +_GLOBAL(_mcount) +#define M_STK_SIZE 48 + /* Would not expect to need to save cr, but glibc version of */ + /* _mcount() does, so cautiously saving it here too. */ + stwu r1,-M_STK_SIZE(r1) + stw r3, 12(r1) + stw r4, 16(r1) + stw r5, 20(r1) + stw r6, 24(r1) + mflr r3 /* will use as first arg to __trace() */ + mfcr r4 + lis r5,mcount_enabled@ha + lwz r5,mcount_enabled@l(r5) + cmpwi r5,0 + stw r3, 44(r1) /* lr */ + stw r4, 8(r1) /* cr */ + stw r7, 28(r1) + stw r8, 32(r1) + stw r9, 36(r1) + stw r10,40(r1) + beq 1f + /* r3 contains lr (eip), put parent lr (parent_eip) in r4 */ + lwz r4,M_STK_SIZE+4(r1) + LOAD_REG_ADDR(r5,mcount_trace_function) + lwz r5,0(r5) + mtctr r5 + bctrl +1: + lwz r8, 8(r1) /* cr */ + lwz r9, 44(r1) /* lr */ + lwz r3, 12(r1) + lwz r4, 16(r1) + lwz r5, 20(r1) + mtcrf 0xff,r8 + mtctr r9 + lwz r0, 52(r1) + lwz r6, 24(r1) + lwz r7, 28(r1) + lwz r8, 32(r1) + lwz r9, 36(r1) + lwz r10,40(r1) + addi r1,r1,M_STK_SIZE + mtlr r0 + bctr + +#endif /* CONFIG_MCOUNT */ Index: linux-2.6.24-rc8-rt1/arch/powerpc/Kconfig =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/Kconfig 2008-01-16 22:27:50.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/Kconfig 2008-01-16 22:28:32.000000000 -0500 @@ -46,13 +46,6 @@ config IRQ_PER_CPU bool default y -config RWSEM_GENERIC_SPINLOCK - bool - -config RWSEM_XCHGADD_ALGORITHM - bool - default y - config ARCH_HAS_ILOG2_U32 bool default y @@ -79,6 +72,7 @@ config ARCH_NO_VIRT_TO_BUS config PPC bool default y + select HAVE_MCOUNT config EARLY_PRINTK bool @@ -176,6 +170,18 @@ config HIGHMEM source kernel/time/Kconfig source kernel/Kconfig.hz source kernel/Kconfig.preempt + +config RWSEM_GENERIC_SPINLOCK + bool + default y + +config ASM_SEMAPHORES + bool + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + source "fs/Kconfig.binfmt" # We optimistically allocate largepages from the VM, so make the limit Index: linux-2.6.24-rc8-rt1/include/linux/clocksource.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/clocksource.h 2008-01-16 22:27:50.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/clocksource.h 2008-01-16 22:29:07.000000000 -0500 @@ -21,6 +21,18 @@ typedef u64 cycle_t; struct clocksource; +extern unsigned long preempt_max_latency; +extern unsigned long preempt_thresh; +extern unsigned long preempt_mark_thresh; + +struct ctl_table; +struct file; +extern int proc_preempt_max_latency(struct ctl_table *table, int write, + struct file *file, void __user *buffer, + size_t *length, loff_t *ppos); +extern int proc_preempt_threshold(struct ctl_table *table, int write, + struct file *file, void __user *buffer, + size_t *length, loff_t *ppos); /** * struct clocksource - hardware abstraction for a free running counter * Provides mostly state-free accessors to the underlying hardware. @@ -50,8 +62,13 @@ struct clocksource; * @flags: flags describing special properties * @vread: vsyscall based read * @resume: resume function for the clocksource, if necessary + * @cycle_last: Used internally by timekeeping core, please ignore. + * @cycle_accumulated: Used internally by timekeeping core, please ignore. + * @cycle_montonic: Used internally by timekeeping core, please ignore. * @cycle_interval: Used internally by timekeeping core, please ignore. * @xtime_interval: Used internally by timekeeping core, please ignore. + * @xtime_nsec: Used internally by timekeeping core, please ignore. + * @error: Used internally by timekeeping core, please ignore. */ struct clocksource { /* @@ -75,14 +92,17 @@ struct clocksource { #endif /* timekeeping specific data, ignore */ - cycle_t cycle_interval; u64 xtime_interval; /* * Second part is written at each timer interrupt * Keep it in a different cache line to dirty no * more than one cache line. */ - cycle_t cycle_last ____cacheline_aligned_in_smp; + struct { + cycle_t cycle_last, cycle_accumulated, cycle_monotonic; + cycle_t cycle_interval; + } ____cacheline_aligned_in_smp; + u64 xtime_nsec; s64 error; @@ -168,18 +188,64 @@ static inline cycle_t clocksource_read(s } /** + * clocksource_get_cycles: - Access the clocksource's accumulated cycle value + * @cs: pointer to clocksource being read + * @now: current cycle value + * + * Uses the clocksource to return the current cycle_t value. + * NOTE!!!: This is different from clocksource_read, because it + * returns the accumulated cycle value! Must hold xtime lock! + */ +static inline cycle_t +clocksource_get_cycles(struct clocksource *cs, cycle_t now) +{ + cycle_t offset = (now - cs->cycle_last) & cs->mask; + offset += cs->cycle_accumulated; + return offset; +} + +/** + * clocksource_accumulate: - Accumulates clocksource cycles + * @cs: pointer to clocksource being read + * @now: current cycle value + * + * Used to avoids clocksource hardware overflow by periodically + * accumulating the current cycle delta. Must hold xtime write lock! + */ +static inline void clocksource_accumulate(struct clocksource *cs, cycle_t now) +{ + cycle_t offset = (now - cs->cycle_last) & cs->mask; + cs->cycle_last = now; + cs->cycle_accumulated += offset; + cs->cycle_monotonic += offset; +} + +/** * cyc2ns - converts clocksource cycles to nanoseconds * @cs: Pointer to clocksource * @cycles: Cycles * * Uses the clocksource and ntp ajdustment to convert cycle_ts to nanoseconds. + * Must hold xtime lock! * * XXX - This could use some mult_lxl_ll() asm optimization */ static inline s64 cyc2ns(struct clocksource *cs, cycle_t cycles) { - u64 ret = (u64)cycles; - ret = (ret * cs->mult) >> cs->shift; + return ((u64)cycles * cs->mult) >> cs->shift; +} + +/** + * ns2cyc - converts nanoseconds to clocksource cycles + * @cs: Pointer to clocksource + * @nsecs: Nanoseconds + */ +static inline cycle_t ns2cyc(struct clocksource *cs, u64 nsecs) +{ + cycle_t ret = nsecs << cs->shift; + + do_div(ret, cs->mult + 1); + return ret; } @@ -190,7 +256,7 @@ static inline s64 cyc2ns(struct clocksou * @length_nsec: Desired interval length in nanoseconds. * * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment - * pair and interval request. + * pair and interval request. Must hold xtime_lock! * * Unless you're the timekeeping code, you should not be using this! */ @@ -218,6 +284,12 @@ extern int clocksource_register(struct c extern struct clocksource* clocksource_get_next(void); extern void clocksource_change_rating(struct clocksource *cs, int rating); extern void clocksource_resume(void); +extern cycle_t get_monotonic_cycles(void); +extern unsigned long cycles_to_usecs(cycle_t cycles); +extern cycle_t usecs_to_cycles(unsigned long usecs); + +/* used to initialize clock */ +extern struct clocksource clocksource_jiffies; #ifdef CONFIG_GENERIC_TIME_VSYSCALL extern void update_vsyscall(struct timespec *ts, struct clocksource *c); Index: linux-2.6.24-rc8-rt1/kernel/time/timekeeping.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/time/timekeeping.c 2008-01-16 22:27:50.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/time/timekeeping.c 2008-01-16 22:29:29.000000000 -0500 @@ -24,8 +24,9 @@ * This read-write spinlock protects us from races in SMP while * playing with xtime and avenrun. */ -__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); +__cacheline_aligned_in_smp DEFINE_RAW_SEQLOCK(xtime_lock); +EXPORT_SYMBOL_GPL(xtime_lock); /* * The current time @@ -45,6 +46,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOC struct timespec xtime __attribute__ ((aligned (16))); struct timespec wall_to_monotonic __attribute__ ((aligned (16))); static unsigned long total_sleep_time; /* seconds */ +EXPORT_SYMBOL_GPL(xtime); static struct timespec xtime_cache __attribute__ ((aligned (16))); static inline void update_xtime_cache(u64 nsec) @@ -53,8 +55,13 @@ static inline void update_xtime_cache(u6 timespec_add_ns(&xtime_cache, nsec); } -static struct clocksource *clock; /* pointer to current clocksource */ - +/* + * pointer to current clocksource + * Just in case we use clocksource_read before we initialize + * the actual clock source. Instead of calling a NULL read pointer + * we return jiffies. + */ +static struct clocksource *clock = &clocksource_jiffies; #ifdef CONFIG_GENERIC_TIME /** @@ -66,16 +73,10 @@ static struct clocksource *clock; /* poi */ static inline s64 __get_nsec_offset(void) { - cycle_t cycle_now, cycle_delta; + cycle_t cycle_delta; s64 ns_offset; - /* read clocksource: */ - cycle_now = clocksource_read(clock); - - /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - - /* convert to nanoseconds: */ + cycle_delta = clocksource_get_cycles(clock, clocksource_read(clock)); ns_offset = cyc2ns(clock, cycle_delta); return ns_offset; @@ -104,6 +105,54 @@ static inline void __get_realtime_clock_ timespec_add_ns(ts, nsecs); } +cycle_t notrace get_monotonic_cycles(void) +{ + cycle_t cycle_now, cycle_delta, cycle_monotonic, cycle_last; + + do { + /* + * cycle_monotonic and cycle_last can change on + * another CPU and we need the delta calculation + * of cycle_now and cycle_last happen atomic, as well + * as the adding to cycle_monotonic. We don't need to grab + * any locks, we just keep trying until get all the + * calculations together in one state. + * + * In fact, we __cant__ grab any locks. This + * function is called from the latency_tracer which can + * be called anywhere. To grab any locks (including + * seq_locks) we risk putting ourselves into a deadlock. + */ + cycle_monotonic = clock->cycle_monotonic; + cycle_last = clock->cycle_last; + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - cycle_last) & clock->mask; + + } while (cycle_monotonic != clock->cycle_monotonic || + cycle_last != clock->cycle_last); + + return cycle_monotonic + cycle_delta; +} + +unsigned long notrace cycles_to_usecs(cycle_t cycles) +{ + u64 ret = cyc2ns(clock, cycles); + + ret += NSEC_PER_USEC/2; /* For rounding in do_div() */ + do_div(ret, NSEC_PER_USEC); + + return ret; +} + +cycle_t notrace usecs_to_cycles(unsigned long usecs) +{ + return ns2cyc(clock, (u64)usecs * 1000); +} + /** * getnstimeofday - Returns the time of day in a timespec * @ts: pointer to the timespec to be set @@ -195,19 +244,27 @@ static void change_clocksource(void) clock = new; clock->cycle_last = now; - + clock->cycle_accumulated = 0; clock->error = 0; clock->xtime_nsec = 0; clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); tick_clock_notify(); +#ifndef CONFIG_PREEMPT_RT printk(KERN_INFO "Time: %s clocksource has been installed.\n", clock->name); +#endif +} + +void timekeeping_accumulate(void) +{ + clocksource_accumulate(clock, clocksource_read(clock)); } #else static inline void change_clocksource(void) { } static inline s64 __get_nsec_offset(void) { return 0; } +void timekeeping_accumulate(void) { } #endif /** @@ -302,6 +359,7 @@ static int timekeeping_resume(struct sys timespec_add_ns(&xtime, timekeeping_suspend_nsecs); /* re-base the last cycle value */ clock->cycle_last = clocksource_read(clock); + clock->cycle_accumulated = 0; clock->error = 0; timekeeping_suspended = 0; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -448,27 +506,28 @@ static void clocksource_adjust(s64 offse */ void update_wall_time(void) { - cycle_t offset; + cycle_t cycle_now; /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) return; #ifdef CONFIG_GENERIC_TIME - offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; + cycle_now = clocksource_read(clock); #else - offset = clock->cycle_interval; + cycle_now = clock->cycle_last + clock->cycle_interval; #endif + clocksource_accumulate(clock, cycle_now); + clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; /* normally this loop will run just once, however in the * case of lost or late ticks, it will accumulate correctly. */ - while (offset >= clock->cycle_interval) { + while (clock->cycle_accumulated >= clock->cycle_interval) { /* accumulate one interval */ clock->xtime_nsec += clock->xtime_interval; - clock->cycle_last += clock->cycle_interval; - offset -= clock->cycle_interval; + clock->cycle_accumulated -= clock->cycle_interval; if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; @@ -482,13 +541,13 @@ void update_wall_time(void) } /* correct the clock when NTP error is too big */ - clocksource_adjust(offset); + clocksource_adjust(clock->cycle_accumulated); /* store full nanoseconds into xtime */ xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; - update_xtime_cache(cyc2ns(clock, offset)); + update_xtime_cache(cyc2ns(clock, clock->cycle_accumulated)); /* check to see if there is a new clocksource to use */ change_clocksource(); Index: linux-2.6.24-rc8-rt1/include/asm-x86/vgtod.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/vgtod.h 2008-01-16 22:27:50.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/vgtod.h 2008-01-16 22:28:42.000000000 -0500 @@ -5,7 +5,7 @@ #include struct vsyscall_gtod_data { - seqlock_t lock; + raw_seqlock_t lock; /* open coded 'struct timespec' */ time_t wall_time_sec; @@ -15,7 +15,7 @@ struct vsyscall_gtod_data { struct timezone sys_tz; struct { /* extract of a clocksource struct */ cycle_t (*vread)(void); - cycle_t cycle_last; + cycle_t cycle_last, cycle_accumulated; cycle_t mask; u32 mult; u32 shift; Index: linux-2.6.24-rc8-rt1/arch/powerpc/kernel/time.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/kernel/time.c 2008-01-16 22:27:50.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/kernel/time.c 2008-01-16 22:28:20.000000000 -0500 @@ -751,7 +751,7 @@ static cycle_t rtc_read(void) return (cycle_t)get_rtc(); } -static cycle_t timebase_read(void) +static cycle_t notrace timebase_read(void) { return (cycle_t)get_tb(); } @@ -773,7 +773,7 @@ void update_vsyscall(struct timespec *wa stamp_xsec = (u64) xtime.tv_nsec * XSEC_PER_SEC; do_div(stamp_xsec, 1000000000); stamp_xsec += (u64) xtime.tv_sec * XSEC_PER_SEC; - update_gtod(clock->cycle_last, stamp_xsec, t2x); + update_gtod(clock->cycle_last-clock->cycle_accumulated, stamp_xsec, t2x); } void update_vsyscall_tz(void) Index: linux-2.6.24-rc8-rt1/kernel/sched.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/sched.c 2008-01-16 22:27:50.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/sched.c 2008-01-16 22:29:28.000000000 -0500 @@ -4,6 +4,7 @@ * Kernel scheduler and related syscalls * * Copyright (C) 1991-2002 Linus Torvalds + * Copyright (C) 2004 Red Hat, Inc., Ingo Molnar * * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and * make semaphores SMP safe @@ -16,12 +17,15 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin + * 2004-10-13 Real-Time Preemption support by Ingo Molnar * 2007-04-15 Work begun on replacing all interactivity tuning with a * fair scheduling design by Con Kolivas. * 2007-05-05 Load balancing (smp-nice) and other improvements * by Peter Williams * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri + * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, + * Thomas Gleixner, Mike Kravetz */ #include @@ -57,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -67,6 +72,8 @@ #include #include +#include "sched_cpupri.h" + /* * Scheduler clock - returns current time in nanosec units. * This is default implementation. @@ -86,6 +93,10 @@ unsigned long long __attribute__((weak)) #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) +#define __PRIO(prio) \ + ((prio) <= 99 ? 199 - (prio) : (prio) - 120) + +#define PRIO(p) __PRIO((p)->prio) /* * 'User priority' is the nice value converted to something we * can work with better when scaling various scheduler parameters, @@ -104,6 +115,20 @@ unsigned long long __attribute__((weak)) #define NICE_0_LOAD SCHED_LOAD_SCALE #define NICE_0_SHIFT SCHED_LOAD_SHIFT +#if (BITS_PER_LONG < 64) +#define JIFFIES_TO_NS64(TIME) \ + ((unsigned long long)(TIME) * ((unsigned long) (1000000000 / HZ))) + +#define NS64_TO_JIFFIES(TIME) \ + ((((unsigned long long)((TIME)) >> BITS_PER_LONG) * \ + (1 + NS_TO_JIFFIES(~0UL))) + NS_TO_JIFFIES((unsigned long)(TIME))) +#else /* BITS_PER_LONG < 64 */ + +#define NS64_TO_JIFFIES(TIME) NS_TO_JIFFIES(TIME) +#define JIFFIES_TO_NS64(TIME) JIFFIES_TO_NS(TIME) + +#endif /* BITS_PER_LONG < 64 */ + /* * These are the 'tuning knobs' of the scheduler: * @@ -133,6 +158,32 @@ static inline void sg_inc_cpu_power(stru } #endif +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) + +/* + * Tweaks for current + */ + +#ifdef CURRENT_PTR +struct task_struct * const ___current = &init_task; +struct task_struct ** const current_ptr = (struct task_struct ** const)&___current; +struct thread_info * const current_ti = &init_thread_union.thread_info; +struct thread_info ** const current_ti_ptr = (struct thread_info ** const)¤t_ti; + +EXPORT_SYMBOL(___current); +EXPORT_SYMBOL(current_ti); + +/* + * The scheduler itself doesnt want 'current' to be cached + * during context-switches: + */ +# undef current +# define current __current() +# undef current_thread_info +# define current_thread_info() __current_thread_info() +#endif + static inline int rt_policy(int policy) { if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) @@ -186,12 +237,12 @@ static struct cfs_rq *init_cfs_rq_p[NR_C * Every task in system belong to this group at bootup. */ struct task_group init_task_group = { - .se = init_sched_entity_p, + .se = init_sched_entity_p, .cfs_rq = init_cfs_rq_p, }; #ifdef CONFIG_FAIR_USER_SCHED -# define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD +# define INIT_TASK_GRP_LOAD (2*NICE_0_LOAD) #else # define INIT_TASK_GRP_LOAD NICE_0_LOAD #endif @@ -266,8 +317,48 @@ struct rt_rq { struct rt_prio_array active; int rt_load_balance_idx; struct list_head *rt_load_balance_head, *rt_load_balance_curr; + unsigned long rt_nr_running; + unsigned long rt_nr_migratory; + unsigned long rt_nr_uninterruptible; + /* highest queued rt task prio */ + int highest_prio; + int overloaded; }; +#ifdef CONFIG_SMP + +/* + * We add the notion of a root-domain which will be used to define per-domain + * variables. Each exclusive cpuset essentially defines an island domain by + * fully partitioning the member cpus from any other cpuset. Whenever a new + * exclusive cpuset is created, we also create and attach a new root-domain + * object. + * + */ +struct root_domain { + atomic_t refcount; + cpumask_t span; + cpumask_t online; + + /* + * The "RT overload" flag: it gets set if a CPU has more than + * one runnable RT task. + */ + cpumask_t rto_mask; + atomic_t rto_count; +#ifdef CONFIG_SMP + struct cpupri cpupri; +#endif +}; + +/* + * By default the system creates a single root-domain with all cpus as + * members (mimicking the global state we have today). + */ +static struct root_domain def_root_domain; + +#endif + /* * This is the main, per-CPU runqueue data structure. * @@ -277,7 +368,7 @@ struct rt_rq { */ struct rq { /* runqueue lock: */ - spinlock_t lock; + raw_spinlock_t lock; /* * nr_running and cpu_load should be in the same cacheline because @@ -310,6 +401,8 @@ struct rq { */ unsigned long nr_uninterruptible; + unsigned long switch_timestamp; + unsigned long slice_avg; struct task_struct *curr, *idle; unsigned long next_balance; struct mm_struct *prev_mm; @@ -325,6 +418,7 @@ struct rq { atomic_t nr_iowait; #ifdef CONFIG_SMP + struct root_domain *rd; struct sched_domain *sd; /* For active balancing */ @@ -358,6 +452,13 @@ struct rq { /* BKL stats */ unsigned int bkl_count; + + /* RT-overload stats: */ + unsigned long rto_schedule; + unsigned long rto_schedule_tail; + unsigned long rto_wakeup; + unsigned long rto_pulled; + unsigned long rto_pushed; #endif struct lock_class_key rq_lock_key; }; @@ -368,6 +469,8 @@ static DEFINE_MUTEX(sched_hotcpu_mutex); static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) { rq->curr->sched_class->check_preempt_curr(rq, p); + if (p != rq->curr && p->prio < rq->curr->prio) + __trace_start_sched_wakeup(p); } static inline int cpu_of(struct rq *rq) @@ -501,11 +604,23 @@ unsigned long long cpu_clock(int cpu) } EXPORT_SYMBOL_GPL(cpu_clock); +/* + * We really dont want to do anything complex within switch_to() + * on PREEMPT_RT - this check enforces this. + */ +#ifdef prepare_arch_switch +# ifdef CONFIG_PREEMPT_RT +# error FIXME +# else +# define _finish_arch_switch finish_arch_switch +# endif +#endif + #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif #ifndef finish_arch_switch -# define finish_arch_switch(prev) do { } while (0) +# define _finish_arch_switch(prev) do { } while (0) #endif static inline int task_current(struct rq *rq, struct task_struct *p) @@ -536,7 +651,7 @@ static inline void finish_lock_switch(st */ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - spin_unlock_irq(&rq->lock); + spin_unlock(&rq->lock); } #else /* __ARCH_WANT_UNLOCKED_CTXSW */ @@ -577,8 +692,8 @@ static inline void finish_lock_switch(st smp_wmb(); prev->oncpu = 0; #endif -#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_enable(); +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_disable(); #endif } #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ @@ -871,6 +986,13 @@ static void cpuacct_charge(struct task_s static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} #endif +#ifdef CONFIG_SMP +static unsigned long source_load(int cpu, int type); +static unsigned long target_load(int cpu, int type); +static unsigned long cpu_avg_load_per_task(int cpu); +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); +#endif /* CONFIG_SMP */ + #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" @@ -975,6 +1097,8 @@ static inline int normal_prio(struct tas prio = MAX_RT_PRIO-1 - p->rt_priority; else prio = __normal_prio(p); + + trace_special_pid(p->pid, PRIO(p), __PRIO(prio)); return prio; } @@ -998,6 +1122,13 @@ static int effective_prio(struct task_st return p->prio; } +static inline void trace_start_sched_wakeup(struct task_struct *p, + struct rq *rq) +{ + if (TASK_PREEMPTS_CURR(p, rq) && (p != rq->curr)) + __trace_start_sched_wakeup(p); +} + /* * activate_task - move a task to the runqueue. */ @@ -1006,6 +1137,8 @@ static void activate_task(struct rq *rq, if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible--; + trace_special_pid(p->pid, PRIO(p), rq->nr_running); + enqueue_task(rq, p, wakeup); inc_nr_running(p, rq); } @@ -1019,6 +1152,8 @@ static void deactivate_task(struct rq *r rq->nr_uninterruptible++; dequeue_task(rq, p, sleep); + trace_special_pid(p->pid, PRIO(p), rq->nr_running); + dec_nr_running(p, rq); } @@ -1047,16 +1182,29 @@ static inline void __set_task_cpu(struct * per-task data have been completed by this moment. */ smp_wmb(); + trace_change_sched_cpu(p, cpu); task_thread_info(p)->cpu = cpu; #endif } +static inline void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio, int running) +{ + if (prev_class != p->sched_class) { + if (prev_class->switched_from) + prev_class->switched_from(rq, p, running); + p->sched_class->switched_to(rq, p, running); + } else + p->sched_class->prio_changed(rq, p, oldprio, running); +} + #ifdef CONFIG_SMP /* * Is this task likely cache-hot: */ -static inline int +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) { s64 delta; @@ -1281,7 +1429,7 @@ static unsigned long target_load(int cpu /* * Return the average load per task on the cpu's run queue */ -static inline unsigned long cpu_avg_load_per_task(int cpu) +static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long total = weighted_cpuload(cpu); @@ -1438,58 +1586,6 @@ static int sched_balance_self(int cpu, i #endif /* CONFIG_SMP */ -/* - * wake_idle() will wake a task on an idle cpu if task->cpu is - * not idle and an idle cpu is available. The span of cpus to - * search starts with cpus closest then further out as needed, - * so we always favor a closer, idle cpu. - * - * Returns the CPU we should wake onto. - */ -#if defined(ARCH_HAS_SCHED_WAKE_IDLE) -static int wake_idle(int cpu, struct task_struct *p) -{ - cpumask_t tmp; - struct sched_domain *sd; - int i; - - /* - * If it is idle, then it is the best cpu to run this task. - * - * This cpu is also the best, if it has more than one task already. - * Siblings must be also busy(in most cases) as they didn't already - * pickup the extra load from this cpu and hence we need not check - * sibling runqueue info. This will avoid the checks and cache miss - * penalities associated with that. - */ - if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) - return cpu; - - for_each_domain(cpu, sd) { - if (sd->flags & SD_WAKE_IDLE) { - cpus_and(tmp, sd->span, p->cpus_allowed); - for_each_cpu_mask(i, tmp) { - if (idle_cpu(i)) { - if (i != task_cpu(p)) { - schedstat_inc(p, - se.nr_wakeups_idle); - } - return i; - } - } - } else { - break; - } - } - return cpu; -} -#else -static inline int wake_idle(int cpu, struct task_struct *p) -{ - return cpu; -} -#endif - /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -1504,18 +1600,22 @@ static inline int wake_idle(int cpu, str * * returns failure only if the task is already active. */ -static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) +static int +try_to_wake_up(struct task_struct *p, unsigned int state, int sync, int mutex) { int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; long old_state; struct rq *rq; -#ifdef CONFIG_SMP - struct sched_domain *sd, *this_sd = NULL; - unsigned long load, this_load; - int new_cpu; -#endif + trace_special_sym(); +#ifdef CONFIG_PREEMPT_RT + /* + * sync wakeups can increase wakeup latencies: + */ + if (rt_task(p)) + sync = 0; +#endif rq = task_rq_lock(p, &flags); old_state = p->state; if (!(old_state & state)) @@ -1532,92 +1632,9 @@ static int try_to_wake_up(struct task_st if (unlikely(task_running(rq, p))) goto out_activate; - new_cpu = cpu; - - schedstat_inc(rq, ttwu_count); - if (cpu == this_cpu) { - schedstat_inc(rq, ttwu_local); - goto out_set_cpu; - } - - for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { - schedstat_inc(sd, ttwu_wake_remote); - this_sd = sd; - break; - } - } - - if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) - goto out_set_cpu; - - /* - * Check for affine wakeup and passive balancing possibilities. - */ - if (this_sd) { - int idx = this_sd->wake_idx; - unsigned int imbalance; - - imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; - - load = source_load(cpu, idx); - this_load = target_load(this_cpu, idx); - - new_cpu = this_cpu; /* Wake to this CPU if we can */ - - if (this_sd->flags & SD_WAKE_AFFINE) { - unsigned long tl = this_load; - unsigned long tl_per_task; - - /* - * Attract cache-cold tasks on sync wakeups: - */ - if (sync && !task_hot(p, rq->clock, this_sd)) - goto out_set_cpu; - - schedstat_inc(p, se.nr_wakeups_affine_attempts); - tl_per_task = cpu_avg_load_per_task(this_cpu); - - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current CPU: - */ - if (sync) - tl -= current->se.load.weight; - - if ((tl <= load && - tl + target_load(cpu, idx) <= tl_per_task) || - 100*(tl + p->se.load.weight) <= imbalance*load) { - /* - * This domain has SD_WAKE_AFFINE and - * p is cache cold in this domain, and - * there is no bad imbalance. - */ - schedstat_inc(this_sd, ttwu_move_affine); - schedstat_inc(p, se.nr_wakeups_affine); - goto out_set_cpu; - } - } - - /* - * Start passive balancing when half the imbalance_pct - * limit is reached. - */ - if (this_sd->flags & SD_WAKE_BALANCE) { - if (imbalance*this_load <= 100*load) { - schedstat_inc(this_sd, ttwu_move_balance); - schedstat_inc(p, se.nr_wakeups_passive); - goto out_set_cpu; - } - } - } - - new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ -out_set_cpu: - new_cpu = wake_idle(new_cpu, p); - if (new_cpu != cpu) { - set_task_cpu(p, new_cpu); + cpu = p->sched_class->select_task_rq(p, sync); + if (cpu != orig_cpu) { + set_task_cpu(p, cpu); task_rq_unlock(rq, &flags); /* might preempt at this point */ rq = task_rq_lock(p, &flags); @@ -1631,6 +1648,21 @@ out_set_cpu: cpu = task_cpu(p); } +#ifdef CONFIG_SCHEDSTATS + schedstat_inc(rq, ttwu_count); + if (cpu == this_cpu) + schedstat_inc(rq, ttwu_local); + else { + struct sched_domain *sd; + for_each_domain(this_cpu, sd) { + if (cpu_isset(cpu, sd->span)) { + schedstat_inc(sd, ttwu_wake_remote); + break; + } + } + } +#endif + out_activate: #endif /* CONFIG_SMP */ schedstat_inc(p, se.nr_wakeups); @@ -1644,11 +1676,24 @@ out_activate: schedstat_inc(p, se.nr_wakeups_remote); update_rq_clock(rq); activate_task(rq, p, 1); + + trace_start_sched_wakeup(p, rq); + check_preempt_curr(rq, p); + if (rq->curr && p && rq && _need_resched()) + trace_special_pid(p->pid, PRIO(p), PRIO(rq->curr)); + success = 1; out_running: - p->state = TASK_RUNNING; + if (mutex) + p->state = TASK_RUNNING_MUTEX; + else + p->state = TASK_RUNNING; +#ifdef CONFIG_SMP + if (p->sched_class->task_wake_up) + p->sched_class->task_wake_up(rq, p); +#endif out: task_rq_unlock(rq, &flags); @@ -1657,14 +1702,47 @@ out: int fastcall wake_up_process(struct task_struct *p) { - return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | - TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); + int ret; + + ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | + TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE | + TASK_UNINTERRUPTIBLE, 0, 0); + return ret; } EXPORT_SYMBOL(wake_up_process); +int fastcall wake_up_process_sync(struct task_struct * p) +{ + int ret; + + ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | + TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE | + TASK_UNINTERRUPTIBLE, 1, 0); + return ret; +} +EXPORT_SYMBOL(wake_up_process_sync); + +int fastcall wake_up_process_mutex(struct task_struct * p) +{ + int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | + TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE | + TASK_UNINTERRUPTIBLE, 0, 1); + return ret; +} +EXPORT_SYMBOL(wake_up_process_mutex); + +int fastcall wake_up_process_mutex_sync(struct task_struct * p) +{ + int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | + TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE | + TASK_UNINTERRUPTIBLE, 1, 1); + return ret; +} +EXPORT_SYMBOL(wake_up_process_mutex_sync); + int fastcall wake_up_state(struct task_struct *p, unsigned int state) { - return try_to_wake_up(p, state, 0); + return try_to_wake_up(p, state | TASK_RUNNING_MUTEX, 0, 0); } /* @@ -1771,6 +1849,10 @@ void fastcall wake_up_new_task(struct ta inc_nr_running(p, rq); } check_preempt_curr(rq, p); +#ifdef CONFIG_SMP + if (p->sched_class->task_wake_up) + p->sched_class->task_wake_up(rq, p); +#endif task_rq_unlock(rq, &flags); } @@ -1803,8 +1885,17 @@ static void fire_sched_in_preempt_notifi struct preempt_notifier *notifier; struct hlist_node *node; + if (hlist_empty(&curr->preempt_notifiers)) + return; + + /* + * The KVM sched in notifier expects to be called with + * interrupts enabled. + */ + local_irq_enable(); hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) notifier->ops->sched_in(notifier, raw_smp_processor_id()); + local_irq_disable(); } static void @@ -1832,6 +1923,26 @@ fire_sched_out_preempt_notifiers(struct #endif +#ifdef CONFIG_DEBUG_PREEMPT +void notrace preempt_enable_no_resched(void) +{ + static int once = 1; + + barrier(); + dec_preempt_count(); + + if (once && !preempt_count()) { + once = 0; + printk(KERN_ERR "BUG: %s:%d task might have lost a preemption check!\n", + current->comm, current->pid); + dump_stack(); + } +} + +EXPORT_SYMBOL(preempt_enable_no_resched); +#endif + + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch @@ -1889,11 +2000,21 @@ static void finish_task_switch(struct rq * Manfred Spraul */ prev_state = prev->state; - finish_arch_switch(prev); + _finish_arch_switch(prev); finish_lock_switch(rq, prev); +#ifdef CONFIG_SMP + if (current->sched_class->post_schedule) + current->sched_class->post_schedule(rq); +#endif + fire_sched_in_preempt_notifiers(current); + trace_stop_sched_switched(current); + /* + * Delay the final freeing of the mm or task, so that we dont have + * to do complex work from within the scheduler: + */ if (mm) - mmdrop(mm); + mmdrop_delayed(mm); if (unlikely(prev_state == TASK_DEAD)) { /* * Remove function-return probe instances associated with this @@ -1911,12 +2032,15 @@ static void finish_task_switch(struct rq asmlinkage void schedule_tail(struct task_struct *prev) __releases(rq->lock) { - struct rq *rq = this_rq(); - - finish_task_switch(rq, prev); + preempt_disable(); // TODO: move this to fork setup + finish_task_switch(this_rq(), prev); + __preempt_enable_no_resched(); + local_irq_enable(); #ifdef __ARCH_WANT_UNLOCKED_CTXSW /* In this case, finish_task_switch does not reenable preemption */ preempt_enable(); +#else + preempt_check_resched(); #endif if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); @@ -1963,10 +2087,18 @@ context_switch(struct rq *rq, struct tas spin_release(&rq->lock.dep_map, 1, _THIS_IP_); #endif + trace_cmdline(); + +#ifdef CURRENT_PTR + barrier(); + *current_ptr = next; + *current_ti_ptr = next->thread_info; +#endif /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); barrier(); + trace_special_pid(prev->pid, PRIO(prev), PRIO(current)); /* * this_rq must be evaluated again because prev may have moved * CPUs since it called schedule(), thus the 'rq' on its stack @@ -2009,6 +2141,11 @@ unsigned long nr_uninterruptible(void) return sum; } +unsigned long nr_uninterruptible_cpu(int cpu) +{ + return cpu_rq(cpu)->nr_uninterruptible; +} + unsigned long long nr_context_switches(void) { int i; @@ -2027,6 +2164,13 @@ unsigned long nr_iowait(void) for_each_possible_cpu(i) sum += atomic_read(&cpu_rq(i)->nr_iowait); + /* + * Since we read the counters lockless, it might be slightly + * inaccurate. Do not allow it to go below zero though: + */ + if (unlikely((long)sum < 0)) + sum = 0; + return sum; } @@ -2124,11 +2268,13 @@ static void double_rq_unlock(struct rq * /* * double_lock_balance - lock the busiest runqueue, this_rq is locked already. */ -static void double_lock_balance(struct rq *this_rq, struct rq *busiest) +static int double_lock_balance(struct rq *this_rq, struct rq *busiest) __releases(this_rq->lock) __acquires(busiest->lock) __acquires(this_rq->lock) { + int ret = 0; + if (unlikely(!irqs_disabled())) { /* printk() doesn't work good under rq->lock */ spin_unlock(&this_rq->lock); @@ -2139,9 +2285,11 @@ static void double_lock_balance(struct r spin_unlock(&this_rq->lock); spin_lock(&busiest->lock); spin_lock(&this_rq->lock); + ret = 1; } else spin_lock(&busiest->lock); } + return ret; } /* @@ -3213,7 +3361,7 @@ out: */ static void run_rebalance_domains(struct softirq_action *h) { - int this_cpu = smp_processor_id(); + int this_cpu = raw_smp_processor_id(); struct rq *this_rq = cpu_rq(this_cpu); enum cpu_idle_type idle = this_rq->idle_at_tick ? CPU_IDLE : CPU_NOT_IDLE; @@ -3365,7 +3513,9 @@ void account_user_time(struct task_struc /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); - if (TASK_NICE(p) > 0) + if (rt_task(p)) + cpustat->user_rt = cputime64_add(cpustat->user_rt, tmp); + else if (TASK_NICE(p) > 0) cpustat->nice = cputime64_add(cpustat->nice, tmp); else cpustat->user = cputime64_add(cpustat->user, tmp); @@ -3420,10 +3570,12 @@ void account_system_time(struct task_str /* Add system time to cpustat. */ tmp = cputime_to_cputime64(cputime); - if (hardirq_count() - hardirq_offset) + if (hardirq_count() - hardirq_offset || (p->flags & PF_HARDIRQ)) cpustat->irq = cputime64_add(cpustat->irq, tmp); - else if (softirq_count()) + else if (softirq_count() || (p->flags & PF_SOFTIRQ)) cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + else if (rt_task(p)) + cpustat->system_rt = cputime64_add(cpustat->system_rt, tmp); else if (p != rq->idle) cpustat->system = cputime64_add(cpustat->system, tmp); else if (atomic_read(&rq->nr_iowait) > 0) @@ -3480,6 +3632,8 @@ void scheduler_tick(void) struct task_struct *curr = rq->curr; u64 next_tick = rq->tick_timestamp + TICK_NSEC; + BUG_ON(!irqs_disabled()); + spin_lock(&rq->lock); __update_rq_clock(rq); /* @@ -3499,44 +3653,6 @@ void scheduler_tick(void) #endif } -#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) - -void fastcall add_preempt_count(int val) -{ - /* - * Underflow? - */ - if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) - return; - preempt_count() += val; - /* - * Spinlock count overflowing soon? - */ - DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= - PREEMPT_MASK - 10); -} -EXPORT_SYMBOL(add_preempt_count); - -void fastcall sub_preempt_count(int val) -{ - /* - * Underflow? - */ - if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) - return; - /* - * Is the spinlock portion underflowing? - */ - if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && - !(preempt_count() & PREEMPT_MASK))) - return; - - preempt_count() -= val; -} -EXPORT_SYMBOL(sub_preempt_count); - -#endif - /* * Print scheduling while atomic bug: */ @@ -3544,8 +3660,10 @@ static noinline void __schedule_bug(stru { struct pt_regs *regs = get_irq_regs(); - printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", - prev->comm, prev->pid, preempt_count()); + stop_trace(); + + printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d, CPU#%d\n", + prev->comm, preempt_count(), prev->pid, smp_processor_id()); debug_show_held_locks(prev); if (irqs_disabled()) @@ -3562,6 +3680,8 @@ static noinline void __schedule_bug(stru */ static inline void schedule_debug(struct task_struct *prev) { + WARN_ON(system_state == SYSTEM_BOOTING); + /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. @@ -3572,6 +3692,8 @@ static inline void schedule_debug(struct profile_hit(SCHED_PROFILING, __builtin_return_address(0)); + trace_special_sym(); + schedstat_inc(this_rq(), sched_count); #ifdef CONFIG_SCHEDSTATS if (unlikely(prev->lock_depth >= 0)) { @@ -3616,14 +3738,15 @@ pick_next_task(struct rq *rq, struct tas /* * schedule() is the main scheduler function. */ -asmlinkage void __sched schedule(void) +asmlinkage void __sched __schedule(void) { struct task_struct *prev, *next; long *switch_count; struct rq *rq; int cpu; -need_resched: + rcu_preempt_boost(); + preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -3632,7 +3755,6 @@ need_resched: switch_count = &prev->nivcsw; release_kernel_lock(prev); -need_resched_nonpreemptible: schedule_debug(prev); @@ -3642,18 +3764,30 @@ need_resched_nonpreemptible: local_irq_disable(); __update_rq_clock(rq); spin_lock(&rq->lock); + cpu = smp_processor_id(); clear_tsk_need_resched(prev); + clear_tsk_need_resched_delayed(prev); - if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + if ((prev->state & ~TASK_RUNNING_MUTEX) && + !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely((prev->state & TASK_INTERRUPTIBLE) && unlikely(signal_pending(prev)))) { prev->state = TASK_RUNNING; } else { + touch_softlockup_watchdog(); deactivate_task(rq, prev, 1); } switch_count = &prev->nvcsw; } + if (preempt_count() & PREEMPT_ACTIVE) + sub_preempt_count(PREEMPT_ACTIVE); + +#ifdef CONFIG_SMP + if (prev->sched_class->pre_schedule) + prev->sched_class->pre_schedule(rq, prev); +#endif + if (unlikely(!rq->nr_running)) idle_balance(cpu, rq); @@ -3668,21 +3802,92 @@ need_resched_nonpreemptible: ++*switch_count; context_switch(rq, prev, next); /* unlocks the rq */ - } else - spin_unlock_irq(&rq->lock); + __preempt_enable_no_resched(); + } else { + __preempt_enable_no_resched(); + spin_unlock(&rq->lock); + trace_stop_sched_switched(next); + } - if (unlikely(reacquire_kernel_lock(current) < 0)) { - cpu = smp_processor_id(); - rq = cpu_rq(cpu); - goto need_resched_nonpreemptible; + reacquire_kernel_lock(current); + if (!irqs_disabled()) { + static int once = 1; + if (once) { + once = 0; + print_irqtrace_events(current); + WARN_ON(1); + } } - preempt_enable_no_resched(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) - goto need_resched; +} + +/* + * schedule() is the main scheduler function. + */ +asmlinkage void __sched schedule(void) +{ + WARN_ON(system_state == SYSTEM_BOOTING); + /* + * Test if we have interrupts disabled. + */ + if (unlikely(irqs_disabled())) { + stop_trace(); + printk(KERN_ERR "BUG: scheduling with irqs disabled: " + "%s/0x%08x/%d\n", current->comm, preempt_count(), + current->pid); + print_symbol("caller is %s\n", + (long)__builtin_return_address(0)); + dump_stack(); + } + + if (unlikely(current->flags & PF_NOSCHED)) { + current->flags &= ~PF_NOSCHED; + printk(KERN_ERR "%s:%d userspace BUG: scheduling in " + "user-atomic context!\n", current->comm, current->pid); + dump_stack(); + send_sig(SIGUSR2, current, 1); + } + + local_irq_disable(); + + do { + __schedule(); + } while (unlikely(test_thread_flag(TIF_NEED_RESCHED) || + test_thread_flag(TIF_NEED_RESCHED_DELAYED))); + + local_irq_enable(); } EXPORT_SYMBOL(schedule); #ifdef CONFIG_PREEMPT + +/* + * Global flag to turn preemption off on a CONFIG_PREEMPT kernel: + */ +int kernel_preemption = 1; + +static int __init preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) { + if (kernel_preemption) { + printk(KERN_INFO "turning off kernel preemption!\n"); + kernel_preemption = 0; + } + return 1; + } + if (!strncmp(str, "on", 2)) { + if (!kernel_preemption) { + printk(KERN_INFO "turning on kernel preemption!\n"); + kernel_preemption = 1; + } + return 1; + } + get_option(&str, &kernel_preemption); + + return 1; +} + +__setup("preempt=", preempt_setup); + /* * this is the entry point to schedule() from in-kernel preemption * off of preempt_enable. Kernel preemptions off return from interrupt @@ -3695,6 +3900,8 @@ asmlinkage void __sched preempt_schedule struct task_struct *task = current; int saved_lock_depth; #endif + if (!kernel_preemption) + return; /* * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. @@ -3703,6 +3910,7 @@ asmlinkage void __sched preempt_schedule return; do { + local_irq_disable(); add_preempt_count(PREEMPT_ACTIVE); /* @@ -3714,11 +3922,11 @@ asmlinkage void __sched preempt_schedule saved_lock_depth = task->lock_depth; task->lock_depth = -1; #endif - schedule(); + __schedule(); #ifdef CONFIG_PREEMPT_BKL task->lock_depth = saved_lock_depth; #endif - sub_preempt_count(PREEMPT_ACTIVE); + local_irq_enable(); /* * Check again in case we missed a preemption opportunity @@ -3730,10 +3938,10 @@ asmlinkage void __sched preempt_schedule EXPORT_SYMBOL(preempt_schedule); /* - * this is the entry point to schedule() from kernel preemption - * off of irq context. - * Note, that this is called and return with irqs disabled. This will - * protect us against recursive calling from irq. + * this is is the entry point for the IRQ return path. Called with + * interrupts disabled. To avoid infinite irq-entry recursion problems + * with fast-paced IRQ sources we do all of this carefully to never + * enable interrupts again. */ asmlinkage void __sched preempt_schedule_irq(void) { @@ -3742,10 +3950,18 @@ asmlinkage void __sched preempt_schedule struct task_struct *task = current; int saved_lock_depth; #endif - /* Catch callers which need to be fixed */ - BUG_ON(ti->preempt_count || !irqs_disabled()); + + if (!kernel_preemption) + return; + /* + * If there is a non-zero preempt_count then just return. + * (interrupts are disabled) + */ + if (unlikely(ti->preempt_count)) + return; do { + local_irq_disable(); add_preempt_count(PREEMPT_ACTIVE); /* @@ -3757,13 +3973,12 @@ asmlinkage void __sched preempt_schedule saved_lock_depth = task->lock_depth; task->lock_depth = -1; #endif - local_irq_enable(); - schedule(); + __schedule(); + local_irq_disable(); #ifdef CONFIG_PREEMPT_BKL task->lock_depth = saved_lock_depth; #endif - sub_preempt_count(PREEMPT_ACTIVE); /* * Check again in case we missed a preemption opportunity @@ -3778,7 +3993,8 @@ asmlinkage void __sched preempt_schedule int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) { - return try_to_wake_up(curr->private, mode, sync); + return try_to_wake_up(curr->private, mode | TASK_RUNNING_MUTEX, + sync, 0); } EXPORT_SYMBOL(default_wake_function); @@ -3818,8 +4034,9 @@ void fastcall __wake_up(wait_queue_head_ unsigned long flags; spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, 0, key); + __wake_up_common(q, mode, nr_exclusive, 1, key); spin_unlock_irqrestore(&q->lock, flags); + preempt_check_resched_delayed(); } EXPORT_SYMBOL(__wake_up); @@ -3869,8 +4086,9 @@ void complete(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); x->done++; __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, - 1, 0, NULL); + 1, 1, NULL); spin_unlock_irqrestore(&x->wait.lock, flags); + preempt_check_resched_delayed(); } EXPORT_SYMBOL(complete); @@ -3881,11 +4099,18 @@ void complete_all(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); x->done += UINT_MAX/2; __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, - 0, 0, NULL); + 0, 1, NULL); spin_unlock_irqrestore(&x->wait.lock, flags); + preempt_check_resched_delayed(); } EXPORT_SYMBOL(complete_all); +unsigned int fastcall completion_done(struct completion *x) +{ + return x->done; +} +EXPORT_SYMBOL(completion_done); + static inline long __sched do_wait_for_common(struct completion *x, long timeout, int state) { @@ -4002,10 +4227,8 @@ long __sched sleep_on_timeout(wait_queue } EXPORT_SYMBOL(sleep_on_timeout); -#ifdef CONFIG_RT_MUTEXES - /* - * rt_mutex_setprio - set the current priority of a task + * task_setprio - set the current priority of a task * @p: task * @prio: prio value (kernel-internal form) * @@ -4014,15 +4237,35 @@ EXPORT_SYMBOL(sleep_on_timeout); * * Used by the rt_mutex code to implement priority inheritance logic. */ -void rt_mutex_setprio(struct task_struct *p, int prio) +void task_setprio(struct task_struct *p, int prio) { unsigned long flags; - int oldprio, on_rq, running; + int oldprio, prev_resched, on_rq, running; struct rq *rq; + const struct sched_class *prev_class = p->sched_class; BUG_ON(prio < 0 || prio > MAX_PRIO); rq = task_rq_lock(p, &flags); + + /* + * Idle task boosting is a nono in general. There is one + * exception, when NOHZ is active: + * + * The idle task calls get_next_timer_interrupt() and holds + * the timer wheel base->lock on the CPU and another CPU wants + * to access the timer (probably to cancel it). We can safely + * ignore the boosting request, as the idle CPU runs this code + * with interrupts disabled and will complete the lock + * protected section without being interrupted. So there is no + * real need to boost. + */ + if (unlikely(p == rq->idle)) { + WARN_ON(p != rq->curr); + WARN_ON(p->pi_blocked_on); + goto out_unlock; + } + update_rq_clock(rq); oldprio = p->prio; @@ -4041,27 +4284,23 @@ void rt_mutex_setprio(struct task_struct p->prio = prio; + trace_special_pid(p->pid, __PRIO(oldprio), PRIO(p)); + prev_resched = _need_resched(); + if (on_rq) { if (running) p->sched_class->set_curr_task(rq); + enqueue_task(rq, p, 0); - /* - * Reschedule if we are currently running on this runqueue and - * our priority decreased, or if we are not currently running on - * this runqueue and our priority is higher than the current's - */ - if (running) { - if (p->prio > oldprio) - resched_task(rq->curr); - } else { - check_preempt_curr(rq, p); - } + + check_class_changed(rq, p, prev_class, oldprio, running); } + trace_special(prev_resched, _need_resched(), 0); + +out_unlock: task_rq_unlock(rq, &flags); } -#endif - void set_user_nice(struct task_struct *p, long nice) { int old_prio, delta, on_rq; @@ -4258,6 +4497,7 @@ int sched_setscheduler(struct task_struc { int retval, oldprio, oldpolicy = -1, on_rq, running; unsigned long flags; + const struct sched_class *prev_class = p->sched_class; struct rq *rq; /* may grab non-irq protected spin_locks */ @@ -4351,18 +4591,10 @@ recheck: if (on_rq) { if (running) p->sched_class->set_curr_task(rq); + activate_task(rq, p, 0); - /* - * Reschedule if we are currently running on this runqueue and - * our priority decreased, or if we are not currently running on - * this runqueue and our priority is higher than the current's - */ - if (running) { - if (p->prio > oldprio) - resched_task(rq->curr); - } else { - check_preempt_curr(rq, p); - } + + check_class_changed(rq, p, prev_class, oldprio, running); } __task_rq_unlock(rq); spin_unlock_irqrestore(&p->pi_lock, flags); @@ -4656,19 +4888,19 @@ asmlinkage long sys_sched_yield(void) * Since we are going to call schedule() anyway, there's * no need to preempt or enable interrupts: */ - __release(rq->lock); - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); - _raw_spin_unlock(&rq->lock); - preempt_enable_no_resched(); + spin_unlock_no_resched(&rq->lock); - schedule(); + __schedule(); + + local_irq_enable(); + preempt_check_resched(); return 0; } static void __cond_resched(void) { -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT) __might_sleep(__FILE__, __LINE__); #endif /* @@ -4677,10 +4909,11 @@ static void __cond_resched(void) * cond_resched() call. */ do { + local_irq_disable(); add_preempt_count(PREEMPT_ACTIVE); - schedule(); - sub_preempt_count(PREEMPT_ACTIVE); + __schedule(); } while (need_resched()); + local_irq_enable(); } int __sched cond_resched(void) @@ -4702,32 +4935,53 @@ EXPORT_SYMBOL(cond_resched); * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ -int cond_resched_lock(spinlock_t *lock) +int __cond_resched_raw_spinlock(raw_spinlock_t *lock) { int ret = 0; - if (need_lockbreak(lock)) { + if (need_lockbreak_raw(lock)) { spin_unlock(lock); cpu_relax(); ret = 1; spin_lock(lock); } if (need_resched() && system_state == SYSTEM_RUNNING) { - spin_release(&lock->dep_map, 1, _THIS_IP_); - _raw_spin_unlock(lock); - preempt_enable_no_resched(); + spin_unlock_no_resched(lock); __cond_resched(); ret = 1; spin_lock(lock); } return ret; } -EXPORT_SYMBOL(cond_resched_lock); +EXPORT_SYMBOL(__cond_resched_raw_spinlock); -int __sched cond_resched_softirq(void) +#ifdef CONFIG_PREEMPT_RT + +int __cond_resched_spinlock(spinlock_t *lock) { - BUG_ON(!in_softirq()); +#if (defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)) || defined(CONFIG_PREEMPT_RT) + if (lock->break_lock) { + lock->break_lock = 0; + spin_unlock_no_resched(lock); + __cond_resched(); + spin_lock(lock); + return 1; + } +#endif + return 0; +} +EXPORT_SYMBOL(__cond_resched_spinlock); + +#endif +/* + * Voluntarily preempt a process context that has softirqs disabled: + */ +int __sched cond_resched_softirq(void) +{ +#ifndef CONFIG_PREEMPT_SOFTIRQS + WARN_ON_ONCE(!in_softirq()); +#endif if (need_resched() && system_state == SYSTEM_RUNNING) { local_bh_enable(); __cond_resched(); @@ -4738,17 +4992,102 @@ int __sched cond_resched_softirq(void) } EXPORT_SYMBOL(cond_resched_softirq); +/* + * Voluntarily preempt a softirq context (possible with softirq threading): + */ +int __sched cond_resched_softirq_context(void) +{ + WARN_ON_ONCE(!in_softirq()); + + if (softirq_need_resched() && system_state == SYSTEM_RUNNING) { + raw_local_irq_disable(); + _local_bh_enable(); + raw_local_irq_enable(); + __cond_resched(); + local_bh_disable(); + return 1; + } + return 0; +} +EXPORT_SYMBOL(cond_resched_softirq_context); + +/* + * Preempt a hardirq context if necessary (possible with hardirq threading): + */ +int cond_resched_hardirq_context(void) +{ + WARN_ON_ONCE(!in_irq()); + WARN_ON_ONCE(!irqs_disabled()); + + if (hardirq_need_resched()) { +#ifndef CONFIG_PREEMPT_RT + irq_exit(); +#endif + local_irq_enable(); + __cond_resched(); +#ifndef CONFIG_PREEMPT_RT + local_irq_disable(); + __irq_enter(); +#endif + + return 1; + } + return 0; +} +EXPORT_SYMBOL(cond_resched_hardirq_context); + +#ifdef CONFIG_PREEMPT_VOLUNTARY + +int voluntary_preemption = 1; + +EXPORT_SYMBOL(voluntary_preemption); + +static int __init voluntary_preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) + voluntary_preemption = 0; + else + get_option(&str, &voluntary_preemption); + if (!voluntary_preemption) + printk("turning off voluntary preemption!\n"); + + return 1; +} + +__setup("voluntary-preempt=", voluntary_preempt_setup); + +#endif + /** * yield - yield the current processor to other threads. * * This is a shortcut for kernel-space yielding - it marks the * thread runnable and calls sys_sched_yield(). */ -void __sched yield(void) +void __sched __yield(void) { set_current_state(TASK_RUNNING); sys_sched_yield(); } + +void __sched yield(void) +{ + static int once = 1; + + /* + * it's a bug to rely on yield() with RT priorities. We print + * the first occurance after bootup ... this will still give + * us an idea about the scope of the problem, without spamming + * the syslog: + */ + if (once && rt_task(current)) { + once = 0; + printk(KERN_ERR "BUG: %s:%d RT task yield()-ing!\n", + current->comm, current->pid); + dump_stack(); + } + __yield(); +} EXPORT_SYMBOL(yield); /* @@ -4888,7 +5227,7 @@ out_unlock: return retval; } -static const char stat_nam[] = "RSDTtZX"; +static const char stat_nam[] = "RMSDTtZX"; static void show_task(struct task_struct *p) { @@ -4896,19 +5235,23 @@ static void show_task(struct task_struct unsigned state; state = p->state ? __ffs(p->state) + 1 : 0; - printk(KERN_INFO "%-13.13s %c", p->comm, - state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); + printk("%-13.13s %c [%p]", p->comm, + state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?', p); #if BITS_PER_LONG == 32 - if (state == TASK_RUNNING) + if (0 && (state == TASK_RUNNING)) printk(KERN_CONT " running "); else printk(KERN_CONT " %08lx ", thread_saved_pc(p)); #else - if (state == TASK_RUNNING) + if (0 && (state == TASK_RUNNING)) printk(KERN_CONT " running task "); else printk(KERN_CONT " %016lx ", thread_saved_pc(p)); #endif + if (task_curr(p)) + printk("[curr] "); + else if (p->se.on_rq) + printk("[on rq #%d] ", task_cpu(p)); #ifdef CONFIG_DEBUG_STACK_USAGE { unsigned long *n = end_of_stack(p); @@ -4927,6 +5270,7 @@ static void show_task(struct task_struct void show_state_filter(unsigned long state_filter) { struct task_struct *g, *p; + int do_unlock = 1; #if BITS_PER_LONG == 32 printk(KERN_INFO @@ -4935,7 +5279,16 @@ void show_state_filter(unsigned long sta printk(KERN_INFO " task PC stack pid father\n"); #endif +#ifdef CONFIG_PREEMPT_RT + if (!read_trylock(&tasklist_lock)) { + printk("hm, tasklist_lock write-locked.\n"); + printk("ignoring ...\n"); + do_unlock = 0; + } +#else read_lock(&tasklist_lock); +#endif + do_each_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow @@ -4951,7 +5304,8 @@ void show_state_filter(unsigned long sta #ifdef CONFIG_SCHED_DEBUG sysrq_sched_debug_show(); #endif - read_unlock(&tasklist_lock); + if (do_unlock) + read_unlock(&tasklist_lock); /* * Only show locks if all tasks are dumped: */ @@ -4992,7 +5346,9 @@ void __cpuinit init_idle(struct task_str spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ -#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) +#if defined(CONFIG_PREEMPT) && \ + !defined(CONFIG_PREEMPT_BKL) && \ + !defined(CONFIG_PREEMPT_RT) task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); #else task_thread_info(idle)->preempt_count = 0; @@ -5077,7 +5433,13 @@ int set_cpus_allowed(struct task_struct goto out; } - p->cpus_allowed = new_mask; + if (p->sched_class->set_cpus_allowed) + p->sched_class->set_cpus_allowed(p, &new_mask); + else { + p->cpus_allowed = new_mask; + p->nr_cpus_allowed = cpus_weight(new_mask); + } + /* Can the task run on the task's current CPU? If so, we're done */ if (cpu_isset(task_cpu(p), new_mask)) goto out; @@ -5111,11 +5473,18 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { struct rq *rq_dest, *rq_src; + unsigned long flags; int ret = 0, on_rq; if (unlikely(cpu_is_offline(dest_cpu))) return ret; + /* + * PREEMPT_RT: this relies on write_lock_irq(&tasklist_lock) + * disabling interrupts - which on PREEMPT_RT does not do: + */ + local_irq_save(flags); + rq_src = cpu_rq(src_cpu); rq_dest = cpu_rq(dest_cpu); @@ -5139,6 +5508,8 @@ static int __migrate_task(struct task_st ret = 1; out: double_rq_unlock(rq_src, rq_dest); + local_irq_restore(flags); + return ret; } @@ -5590,6 +5961,15 @@ migration_call(struct notifier_block *nf case CPU_ONLINE_FROZEN: /* Strictly unnecessary, as first user will wake it. */ wake_up_process(cpu_rq(cpu)->migration_thread); + + /* Update our root-domain */ + rq = cpu_rq(cpu); + spin_lock_irqsave(&rq->lock, flags); + if (rq->rd) { + BUG_ON(!cpu_isset(cpu, rq->rd->span)); + cpu_set(cpu, rq->rd->online); + } + spin_unlock_irqrestore(&rq->lock, flags); break; #ifdef CONFIG_HOTPLUG_CPU @@ -5640,6 +6020,17 @@ migration_call(struct notifier_block *nf } spin_unlock_irq(&rq->lock); break; + + case CPU_DOWN_PREPARE: + /* Update our root-domain */ + rq = cpu_rq(cpu); + spin_lock_irqsave(&rq->lock, flags); + if (rq->rd) { + BUG_ON(!cpu_isset(cpu, rq->rd->span)); + cpu_clear(cpu, rq->rd->online); + } + spin_unlock_irqrestore(&rq->lock, flags); + break; #endif case CPU_LOCK_RELEASE: mutex_unlock(&sched_hotcpu_mutex); @@ -5831,11 +6222,79 @@ sd_parent_degenerate(struct sched_domain return 1; } +static void rq_attach_root(struct rq *rq, struct root_domain *rd) +{ + unsigned long flags; + const struct sched_class *class; + + spin_lock_irqsave(&rq->lock, flags); + + if (rq->rd) { + struct root_domain *old_rd = rq->rd; + + for (class = sched_class_highest; class; class = class->next) { + if (class->leave_domain) + class->leave_domain(rq); + } + + cpu_clear(rq->cpu, old_rd->span); + cpu_clear(rq->cpu, old_rd->online); + + if (atomic_dec_and_test(&old_rd->refcount)) + kfree(old_rd); + } + + atomic_inc(&rd->refcount); + rq->rd = rd; + + cpu_set(rq->cpu, rd->span); + if (cpu_isset(rq->cpu, cpu_online_map)) + cpu_set(rq->cpu, rd->online); + + for (class = sched_class_highest; class; class = class->next) { + if (class->join_domain) + class->join_domain(rq); + } + + spin_unlock_irqrestore(&rq->lock, flags); +} + +static void init_rootdomain(struct root_domain *rd) +{ + memset(rd, 0, sizeof(*rd)); + + cpus_clear(rd->span); + cpus_clear(rd->online); + + cpupri_init(&rd->cpupri); + +} + +static void init_defrootdomain(void) +{ + init_rootdomain(&def_root_domain); + atomic_set(&def_root_domain.refcount, 1); +} + +static struct root_domain *alloc_rootdomain(void) +{ + struct root_domain *rd; + + rd = kmalloc(sizeof(*rd), GFP_KERNEL); + if (!rd) + return NULL; + + init_rootdomain(rd); + + return rd; +} + /* - * Attach the domain 'sd' to 'cpu' as its base domain. Callers must + * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. */ -static void cpu_attach_domain(struct sched_domain *sd, int cpu) +static void +cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) { struct rq *rq = cpu_rq(cpu); struct sched_domain *tmp; @@ -5860,6 +6319,7 @@ static void cpu_attach_domain(struct sch sched_domain_debug(sd, cpu); + rq_attach_root(rq, rd); rcu_assign_pointer(rq->sd, sd); } @@ -6228,6 +6688,7 @@ static void init_sched_groups_power(int static int build_sched_domains(const cpumask_t *cpu_map) { int i; + struct root_domain *rd; #ifdef CONFIG_NUMA struct sched_group **sched_group_nodes = NULL; int sd_allnodes = 0; @@ -6244,6 +6705,12 @@ static int build_sched_domains(const cpu sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; #endif + rd = alloc_rootdomain(); + if (!rd) { + printk(KERN_WARNING "Cannot alloc root domain\n"); + return -ENOMEM; + } + /* * Set up domains for cpus specified by the cpu_map. */ @@ -6460,7 +6927,7 @@ static int build_sched_domains(const cpu #else sd = &per_cpu(phys_domains, i); #endif - cpu_attach_domain(sd, i); + cpu_attach_domain(sd, rd, i); } return 0; @@ -6518,8 +6985,7 @@ static void detach_destroy_domains(const unregister_sched_domain_sysctl(); for_each_cpu_mask(i, *cpu_map) - cpu_attach_domain(NULL, i); - synchronize_sched(); + cpu_attach_domain(NULL, &def_root_domain, i); arch_destroy_sched_domains(cpu_map); } @@ -6751,6 +7217,10 @@ void __init sched_init(void) int highest_cpu = 0; int i, j; +#ifdef CONFIG_SMP + init_defrootdomain(); +#endif + for_each_possible_cpu(i) { struct rt_prio_array *array; struct rq *rq; @@ -6790,12 +7260,16 @@ void __init sched_init(void) rq->cpu_load[j] = 0; #ifdef CONFIG_SMP rq->sd = NULL; + rq->rd = NULL; rq->active_balance = 0; rq->next_balance = jiffies; rq->push_cpu = 0; rq->cpu = i; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); + rq->rt.highest_prio = MAX_RT_PRIO; + rq->rt.overloaded = 0; + rq_attach_root(rq, &def_root_domain); #endif atomic_set(&rq->nr_iowait, 0); @@ -6830,6 +7304,9 @@ void __init sched_init(void) atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current); +#ifdef CONFIG_PREEMPT_RT + printk("Real-Time Preemption Support (C) 2004-2007 Ingo Molnar\n"); +#endif /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, @@ -6843,7 +7320,7 @@ void __init sched_init(void) current->sched_class = &fair_sched_class; } -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT) void __might_sleep(char *file, int line) { #ifdef in_atomic @@ -6851,13 +7328,17 @@ void __might_sleep(char *file, int line) if ((in_atomic() || irqs_disabled()) && system_state == SYSTEM_RUNNING && !oops_in_progress) { + if (debug_direct_keyboard && hardirq_count()) + return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; + stop_trace(); printk(KERN_ERR "BUG: sleeping function called from invalid" - " context at %s:%d\n", file, line); - printk("in_atomic():%d, irqs_disabled():%d\n", - in_atomic(), irqs_disabled()); + " context %s(%d) at %s:%d\n", + current->comm, current->pid, file, line); + printk("in_atomic():%d [%08x], irqs_disabled():%d\n", + in_atomic(), preempt_count(), irqs_disabled()); debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); Index: linux-2.6.24-rc8-rt1/kernel/sched_rt.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/sched_rt.c 2008-01-16 22:27:50.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/sched_rt.c 2008-01-16 22:29:30.000000000 -0500 @@ -3,6 +3,51 @@ * policies) */ +#ifdef CONFIG_SMP + +static inline int rt_overloaded(struct rq *rq) +{ + return atomic_read(&rq->rd->rto_count); +} + +static inline void rt_set_overload(struct rq *rq) +{ + cpu_set(rq->cpu, rq->rd->rto_mask); + /* + * Make sure the mask is visible before we set + * the overload count. That is checked to determine + * if we should look at the mask. It would be a shame + * if we looked at the mask, but the mask was not + * updated yet. + */ + wmb(); + atomic_inc(&rq->rd->rto_count); +} + +static inline void rt_clear_overload(struct rq *rq) +{ + /* the order here really doesn't matter */ + atomic_dec(&rq->rd->rto_count); + cpu_clear(rq->cpu, rq->rd->rto_mask); +} + +static void update_rt_migration(struct rq *rq) +{ + if (unlikely(num_online_cpus() == 1)) + return; + + if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) { + if (!rq->rt.overloaded) { + rt_set_overload(rq); + rq->rt.overloaded = 1; + } + } else if (rq->rt.overloaded) { + rt_clear_overload(rq); + rq->rt.overloaded = 0; + } +} +#endif /* CONFIG_SMP */ + /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -26,12 +71,114 @@ static void update_curr_rt(struct rq *rq cpuacct_charge(curr, delta_exec); } +static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq) +{ + WARN_ON(!rt_task(p)); + rq->rt.rt_nr_running++; +#ifdef CONFIG_SMP + if (p->prio < rq->rt.highest_prio) { + rq->rt.highest_prio = p->prio; + cpupri_set(&rq->rd->cpupri, rq->cpu, p->prio); + } + if (p->nr_cpus_allowed > 1) + rq->rt.rt_nr_migratory++; + + update_rt_migration(rq); +#endif /* CONFIG_SMP */ +} + +static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq) +{ +#ifdef CONFIG_SMP + int highest_prio = rq->rt.highest_prio; +#endif + WARN_ON(!rt_task(p)); + WARN_ON(!rq->rt.rt_nr_running); + rq->rt.rt_nr_running--; +#ifdef CONFIG_SMP + if (rq->rt.rt_nr_running) { + struct rt_prio_array *array; + + WARN_ON(p->prio < rq->rt.highest_prio); + if (p->prio == rq->rt.highest_prio) { + /* recalculate */ + array = &rq->rt.active; + rq->rt.highest_prio = + sched_find_first_bit(array->bitmap); + } /* otherwise leave rq->highest prio alone */ + } else + rq->rt.highest_prio = MAX_RT_PRIO; + if (p->nr_cpus_allowed > 1) { + BUG_ON(!rq->rt.rt_nr_migratory); + rq->rt.rt_nr_migratory--; + } + + if (rq->rt.highest_prio != highest_prio) + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio); + + update_rt_migration(rq); +#endif /* CONFIG_SMP */ +} + +static inline void incr_rt_nr_uninterruptible(struct task_struct *p, + struct rq *rq) +{ + rq->rt.rt_nr_uninterruptible++; +} + +static inline void decr_rt_nr_uninterruptible(struct task_struct *p, + struct rq *rq) +{ + rq->rt.rt_nr_uninterruptible--; +} + +unsigned long rt_nr_running(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->rt.rt_nr_running; + + return sum; +} + +unsigned long rt_nr_running_cpu(int cpu) +{ + return cpu_rq(cpu)->rt.rt_nr_running; +} + +unsigned long rt_nr_uninterruptible(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->rt.rt_nr_uninterruptible; + + /* + * Since we read the counters lockless, it might be slightly + * inaccurate. Do not allow it to go below zero though: + */ + if (unlikely((long)sum < 0)) + sum = 0; + + return sum; +} + +unsigned long rt_nr_uninterruptible_cpu(int cpu) +{ + return cpu_rq(cpu)->rt.rt_nr_uninterruptible; +} + static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) { struct rt_prio_array *array = &rq->rt.active; list_add_tail(&p->run_list, array->queue + p->prio); __set_bit(p->prio, array->bitmap); + inc_rt_tasks(p, rq); + + if (p->state == TASK_UNINTERRUPTIBLE) + decr_rt_nr_uninterruptible(p, rq); } /* @@ -43,9 +190,13 @@ static void dequeue_task_rt(struct rq *r update_curr_rt(rq); + if (p->state == TASK_UNINTERRUPTIBLE) + incr_rt_nr_uninterruptible(p, rq); + list_del(&p->run_list); if (list_empty(array->queue + p->prio)) __clear_bit(p->prio, array->bitmap); + dec_rt_tasks(p, rq); } /* @@ -65,6 +216,45 @@ yield_task_rt(struct rq *rq) requeue_task_rt(rq, rq->curr); } +#ifdef CONFIG_SMP +static int find_lowest_rq(struct task_struct *task); + +static int select_task_rq_rt(struct task_struct *p, int sync) +{ + struct rq *rq = task_rq(p); + + /* + * If the current task is an RT task, then + * try to see if we can wake this RT task up on another + * runqueue. Otherwise simply start this RT task + * on its current runqueue. + * + * We want to avoid overloading runqueues. Even if + * the RT task is of higher priority than the current RT task. + * RT tasks behave differently than other tasks. If + * one gets preempted, we try to push it off to another queue. + * So trying to keep a preempting RT task on the same + * cache hot CPU will force the running RT task to + * a cold CPU. So we waste all the cache for the lower + * RT task in hopes of saving some of a RT task + * that is just being woken and probably will have + * cold cache anyway. + */ + if (unlikely(rt_task(rq->curr)) && + (p->nr_cpus_allowed > 1)) { + int cpu = find_lowest_rq(p); + + return (cpu == -1) ? task_cpu(p) : cpu; + } + + /* + * Otherwise, just let it ride on the affined RQ and the + * post-schedule router will push the preempted task away + */ + return task_cpu(p); +} +#endif /* CONFIG_SMP */ + /* * Preempt the current task with a newly woken task if needed: */ @@ -100,76 +290,428 @@ static void put_prev_task_rt(struct rq * } #ifdef CONFIG_SMP -/* - * Load-balancing iterator. Note: while the runqueue stays locked - * during the whole iteration, the current task might be - * dequeued so the iterator has to be dequeue-safe. Here we - * achieve that by always pre-iterating before returning - * the current task: - */ -static struct task_struct *load_balance_start_rt(void *arg) +/* Only try algorithms three times */ +#define RT_MAX_TRIES 3 + +static int double_lock_balance(struct rq *this_rq, struct rq *busiest); +static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); + +static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) +{ + if (!task_running(rq, p) && + (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) && + (p->nr_cpus_allowed > 1)) + return 1; + return 0; +} + +/* Return the second highest RT task, NULL otherwise */ +static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) { - struct rq *rq = arg; struct rt_prio_array *array = &rq->rt.active; - struct list_head *head, *curr; - struct task_struct *p; + struct task_struct *next; + struct list_head *queue; int idx; + if (likely(rq->rt.rt_nr_running < 2)) + return NULL; + idx = sched_find_first_bit(array->bitmap); - if (idx >= MAX_RT_PRIO) + if (unlikely(idx >= MAX_RT_PRIO)) { + WARN_ON(1); /* rt_nr_running is bad */ return NULL; + } + + queue = array->queue + idx; + BUG_ON(list_empty(queue)); + + next = list_entry(queue->next, struct task_struct, run_list); + if (unlikely(pick_rt_task(rq, next, cpu))) + goto out; + + if (queue->next->next != queue) { + /* same prio task */ + next = list_entry(queue->next->next, struct task_struct, + run_list); + if (pick_rt_task(rq, next, cpu)) + goto out; + } - head = array->queue + idx; - curr = head->prev; + retry: + /* slower, but more flexible */ + idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); + if (unlikely(idx >= MAX_RT_PRIO)) + return NULL; - p = list_entry(curr, struct task_struct, run_list); + queue = array->queue + idx; + BUG_ON(list_empty(queue)); - curr = curr->prev; + list_for_each_entry(next, queue, run_list) { + if (pick_rt_task(rq, next, cpu)) + goto out; + } - rq->rt.rt_load_balance_idx = idx; - rq->rt.rt_load_balance_head = head; - rq->rt.rt_load_balance_curr = curr; + goto retry; - return p; + out: + return next; } -static struct task_struct *load_balance_next_rt(void *arg) +static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); + +static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask) { - struct rq *rq = arg; - struct rt_prio_array *array = &rq->rt.active; - struct list_head *head, *curr; - struct task_struct *p; - int idx; + int count; - idx = rq->rt.rt_load_balance_idx; - head = rq->rt.rt_load_balance_head; - curr = rq->rt.rt_load_balance_curr; + count = cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask); /* - * If we arrived back to the head again then - * iterate to the next queue (if any): + * cpupri cannot efficiently tell us how many bits are set, so it only + * returns a boolean. However, the caller of this function will + * special case the value "1", so we want to return a positive integer + * other than one if there are bits to look at */ - if (unlikely(head == curr)) { - int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); + return count ? 2 : 0; +} + +static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) +{ + int first; + + /* "this_cpu" is cheaper to preempt than a remote processor */ + if ((this_cpu != -1) && cpu_isset(this_cpu, *mask)) + return this_cpu; - if (next_idx >= MAX_RT_PRIO) - return NULL; + first = first_cpu(*mask); + if (first != NR_CPUS) + return first; - idx = next_idx; - head = array->queue + idx; - curr = head->prev; + return -1; +} - rq->rt.rt_load_balance_idx = idx; - rq->rt.rt_load_balance_head = head; +static int find_lowest_rq(struct task_struct *task) +{ + struct sched_domain *sd; + cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); + int this_cpu = smp_processor_id(); + int cpu = task_cpu(task); + int count; + + if (task->nr_cpus_allowed == 1) + return -1; /* No other targets possible */ + + count = find_lowest_cpus(task, lowest_mask); + if (!count) + return -1; /* No targets found */ + + /* + * There is no sense in performing an optimal search if only one + * target is found. + */ + if (count == 1) + return first_cpu(*lowest_mask); + + /* + * At this point we have built a mask of cpus representing the + * lowest priority tasks in the system. Now we want to elect + * the best one based on our affinity and topology. + * + * We prioritize the last cpu that the task executed on since + * it is most likely cache-hot in that location. + */ + if (cpu_isset(cpu, *lowest_mask)) + return cpu; + + /* + * Otherwise, we consult the sched_domains span maps to figure + * out which cpu is logically closest to our hot cache data. + */ + if (this_cpu == cpu) + this_cpu = -1; /* Skip this_cpu opt if the same */ + + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_AFFINE) { + cpumask_t domain_mask; + int best_cpu; + + cpus_and(domain_mask, sd->span, *lowest_mask); + + best_cpu = pick_optimal_cpu(this_cpu, + &domain_mask); + if (best_cpu != -1) + return best_cpu; + } } - p = list_entry(curr, struct task_struct, run_list); + /* + * And finally, if there were no matches within the domains + * just give the caller *something* to work with from the compatible + * locations. + */ + return pick_optimal_cpu(this_cpu, lowest_mask); +} - curr = curr->prev; +/* Will lock the rq it finds */ +static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) +{ + struct rq *lowest_rq = NULL; + int tries; + int cpu; + + for (tries = 0; tries < RT_MAX_TRIES; tries++) { + cpu = find_lowest_rq(task); + + if ((cpu == -1) || (cpu == rq->cpu)) + break; + + lowest_rq = cpu_rq(cpu); + + /* if the prio of this runqueue changed, try again */ + if (double_lock_balance(rq, lowest_rq)) { + /* + * We had to unlock the run queue. In + * the mean time, task could have + * migrated already or had its affinity changed. + * Also make sure that it wasn't scheduled on its rq. + */ + if (unlikely(task_rq(task) != rq || + !cpu_isset(lowest_rq->cpu, + task->cpus_allowed) || + task_running(rq, task) || + !task->se.on_rq)) { + + spin_unlock(&lowest_rq->lock); + lowest_rq = NULL; + break; + } + } + + /* If this rq is still suitable use it. */ + if (lowest_rq->rt.highest_prio > task->prio) + break; + + /* try again */ + spin_unlock(&lowest_rq->lock); + lowest_rq = NULL; + } - rq->rt.rt_load_balance_curr = curr; + return lowest_rq; +} - return p; +/* + * If the current CPU has more than one RT task, see if the non + * running task can migrate over to a CPU that is running a task + * of lesser priority. + */ +static int push_rt_task(struct rq *rq) +{ + struct task_struct *next_task; + struct rq *lowest_rq; + int ret = 0; + int paranoid = RT_MAX_TRIES; + + if (!rq->rt.overloaded) + return 0; + + next_task = pick_next_highest_task_rt(rq, -1); + if (!next_task) + return 0; + + retry: + if (unlikely(next_task == rq->curr)) { + WARN_ON(1); + return 0; + } + + /* + * It's possible that the next_task slipped in of + * higher priority than current. If that's the case + * just reschedule current. + */ + if (unlikely(next_task->prio < rq->curr->prio)) { + resched_task(rq->curr); + return 0; + } + + /* We might release rq lock */ + get_task_struct(next_task); + + /* find_lock_lowest_rq locks the rq if found */ + lowest_rq = find_lock_lowest_rq(next_task, rq); + if (!lowest_rq) { + struct task_struct *task; + /* + * find lock_lowest_rq releases rq->lock + * so it is possible that next_task has changed. + * If it has, then try again. + */ + task = pick_next_highest_task_rt(rq, -1); + if (unlikely(task != next_task) && task && paranoid--) { + put_task_struct(next_task); + next_task = task; + goto retry; + } + goto out; + } + + deactivate_task(rq, next_task, 0); + set_task_cpu(next_task, lowest_rq->cpu); + activate_task(lowest_rq, next_task, 0); + + resched_task(lowest_rq->curr); + + schedstat_inc(rq, rto_pushed); + + spin_unlock(&lowest_rq->lock); + + ret = 1; +out: + put_task_struct(next_task); + + return ret; +} + +/* + * TODO: Currently we just use the second highest prio task on + * the queue, and stop when it can't migrate (or there's + * no more RT tasks). There may be a case where a lower + * priority RT task has a different affinity than the + * higher RT task. In this case the lower RT task could + * possibly be able to migrate where as the higher priority + * RT task could not. We currently ignore this issue. + * Enhancements are welcome! + */ +static void push_rt_tasks(struct rq *rq) +{ + /* push_rt_task will return true if it moved an RT */ + while (push_rt_task(rq)) + ; +} + +static int pull_rt_task(struct rq *this_rq) +{ + int this_cpu = this_rq->cpu, ret = 0, cpu; + struct task_struct *p, *next; + struct rq *src_rq; + + if (likely(!rt_overloaded(this_rq))) + return 0; + + next = pick_next_task_rt(this_rq); + + for_each_cpu_mask(cpu, this_rq->rd->rto_mask) { + if (this_cpu == cpu) + continue; + + src_rq = cpu_rq(cpu); + /* + * We can potentially drop this_rq's lock in + * double_lock_balance, and another CPU could + * steal our next task - hence we must cause + * the caller to recalculate the next task + * in that case: + */ + if (double_lock_balance(this_rq, src_rq)) { + struct task_struct *old_next = next; + + next = pick_next_task_rt(this_rq); + if (next != old_next) + ret = 1; + } + + /* + * Are there still pullable RT tasks? + */ + if (src_rq->rt.rt_nr_running <= 1) { + spin_unlock(&src_rq->lock); + continue; + } + + p = pick_next_highest_task_rt(src_rq, this_cpu); + + /* + * Do we have an RT task that preempts + * the to-be-scheduled task? + */ + if (p && (!next || (p->prio < next->prio))) { + WARN_ON(p == src_rq->curr); + WARN_ON(!p->se.on_rq); + + /* + * There's a chance that p is higher in priority + * than what's currently running on its cpu. + * This is just that p is wakeing up and hasn't + * had a chance to schedule. We only pull + * p if it is lower in priority than the + * current task on the run queue or + * this_rq next task is lower in prio than + * the current task on that rq. + */ + if (p->prio < src_rq->curr->prio || + (next && next->prio < src_rq->curr->prio)) + goto out; + + ret = 1; + + deactivate_task(src_rq, p, 0); + set_task_cpu(p, this_cpu); + activate_task(this_rq, p, 0); + /* + * We continue with the search, just in + * case there's an even higher prio task + * in another runqueue. (low likelyhood + * but possible) + * + * Update next so that we won't pick a task + * on another cpu with a priority lower (or equal) + * than the one we just picked. + */ + next = p; + + schedstat_inc(src_rq, rto_pulled); + } + out: + spin_unlock(&src_rq->lock); + } + + return ret; +} + +static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) +{ + /* Try to pull RT tasks here if we lower this rq's prio */ + if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio) { + pull_rt_task(rq); + schedstat_inc(rq, rto_schedule); + } +} + +static void post_schedule_rt(struct rq *rq) +{ + /* + * If we have more than one rt_task queued, then + * see if we can push the other rt_tasks off to other CPUS. + * Note we may release the rq lock, and since + * the lock was owned by prev, we need to release it + * first via finish_lock_switch and then reaquire it here. + */ + if (unlikely(rq->rt.overloaded)) { + spin_lock(&rq->lock); + push_rt_tasks(rq); + schedstat_inc(rq, rto_schedule_tail); + spin_unlock(&rq->lock); + } +} + + +static void task_wake_up_rt(struct rq *rq, struct task_struct *p) +{ + if (!task_running(rq, p) && + (p->prio >= rq->rt.highest_prio) && + rq->rt.overloaded) { + push_rt_tasks(rq); + schedstat_inc(rq, rto_wakeup); + } } static unsigned long @@ -178,33 +720,146 @@ load_balance_rt(struct rq *this_rq, int struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio) { - struct rq_iterator rt_rq_iterator; - - rt_rq_iterator.start = load_balance_start_rt; - rt_rq_iterator.next = load_balance_next_rt; - /* pass 'busiest' rq argument into - * load_balance_[start|next]_rt iterators - */ - rt_rq_iterator.arg = busiest; - - return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd, - idle, all_pinned, this_best_prio, &rt_rq_iterator); + /* don't touch RT tasks */ + return 0; } static int move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, struct sched_domain *sd, enum cpu_idle_type idle) { - struct rq_iterator rt_rq_iterator; + /* don't touch RT tasks */ + return 0; +} +static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask) +{ + int weight = cpus_weight(*new_mask); + + BUG_ON(!rt_task(p)); + + /* + * Update the migration status of the RQ if we have an RT task + * which is running AND changing its weight value. + */ + if (p->se.on_rq && (weight != p->nr_cpus_allowed)) { + struct rq *rq = task_rq(p); + + if ((p->nr_cpus_allowed <= 1) && (weight > 1)) + rq->rt.rt_nr_migratory++; + else if((p->nr_cpus_allowed > 1) && (weight <= 1)) { + BUG_ON(!rq->rt.rt_nr_migratory); + rq->rt.rt_nr_migratory--; + } + + update_rt_migration(rq); + } + + p->cpus_allowed = *new_mask; + p->nr_cpus_allowed = weight; +} +/* Assumes rq->lock is held */ +static void join_domain_rt(struct rq *rq) +{ + if (rq->rt.overloaded) + rt_set_overload(rq); + + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio); +} - rt_rq_iterator.start = load_balance_start_rt; - rt_rq_iterator.next = load_balance_next_rt; - rt_rq_iterator.arg = busiest; +/* Assumes rq->lock is held */ +static void leave_domain_rt(struct rq *rq) +{ + if (rq->rt.overloaded) + rt_clear_overload(rq); - return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, - &rt_rq_iterator); + cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); +} + +/* + * When switch from the rt queue, we bring ourselves to a position + * that we might want to pull RT tasks from other runqueues. + */ +static void switched_from_rt(struct rq *rq, struct task_struct *p, + int running) +{ + /* + * If there are other RT tasks then we will reschedule + * and the scheduling of the other RT tasks will handle + * the balancing. But if we are the last RT task + * we may need to handle the pulling of RT tasks + * now. + */ + if (!rq->rt.rt_nr_running) + pull_rt_task(rq); +} +#endif /* CONFIG_SMP */ + +/* + * When switching a task to RT, we may overload the runqueue + * with RT tasks. In this case we try to push them off to + * other runqueues. + */ +static void switched_to_rt(struct rq *rq, struct task_struct *p, + int running) +{ + int check_resched = 1; + + /* + * If we are already running, then there's nothing + * that needs to be done. But if we are not running + * we may need to preempt the current running task. + * If that current running task is also an RT task + * then see if we can move to another run queue. + */ + if (!running) { +#ifdef CONFIG_SMP + if (rq->rt.overloaded && push_rt_task(rq) && + /* Don't resched if we changed runqueues */ + rq != task_rq(p)) + check_resched = 0; +#endif /* CONFIG_SMP */ + if (check_resched && p->prio < rq->curr->prio) + resched_task(rq->curr); + } } -#endif + +/* + * Priority of the task has changed. This may cause + * us to initiate a push or pull. + */ +static void prio_changed_rt(struct rq *rq, struct task_struct *p, + int oldprio, int running) +{ + if (running) { +#ifdef CONFIG_SMP + /* + * If our priority decreases while running, we + * may need to pull tasks to this runqueue. + */ + if (oldprio < p->prio) + pull_rt_task(rq); + /* + * If there's a higher priority task waiting to run + * then reschedule. + */ + if (p->prio > rq->rt.highest_prio) + resched_task(p); +#else + /* For UP simply resched on drop of prio */ + if (oldprio < p->prio) + resched_task(p); +#endif /* CONFIG_SMP */ + } else { + /* + * This task is not running, but if it is + * greater than the current running task + * then reschedule. + */ + if (p->prio < rq->curr->prio) + resched_task(rq->curr); + } +} + static void task_tick_rt(struct rq *rq, struct task_struct *p) { @@ -244,6 +899,9 @@ const struct sched_class rt_sched_class .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_rt, +#endif /* CONFIG_SMP */ .check_preempt_curr = check_preempt_curr_rt, @@ -253,8 +911,18 @@ const struct sched_class rt_sched_class #ifdef CONFIG_SMP .load_balance = load_balance_rt, .move_one_task = move_one_task_rt, + .set_cpus_allowed = set_cpus_allowed_rt, + .join_domain = join_domain_rt, + .leave_domain = leave_domain_rt, + .pre_schedule = pre_schedule_rt, + .post_schedule = post_schedule_rt, + .task_wake_up = task_wake_up_rt, + .switched_from = switched_from_rt, #endif .set_curr_task = set_curr_task_rt, .task_tick = task_tick_rt, + + .prio_changed = prio_changed_rt, + .switched_to = switched_to_rt, }; Index: linux-2.6.24-rc8-rt1/include/linux/init_task.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/init_task.h 2008-01-16 22:27:49.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/init_task.h 2008-01-16 22:29:20.000000000 -0500 @@ -10,6 +10,7 @@ #include #include #include +#include #define INIT_FDTABLE \ { \ @@ -87,6 +88,17 @@ extern struct nsproxy init_nsproxy; .signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(sighand.signalfd_wqh), \ } +#ifdef CONFIG_PREEMPT_RCU_BOOST +#define INIT_RCU_BOOST_PRIO .rcu_prio = MAX_PRIO, +#define INIT_PREEMPT_RCU_BOOST(tsk) \ + .rcub_rbdp = NULL, \ + .rcub_state = RCU_BOOST_IDLE, \ + .rcub_entry = LIST_HEAD_INIT(tsk.rcub_entry), +#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST */ +#define INIT_RCU_BOOST_PRIO +#define INIT_PREEMPT_RCU_BOOST(tsk) +#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST */ + extern struct group_info init_groups; #define INIT_STRUCT_PID { \ @@ -129,7 +141,9 @@ extern struct group_info init_groups; .static_prio = MAX_PRIO-20, \ .normal_prio = MAX_PRIO-20, \ .policy = SCHED_NORMAL, \ + INIT_RCU_BOOST_PRIO \ .cpus_allowed = CPU_MASK_ALL, \ + .nr_cpus_allowed = NR_CPUS, \ .mm = NULL, \ .active_mm = &init_mm, \ .run_list = LIST_HEAD_INIT(tsk.run_list), \ @@ -164,7 +178,8 @@ extern struct group_info init_groups; .journal_info = NULL, \ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ .fs_excl = ATOMIC_INIT(0), \ - .pi_lock = __SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ + .posix_timer_list = NULL, \ + .pi_lock = RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ .pids = { \ [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \ [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \ @@ -173,6 +188,7 @@ extern struct group_info init_groups; .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \ INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ + INIT_PREEMPT_RCU_BOOST(tsk) \ } Index: linux-2.6.24-rc8-rt1/include/linux/sched.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/sched.h 2008-01-16 22:27:49.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/sched.h 2008-01-16 22:29:23.000000000 -0500 @@ -91,6 +91,27 @@ struct sched_param { #include +#ifdef CONFIG_PREEMPT +extern int kernel_preemption; +#else +# define kernel_preemption 0 +#endif +#ifdef CONFIG_PREEMPT_VOLUNTARY +extern int voluntary_preemption; +#else +# define voluntary_preemption 0 +#endif +#ifdef CONFIG_PREEMPT_SOFTIRQS +extern int softirq_preemption; +#else +# define softirq_preemption 0 +#endif +#ifdef CONFIG_PREEMPT_HARDIRQS +extern int hardirq_preemption; +#else +# define hardirq_preemption 0 +#endif + struct exec_domain; struct futex_pi_state; struct bio; @@ -168,21 +189,45 @@ print_cfs_rq(struct seq_file *m, int cpu * mistake. */ #define TASK_RUNNING 0 -#define TASK_INTERRUPTIBLE 1 -#define TASK_UNINTERRUPTIBLE 2 -#define TASK_STOPPED 4 -#define TASK_TRACED 8 +#define TASK_RUNNING_MUTEX 1 +#define TASK_INTERRUPTIBLE 2 +#define TASK_UNINTERRUPTIBLE 4 +#define TASK_STOPPED 8 +#define TASK_TRACED 16 /* in tsk->exit_state */ -#define EXIT_ZOMBIE 16 -#define EXIT_DEAD 32 +#define EXIT_ZOMBIE 32 +#define EXIT_DEAD 64 /* in tsk->state again */ -#define TASK_DEAD 64 +#define TASK_NONINTERACTIVE 128 +#define TASK_DEAD 256 #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) #define set_task_state(tsk, state_value) \ set_mb((tsk)->state, (state_value)) +// #define PREEMPT_DIRECT + +#ifdef CONFIG_X86_LOCAL_APIC +extern void nmi_show_all_regs(void); +#else +# define nmi_show_all_regs() do { } while (0) +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct exec_domain; + /* * set_current_state() includes a barrier so that the write of current->state * is correctly serialised wrt the caller's subsequent test of whether to @@ -241,6 +286,7 @@ static inline void show_state(void) } extern void show_regs(struct pt_regs *); +extern void irq_show_regs_callback(int cpu, struct pt_regs *regs); /* * TASK is a pointer to the task whose backtrace we want to see (or NULL for current @@ -258,6 +304,12 @@ extern void account_process_tick(struct extern void update_process_times(int user); extern void scheduler_tick(void); +#ifdef CONFIG_GENERIC_HARDIRQS +extern int debug_direct_keyboard; +#else +# define debug_direct_keyboard 0 +#endif + #ifdef CONFIG_DETECT_SOFTLOCKUP extern void softlockup_tick(void); extern void spawn_softlockup_task(void); @@ -279,6 +331,108 @@ static inline void touch_all_softlockup_ } #endif +#ifdef CONFIG_PREEMPT_BKL +extern struct semaphore kernel_sem; +#endif + +#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_EVENT_TRACE) + extern void print_traces(struct task_struct *task); +#else +# define print_traces(task) do { } while (0) +#endif + +#ifdef CONFIG_FRAME_POINTER +# ifndef CONFIG_ARM +# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1)) +# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2)) +# define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3)) +# define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4)) +# define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5)) +# else + extern unsigned long arm_return_addr(int level); +# define CALLER_ADDR0 arm_return_addr(0) +# define CALLER_ADDR1 arm_return_addr(1) +# define CALLER_ADDR2 arm_return_addr(2) +# define CALLER_ADDR3 arm_return_addr(3) +# define CALLER_ADDR4 arm_return_addr(4) +# define CALLER_ADDR5 arm_return_addr(5) +#endif +#else +# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +# define CALLER_ADDR1 0UL +# define CALLER_ADDR2 0UL +# define CALLER_ADDR3 0UL +# define CALLER_ADDR4 0UL +# define CALLER_ADDR5 0UL +#endif + +#ifdef CONFIG_EVENT_TRACE + extern int trace_enabled, trace_user_triggered, + trace_user_trigger_irq, trace_freerunning, trace_verbose, + trace_print_on_crash, trace_all_cpus, print_functions, + syscall_tracing, stackframe_tracing, trace_use_raw_cycles, + trace_all_runnable; + extern void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3); + extern void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2); + extern void notrace trace_special_u64(unsigned long long v1, unsigned long v2); + extern void notrace trace_special_sym(void); + extern void stop_trace(void); +# define start_trace() do { trace_enabled = 1; } while (0) + extern void print_last_trace(void); + extern void nmi_trace(unsigned long eip, unsigned long parent_eip, + unsigned long flags); + extern long user_trace_start(void); + extern long user_trace_stop(void); + extern void trace_cmdline(void); + extern void init_tracer(void); +#else +# define trace_enabled 0 +# define syscall_tracing 0 +# define stackframe_tracing 0 +# define trace_user_triggered 0 +# define trace_freerunning 0 +# define trace_all_cpus 0 +# define trace_verbose 0 +# define trace_special(v1,v2,v3) do { } while (0) +# define trace_special_pid(pid,v1,v2) do { } while (0) +# define trace_special_u64(v1,v2) do { } while (0) +# define trace_special_sym() do { } while (0) +# define stop_trace() do { } while (0) +# define start_trace() do { } while (0) +# define print_last_trace() do { } while (0) +# define nmi_trace(eip, parent_eip, flags) do { } while (0) +# define user_trace_start() do { } while (0) +# define user_trace_stop() do { } while (0) +# define trace_cmdline() do { } while (0) +# define init_tracer() do { } while (0) +#endif + +#ifdef CONFIG_MCOUNT + extern void call_mcount(void); +#else +# define call_mcount() do { } while (0) +#endif + +#ifdef CONFIG_WAKEUP_TIMING + extern int wakeup_timing; + extern void __trace_start_sched_wakeup(struct task_struct *p); + extern void trace_stop_sched_switched(struct task_struct *p); + extern void trace_change_sched_cpu(struct task_struct *p, int new_cpu); +#else +# define wakeup_timing 0 +# define __trace_start_sched_wakeup(p) do { } while (0) +# define trace_stop_sched_switched(p) do { } while (0) +# define trace_change_sched_cpu(p, cpu) do { } while (0) +#endif + +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + extern void notrace time_hardirqs_on(unsigned long a0, unsigned long a1); + extern void notrace time_hardirqs_off(unsigned long a0, unsigned long a1); +#else +# define time_hardirqs_on(a0, a1) do { } while (0) +# define time_hardirqs_off(a0, a1) do { } while (0) +#endif /* Attach to any functions which should be ignored in wchan output. */ #define __sched __attribute__((__section__(".sched.text"))) @@ -294,6 +448,11 @@ extern signed long FASTCALL(schedule_tim extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); +/* + * This one can be called with interrupts disabled, only + * to be used by lowlevel arch code! + */ +asmlinkage void __sched __schedule(void); struct nsproxy; struct user_namespace; @@ -522,6 +681,19 @@ struct signal_struct { #define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */ #define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */ +#ifdef CONFIG_PREEMPT_RCU_BOOST +#define set_rcu_prio(p, prio) /* cpp to avoid #include hell */ \ + do { \ + (p)->rcu_prio = (prio); \ + } while (0) +#define get_rcu_prio(p) (p)->rcu_prio /* cpp to avoid #include hell */ +#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST */ +static inline void set_rcu_prio(struct task_struct *p, int prio) +{ +} +#define get_rcu_prio(p) (MAX_PRIO) /* cpp to use MAX_PRIO before it's defined */ +#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST */ + /* * Some day this will be a full-fledged user tracking system.. */ @@ -827,6 +999,7 @@ struct sched_class { void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); void (*yield_task) (struct rq *rq); + int (*select_task_rq)(struct task_struct *p, int sync); void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); @@ -842,11 +1015,25 @@ struct sched_class { int (*move_one_task) (struct rq *this_rq, int this_cpu, struct rq *busiest, struct sched_domain *sd, enum cpu_idle_type idle); + void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); + void (*post_schedule) (struct rq *this_rq); + void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); #endif void (*set_curr_task) (struct rq *rq); void (*task_tick) (struct rq *rq, struct task_struct *p); void (*task_new) (struct rq *rq, struct task_struct *p); + void (*set_cpus_allowed)(struct task_struct *p, cpumask_t *newmask); + + void (*join_domain)(struct rq *rq); + void (*leave_domain)(struct rq *rq); + + void (*switched_from) (struct rq *this_rq, struct task_struct *task, + int running); + void (*switched_to) (struct rq *this_rq, struct task_struct *task, + int running); + void (*prio_changed) (struct rq *this_rq, struct task_struct *task, + int oldprio, int running); }; struct load_weight { @@ -930,6 +1117,9 @@ struct task_struct { #endif int prio, static_prio, normal_prio; +#ifdef CONFIG_PREEMPT_RCU_BOOST + int rcu_prio; +#endif struct list_head run_list; const struct sched_class *sched_class; struct sched_entity se; @@ -956,11 +1146,23 @@ struct task_struct { unsigned int policy; cpumask_t cpus_allowed; + int nr_cpus_allowed; unsigned int time_slice; +#ifdef CONFIG_PREEMPT_RCU + int rcu_read_lock_nesting; + int rcu_flipctr_idx; +#endif /* #ifdef CONFIG_PREEMPT_RCU */ + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; #endif +#ifdef CONFIG_PREEMPT_RCU_BOOST + struct rcu_boost_dat *rcub_rbdp; + enum rcu_boost_state rcub_state; + struct list_head rcub_entry; + unsigned long rcu_preempt_counter; +#endif struct list_head tasks; /* @@ -1024,6 +1226,8 @@ struct task_struct { unsigned long long it_sched_expires; struct list_head cpu_timers[3]; + struct task_struct* posix_timer_list; + /* process credentials */ uid_t uid,euid,suid,fsuid; gid_t gid,egid,sgid,fsgid; @@ -1080,7 +1284,7 @@ struct task_struct { spinlock_t alloc_lock; /* Protection of the PI data structures: */ - spinlock_t pi_lock; + raw_spinlock_t pi_lock; #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task */ @@ -1093,6 +1297,7 @@ struct task_struct { /* mutex deadlock detection */ struct mutex_waiter *blocked_on; #endif + int pagefault_disabled; #ifdef CONFIG_TRACE_IRQFLAGS unsigned int irq_events; int hardirqs_enabled; @@ -1116,6 +1321,32 @@ struct task_struct { unsigned int lockdep_recursion; #endif +#define MAX_PREEMPT_TRACE 25 + +#ifdef CONFIG_PREEMPT_TRACE + unsigned long preempt_trace_eip[MAX_PREEMPT_TRACE]; + unsigned long preempt_trace_parent_eip[MAX_PREEMPT_TRACE]; +#endif + +#define MAX_LOCK_STACK MAX_PREEMPT_TRACE +#ifdef CONFIG_DEBUG_PREEMPT + int lock_count; +# ifdef CONFIG_PREEMPT_RT + struct rt_mutex *owned_lock[MAX_LOCK_STACK]; +# endif +#endif +#ifdef CONFIG_DETECT_SOFTLOCKUP + unsigned long softlockup_count; /* Count to keep track how long the + * thread is in the kernel without + * sleeping. + */ +#endif + /* realtime bits */ + +#ifdef CONFIG_DEBUG_RT_MUTEXES + void *last_kernel_lock; +#endif + /* journalling filesystem info */ void *journal_info; @@ -1178,26 +1409,22 @@ struct task_struct { int make_it_fail; #endif struct prop_local_single dirties; +#ifdef CONFIG_PREEMPT_RT + /* + * Temporary hack, until we find a solution to + * handle printk in atomic operations. + */ + int in_printk; +#endif }; -/* - * Priority of a process goes from 0..MAX_PRIO-1, valid RT - * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH - * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority - * values are inverted: lower p->prio value means higher priority. - * - * The MAX_USER_RT_PRIO value allows the actual maximum - * RT priority to be separate from the value exported to - * user-space. This allows kernel threads to set their - * priority to a value higher than any user task. Note: - * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. - */ - -#define MAX_USER_RT_PRIO 100 -#define MAX_RT_PRIO MAX_USER_RT_PRIO +#ifdef CONFIG_PREEMPT_RT +# define set_printk_might_sleep(x) do { current->in_printk = x; } while(0) +#else +# define set_printk_might_sleep(x) do { } while(0) +#endif -#define MAX_PRIO (MAX_RT_PRIO + 40) -#define DEFAULT_PRIO (MAX_RT_PRIO + 20) +#include static inline int rt_prio(int prio) { @@ -1345,6 +1572,15 @@ extern struct pid *cad_pid; extern void free_task(struct task_struct *tsk); #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) +#ifdef CONFIG_PREEMPT_RT +extern void __put_task_struct_cb(struct rcu_head *rhp); + +static inline void put_task_struct(struct task_struct *t) +{ + if (atomic_dec_and_test(&t->usage)) + call_rcu(&t->rcu, __put_task_struct_cb); +} +#else extern void __put_task_struct(struct task_struct *t); static inline void put_task_struct(struct task_struct *t) @@ -1352,6 +1588,7 @@ static inline void put_task_struct(struc if (atomic_dec_and_test(&t->usage)) __put_task_struct(t); } +#endif /* * Per process flags @@ -1362,6 +1599,7 @@ static inline void put_task_struct(struc #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ +#define PF_NOSCHED 0x00000020 /* Userspace does not expect scheduling */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* dumped core */ @@ -1369,6 +1607,7 @@ static inline void put_task_struct(struc #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_FLUSHER 0x00001000 /* responsible for disk writeback */ #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ +#define PF_KMAP 0x00004000 /* this context has a kmap */ #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ #define PF_FROZEN 0x00010000 /* frozen for system suspend */ #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ @@ -1380,6 +1619,8 @@ static inline void put_task_struct(struc #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ +#define PF_SOFTIRQ 0x04000000 /* softirq context */ +#define PF_HARDIRQ 0x08000000 /* hardirq context */ #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ @@ -1466,9 +1707,14 @@ int sched_nr_latency_handler(struct ctl_ extern unsigned int sysctl_sched_compat_yield; +extern void task_setprio(struct task_struct *p, int prio); + #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); -extern void rt_mutex_setprio(struct task_struct *p, int prio); +static inline void rt_mutex_setprio(struct task_struct *p, int prio) +{ + task_setprio(p, prio); +} extern void rt_mutex_adjust_pi(struct task_struct *p); #else static inline int rt_mutex_getprio(struct task_struct *p) @@ -1490,6 +1736,7 @@ extern struct task_struct *curr_task(int extern void set_curr_task(int cpu, struct task_struct *p); void yield(void); +void __yield(void); /* * The default (Linux) execution domain. @@ -1561,6 +1808,9 @@ extern void do_timer(unsigned long ticks extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state)); extern int FASTCALL(wake_up_process(struct task_struct * tsk)); +extern int FASTCALL(wake_up_process_mutex(struct task_struct * tsk)); +extern int FASTCALL(wake_up_process_sync(struct task_struct * tsk)); +extern int FASTCALL(wake_up_process_mutex_sync(struct task_struct * tsk)); extern void FASTCALL(wake_up_new_task(struct task_struct * tsk, unsigned long clone_flags)); #ifdef CONFIG_SMP @@ -1655,12 +1905,20 @@ extern struct mm_struct * mm_alloc(void) /* mmdrop drops the mm and the page tables */ extern void FASTCALL(__mmdrop(struct mm_struct *)); +extern void FASTCALL(__mmdrop_delayed(struct mm_struct *)); + static inline void mmdrop(struct mm_struct * mm) { if (unlikely(atomic_dec_and_test(&mm->mm_count))) __mmdrop(mm); } +static inline void mmdrop_delayed(struct mm_struct * mm) +{ + if (atomic_dec_and_test(&mm->mm_count)) + __mmdrop_delayed(mm); +} + /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); /* Grab a reference to a task's mm, if it is not already going away */ @@ -1838,11 +2096,32 @@ static inline int signal_pending(struct return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); } -static inline int need_resched(void) +static inline int _need_resched(void) { return unlikely(test_thread_flag(TIF_NEED_RESCHED)); } +static inline int need_resched(void) +{ + touch_critical_timing(); + return _need_resched(); +} + +static inline void set_tsk_need_resched_delayed(struct task_struct *tsk) +{ + set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_DELAYED); +} + +static inline void clear_tsk_need_resched_delayed(struct task_struct *tsk) +{ + clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_DELAYED); +} + +static inline int need_resched_delayed(void) +{ + return unlikely(test_thread_flag(TIF_NEED_RESCHED_DELAYED)); +} + /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return @@ -1851,27 +2130,51 @@ static inline int need_resched(void) * cond_resched_softirq() will enable bhs before scheduling. */ extern int cond_resched(void); -extern int cond_resched_lock(spinlock_t * lock); +extern int __cond_resched_raw_spinlock(raw_spinlock_t *lock); +extern int __cond_resched_spinlock(spinlock_t *spinlock); + +#define cond_resched_lock(lock) \ + PICK_SPIN_OP_RET(__cond_resched_raw_spinlock, __cond_resched_spinlock,\ + lock) + extern int cond_resched_softirq(void); +extern int cond_resched_softirq_context(void); +extern int cond_resched_hardirq_context(void); /* * Does a critical section need to be broken due to another * task waiting?: */ -#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) -# define need_lockbreak(lock) ((lock)->break_lock) +#if (defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)) || defined(CONFIG_PREEMPT_RT) +# define need_lockbreak(lock) ({ int __need = ((lock)->break_lock); if (__need) (lock)->break_lock = 0; __need; }) #else # define need_lockbreak(lock) 0 #endif +#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) +# define need_lockbreak_raw(lock) ({ int __need = ((lock)->break_lock); if (__need) (lock)->break_lock = 0; __need; }) +#else +# define need_lockbreak_raw(lock) 0 +#endif + /* * Does a critical section need to be broken due to another * task waiting or preemption being signalled: */ -static inline int lock_need_resched(spinlock_t *lock) +#define lock_need_resched(lock) \ + unlikely(need_lockbreak(lock) || need_resched()) + +static inline int softirq_need_resched(void) { - if (need_lockbreak(lock) || need_resched()) - return 1; + if (softirq_preemption && (current->flags & PF_SOFTIRQ)) + return need_resched(); + return 0; +} + +static inline int hardirq_need_resched(void) +{ + if (hardirq_preemption && (current->flags & PF_HARDIRQ)) + return need_resched(); return 0; } Index: linux-2.6.24-rc8-rt1/kernel/fork.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/fork.c 2008-01-16 22:27:49.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/fork.c 2008-01-16 22:29:20.000000000 -0500 @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,8 @@ #include #include #include +#include +#include #include #include #include @@ -71,6 +74,15 @@ DEFINE_PER_CPU(unsigned long, process_co __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ +/* + * Delayed mmdrop. In the PREEMPT_RT case we + * dont want to do this from the scheduling + * context. + */ +static DEFINE_PER_CPU(struct task_struct *, desched_task); + +static DEFINE_PER_CPU(struct list_head, delayed_drop_list); + int nr_processes(void) { int cpu; @@ -115,10 +127,13 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); -void __put_task_struct(struct task_struct *tsk) +#ifdef CONFIG_PREEMPT_RT +void __put_task_struct_cb(struct rcu_head *rhp) { + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + + BUG_ON(atomic_read(&tsk->usage)); WARN_ON(!tsk->exit_state); - WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); security_task_free(tsk); @@ -130,8 +145,27 @@ void __put_task_struct(struct task_struc free_task(tsk); } +#else + +void __put_task_struct(struct task_struct *tsk) +{ + WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); + BUG_ON(atomic_read(&tsk->usage)); + WARN_ON(tsk == current); + + security_task_free(tsk); + free_uid(tsk->user); + put_group_info(tsk->group_info); + + if (!profile_handoff_task(tsk)) + free_task(tsk); +} +#endif + void __init fork_init(unsigned long mempages) { + int i; + #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES @@ -159,6 +193,9 @@ void __init fork_init(unsigned long memp init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; + + for (i = 0; i < NR_CPUS; i++) + INIT_LIST_HEAD(&per_cpu(delayed_drop_list, i)); } static struct task_struct *dup_task_struct(struct task_struct *orig) @@ -354,6 +391,7 @@ static struct mm_struct * mm_init(struct spin_lock_init(&mm->page_table_lock); rwlock_init(&mm->ioctx_list_lock); mm->ioctx_list = NULL; + INIT_LIST_HEAD(&mm->delayed_drop); mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; @@ -959,6 +997,9 @@ static void rt_mutex_init_task(struct ta #ifdef CONFIG_RT_MUTEXES plist_head_init(&p->pi_waiters, &p->pi_lock); p->pi_blocked_on = NULL; +# ifdef CONFIG_DEBUG_RT_MUTEXES + p->last_kernel_lock = NULL; +# endif #endif } @@ -1010,7 +1051,7 @@ static struct task_struct *copy_process( rt_mutex_init_task(p); -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_LOCKDEP) DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif @@ -1045,6 +1086,16 @@ static struct task_struct *copy_process( copy_flags(clone_flags, p); INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); +#ifdef CONFIG_PREEMPT_RCU + p->rcu_read_lock_nesting = 0; + p->rcu_flipctr_idx = 0; +#ifdef CONFIG_PREEMPT_RCU_BOOST + p->rcu_prio = MAX_PRIO; + p->rcub_rbdp = NULL; + p->rcub_state = RCU_BOOST_IDLE; + INIT_LIST_HEAD(&p->rcub_entry); +#endif +#endif /* CONFIG_PREEMPT_RCU */ p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); @@ -1074,7 +1125,7 @@ static struct task_struct *copy_process( INIT_LIST_HEAD(&p->cpu_timers[0]); INIT_LIST_HEAD(&p->cpu_timers[1]); INIT_LIST_HEAD(&p->cpu_timers[2]); - + p->posix_timer_list = NULL; p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); p->real_start_time = p->start_time; @@ -1113,6 +1164,7 @@ static struct task_struct *copy_process( p->hardirq_context = 0; p->softirq_context = 0; #endif + p->pagefault_disabled = 0; #ifdef CONFIG_LOCKDEP p->lockdep_depth = 0; /* no locks held yet */ p->curr_chain_key = 0; @@ -1150,6 +1202,9 @@ static struct task_struct *copy_process( retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); if (retval) goto bad_fork_cleanup_namespaces; +#ifdef CONFIG_DEBUG_PREEMPT + p->lock_count = 0; +#endif if (pid != &init_struct_pid) { retval = -ENOMEM; @@ -1236,10 +1291,13 @@ static struct task_struct *copy_process( * to ensure it is on a valid CPU (and if not, just force it back to * parent's CPU). This avoids alot of nasty races. */ + preempt_disable(); p->cpus_allowed = current->cpus_allowed; + p->nr_cpus_allowed = current->nr_cpus_allowed; if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || !cpu_online(task_cpu(p)))) set_task_cpu(p, smp_processor_id()); + preempt_enable(); /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) @@ -1301,7 +1359,9 @@ static struct task_struct *copy_process( attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail_rcu(&p->tasks, &init_task.tasks); + preempt_disable(); __get_cpu_var(process_counts)++; + preempt_enable(); } attach_pid(p, PIDTYPE_PID, pid); nr_threads++; @@ -1732,3 +1792,124 @@ bad_unshare_cleanup_thread: bad_unshare_out: return err; } + +static int mmdrop_complete(void) +{ + struct list_head *head; + int ret = 0; + + head = &get_cpu_var(delayed_drop_list); + while (!list_empty(head)) { + struct mm_struct *mm = list_entry(head->next, + struct mm_struct, delayed_drop); + list_del(&mm->delayed_drop); + put_cpu_var(delayed_drop_list); + + __mmdrop(mm); + ret = 1; + + head = &get_cpu_var(delayed_drop_list); + } + put_cpu_var(delayed_drop_list); + + return ret; +} + +/* + * We dont want to do complex work from the scheduler, thus + * we delay the work to a per-CPU worker thread: + */ +void fastcall __mmdrop_delayed(struct mm_struct *mm) +{ + struct task_struct *desched_task; + struct list_head *head; + + head = &get_cpu_var(delayed_drop_list); + list_add_tail(&mm->delayed_drop, head); + desched_task = __get_cpu_var(desched_task); + if (desched_task) + wake_up_process(desched_task); + put_cpu_var(delayed_drop_list); +} + +static int desched_thread(void * __bind_cpu) +{ + set_user_nice(current, -10); + current->flags |= PF_NOFREEZE | PF_SOFTIRQ; + + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { + + if (mmdrop_complete()) + continue; + schedule(); + + /* + * This must be called from time to time on ia64, and is a + * no-op on other archs. Used to be in cpu_idle(), but with + * the new -rt semantics it can't stay there. + */ + check_pgt_cache(); + + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +static int __devinit cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + struct task_struct *p; + + switch (action) { + case CPU_UP_PREPARE: + + BUG_ON(per_cpu(desched_task, hotcpu)); + INIT_LIST_HEAD(&per_cpu(delayed_drop_list, hotcpu)); + p = kthread_create(desched_thread, hcpu, "desched/%d", hotcpu); + if (IS_ERR(p)) { + printk("desched_thread for %i failed\n", hotcpu); + return NOTIFY_BAD; + } + per_cpu(desched_task, hotcpu) = p; + kthread_bind(p, hotcpu); + break; + case CPU_ONLINE: + + wake_up_process(per_cpu(desched_task, hotcpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + + /* Unbind so it can run. Fall thru. */ + kthread_bind(per_cpu(desched_task, hotcpu), smp_processor_id()); + case CPU_DEAD: + + p = per_cpu(desched_task, hotcpu); + per_cpu(desched_task, hotcpu) = NULL; + kthread_stop(p); + takeover_tasklets(hotcpu); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata cpu_nfb = { + .notifier_call = cpu_callback +}; + +__init int spawn_desched_task(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + + cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + register_cpu_notifier(&cpu_nfb); + return 0; +} + Index: linux-2.6.24-rc8-rt1/kernel/sched_fair.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/sched_fair.c 2008-01-16 22:27:49.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/sched_fair.c 2008-01-16 22:28:04.000000000 -0500 @@ -836,6 +836,154 @@ static void yield_task_fair(struct rq *r } /* + * wake_idle() will wake a task on an idle cpu if task->cpu is + * not idle and an idle cpu is available. The span of cpus to + * search starts with cpus closest then further out as needed, + * so we always favor a closer, idle cpu. + * + * Returns the CPU we should wake onto. + */ +#if defined(ARCH_HAS_SCHED_WAKE_IDLE) +static int wake_idle(int cpu, struct task_struct *p) +{ + cpumask_t tmp; + struct sched_domain *sd; + int i; + + /* + * If it is idle, then it is the best cpu to run this task. + * + * This cpu is also the best, if it has more than one task already. + * Siblings must be also busy(in most cases) as they didn't already + * pickup the extra load from this cpu and hence we need not check + * sibling runqueue info. This will avoid the checks and cache miss + * penalities associated with that. + */ + if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) + return cpu; + + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_IDLE) { + cpus_and(tmp, sd->span, p->cpus_allowed); + for_each_cpu_mask(i, tmp) { + if (idle_cpu(i)) { + if (i != task_cpu(p)) { + schedstat_inc(p, + se.nr_wakeups_idle); + } + return i; + } + } + } else { + break; + } + } + return cpu; +} +#else +static inline int wake_idle(int cpu, struct task_struct *p) +{ + return cpu; +} +#endif + +#ifdef CONFIG_SMP +static int select_task_rq_fair(struct task_struct *p, int sync) +{ + int cpu, this_cpu; + struct rq *rq; + struct sched_domain *sd, *this_sd = NULL; + int new_cpu; + + cpu = task_cpu(p); + rq = task_rq(p); + this_cpu = smp_processor_id(); + new_cpu = cpu; + + if (cpu == this_cpu) + goto out_set_cpu; + + for_each_domain(this_cpu, sd) { + if (cpu_isset(cpu, sd->span)) { + this_sd = sd; + break; + } + } + + if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + goto out_set_cpu; + + /* + * Check for affine wakeup and passive balancing possibilities. + */ + if (this_sd) { + int idx = this_sd->wake_idx; + unsigned int imbalance; + unsigned long load, this_load; + + imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; + + load = source_load(cpu, idx); + this_load = target_load(this_cpu, idx); + + new_cpu = this_cpu; /* Wake to this CPU if we can */ + + if (this_sd->flags & SD_WAKE_AFFINE) { + unsigned long tl = this_load; + unsigned long tl_per_task; + + /* + * Attract cache-cold tasks on sync wakeups: + */ + if (sync && !task_hot(p, rq->clock, this_sd)) + goto out_set_cpu; + + schedstat_inc(p, se.nr_wakeups_affine_attempts); + tl_per_task = cpu_avg_load_per_task(this_cpu); + + /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: + */ + if (sync) + tl -= current->se.load.weight; + + if ((tl <= load && + tl + target_load(cpu, idx) <= tl_per_task) || + 100*(tl + p->se.load.weight) <= imbalance*load) { + /* + * This domain has SD_WAKE_AFFINE and + * p is cache cold in this domain, and + * there is no bad imbalance. + */ + schedstat_inc(this_sd, ttwu_move_affine); + schedstat_inc(p, se.nr_wakeups_affine); + goto out_set_cpu; + } + } + + /* + * Start passive balancing when half the imbalance_pct + * limit is reached. + */ + if (this_sd->flags & SD_WAKE_BALANCE) { + if (imbalance*this_load <= 100*load) { + schedstat_inc(this_sd, ttwu_move_balance); + schedstat_inc(p, se.nr_wakeups_passive); + goto out_set_cpu; + } + } + } + + new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ +out_set_cpu: + return wake_idle(new_cpu, p); +} +#endif /* CONFIG_SMP */ + + +/* * Preempt the current task with a newly woken task if needed: */ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) @@ -1087,6 +1235,42 @@ static void task_new_fair(struct rq *rq, resched_task(rq->curr); } +/* + * Priority of the task has changed. Check to see if we preempt + * the current task. + */ +static void prio_changed_fair(struct rq *rq, struct task_struct *p, + int oldprio, int running) +{ + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (running) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else + check_preempt_curr(rq, p); +} + +/* + * We switched to the sched_fair class. + */ +static void switched_to_fair(struct rq *rq, struct task_struct *p, + int running) +{ + /* + * We were most likely switched from sched_rt, so + * kick off the schedule if running, otherwise just see + * if we can still preempt the current task. + */ + if (running) + resched_task(rq->curr); + else + check_preempt_curr(rq, p); +} + /* Account for a task changing its policy or group. * * This routine is mostly called to set cfs_rq->curr field when a task @@ -1108,6 +1292,9 @@ static const struct sched_class fair_sch .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_fair, +#endif /* CONFIG_SMP */ .check_preempt_curr = check_preempt_wakeup, @@ -1122,6 +1309,9 @@ static const struct sched_class fair_sch .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, .task_new = task_new_fair, + + .prio_changed = prio_changed_fair, + .switched_to = switched_to_fair, }; #ifdef CONFIG_SCHED_DEBUG Index: linux-2.6.24-rc8-rt1/kernel/sched_idletask.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/sched_idletask.c 2008-01-16 22:27:49.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/sched_idletask.c 2008-01-16 22:28:04.000000000 -0500 @@ -5,6 +5,12 @@ * handled in sched_fair.c) */ +#ifdef CONFIG_SMP +static int select_task_rq_idle(struct task_struct *p, int sync) +{ + return task_cpu(p); /* IDLE tasks as never migrated */ +} +#endif /* CONFIG_SMP */ /* * Idle tasks are unconditionally rescheduled: */ @@ -63,6 +69,33 @@ static void set_curr_task_idle(struct rq { } +static void switched_to_idle(struct rq *rq, struct task_struct *p, + int running) +{ + /* Can this actually happen?? */ + if (running) + resched_task(rq->curr); + else + check_preempt_curr(rq, p); +} + +static void prio_changed_idle(struct rq *rq, struct task_struct *p, + int oldprio, int running) +{ + /* This can happen for hot plug CPUS */ + + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (running) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else + check_preempt_curr(rq, p); +} + /* * Simple, special scheduling class for the per-CPU idle tasks: */ @@ -72,6 +105,9 @@ const struct sched_class idle_sched_clas /* dequeue is not valid, we print a debug message there: */ .dequeue_task = dequeue_task_idle, +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_idle, +#endif /* CONFIG_SMP */ .check_preempt_curr = check_preempt_curr_idle, @@ -85,5 +121,9 @@ const struct sched_class idle_sched_clas .set_curr_task = set_curr_task_idle, .task_tick = task_tick_idle, + + .prio_changed = prio_changed_idle, + .switched_to = switched_to_idle, + /* no .task_new for idle tasks */ }; Index: linux-2.6.24-rc8-rt1/include/linux/topology.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/topology.h 2008-01-16 22:27:48.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/topology.h 2008-01-16 22:28:03.000000000 -0500 @@ -5,7 +5,7 @@ * * Copyright (C) 2002, IBM Corp. * - * All rights reserved. + * All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by Index: linux-2.6.24-rc8-rt1/kernel/Makefile =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/Makefile 2008-01-16 22:27:47.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/Makefile 2008-01-16 22:29:20.000000000 -0500 @@ -7,14 +7,17 @@ obj-y = sched.o fork.o exec_domain.o sysctl.o capability.o ptrace.o timer.o user.o user_namespace.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ - kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ + kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o \ hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \ utsname.o notifier.o obj-$(CONFIG_SYSCTL) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ +ifneq ($(CONFIG_PREEMPT_RT),y) +obj-y += mutex.o obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o +endif obj-$(CONFIG_LOCKDEP) += lockdep.o ifeq ($(CONFIG_PROC_FS),y) obj-$(CONFIG_LOCKDEP) += lockdep_proc.o @@ -26,6 +29,7 @@ endif obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o +obj-$(CONFIG_PREEMPT_RT) += rt.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o @@ -43,6 +47,11 @@ obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o +obj-$(CONFIG_DEBUG_PREEMPT) += latency_trace.o +obj-$(CONFIG_WAKEUP_TIMING) += latency_trace.o +obj-$(CONFIG_EVENT_TRACE) += latency_trace.o +obj-$(CONFIG_CRITICAL_TIMING) += latency_trace.o +obj-$(CONFIG_LATENCY_HIST) += latency_hist.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_AUDIT_TREE) += audit_tree.o @@ -52,11 +61,18 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softl obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o +obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o +obj-$(CONFIG_PREEMPT_RCU) += rcuclassic.o rcupreempt.o +obj-$(CONFIG_PREEMPT_RCU_BOOST) += rcupreempt-boost.o +ifeq ($(CONFIG_PREEMPT_RCU),y) +obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o +endif obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o +obj-$(CONFIG_SMP) += sched_cpupri.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is Index: linux-2.6.24-rc8-rt1/kernel/sched_cpupri.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/kernel/sched_cpupri.c 2008-01-16 22:28:05.000000000 -0500 @@ -0,0 +1,174 @@ +/* + * kernel/sched_cpupri.c + * + * CPU priority management + * + * Copyright (C) 2007 Novell + * + * Author: Gregory Haskins + * + * This code tracks the priority of each CPU so that global migration + * decisions are easy to calculate. Each CPU can be in a state as follows: + * + * (INVALID), IDLE, NORMAL, RT1, ... RT99 + * + * going from the lowest priority to the highest. CPUs in the INVALID state + * are not eligible for routing. The system maintains this state with + * a 2 dimensional bitmap (the first for priority class, the second for cpus + * in that class). Therefore a typical application without affinity + * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit + * searches). For tasks with affinity restrictions, the algorithm has a + * worst case complexity of O(min(102, nr_domcpus)), though the scenario that + * yields the worst case search is fairly contrived. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include "sched_cpupri.h" + +/* Convert between a 140 based task->prio, and our 102 based cpupri */ +static int convert_prio(int prio) +{ + int cpupri; + + if (prio == CPUPRI_INVALID) + cpupri = CPUPRI_INVALID; + else if (prio == MAX_PRIO) + cpupri = CPUPRI_IDLE; + else if (prio >= MAX_RT_PRIO) + cpupri = CPUPRI_NORMAL; + else + cpupri = MAX_RT_PRIO - prio + 1; + + return cpupri; +} + +#define for_each_cpupri_active(array, idx) \ + for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ + idx < CPUPRI_NR_PRIORITIES; \ + idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1)) + +/** + * cpupri_find - find the best (lowest-pri) CPU in the system + * @cp: The cpupri context + * @p: The task + * @lowest_mask: A mask to fill in with selected CPUs + * + * Note: This function returns the recommended CPUs as calculated during the + * current invokation. By the time the call returns, the CPUs may have in + * fact changed priorities any number of times. While not ideal, it is not + * an issue of correctness since the normal rebalancer logic will correct + * any discrepancies created by racing against the uncertainty of the current + * priority configuration. + * + * Returns: (int)bool - CPUs were found + */ +int cpupri_find(struct cpupri *cp, struct task_struct *p, + cpumask_t *lowest_mask) +{ + int idx = 0; + int task_pri = convert_prio(p->prio); + + for_each_cpupri_active(cp->pri_active, idx) { + struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; + cpumask_t mask; + + if (idx >= task_pri) + break; + + cpus_and(mask, p->cpus_allowed, vec->mask); + + if (cpus_empty(mask)) + continue; + + *lowest_mask = mask; + return 1; + } + + return 0; +} + +/** + * cpupri_set - update the cpu priority setting + * @cp: The cpupri context + * @cpu: The target cpu + * @pri: The priority (INVALID-RT99) to assign to this CPU + * + * Note: Assumes cpu_rq(cpu)->lock is locked + * + * Returns: (void) + */ +void cpupri_set(struct cpupri *cp, int cpu, int newpri) +{ + int *currpri = &cp->cpu_to_pri[cpu]; + int oldpri = *currpri; + unsigned long flags; + + newpri = convert_prio(newpri); + + BUG_ON(newpri >= CPUPRI_NR_PRIORITIES); + + if (newpri == oldpri) + return; + + /* + * If the cpu was currently mapped to a different value, we + * first need to unmap the old value + */ + if (likely(oldpri != CPUPRI_INVALID)) { + struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; + + spin_lock_irqsave(&vec->lock, flags); + + vec->count--; + if (!vec->count) + clear_bit(oldpri, cp->pri_active); + cpu_clear(cpu, vec->mask); + + spin_unlock_irqrestore(&vec->lock, flags); + } + + if (likely(newpri != CPUPRI_INVALID)) { + struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; + + spin_lock_irqsave(&vec->lock, flags); + + cpu_set(cpu, vec->mask); + vec->count++; + if (vec->count == 1) + set_bit(newpri, cp->pri_active); + + spin_unlock_irqrestore(&vec->lock, flags); + } + + *currpri = newpri; +} + +/** + * cpupri_init - initialize the cpupri structure + * @cp: The cpupri context + * + * Returns: (void) + */ +void cpupri_init(struct cpupri *cp) +{ + int i; + + memset(cp, 0, sizeof(*cp)); + + for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { + struct cpupri_vec *vec = &cp->pri_to_cpu[i]; + + spin_lock_init(&vec->lock); + vec->count = 0; + cpus_clear(vec->mask); + } + + for_each_possible_cpu(i) + cp->cpu_to_pri[i] = CPUPRI_INVALID; +} + + Index: linux-2.6.24-rc8-rt1/kernel/sched_cpupri.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/kernel/sched_cpupri.h 2008-01-16 22:28:49.000000000 -0500 @@ -0,0 +1,36 @@ +#ifndef _LINUX_CPUPRI_H +#define _LINUX_CPUPRI_H + +#include + +#define CPUPRI_NR_PRIORITIES 2+MAX_RT_PRIO +#define CPUPRI_NR_PRI_WORDS CPUPRI_NR_PRIORITIES/BITS_PER_LONG + +#define CPUPRI_INVALID -1 +#define CPUPRI_IDLE 0 +#define CPUPRI_NORMAL 1 +/* values 2-101 are RT priorities 0-99 */ + +struct cpupri_vec { + raw_spinlock_t lock; + int count; + cpumask_t mask; +}; + +struct cpupri { + struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; + long pri_active[CPUPRI_NR_PRI_WORDS]; + int cpu_to_pri[NR_CPUS]; +}; + +#ifdef CONFIG_SMP +int cpupri_find(struct cpupri *cp, + struct task_struct *p, cpumask_t *lowest_mask); +void cpupri_set(struct cpupri *cp, int cpu, int pri); +void cpupri_init(struct cpupri *cp); +#else +#define cpupri_set(cp, cpu, pri) do { } while (0) +#define cpupri_init() do { } while (0) +#endif + +#endif /* _LINUX_CPUPRI_H */ Index: linux-2.6.24-rc8-rt1/drivers/kvm/kvm_main.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/kvm/kvm_main.c 2008-01-16 22:27:47.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/kvm/kvm_main.c 2008-01-16 22:28:06.000000000 -0500 @@ -1987,8 +1987,8 @@ static int __vcpu_run(struct kvm_vcpu *v int r; if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) { - printk("vcpu %d received sipi with vector # %x\n", - vcpu->vcpu_id, vcpu->sipi_vector); + vcpu_printf(vcpu, "vcpu %d received sipi with vector # %x\n", + vcpu->vcpu_id, vcpu->sipi_vector); kvm_lapic_reset(vcpu); kvm_x86_ops->vcpu_reset(vcpu); vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; @@ -2003,6 +2003,11 @@ again: if (unlikely(r)) goto out; + if (vcpu->migrate_apic_timer) { + vcpu->migrate_apic_timer = false; + __kvm_migrate_apic_timer(vcpu); + } + preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); @@ -2010,6 +2015,13 @@ again: local_irq_disable(); + if (need_resched() || need_resched_delayed()) { + local_irq_enable(); + preempt_enable(); + r = 1; + goto out; + } + if (signal_pending(current)) { local_irq_enable(); preempt_enable(); Index: linux-2.6.24-rc8-rt1/drivers/kvm/irq.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/kvm/irq.h 2008-01-16 22:27:47.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/kvm/irq.h 2008-01-16 22:28:05.000000000 -0500 @@ -160,6 +160,6 @@ void kvm_apic_timer_intr_post(struct kvm void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); -void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); +void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); #endif Index: linux-2.6.24-rc8-rt1/drivers/kvm/kvm.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/kvm/kvm.h 2008-01-16 22:27:47.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/kvm/kvm.h 2008-01-16 22:28:06.000000000 -0500 @@ -325,6 +325,7 @@ struct kvm_vcpu { u64 pdptrs[4]; /* pae */ u64 shadow_efer; u64 apic_base; + bool migrate_apic_timer; struct kvm_lapic *apic; /* kernel irqchip context */ #define VCPU_MP_STATE_RUNNABLE 0 #define VCPU_MP_STATE_UNINITIALIZED 1 @@ -508,6 +509,7 @@ struct kvm_x86_ops { extern struct kvm_x86_ops *kvm_x86_ops; +#ifdef KVM_DEBUG /* The guest did something we don't support. */ #define pr_unimpl(vcpu, fmt, ...) \ do { \ @@ -517,6 +519,11 @@ extern struct kvm_x86_ops *kvm_x86_ops; } while(0) #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) +#else +#define pr_unimpl(vcpu, fmt ...) do { } while(0) +#define kvm_printf(kvm, fmt ...) do { } while(0) +#endif + #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); @@ -775,6 +782,11 @@ static inline u32 get_rdx_init_val(void) return 0x600; /* P6 family */ } +static inline void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) +{ + vcpu->migrate_apic_timer = true; +} + #define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" #define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" #define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" Index: linux-2.6.24-rc8-rt1/drivers/kvm/lapic.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/kvm/lapic.c 2008-01-16 22:27:47.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/kvm/lapic.c 2008-01-16 22:28:06.000000000 -0500 @@ -347,35 +347,35 @@ static int __apic_accept_irq(struct kvm_ break; case APIC_DM_REMRD: - printk(KERN_DEBUG "Ignoring delivery mode 3\n"); + vcpu_printf(vcpu "Ignoring delivery mode 3\n"); break; case APIC_DM_SMI: - printk(KERN_DEBUG "Ignoring guest SMI\n"); + vcpu_printf(vcpu, "Ignoring guest SMI\n"); break; case APIC_DM_NMI: - printk(KERN_DEBUG "Ignoring guest NMI\n"); + vcpu_printf(vcpu, "Ignoring guest NMI\n"); break; case APIC_DM_INIT: if (level) { if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE) - printk(KERN_DEBUG - "INIT on a runnable vcpu %d\n", - vcpu->vcpu_id); + vcpu_printf(vcpu, + "INIT on a runnable vcpu %d\n", + vcpu->vcpu_id); vcpu->mp_state = VCPU_MP_STATE_INIT_RECEIVED; kvm_vcpu_kick(vcpu); } else { - printk(KERN_DEBUG - "Ignoring de-assert INIT to vcpu %d\n", - vcpu->vcpu_id); + vcpu_printf(vcpu, + "Ignoring de-assert INIT to vcpu %d\n", + vcpu->vcpu_id); } break; case APIC_DM_STARTUP: - printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", - vcpu->vcpu_id, vector); + vcpu_printf(vcpu, "SIPI to vcpu %d vector 0x%02x\n", + vcpu->vcpu_id, vector); if (vcpu->mp_state == VCPU_MP_STATE_INIT_RECEIVED) { vcpu->sipi_vector = vector; vcpu->mp_state = VCPU_MP_STATE_SIPI_RECEIVED; @@ -1065,7 +1065,7 @@ void kvm_apic_post_state_restore(struct start_apic_timer(apic); } -void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) +void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->apic; struct hrtimer *timer; Index: linux-2.6.24-rc8-rt1/arch/arm/mach-ep93xx/core.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/mach-ep93xx/core.c 2008-01-16 22:27:47.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/mach-ep93xx/core.c 2008-01-16 22:28:07.000000000 -0500 @@ -32,6 +32,8 @@ #include #include #include +#include +#include #include #include @@ -50,7 +52,6 @@ #include - /************************************************************************* * Static I/O mappings that are needed for all EP93xx platforms *************************************************************************/ @@ -93,59 +94,125 @@ void __init ep93xx_map_io(void) * to use this timer for something else. We also use timer 4 for keeping * track of lost jiffies. */ -static unsigned int last_jiffy_time; - -#define TIMER4_TICKS_PER_JIFFY ((CLOCK_TICK_RATE + (HZ/2)) / HZ) +static struct clock_event_device clockevent_ep93xx; static int ep93xx_timer_interrupt(int irq, void *dev_id) { - write_seqlock(&xtime_lock); + __raw_writel(EP93XX_TC_CLEAR, EP93XX_TIMER1_CLEAR); - __raw_writel(1, EP93XX_TIMER1_CLEAR); - while ((signed long) - (__raw_readl(EP93XX_TIMER4_VALUE_LOW) - last_jiffy_time) - >= TIMER4_TICKS_PER_JIFFY) { - last_jiffy_time += TIMER4_TICKS_PER_JIFFY; - timer_tick(); - } - - write_sequnlock(&xtime_lock); + clockevent_ep93xx.event_handler(&clockevent_ep93xx); return IRQ_HANDLED; } +static int ep93xx_set_next_event(unsigned long evt, + struct clock_event_device *unused) +{ + u32 tmode = __raw_readl(EP93XX_TIMER1_CONTROL); + + /* stop timer */ + __raw_writel(tmode & ~EP93XX_TC123_ENABLE, EP93XX_TIMER1_CONTROL); + /* program timer */ + __raw_writel(evt, EP93XX_TIMER1_LOAD); + /* start timer */ + __raw_writel(tmode | EP93XX_TC123_ENABLE, EP93XX_TIMER1_CONTROL); + + return 0; +} + +static void ep93xx_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + u32 tmode = EP93XX_TC123_SEL_508KHZ; + + /* Disable timer */ + __raw_writel(tmode, EP93XX_TIMER1_CONTROL); + + switch(mode) { + case CLOCK_EVT_MODE_PERIODIC: + /* Set timer period */ + __raw_writel((508469 / HZ) - 1, EP93XX_TIMER1_LOAD); + tmode |= EP93XX_TC123_PERIODIC; + + case CLOCK_EVT_MODE_ONESHOT: + tmode |= EP93XX_TC123_ENABLE; + __raw_writel(tmode, EP93XX_TIMER1_CONTROL); + break; + + case CLOCK_EVT_MODE_SHUTDOWN: + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_RESUME: + return; + } +} + +static struct clock_event_device clockevent_ep93xx = { + .name = "ep93xx-timer1", + .features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_PERIODIC, + .shift = 32, + .set_mode = ep93xx_set_mode, + .set_next_event = ep93xx_set_next_event, +}; + + static struct irqaction ep93xx_timer_irq = { .name = "ep93xx timer", .flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL, .handler = ep93xx_timer_interrupt, }; -static void __init ep93xx_timer_init(void) +static void __init ep93xx_clockevent_init(void) { - /* Enable periodic HZ timer. */ - __raw_writel(0x48, EP93XX_TIMER1_CONTROL); - __raw_writel((508469 / HZ) - 1, EP93XX_TIMER1_LOAD); - __raw_writel(0xc8, EP93XX_TIMER1_CONTROL); + setup_irq(IRQ_EP93XX_TIMER1, &ep93xx_timer_irq); - /* Enable lost jiffy timer. */ - __raw_writel(0x100, EP93XX_TIMER4_VALUE_HIGH); + clockevent_ep93xx.mult = div_sc(508469, NSEC_PER_SEC, + clockevent_ep93xx.shift); + clockevent_ep93xx.max_delta_ns = + clockevent_delta2ns(0xfffffffe, &clockevent_ep93xx); + clockevent_ep93xx.min_delta_ns = + clockevent_delta2ns(0xf, &clockevent_ep93xx); + clockevent_ep93xx.cpumask = cpumask_of_cpu(0); + clockevents_register_device(&clockevent_ep93xx); +} - setup_irq(IRQ_EP93XX_TIMER1, &ep93xx_timer_irq); +/* + * timer4 is a 40 Bit timer, separated in a 32bit and a 8 bit + * register, EP93XX_TIMER4_VALUE_LOW stores 32 bit word. The + * controlregister is in EP93XX_TIMER4_VALUE_HIGH + */ + +cycle_t ep93xx_get_cycles(void) +{ + return __raw_readl(EP93XX_TIMER4_VALUE_LOW); } -static unsigned long ep93xx_gettimeoffset(void) +static struct clocksource clocksource_ep93xx = { + .name = "ep93xx_timer4", + .rating = 200, + .read = ep93xx_get_cycles, + .mask = 0xFFFFFFFF, + .shift = 20, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static void __init ep93xx_clocksource_init(void) { - int offset; + /* Reset time-stamp counter */ + __raw_writel(0x100, EP93XX_TIMER4_VALUE_HIGH); - offset = __raw_readl(EP93XX_TIMER4_VALUE_LOW) - last_jiffy_time; + clocksource_ep93xx.mult = + clocksource_hz2mult(983040, clocksource_ep93xx.shift); + clocksource_register(&clocksource_ep93xx); +} - /* Calculate (1000000 / 983040) * offset. */ - return offset + (53 * offset / 3072); +static void __init ep93xx_timer_init(void) +{ + ep93xx_clocksource_init(); + ep93xx_clockevent_init(); } struct sys_timer ep93xx_timer = { - .init = ep93xx_timer_init, - .offset = ep93xx_gettimeoffset, + .init = ep93xx_timer_init, }; @@ -497,7 +564,6 @@ static struct platform_device ep93xx_ohc .resource = ep93xx_ohci_resources, }; - void __init ep93xx_init_devices(void) { unsigned int v; Index: linux-2.6.24-rc8-rt1/include/asm-arm/arch-ep93xx/ep93xx-regs.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-arm/arch-ep93xx/ep93xx-regs.h 2008-01-16 22:27:47.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-arm/arch-ep93xx/ep93xx-regs.h 2008-01-16 22:28:06.000000000 -0500 @@ -67,6 +67,12 @@ #define EP93XX_TIMER3_CONTROL EP93XX_TIMER_REG(0x88) #define EP93XX_TIMER3_CLEAR EP93XX_TIMER_REG(0x8c) +#define EP93XX_TC_CLEAR 0x00000001 +#define EP93XX_TC123_ENABLE 0x00000080 +#define EP93XX_TC123_PERIODIC 0x00000040 +#define EP93XX_TC123_SEL_508KHZ 0x00000008 +#define EP93XX_TC4_ENABLE 0x00000100 + #define EP93XX_I2S_BASE (EP93XX_APB_VIRT_BASE + 0x00020000) #define EP93XX_SECURITY_BASE (EP93XX_APB_VIRT_BASE + 0x00030000) Index: linux-2.6.24-rc8-rt1/arch/arm/kernel/time.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/kernel/time.c 2008-01-16 22:27:46.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/kernel/time.c 2008-01-16 22:28:07.000000000 -0500 @@ -236,6 +236,13 @@ static inline void do_leds(void) #define do_leds() #endif +void arch_tick_leds(void) +{ +#ifdef CONFIG_LEDS_TIMER + do_leds(); +#endif +} + #ifndef CONFIG_GENERIC_TIME void do_gettimeofday(struct timeval *tv) { Index: linux-2.6.24-rc8-rt1/drivers/net/sungem.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/net/sungem.c 2008-01-16 22:27:46.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/net/sungem.c 2008-01-16 22:28:07.000000000 -0500 @@ -1031,10 +1031,8 @@ static int gem_start_xmit(struct sk_buff (csum_stuff_off << 21)); } - local_irq_save(flags); - if (!spin_trylock(&gp->tx_lock)) { + if (!spin_trylock_irqsave(&gp->tx_lock, flags)) { /* Tell upper layer to requeue */ - local_irq_restore(flags); return NETDEV_TX_LOCKED; } /* We raced with gem_do_stop() */ Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/tsc_sync.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/tsc_sync.c 2008-01-16 22:27:46.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/tsc_sync.c 2008-01-16 22:28:33.000000000 -0500 @@ -33,7 +33,7 @@ static __cpuinitdata atomic_t stop_count * we want to have the fastest, inlined, non-debug version * of a critical section, to be able to prove TSC time-warps: */ -static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; +static __cpuinitdata __raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; static __cpuinitdata cycles_t last_tsc; static __cpuinitdata cycles_t max_warp; static __cpuinitdata int nr_warps; @@ -97,6 +97,7 @@ static __cpuinit void check_tsc_warp(voi */ void __cpuinit check_tsc_sync_source(int cpu) { + unsigned long flags; int cpus = 2; /* @@ -117,8 +118,11 @@ void __cpuinit check_tsc_sync_source(int /* * Wait for the target to arrive: */ + local_save_flags(flags); + local_irq_enable(); while (atomic_read(&start_count) != cpus-1) cpu_relax(); + local_irq_restore(flags); /* * Trigger the target to continue into the measurement too: */ Index: linux-2.6.24-rc8-rt1/drivers/input/keyboard/atkbd.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/input/keyboard/atkbd.c 2008-01-16 22:27:46.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/input/keyboard/atkbd.c 2008-01-16 22:28:07.000000000 -0500 @@ -1401,9 +1401,23 @@ static ssize_t atkbd_show_err_count(stru return sprintf(buf, "%lu\n", atkbd->err_count); } +static int __read_mostly noatkbd; + +static int __init noatkbd_setup(char *str) +{ + noatkbd = 1; + printk(KERN_INFO "debug: not setting up AT keyboard.\n"); + + return 1; +} + +__setup("noatkbd", noatkbd_setup); static int __init atkbd_init(void) { + if (noatkbd) + return 0; + return serio_register_driver(&atkbd_drv); } Index: linux-2.6.24-rc8-rt1/drivers/input/mouse/psmouse-base.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/input/mouse/psmouse-base.c 2008-01-16 22:27:46.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/input/mouse/psmouse-base.c 2008-01-16 22:28:07.000000000 -0500 @@ -1596,10 +1596,25 @@ static int psmouse_get_maxproto(char *bu return sprintf(buffer, "%s\n", psmouse_protocol_by_type(type)->name); } +static int __read_mostly nopsmouse; + +static int __init nopsmouse_setup(char *str) +{ + nopsmouse = 1; + printk(KERN_INFO "debug: not setting up psmouse.\n"); + + return 1; +} + +__setup("nopsmouse", nopsmouse_setup); + static int __init psmouse_init(void) { int err; + if (nopsmouse) + return 0; + kpsmoused_wq = create_singlethread_workqueue("kpsmoused"); if (!kpsmoused_wq) { printk(KERN_ERR "psmouse: failed to create kpsmoused workqueue\n"); Index: linux-2.6.24-rc8-rt1/kernel/rtmutex-debug.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/rtmutex-debug.h 2008-01-16 22:27:46.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/rtmutex-debug.h 2008-01-16 22:28:08.000000000 -0500 @@ -17,17 +17,17 @@ extern void debug_rt_mutex_free_waiter(s extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); extern void debug_rt_mutex_lock(struct rt_mutex *lock); extern void debug_rt_mutex_unlock(struct rt_mutex *lock); -extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, - struct task_struct *powner); +extern void +debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner); extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, struct rt_mutex *lock); extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); -# define debug_rt_mutex_reset_waiter(w) \ +# define debug_rt_mutex_reset_waiter(w) \ do { (w)->deadlock_lock = NULL; } while (0) -static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, - int detect) +static inline int +debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, int detect) { - return (waiter != NULL); + return waiter != NULL; } Index: linux-2.6.24-rc8-rt1/drivers/net/8139too.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/net/8139too.c 2008-01-16 22:27:46.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/net/8139too.c 2008-01-16 22:28:08.000000000 -0500 @@ -2199,7 +2199,11 @@ static irqreturn_t rtl8139_interrupt (in */ static void rtl8139_poll_controller(struct net_device *dev) { - disable_irq(dev->irq); + /* + * use _nosync() variant - might be used by netconsole + * from atomic contexts: + */ + disable_irq_nosync(dev->irq); rtl8139_interrupt(dev->irq, dev); enable_irq(dev->irq); } Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/kprobes_32.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/kprobes_32.c 2008-01-16 22:27:46.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/kprobes_32.c 2008-01-16 22:29:09.000000000 -0500 @@ -332,7 +332,7 @@ ss_probe: /* Boost up -- we can execute copied instructions directly */ reset_current_kprobe(); regs->eip = (unsigned long)p->ainsn.insn; - preempt_enable_no_resched(); + preempt_enable(); return 1; } #endif @@ -341,7 +341,7 @@ ss_probe: return 1; no_kprobe: - preempt_enable_no_resched(); + preempt_enable(); return ret; } @@ -573,7 +573,7 @@ static int __kprobes post_kprobe_handler } reset_current_kprobe(); out: - preempt_enable_no_resched(); + preempt_enable(); /* * if somebody else is singlestepping across a probe point, eflags @@ -607,7 +607,7 @@ int __kprobes kprobe_fault_handler(struc restore_previous_kprobe(kcb); else reset_current_kprobe(); - preempt_enable_no_resched(); + preempt_enable(); break; case KPROBE_HIT_ACTIVE: case KPROBE_HIT_SSDONE: @@ -668,12 +668,11 @@ int __kprobes kprobe_exceptions_notify(s ret = NOTIFY_STOP; break; case DIE_GPF: + // TODO: do this better on PREEMPT_RT /* kprobe_running() needs smp_processor_id() */ - preempt_disable(); - if (kprobe_running() && + if (per_cpu(current_kprobe, raw_smp_processor_id()) && kprobe_fault_handler(args->regs, args->trapnr)) ret = NOTIFY_STOP; - preempt_enable(); break; default: break; @@ -739,7 +738,7 @@ int __kprobes longjmp_break_handler(stru *regs = kcb->jprobe_saved_regs; memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack, MIN_STACK_SIZE(stack_addr)); - preempt_enable_no_resched(); + preempt_enable(); return 1; } return 0; Index: linux-2.6.24-rc8-rt1/arch/x86/mm/highmem_32.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/mm/highmem_32.c 2008-01-16 22:27:46.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/mm/highmem_32.c 2008-01-16 22:29:04.000000000 -0500 @@ -3,9 +3,9 @@ void *kmap(struct page *page) { - might_sleep(); if (!PageHighMem(page)) return page_address(page); + might_sleep(); return kmap_high(page); } @@ -18,6 +18,26 @@ void kunmap(struct page *page) kunmap_high(page); } +void kunmap_virt(void *ptr) +{ + struct page *page; + + if ((unsigned long)ptr < PKMAP_ADDR(0)) + return; + page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]); + kunmap(page); +} + +struct page *kmap_to_page(void *ptr) +{ + struct page *page; + + if ((unsigned long)ptr < PKMAP_ADDR(0)) + return virt_to_page(ptr); + page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]); + return page; +} + /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB @@ -26,12 +46,12 @@ void kunmap(struct page *page) * However when holding an atomic kmap is is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) +void *__kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) { enum fixed_addresses idx; unsigned long vaddr; - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) @@ -39,19 +59,19 @@ void *kmap_atomic_prot(struct page *page idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - BUG_ON(!pte_none(*(kmap_pte-idx))); + WARN_ON_ONCE(!pte_none(*(kmap_pte-idx))); set_pte(kmap_pte-idx, mk_pte(page, prot)); arch_flush_lazy_mmu_mode(); return (void *)vaddr; } -void *kmap_atomic(struct page *page, enum km_type type) +void *__kmap_atomic(struct page *page, enum km_type type) { return kmap_atomic_prot(page, type, kmap_prot); } -void kunmap_atomic(void *kvaddr, enum km_type type) +void __kunmap_atomic(void *kvaddr, enum km_type type) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); @@ -73,16 +93,18 @@ void kunmap_atomic(void *kvaddr, enum km arch_flush_lazy_mmu_mode(); pagefault_enable(); + preempt_enable(); } /* This is the same as kmap_atomic() but can map memory that doesn't * have a struct page associated with it. */ -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) +void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type) { enum fixed_addresses idx; unsigned long vaddr; + preempt_disable(); pagefault_disable(); idx = type + KM_TYPE_NR*smp_processor_id(); @@ -93,7 +115,7 @@ void *kmap_atomic_pfn(unsigned long pfn, return (void*) vaddr; } -struct page *kmap_atomic_to_page(void *ptr) +struct page *__kmap_atomic_to_page(void *ptr) { unsigned long idx, vaddr = (unsigned long)ptr; pte_t *pte; @@ -108,6 +130,7 @@ struct page *kmap_atomic_to_page(void *p EXPORT_SYMBOL(kmap); EXPORT_SYMBOL(kunmap); -EXPORT_SYMBOL(kmap_atomic); -EXPORT_SYMBOL(kunmap_atomic); -EXPORT_SYMBOL(kmap_atomic_to_page); +EXPORT_SYMBOL(kunmap_virt); +EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(__kmap_atomic_to_page); Index: linux-2.6.24-rc8-rt1/include/asm-x86/atomic_32.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/atomic_32.h 2008-01-16 22:27:46.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/atomic_32.h 2008-01-16 22:28:08.000000000 -0500 @@ -195,10 +195,10 @@ static __inline__ int atomic_add_return( #ifdef CONFIG_M386 no_xadd: /* Legacy 386 processor */ - local_irq_save(flags); + raw_local_irq_save(flags); __i = atomic_read(v); atomic_set(v, i + __i); - local_irq_restore(flags); + raw_local_irq_restore(flags); return i + __i; #endif } Index: linux-2.6.24-rc8-rt1/drivers/pci/msi.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/pci/msi.c 2008-01-16 22:27:46.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/pci/msi.c 2008-01-16 22:28:09.000000000 -0500 @@ -241,6 +241,10 @@ static void __pci_restore_msi_state(stru return; entry = get_irq_msi(dev->irq); + if (!entry) { + WARN_ON(1); + return; + } pos = entry->msi_attrib.pos; pci_intx_for_msi(dev, 0); Index: linux-2.6.24-rc8-rt1/drivers/block/floppy.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/block/floppy.c 2008-01-16 22:27:46.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/block/floppy.c 2008-01-16 22:28:09.000000000 -0500 @@ -4149,6 +4149,28 @@ static void floppy_device_release(struct complete(&device_release); } +static int floppy_suspend(struct platform_device *dev, pm_message_t state) +{ + floppy_release_irq_and_dma(); + + return 0; +} + +static int floppy_resume(struct platform_device *dev) +{ + floppy_grab_irq_and_dma(); + + return 0; +} + +static struct platform_driver floppy_driver = { + .suspend = floppy_suspend, + .resume = floppy_resume, + .driver = { + .name = "floppy", + }, +}; + static struct platform_device floppy_device[N_DRIVE]; static struct kobject *floppy_find(dev_t dev, int *part, void *data) @@ -4197,10 +4219,14 @@ static int __init floppy_init(void) if (err) goto out_put_disk; + err = platform_driver_register(&floppy_driver); + if (err) + goto out_unreg_blkdev; + floppy_queue = blk_init_queue(do_fd_request, &floppy_lock); if (!floppy_queue) { err = -ENOMEM; - goto out_unreg_blkdev; + goto out_unreg_driver; } blk_queue_max_sectors(floppy_queue, 64); @@ -4349,6 +4375,8 @@ out_flush_work: out_unreg_region: blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); blk_cleanup_queue(floppy_queue); +out_unreg_driver: + platform_driver_unregister(&floppy_driver); out_unreg_blkdev: unregister_blkdev(FLOPPY_MAJOR, "fd"); out_put_disk: @@ -4544,6 +4572,7 @@ void cleanup_module(void) init_completion(&device_release); blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); unregister_blkdev(FLOPPY_MAJOR, "fd"); + platform_driver_unregister(&floppy_driver); for (drive = 0; drive < N_DRIVE; drive++) { del_timer_sync(&motor_off_timer[drive]); Index: linux-2.6.24-rc8-rt1/include/linux/hrtimer.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/hrtimer.h 2008-01-16 22:27:46.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/hrtimer.h 2008-01-16 22:28:54.000000000 -0500 @@ -191,7 +191,7 @@ struct hrtimer_clock_base { * @nr_events: Total number of timer interrupt events */ struct hrtimer_cpu_base { - spinlock_t lock; + raw_spinlock_t lock; struct lock_class_key lock_key; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; #ifdef CONFIG_HIGH_RES_TIMERS @@ -200,6 +200,9 @@ struct hrtimer_cpu_base { struct list_head cb_pending; unsigned long nr_events; #endif +#ifdef CONFIG_PREEMPT_SOFTIRQS + wait_queue_head_t wait; +#endif }; #ifdef CONFIG_HIGH_RES_TIMERS @@ -253,6 +256,12 @@ static inline ktime_t hrtimer_cb_get_tim extern ktime_t ktime_get(void); extern ktime_t ktime_get_real(void); +# if (BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR) +# define hrtimer_trace(a,b) trace_special_u64((a).tv64,b) +# else +# define hrtimer_trace(a,b) trace_special((a).tv.sec,(a).tv.nsec,b) +# endif + /* Exported timer functions: */ /* Initialize timers: */ @@ -270,6 +279,13 @@ static inline int hrtimer_restart(struct return hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); } +/* Softirq preemption could deadlock timer removal */ +#ifdef CONFIG_PREEMPT_SOFTIRQS + extern void hrtimer_wait_for_timer(const struct hrtimer *timer); +#else +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0) +#endif + /* Query timers: */ extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp); @@ -297,6 +313,9 @@ static inline int hrtimer_is_queued(stru /* Forward a hrtimer so it expires after now: */ extern unsigned long hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval); +/* Overrun count: */ +extern unsigned long +hrtimer_overrun(struct hrtimer *timer, ktime_t now, ktime_t interval); /* Precise sleep: */ extern long hrtimer_nanosleep(struct timespec *rqtp, Index: linux-2.6.24-rc8-rt1/kernel/hrtimer.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/hrtimer.c 2008-01-16 22:27:46.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/hrtimer.c 2008-01-16 22:29:09.000000000 -0500 @@ -360,9 +360,9 @@ static inline int hrtimer_is_hres_enable /* * Is the high resolution mode active ? */ -static inline int hrtimer_hres_active(void) +static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { - return __get_cpu_var(hrtimer_bases).hres_active; + return cpu_base->hres_active; } /* @@ -439,11 +439,12 @@ static int hrtimer_reprogram(struct hrti */ static void retrigger_next_event(void *arg) { - struct hrtimer_cpu_base *base; + struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); + struct timespec realtime_offset; unsigned long seq; - if (!hrtimer_hres_active()) + if (!hrtimer_hres_active(base)) return; do { @@ -453,8 +454,6 @@ static void retrigger_next_event(void *a -wall_to_monotonic.tv_nsec); } while (read_seqretry(&xtime_lock, seq)); - base = &__get_cpu_var(hrtimer_bases); - /* Adjust CLOCK_REALTIME offset */ spin_lock(&base->lock); base->clock_base[CLOCK_REALTIME].offset = @@ -576,10 +575,8 @@ static inline int hrtimer_enqueue_reprog /* * Switch to high resolution mode */ -static int hrtimer_switch_to_hres(void) +static int hrtimer_switch_to_hres(struct hrtimer_cpu_base *base) { - int cpu = smp_processor_id(); - struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); unsigned long flags; if (base->hres_active) @@ -590,7 +587,7 @@ static int hrtimer_switch_to_hres(void) if (tick_init_highres()) { local_irq_restore(flags); printk(KERN_WARNING "Could not switch to high resolution " - "mode on CPU %d\n", cpu); + "mode on CPU %d\n", raw_smp_processor_id()); return 0; } base->hres_active = 1; @@ -602,16 +599,20 @@ static int hrtimer_switch_to_hres(void) /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); local_irq_restore(flags); - printk(KERN_DEBUG "Switched to high resolution mode on CPU %d\n", - smp_processor_id()); return 1; } #else -static inline int hrtimer_hres_active(void) { return 0; } +static inline int hrtimer_hres_active(struct hrtimer_cpu_base *base) +{ + return 0; +} static inline int hrtimer_is_hres_enabled(void) { return 0; } -static inline int hrtimer_switch_to_hres(void) { return 0; } +static inline int hrtimer_switch_to_hres(struct hrtimer_cpu_base *base) +{ + return 0; +} static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base) @@ -694,6 +695,28 @@ hrtimer_forward(struct hrtimer *timer, k } EXPORT_SYMBOL_GPL(hrtimer_forward); +unsigned long +hrtimer_overrun(struct hrtimer *timer, ktime_t now, ktime_t interval) +{ + unsigned long orun = 1; + ktime_t delta; + + delta = ktime_sub(now, timer->expires); + + if (delta.tv64 < 0) + return 0; + + if (interval.tv64 < timer->base->resolution.tv64) + interval.tv64 = timer->base->resolution.tv64; + + if (unlikely(delta.tv64 >= interval.tv64)) + orun = ktime_divns(delta, ktime_to_ns(interval)) + 1; + + return orun; +} +EXPORT_SYMBOL_GPL(hrtimer_overrun); + + /* * enqueue_hrtimer - internal function to (re)start a timer * @@ -708,6 +731,8 @@ static void enqueue_hrtimer(struct hrtim struct hrtimer *entry; int leftmost = 1; + hrtimer_trace(timer->expires, (unsigned long) timer); + /* * Find the right place in the rbtree: */ @@ -779,7 +804,7 @@ static void __remove_hrtimer(struct hrti if (base->first == &timer->node) { base->first = rb_next(&timer->node); /* Reprogram the clock event device. if enabled */ - if (reprogram && hrtimer_hres_active()) + if (reprogram && hrtimer_hres_active(base->cpu_base)) hrtimer_force_reprogram(base->cpu_base); } rb_erase(&timer->node, &base->active); @@ -919,7 +944,7 @@ int hrtimer_cancel(struct hrtimer *timer if (ret >= 0) return ret; - cpu_relax(); + hrtimer_wait_for_timer(timer); } } EXPORT_SYMBOL_GPL(hrtimer_cancel); @@ -959,7 +984,7 @@ ktime_t hrtimer_get_next_event(void) spin_lock_irqsave(&cpu_base->lock, flags); - if (!hrtimer_hres_active()) { + if (!hrtimer_hres_active(cpu_base)) { for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { struct hrtimer *timer; @@ -1030,6 +1055,32 @@ int hrtimer_get_res(const clockid_t whic } EXPORT_SYMBOL_GPL(hrtimer_get_res); +#ifdef CONFIG_PREEMPT_SOFTIRQS +# define wake_up_timer_waiters(b) wake_up(&(b)->wait) + +/** + * hrtimer_wait_for_timer - Wait for a running timer + * + * @timer: timer to wait for + * + * The function waits in case the timers callback function is + * currently executed on the waitqueue of the timer base. The + * waitqueue is woken up after the timer callback function has + * finished execution. + */ +void hrtimer_wait_for_timer(const struct hrtimer *timer) +{ + struct hrtimer_clock_base *base = timer->base; + + if (base && base->cpu_base) + wait_event(base->cpu_base->wait, + !(timer->state & HRTIMER_STATE_CALLBACK)); +} + +#else +# define wake_up_timer_waiters(b) do { } while (0) +#endif + #ifdef CONFIG_HIGH_RES_TIMERS /* @@ -1049,6 +1100,7 @@ void hrtimer_interrupt(struct clock_even retry: now = ktime_get(); + hrtimer_trace(now, 0); expires_next.tv64 = KTIME_MAX; @@ -1077,6 +1129,8 @@ void hrtimer_interrupt(struct clock_even break; } + hrtimer_trace(timer->expires, (unsigned long) timer); + /* Move softirq callbacks to the pending list */ if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { __remove_hrtimer(timer, base, @@ -1122,7 +1176,9 @@ void hrtimer_interrupt(struct clock_even static void run_hrtimer_softirq(struct softirq_action *h) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base; + + cpu_base = &per_cpu(hrtimer_bases, raw_smp_processor_id()); spin_lock_irq(&cpu_base->lock); @@ -1162,6 +1218,8 @@ static void run_hrtimer_softirq(struct s } } spin_unlock_irq(&cpu_base->lock); + + wake_up_timer_waiters(cpu_base); } #endif /* CONFIG_HIGH_RES_TIMERS */ @@ -1212,6 +1270,8 @@ static inline void run_hrtimer_queue(str } } spin_unlock_irq(&cpu_base->lock); + + wake_up_timer_waiters(cpu_base); } /* @@ -1223,10 +1283,11 @@ static inline void run_hrtimer_queue(str */ void hrtimer_run_queues(void) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base; int i; - if (hrtimer_hres_active()) + cpu_base = &per_cpu(hrtimer_bases, raw_smp_processor_id()); + if (hrtimer_hres_active(cpu_base)) return; /* @@ -1238,7 +1299,7 @@ void hrtimer_run_queues(void) * deadlock vs. xtime_lock. */ if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) - if (hrtimer_switch_to_hres()) + if (hrtimer_switch_to_hres(cpu_base)) return; hrtimer_get_softirq_time(cpu_base); @@ -1390,6 +1451,9 @@ static void __devinit init_hrtimers_cpu( cpu_base->clock_base[i].cpu_base = cpu_base; hrtimer_init_hres(cpu_base); +#ifdef CONFIG_PREEMPT_SOFTIRQS + init_waitqueue_head(&cpu_base->wait); +#endif } #ifdef CONFIG_HOTPLUG_CPU @@ -1424,7 +1488,7 @@ static void migrate_hrtimers(int cpu) tick_cancel_sched_timer(cpu); local_irq_disable(); - double_spin_lock(&new_base->lock, &old_base->lock, + raw_double_spin_lock(&new_base->lock, &old_base->lock, smp_processor_id() < cpu); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { @@ -1432,7 +1496,7 @@ static void migrate_hrtimers(int cpu) &new_base->clock_base[i]); } - double_spin_unlock(&new_base->lock, &old_base->lock, + raw_double_spin_unlock(&new_base->lock, &old_base->lock, smp_processor_id() < cpu); local_irq_enable(); put_cpu_var(hrtimer_bases); Index: linux-2.6.24-rc8-rt1/mm/memory.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/mm/memory.c 2008-01-16 22:27:46.000000000 -0500 +++ linux-2.6.24-rc8-rt1/mm/memory.c 2008-01-16 22:29:04.000000000 -0500 @@ -261,18 +261,52 @@ void free_pgd_range(struct mmu_gather ** } while (pgd++, addr = next, addr != end); } +#ifdef CONFIG_IA64 +#define tlb_start_addr(tlb) (tlb)->start_addr +#define tlb_end_addr(tlb) (tlb)->end_addr +#else +#define tlb_start_addr(tlb) 0UL /* only ia64 really uses it */ +#define tlb_end_addr(tlb) 0UL /* only ia64 really uses it */ +#endif + void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling) { +#ifdef CONFIG_PREEMPT + struct vm_area_struct *unlink = vma; + int fullmm = (*tlb)->fullmm; + + if (!vma) /* Sometimes when exiting after an oops */ + return; +#ifndef CONFIG_PREEMPT_RT + if (vma->vm_next) +#endif + tlb_finish_mmu(*tlb, tlb_start_addr(*tlb), tlb_end_addr(*tlb)); + /* + * Hide vma from rmap and vmtruncate before freeeing pgtables, + * with preemption enabled, except when unmapping just one area. + */ + while (unlink) { + anon_vma_unlink(unlink); + unlink_file_vma(unlink); + unlink = unlink->vm_next; + } +#ifndef CONFIG_PREEMPT_RT + if (vma->vm_next) +#endif + *tlb = tlb_gather_mmu(vma->vm_mm, fullmm); +#endif while (vma) { struct vm_area_struct *next = vma->vm_next; unsigned long addr = vma->vm_start; +#ifndef CONFIG_PREEMPT /* * Hide vma from rmap and vmtruncate before freeing pgtables */ anon_vma_unlink(vma); unlink_file_vma(vma); +#endif if (is_vm_hugetlb_page(vma)) { hugetlb_free_pgd_range(tlb, addr, vma->vm_end, @@ -285,8 +319,10 @@ void free_pgtables(struct mmu_gather **t && !is_vm_hugetlb_page(next)) { vma = next; next = vma->vm_next; +#ifndef CONFIG_PREEMPT anon_vma_unlink(vma); unlink_file_vma(vma); +#endif } free_pgd_range(tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); @@ -770,10 +806,13 @@ static unsigned long unmap_page_range(st return addr; } -#ifdef CONFIG_PREEMPT +#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_RT) # define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) #else -/* No preempt: go for improved straight-line efficiency */ +/* + * No preempt: go for improved straight-line efficiency + * on PREEMPT_RT this is not a critical latency-path. + */ # define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) #endif @@ -2512,6 +2551,28 @@ unlock: return 0; } +void pagefault_disable(void) +{ + current->pagefault_disabled++; + /* + * make sure to have issued the store before a pagefault + * can hit. + */ + barrier(); +} +EXPORT_SYMBOL(pagefault_disable); + +void pagefault_enable(void) +{ + /* + * make sure to issue those last loads/stores before enabling + * the pagefault handler again. + */ + barrier(); + current->pagefault_disabled--; +} +EXPORT_SYMBOL(pagefault_enable); + /* * By the time we get here, we already hold the mm semaphore */ Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/io_apic_32.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/io_apic_32.c 2008-01-16 22:27:46.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/io_apic_32.c 2008-01-16 22:28:47.000000000 -0500 @@ -56,8 +56,8 @@ atomic_t irq_mis_count; /* Where if anywhere is the i8259 connect in external int mode */ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); +static DEFINE_RAW_SPINLOCK(ioapic_lock); +static DEFINE_RAW_SPINLOCK(vector_lock); int timer_over_8254 __initdata = 1; @@ -261,14 +261,14 @@ static void __unmask_IO_APIC_irq (unsign __modify_IO_APIC_irq(irq, 0, 0x00010000); } -/* mask = 1, trigger = 0 */ -static void __mask_and_edge_IO_APIC_irq (unsigned int irq) +/* trigger = 0 (edge mode) */ +static void __pcix_mask_IO_APIC_irq (unsigned int irq) { - __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); + __modify_IO_APIC_irq(irq, 0, 0x00008000); } -/* mask = 0, trigger = 1 */ -static void __unmask_and_level_IO_APIC_irq (unsigned int irq) +/* mask = 0, trigger = 1 (level mode) */ +static void __pcix_unmask_IO_APIC_irq (unsigned int irq) { __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); } @@ -291,6 +291,24 @@ static void unmask_IO_APIC_irq (unsigned spin_unlock_irqrestore(&ioapic_lock, flags); } +static void pcix_mask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __pcix_mask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void pcix_unmask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __pcix_unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; @@ -1236,23 +1254,28 @@ static int assign_irq_vector(int irq) return vector; } + static struct irq_chip ioapic_chip; +static struct irq_chip pcix_ioapic_chip; #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 #define IOAPIC_LEVEL 1 -static void ioapic_register_intr(int irq, int vector, unsigned long trigger) +static void ioapic_register_intr(int irq, int vector, unsigned long trigger, + int pcix) { + struct irq_chip *chip = pcix ? &pcix_ioapic_chip : &ioapic_chip; + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) { irq_desc[irq].status |= IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, "fasteoi"); + set_irq_chip_and_handler_name(irq, chip, handle_fasteoi_irq, + pcix ? "pcix-fasteoi" : "fasteoi"); } else { irq_desc[irq].status &= ~IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_edge_irq, "edge"); + set_irq_chip_and_handler_name(irq, chip, handle_edge_irq, + pcix ? "pcix-edge" : "edge"); } set_intr_gate(vector, interrupt[irq]); } @@ -1322,7 +1345,8 @@ static void __init setup_IO_APIC_irqs(vo if (IO_APIC_IRQ(irq)) { vector = assign_irq_vector(irq); entry.vector = vector; - ioapic_register_intr(irq, vector, IOAPIC_AUTO); + ioapic_register_intr(irq, vector, IOAPIC_AUTO, + apic > 0); if (!apic && (irq < 16)) disable_8259A_irq(irq); @@ -1493,7 +1517,7 @@ void __init print_IO_APIC(void) return; } -#if 0 +#if 1 static void print_APIC_bitfield (int base) { @@ -1900,7 +1924,7 @@ static int __init timer_irq_works(void) * might have cached one ExtINT interrupt. Finally, at * least one tick may be lost due to delays. */ - if (jiffies - t1 > 4) + if (jiffies - t1 > 4 && jiffies - t1 < 16) return 1; return 0; @@ -1989,8 +2013,10 @@ static void ack_ioapic_quirk_irq(unsigne if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); + /* mask = 1, trigger = 0 */ + __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); + /* mask = 0, trigger = 1 */ + __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); spin_unlock(&ioapic_lock); } } @@ -2015,6 +2041,18 @@ static struct irq_chip ioapic_chip __rea .retrigger = ioapic_retrigger_irq, }; +static struct irq_chip pcix_ioapic_chip __read_mostly = { + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = pcix_mask_IO_APIC_irq, + .unmask = pcix_unmask_IO_APIC_irq, + .ack = ack_ioapic_irq, + .eoi = ack_ioapic_irq, +#ifdef CONFIG_SMP + .set_affinity = set_ioapic_affinity_irq, +#endif + .retrigger = ioapic_retrigger_irq, +}; static inline void init_IO_APIC_traps(void) { @@ -2834,7 +2872,7 @@ int io_apic_set_pci_routing (int ioapic, mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, edge_level, active_high_low); - ioapic_register_intr(irq, entry.vector, edge_level); + ioapic_register_intr(irq, entry.vector, edge_level, ioapic > 0); if (!ioapic && (irq < 16)) disable_8259A_irq(irq); Index: linux-2.6.24-rc8-rt1/arch/x86/pci/Makefile_32 =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/pci/Makefile_32 2008-01-16 22:27:46.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/pci/Makefile_32 2008-01-16 22:28:10.000000000 -0500 @@ -4,8 +4,9 @@ obj-$(CONFIG_PCI_BIOS) += pcbios.o obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_32.o direct.o mmconfig-shared.o obj-$(CONFIG_PCI_DIRECT) += direct.o +obj-$(CONFIG_ACPI) += acpi.o + pci-y := fixup.o -pci-$(CONFIG_ACPI) += acpi.o pci-y += legacy.o irq.o pci-$(CONFIG_X86_VISWS) := visws.o fixup.o Index: linux-2.6.24-rc8-rt1/include/linux/spinlock.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/spinlock.h 2008-01-16 22:27:46.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/spinlock.h 2008-01-16 22:29:18.000000000 -0500 @@ -44,6 +44,42 @@ * builds the _spin_*() APIs. * * linux/spinlock.h: builds the final spin_*() APIs. + * + * + * Public types and naming conventions: + * ------------------------------------ + * spinlock_t: type: sleep-lock + * raw_spinlock_t: type: spin-lock (debug) + * + * spin_lock([raw_]spinlock_t): API: acquire lock, both types + * + * + * Internal types and naming conventions: + * ------------------------------------- + * __raw_spinlock_t: type: lowlevel spin-lock + * + * _spin_lock(struct rt_mutex): API: acquire sleep-lock + * __spin_lock(raw_spinlock_t): API: acquire spin-lock (highlevel) + * _raw_spin_lock(raw_spinlock_t): API: acquire spin-lock (debug) + * __raw_spin_lock(__raw_spinlock_t): API: acquire spin-lock (lowlevel) + * + * + * spin_lock(raw_spinlock_t) translates into the following chain of + * calls/inlines/macros, if spin-lock debugging is enabled: + * + * spin_lock() [include/linux/spinlock.h] + * -> __spin_lock() [kernel/spinlock.c] + * -> _raw_spin_lock() [lib/spinlock_debug.c] + * -> __raw_spin_lock() [include/asm/spinlock.h] + * + * spin_lock(spinlock_t) translates into the following chain of + * calls/inlines/macros: + * + * spin_lock() [include/linux/spinlock.h] + * -> _spin_lock() [include/linux/spinlock.h] + * -> rt_spin_lock() [kernel/rtmutex.c] + * -> rt_spin_lock_fastlock() [kernel/rtmutex.c] + * -> rt_spin_lock_slowlock() [kernel/rtmutex.c] */ #include @@ -51,29 +87,15 @@ #include #include #include +#include #include #include +#include +#include #include /* - * Must define these before including other files, inline functions need them - */ -#define LOCK_SECTION_NAME ".text.lock."KBUILD_BASENAME - -#define LOCK_SECTION_START(extra) \ - ".subsection 1\n\t" \ - extra \ - ".ifndef " LOCK_SECTION_NAME "\n\t" \ - LOCK_SECTION_NAME ":\n\t" \ - ".endif\n" - -#define LOCK_SECTION_END \ - ".previous\n\t" - -#define __lockfunc fastcall __attribute__((section(".spinlock.text"))) - -/* * Pull the raw_spinlock_t and raw_rwlock_t definitions: */ #include @@ -89,42 +111,10 @@ extern int __lockfunc generic__raw_read_ # include #endif -#ifdef CONFIG_DEBUG_SPINLOCK - extern void __spin_lock_init(spinlock_t *lock, const char *name, - struct lock_class_key *key); -# define spin_lock_init(lock) \ -do { \ - static struct lock_class_key __key; \ - \ - __spin_lock_init((lock), #lock, &__key); \ -} while (0) - -#else -# define spin_lock_init(lock) \ - do { *(lock) = SPIN_LOCK_UNLOCKED; } while (0) -#endif - -#ifdef CONFIG_DEBUG_SPINLOCK - extern void __rwlock_init(rwlock_t *lock, const char *name, - struct lock_class_key *key); -# define rwlock_init(lock) \ -do { \ - static struct lock_class_key __key; \ - \ - __rwlock_init((lock), #lock, &__key); \ -} while (0) -#else -# define rwlock_init(lock) \ - do { *(lock) = RW_LOCK_UNLOCKED; } while (0) -#endif - -#define spin_is_locked(lock) __raw_spin_is_locked(&(lock)->raw_lock) - -/** - * spin_unlock_wait - wait until the spinlock gets unlocked - * @lock: the spinlock in question. +/* + * Pull the RT types: */ -#define spin_unlock_wait(lock) __raw_spin_unlock_wait(&(lock)->raw_lock) +#include /* * Pull the _spin_*()/_read_*()/_write_*() functions/declarations: @@ -136,16 +126,16 @@ do { \ #endif #ifdef CONFIG_DEBUG_SPINLOCK - extern void _raw_spin_lock(spinlock_t *lock); -#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock) - extern int _raw_spin_trylock(spinlock_t *lock); - extern void _raw_spin_unlock(spinlock_t *lock); - extern void _raw_read_lock(rwlock_t *lock); - extern int _raw_read_trylock(rwlock_t *lock); - extern void _raw_read_unlock(rwlock_t *lock); - extern void _raw_write_lock(rwlock_t *lock); - extern int _raw_write_trylock(rwlock_t *lock); - extern void _raw_write_unlock(rwlock_t *lock); + extern __lockfunc void _raw_spin_lock(raw_spinlock_t *lock); +# define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock) + extern __lockfunc int _raw_spin_trylock(raw_spinlock_t *lock); + extern __lockfunc void _raw_spin_unlock(raw_spinlock_t *lock); + extern __lockfunc void _raw_read_lock(raw_rwlock_t *lock); + extern __lockfunc int _raw_read_trylock(raw_rwlock_t *lock); + extern __lockfunc void _raw_read_unlock(raw_rwlock_t *lock); + extern __lockfunc void _raw_write_lock(raw_rwlock_t *lock); + extern __lockfunc int _raw_write_trylock(raw_rwlock_t *lock); + extern __lockfunc void _raw_write_unlock(raw_rwlock_t *lock); #else # define _raw_spin_lock(lock) __raw_spin_lock(&(lock)->raw_lock) # define _raw_spin_lock_flags(lock, flags) \ @@ -160,141 +150,446 @@ do { \ # define _raw_write_unlock(rwlock) __raw_write_unlock(&(rwlock)->raw_lock) #endif -#define read_can_lock(rwlock) __raw_read_can_lock(&(rwlock)->raw_lock) -#define write_can_lock(rwlock) __raw_write_can_lock(&(rwlock)->raw_lock) +extern int __bad_spinlock_type(void); +extern int __bad_rwlock_type(void); + +extern void +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key); + +extern void __lockfunc rt_spin_lock(spinlock_t *lock); +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass); +extern void __lockfunc rt_spin_unlock(spinlock_t *lock); +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock); +extern int __lockfunc +rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags); +extern int __lockfunc rt_spin_trylock(spinlock_t *lock); +extern int _atomic_dec_and_spin_lock(spinlock_t *lock, atomic_t *atomic); + +/* + * lockdep-less calls, for derived types like rwlock: + * (for trylock they can use rt_mutex_trylock() directly. + */ +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock); +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock); + +#ifdef CONFIG_PREEMPT_RT +# define _spin_lock(l) rt_spin_lock(l) +# define _spin_lock_nested(l, s) rt_spin_lock_nested(l, s) +# define _spin_lock_bh(l) rt_spin_lock(l) +# define _spin_lock_irq(l) rt_spin_lock(l) +# define _spin_unlock(l) rt_spin_unlock(l) +# define _spin_unlock_no_resched(l) rt_spin_unlock(l) +# define _spin_unlock_bh(l) rt_spin_unlock(l) +# define _spin_unlock_irq(l) rt_spin_unlock(l) +# define _spin_unlock_irqrestore(l, f) rt_spin_unlock(l) +static inline unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) +{ + rt_spin_lock(lock); + return 0; +} +static inline unsigned long __lockfunc +_spin_lock_irqsave_nested(spinlock_t *lock, int subclass) +{ + rt_spin_lock_nested(lock, subclass); + return 0; +} +#else +static inline unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) +{ + return 0; +} +static inline unsigned long __lockfunc +_spin_lock_irqsave_nested(spinlock_t *lock, int subclass) +{ + return 0; +} +# define _spin_lock(l) do { } while (0) +# define _spin_lock_nested(l, s) do { } while (0) +# define _spin_lock_bh(l) do { } while (0) +# define _spin_lock_irq(l) do { } while (0) +# define _spin_unlock(l) do { } while (0) +# define _spin_unlock_no_resched(l) do { } while (0) +# define _spin_unlock_bh(l) do { } while (0) +# define _spin_unlock_irq(l) do { } while (0) +# define _spin_unlock_irqrestore(l, f) do { } while (0) +#endif + +#define _spin_lock_init(sl, n, f, l) \ +do { \ + static struct lock_class_key __key; \ + \ + __rt_spin_lock_init(sl, n, &__key); \ +} while (0) + +# ifdef CONFIG_PREEMPT_RT +# define _spin_can_lock(l) (!rt_mutex_is_locked(&(l)->lock)) +# define _spin_is_locked(l) rt_mutex_is_locked(&(l)->lock) +# define _spin_unlock_wait(l) rt_spin_unlock_wait(l) + +# define _spin_trylock(l) rt_spin_trylock(l) +# define _spin_trylock_bh(l) rt_spin_trylock(l) +# define _spin_trylock_irq(l) rt_spin_trylock(l) +# define _spin_trylock_irqsave(l,f) rt_spin_trylock_irqsave(l, f) +# else + + extern int this_should_never_be_called_on_non_rt(spinlock_t *lock); +# define TSNBCONRT(l) this_should_never_be_called_on_non_rt(l) +# define _spin_can_lock(l) TSNBCONRT(l) +# define _spin_is_locked(l) TSNBCONRT(l) +# define _spin_unlock_wait(l) TSNBCONRT(l) + +# define _spin_trylock(l) TSNBCONRT(l) +# define _spin_trylock_bh(l) TSNBCONRT(l) +# define _spin_trylock_irq(l) TSNBCONRT(l) +# define _spin_trylock_irqsave(l,f) TSNBCONRT(l) +#endif + +extern void __lockfunc rt_write_lock(rwlock_t *rwlock); +extern void __lockfunc rt_read_lock(rwlock_t *rwlock); +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock); +extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, + unsigned long *flags); +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock); +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock); +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock); +extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock); +extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock); +extern void +__rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key); + +#define _rwlock_init(rwl, n, f, l) \ +do { \ + static struct lock_class_key __key; \ + \ + __rt_rwlock_init(rwl, n, &__key); \ +} while (0) + +#ifdef CONFIG_PREEMPT_RT +# define rt_read_can_lock(rwl) (!rt_mutex_is_locked(&(rwl)->lock)) +# define rt_write_can_lock(rwl) (!rt_mutex_is_locked(&(rwl)->lock)) +#else + extern int rt_rwlock_can_lock_never_call_on_non_rt(rwlock_t *rwlock); +# define rt_read_can_lock(rwl) rt_rwlock_can_lock_never_call_on_non_rt(rwl) +# define rt_write_can_lock(rwl) rt_rwlock_can_lock_never_call_on_non_rt(rwl) +#endif + +# define _read_can_lock(rwl) rt_read_can_lock(rwl) +# define _write_can_lock(rwl) rt_write_can_lock(rwl) + +# define _read_trylock(rwl) rt_read_trylock(rwl) +# define _write_trylock(rwl) rt_write_trylock(rwl) +# define _write_trylock_irqsave(rwl, flags) \ + rt_write_trylock_irqsave(rwl, flags) + +# define _read_lock(rwl) rt_read_lock(rwl) +# define _write_lock(rwl) rt_write_lock(rwl) +# define _read_unlock(rwl) rt_read_unlock(rwl) +# define _write_unlock(rwl) rt_write_unlock(rwl) + +# define _read_lock_bh(rwl) rt_read_lock(rwl) +# define _write_lock_bh(rwl) rt_write_lock(rwl) +# define _read_unlock_bh(rwl) rt_read_unlock(rwl) +# define _write_unlock_bh(rwl) rt_write_unlock(rwl) + +# define _read_lock_irq(rwl) rt_read_lock(rwl) +# define _write_lock_irq(rwl) rt_write_lock(rwl) +# define _read_unlock_irq(rwl) rt_read_unlock(rwl) +# define _write_unlock_irq(rwl) rt_write_unlock(rwl) + +# define _read_lock_irqsave(rwl) rt_read_lock_irqsave(rwl) +# define _write_lock_irqsave(rwl) rt_write_lock_irqsave(rwl) + +# define _read_unlock_irqrestore(rwl, f) rt_read_unlock(rwl) +# define _write_unlock_irqrestore(rwl, f) rt_write_unlock(rwl) + +#ifdef CONFIG_DEBUG_SPINLOCK + extern void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, + struct lock_class_key *key); +# define _raw_spin_lock_init(lock, name, file, line) \ +do { \ + static struct lock_class_key __key; \ + \ + __raw_spin_lock_init((lock), #lock, &__key); \ +} while (0) + +#else +#define __raw_spin_lock_init(lock) \ + do { *(lock) = RAW_SPIN_LOCK_UNLOCKED(lock); } while (0) +# define _raw_spin_lock_init(lock, name, file, line) __raw_spin_lock_init(lock) +#endif + +/* + * PICK_SPIN_OP()/PICK_RW_OP() are simple redirectors for PICK_FUNCTION + */ +#define PICK_SPIN_OP(...) \ + PICK_FUNCTION(raw_spinlock_t *, spinlock_t *, ##__VA_ARGS__) +#define PICK_SPIN_OP_RET(...) \ + PICK_FUNCTION_RET(raw_spinlock_t *, spinlock_t *, ##__VA_ARGS__) +#define PICK_RW_OP(...) PICK_FUNCTION(raw_rwlock_t *, rwlock_t *, ##__VA_ARGS__) +#define PICK_RW_OP_RET(...) \ + PICK_FUNCTION_RET(raw_rwlock_t *, rwlock_t *, ##__VA_ARGS__) + +#define spin_lock_init(lock) \ + PICK_SPIN_OP(_raw_spin_lock_init, _spin_lock_init, lock, #lock, \ + __FILE__, __LINE__) + +#ifdef CONFIG_DEBUG_SPINLOCK + extern void __raw_rwlock_init(raw_rwlock_t *lock, const char *name, + struct lock_class_key *key); +# define _raw_rwlock_init(lock, name, file, line) \ +do { \ + static struct lock_class_key __key; \ + \ + __raw_rwlock_init((lock), #lock, &__key); \ +} while (0) +#else +#define __raw_rwlock_init(lock) \ + do { *(lock) = RAW_RW_LOCK_UNLOCKED(lock); } while (0) +# define _raw_rwlock_init(lock, name, file, line) __raw_rwlock_init(lock) +#endif + +#define rwlock_init(lock) \ + PICK_RW_OP(_raw_rwlock_init, _rwlock_init, lock, #lock, \ + __FILE__, __LINE__) + +#define __spin_is_locked(lock) __raw_spin_is_locked(&(lock)->raw_lock) + +#define spin_is_locked(lock) \ + PICK_SPIN_OP_RET(__spin_is_locked, _spin_is_locked, lock) + +#define __spin_unlock_wait(lock) __raw_spin_unlock_wait(&(lock)->raw_lock) + +#define spin_unlock_wait(lock) \ + PICK_SPIN_OP(__spin_unlock_wait, _spin_unlock_wait, lock) /* * Define the various spin_lock and rw_lock methods. Note we define these * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various * methods are defined as nops in the case they are not required. */ -#define spin_trylock(lock) __cond_lock(lock, _spin_trylock(lock)) -#define read_trylock(lock) __cond_lock(lock, _read_trylock(lock)) -#define write_trylock(lock) __cond_lock(lock, _write_trylock(lock)) +#define spin_trylock(lock) \ + __cond_lock(lock, PICK_SPIN_OP_RET(__spin_trylock, _spin_trylock, lock)) + +#define read_trylock(lock) \ + __cond_lock(lock, PICK_RW_OP_RET(__read_trylock, _read_trylock, lock)) + +#define write_trylock(lock) \ + __cond_lock(lock, PICK_RW_OP_RET(__write_trylock, _write_trylock, lock)) -#define spin_lock(lock) _spin_lock(lock) +#define write_trylock_irqsave(lock, flags) \ + __cond_lock(lock, PICK_RW_OP_RET(__write_trylock_irqsave, \ + _write_trylock_irqsave, lock, &flags)) + +#define __spin_can_lock(lock) __raw_spin_can_lock(&(lock)->raw_lock) +#define __read_can_lock(lock) __raw_read_can_lock(&(lock)->raw_lock) +#define __write_can_lock(lock) __raw_write_can_lock(&(lock)->raw_lock) + +#define spin_can_lock(lock) \ + __cond_lock(lock, PICK_SPIN_OP_RET(__spin_can_lock, _spin_can_lock,\ + lock)) + +#define read_can_lock(lock) \ + __cond_lock(lock, PICK_RW_OP_RET(__read_can_lock, _read_can_lock, lock)) + +#define write_can_lock(lock) \ + __cond_lock(lock, PICK_RW_OP_RET(__write_can_lock, _write_can_lock,\ + lock)) + +#define spin_lock(lock) PICK_SPIN_OP(__spin_lock, _spin_lock, lock) #ifdef CONFIG_DEBUG_LOCK_ALLOC -# define spin_lock_nested(lock, subclass) _spin_lock_nested(lock, subclass) +# define spin_lock_nested(lock, subclass) \ + PICK_SPIN_OP(__spin_lock_nested, _spin_lock_nested, lock, subclass) #else -# define spin_lock_nested(lock, subclass) _spin_lock(lock) +# define spin_lock_nested(lock, subclass) spin_lock(lock) #endif -#define write_lock(lock) _write_lock(lock) -#define read_lock(lock) _read_lock(lock) +#define write_lock(lock) PICK_RW_OP(__write_lock, _write_lock, lock) -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) +#define read_lock(lock) PICK_RW_OP(__read_lock, _read_lock, lock) -#define spin_lock_irqsave(lock, flags) flags = _spin_lock_irqsave(lock) -#define read_lock_irqsave(lock, flags) flags = _read_lock_irqsave(lock) -#define write_lock_irqsave(lock, flags) flags = _write_lock_irqsave(lock) +# define spin_lock_irqsave(lock, flags) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + flags = PICK_SPIN_OP_RET(__spin_lock_irqsave, _spin_lock_irqsave, \ + lock); \ +} while (0) #ifdef CONFIG_DEBUG_LOCK_ALLOC -#define spin_lock_irqsave_nested(lock, flags, subclass) \ - flags = _spin_lock_irqsave_nested(lock, subclass) +# define spin_lock_irqsave_nested(lock, flags, subclass) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + flags = PICK_SPIN_OP_RET(__spin_lock_irqsave_nested, \ + _spin_lock_irqsave_nested, lock, subclass); \ +} while (0) #else -#define spin_lock_irqsave_nested(lock, flags, subclass) \ - flags = _spin_lock_irqsave(lock) +# define spin_lock_irqsave_nested(lock, flags, subclass) \ + spin_lock_irqsave(lock, flags) #endif -#else +# define read_lock_irqsave(lock, flags) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + flags = PICK_RW_OP_RET(__read_lock_irqsave, _read_lock_irqsave, lock);\ +} while (0) + +# define write_lock_irqsave(lock, flags) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + flags = PICK_RW_OP_RET(__write_lock_irqsave, _write_lock_irqsave,lock);\ +} while (0) -#define spin_lock_irqsave(lock, flags) _spin_lock_irqsave(lock, flags) -#define read_lock_irqsave(lock, flags) _read_lock_irqsave(lock, flags) -#define write_lock_irqsave(lock, flags) _write_lock_irqsave(lock, flags) -#define spin_lock_irqsave_nested(lock, flags, subclass) \ - spin_lock_irqsave(lock, flags) +#define spin_lock_irq(lock) PICK_SPIN_OP(__spin_lock_irq, _spin_lock_irq, lock) -#endif +#define spin_lock_bh(lock) PICK_SPIN_OP(__spin_lock_bh, _spin_lock_bh, lock) + +#define read_lock_irq(lock) PICK_RW_OP(__read_lock_irq, _read_lock_irq, lock) -#define spin_lock_irq(lock) _spin_lock_irq(lock) -#define spin_lock_bh(lock) _spin_lock_bh(lock) +#define read_lock_bh(lock) PICK_RW_OP(__read_lock_bh, _read_lock_bh, lock) -#define read_lock_irq(lock) _read_lock_irq(lock) -#define read_lock_bh(lock) _read_lock_bh(lock) +#define write_lock_irq(lock) PICK_RW_OP(__write_lock_irq, _write_lock_irq, lock) -#define write_lock_irq(lock) _write_lock_irq(lock) -#define write_lock_bh(lock) _write_lock_bh(lock) +#define write_lock_bh(lock) PICK_RW_OP(__write_lock_bh, _write_lock_bh, lock) + +#define spin_unlock(lock) PICK_SPIN_OP(__spin_unlock, _spin_unlock, lock) + +#define read_unlock(lock) PICK_RW_OP(__read_unlock, _read_unlock, lock) + +#define write_unlock(lock) PICK_RW_OP(__write_unlock, _write_unlock, lock) + +#define spin_unlock_no_resched(lock) \ + PICK_SPIN_OP(__spin_unlock_no_resched, _spin_unlock_no_resched, lock) + +#define spin_unlock_irqrestore(lock, flags) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + PICK_SPIN_OP(__spin_unlock_irqrestore, _spin_unlock_irqrestore, \ + lock, flags); \ +} while (0) + +#define spin_unlock_irq(lock) \ + PICK_SPIN_OP(__spin_unlock_irq, _spin_unlock_irq, lock) +#define spin_unlock_bh(lock) \ + PICK_SPIN_OP(__spin_unlock_bh, _spin_unlock_bh, lock) + +#define read_unlock_irqrestore(lock, flags) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + PICK_RW_OP(__read_unlock_irqrestore, _read_unlock_irqrestore, \ + lock, flags); \ +} while (0) + +#define read_unlock_irq(lock) \ + PICK_RW_OP(__read_unlock_irq, _read_unlock_irq, lock) +#define read_unlock_bh(lock) PICK_RW_OP(__read_unlock_bh, _read_unlock_bh, lock) + +#define write_unlock_irqrestore(lock, flags) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + PICK_RW_OP(__write_unlock_irqrestore, _write_unlock_irqrestore, \ + lock, flags); \ +} while (0) +#define write_unlock_irq(lock) \ + PICK_RW_OP(__write_unlock_irq, _write_unlock_irq, lock) + +#define write_unlock_bh(lock) \ + PICK_RW_OP(__write_unlock_bh, _write_unlock_bh, lock) + +#define spin_trylock_bh(lock) \ + __cond_lock(lock, PICK_SPIN_OP_RET(__spin_trylock_bh, _spin_trylock_bh,\ + lock)) + +#define spin_trylock_irq(lock) \ + __cond_lock(lock, PICK_SPIN_OP_RET(__spin_trylock_irq, \ + _spin_trylock_irq, lock)) + +#define spin_trylock_irqsave(lock, flags) \ + __cond_lock(lock, PICK_SPIN_OP_RET(__spin_trylock_irqsave, \ + _spin_trylock_irqsave, lock, &flags)) + +/* "lock on reference count zero" */ +#ifndef ATOMIC_DEC_AND_LOCK +# include + extern int __atomic_dec_and_spin_lock(raw_spinlock_t *lock, atomic_t *atomic); +#endif + +#define atomic_dec_and_lock(atomic, lock) \ + __cond_lock(lock, PICK_SPIN_OP_RET(__atomic_dec_and_spin_lock, \ + _atomic_dec_and_spin_lock, lock, atomic)) /* - * We inline the unlock functions in the nondebug case: + * bit-based spin_lock() + * + * Don't use this unless you really need to: spin_lock() and spin_unlock() + * are significantly faster. */ -#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) || \ - !defined(CONFIG_SMP) -# define spin_unlock(lock) _spin_unlock(lock) -# define read_unlock(lock) _read_unlock(lock) -# define write_unlock(lock) _write_unlock(lock) -# define spin_unlock_irq(lock) _spin_unlock_irq(lock) -# define read_unlock_irq(lock) _read_unlock_irq(lock) -# define write_unlock_irq(lock) _write_unlock_irq(lock) -#else -# define spin_unlock(lock) \ - do {__raw_spin_unlock(&(lock)->raw_lock); __release(lock); } while (0) -# define read_unlock(lock) \ - do {__raw_read_unlock(&(lock)->raw_lock); __release(lock); } while (0) -# define write_unlock(lock) \ - do {__raw_write_unlock(&(lock)->raw_lock); __release(lock); } while (0) -# define spin_unlock_irq(lock) \ -do { \ - __raw_spin_unlock(&(lock)->raw_lock); \ - __release(lock); \ - local_irq_enable(); \ -} while (0) -# define read_unlock_irq(lock) \ -do { \ - __raw_read_unlock(&(lock)->raw_lock); \ - __release(lock); \ - local_irq_enable(); \ -} while (0) -# define write_unlock_irq(lock) \ -do { \ - __raw_write_unlock(&(lock)->raw_lock); \ - __release(lock); \ - local_irq_enable(); \ -} while (0) +static inline void bit_spin_lock(int bitnum, unsigned long *addr) +{ + /* + * Assuming the lock is uncontended, this never enters + * the body of the outer loop. If it is contended, then + * within the inner loop a non-atomic test is used to + * busywait with less bus contention for a good time to + * attempt to acquire the lock bit. + */ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) + while (test_and_set_bit(bitnum, addr)) + while (test_bit(bitnum, addr)) + cpu_relax(); #endif + __acquire(bitlock); +} -#define spin_unlock_irqrestore(lock, flags) \ - _spin_unlock_irqrestore(lock, flags) -#define spin_unlock_bh(lock) _spin_unlock_bh(lock) - -#define read_unlock_irqrestore(lock, flags) \ - _read_unlock_irqrestore(lock, flags) -#define read_unlock_bh(lock) _read_unlock_bh(lock) - -#define write_unlock_irqrestore(lock, flags) \ - _write_unlock_irqrestore(lock, flags) -#define write_unlock_bh(lock) _write_unlock_bh(lock) - -#define spin_trylock_bh(lock) __cond_lock(lock, _spin_trylock_bh(lock)) - -#define spin_trylock_irq(lock) \ -({ \ - local_irq_disable(); \ - spin_trylock(lock) ? \ - 1 : ({ local_irq_enable(); 0; }); \ -}) +/* + * Return true if it was acquired + */ +static inline int bit_spin_trylock(int bitnum, unsigned long *addr) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) + if (test_and_set_bit(bitnum, addr)) + return 0; +#endif + __acquire(bitlock); + return 1; +} -#define spin_trylock_irqsave(lock, flags) \ -({ \ - local_irq_save(flags); \ - spin_trylock(lock) ? \ - 1 : ({ local_irq_restore(flags); 0; }); \ -}) +/* + * bit-based spin_unlock() + */ +static inline void bit_spin_unlock(int bitnum, unsigned long *addr) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) + BUG_ON(!test_bit(bitnum, addr)); + smp_mb__before_clear_bit(); + clear_bit(bitnum, addr); +#endif + __release(bitlock); +} -#define write_trylock_irqsave(lock, flags) \ -({ \ - local_irq_save(flags); \ - write_trylock(lock) ? \ - 1 : ({ local_irq_restore(flags); 0; }); \ -}) +/* + * Return true if the lock is held. + */ +static inline int bit_spin_is_locked(int bitnum, unsigned long *addr) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) + return test_bit(bitnum, addr); +#else + return 1; +#endif +} + +/** + * __raw_spin_can_lock - would __raw_spin_trylock() succeed? + * @lock: the spinlock in question. + */ +#define __raw_spin_can_lock(lock) (!__raw_spin_is_locked(lock)) /* * Locks two spinlocks l1 and l2. * l1_first indicates if spinlock l1 should be taken first. */ -static inline void double_spin_lock(spinlock_t *l1, spinlock_t *l2, - bool l1_first) +static inline void +raw_double_spin_lock(raw_spinlock_t *l1, raw_spinlock_t *l2, bool l1_first) __acquires(l1) __acquires(l2) { @@ -307,13 +602,29 @@ static inline void double_spin_lock(spin } } +static inline void +double_spin_lock(spinlock_t *l1, spinlock_t *l2, bool l1_first) + __acquires(l1) + __acquires(l2) +{ + if (l1_first) { + spin_lock(l1); + spin_lock(l2); + } else { + spin_lock(l2); + spin_lock(l1); + } +} + + /* * Unlocks two spinlocks l1 and l2. * l1_taken_first indicates if spinlock l1 was taken first and therefore * should be released after spinlock l2. */ -static inline void double_spin_unlock(spinlock_t *l1, spinlock_t *l2, - bool l1_taken_first) +static inline void +raw_double_spin_unlock(raw_spinlock_t *l1, raw_spinlock_t *l2, + bool l1_taken_first) __releases(l1) __releases(l2) { @@ -326,24 +637,19 @@ static inline void double_spin_unlock(sp } } -/* - * Pull the atomic_t declaration: - * (asm-mips/atomic.h needs above definitions) - */ -#include -/** - * atomic_dec_and_lock - lock on reaching reference count zero - * @atomic: the atomic counter - * @lock: the spinlock in question - */ -extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock); -#define atomic_dec_and_lock(atomic, lock) \ - __cond_lock(lock, _atomic_dec_and_lock(atomic, lock)) - -/** - * spin_can_lock - would spin_trylock() succeed? - * @lock: the spinlock in question. - */ -#define spin_can_lock(lock) (!spin_is_locked(lock)) +static inline void +double_spin_unlock(spinlock_t *l1, spinlock_t *l2, bool l1_taken_first) + __releases(l1) + __releases(l2) +{ + if (l1_taken_first) { + spin_unlock(l2); + spin_unlock(l1); + } else { + spin_unlock(l1); + spin_unlock(l2); + } +} #endif /* __LINUX_SPINLOCK_H */ + Index: linux-2.6.24-rc8-rt1/kernel/irq/migration.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/irq/migration.c 2008-01-16 22:27:46.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/irq/migration.c 2008-01-16 22:28:10.000000000 -0500 @@ -61,6 +61,7 @@ void move_masked_irq(int irq) void move_native_irq(int irq) { struct irq_desc *desc = irq_desc + irq; + int mask = 1; if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; @@ -68,8 +69,17 @@ void move_native_irq(int irq) if (unlikely(desc->status & IRQ_DISABLED)) return; - desc->chip->mask(irq); + /* + * If the irq is already in progress, it should be masked. + * If we unmask it, we might cause an interrupt storm on RT. + */ + if (unlikely(desc->status & IRQ_INPROGRESS)) + mask = 0; + + if (mask) + desc->chip->mask(irq); move_masked_irq(irq); - desc->chip->unmask(irq); + if (mask) + desc->chip->unmask(irq); } Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/io_apic_64.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/io_apic_64.c 2008-01-16 22:27:46.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/io_apic_64.c 2008-01-16 22:28:57.000000000 -0500 @@ -91,8 +91,8 @@ int timer_over_8254 __initdata = 1; /* Where if anywhere is the i8259 connect in external int mode */ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; -static DEFINE_SPINLOCK(ioapic_lock); -DEFINE_SPINLOCK(vector_lock); +static DEFINE_RAW_SPINLOCK(ioapic_lock); +DEFINE_RAW_SPINLOCK(vector_lock); /* * # of IRQ routing registers @@ -205,6 +205,9 @@ static inline void io_apic_sync(unsigned reg ACTION; \ io_apic_modify(entry->apic, reg); \ FINAL; \ + /* Force POST flush by reading: */ \ + reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ + \ if (!entry->next) \ break; \ entry = irq_2_pin + entry->next; \ @@ -349,10 +352,11 @@ static void add_pin_to_irq(unsigned int static void name##_IO_APIC_irq (unsigned int irq) \ __DO_ACTION(R, ACTION, FINAL) -DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) - /* mask = 1 */ -DO_ACTION( __unmask, 0, &= 0xfffeffff, ) - /* mask = 0 */ +DO_ACTION( __mask, 0, |= 0x00010000, ) /* mask = 1 */ +DO_ACTION( __unmask, 0, &= 0xfffeffff, ) /* mask = 0 */ + +DO_ACTION( __pcix_mask, 0, &= 0xffff7fff, ) /* edge */ +DO_ACTION( __pcix_unmask, 0, = (reg & 0xfffeffff) | 0x00008000, ) /* level */ static void mask_IO_APIC_irq (unsigned int irq) { @@ -371,6 +375,23 @@ static void unmask_IO_APIC_irq (unsigned __unmask_IO_APIC_irq(irq); spin_unlock_irqrestore(&ioapic_lock, flags); } +static void pcix_mask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __pcix_mask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void pcix_unmask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __pcix_unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { @@ -796,17 +817,20 @@ void __setup_vector_irq(int cpu) static struct irq_chip ioapic_chip; +static struct irq_chip pcix_ioapic_chip; -static void ioapic_register_intr(int irq, unsigned long trigger) +static void ioapic_register_intr(int irq, unsigned long trigger, int pcix) { + struct irq_chip *chip = pcix ? &pcix_ioapic_chip : &ioapic_chip; + if (trigger) { irq_desc[irq].status |= IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, "fasteoi"); + set_irq_chip_and_handler_name(irq, chip, handle_fasteoi_irq, + pcix ? "pcix-fasteoi" : "fasteoi"); } else { irq_desc[irq].status &= ~IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_edge_irq, "edge"); + set_irq_chip_and_handler_name(irq, chip, handle_edge_irq, + pcix ? "pcix-edge" : "edge"); } } @@ -851,7 +875,7 @@ static void setup_IO_APIC_irq(int apic, if (trigger) entry.mask = 1; - ioapic_register_intr(irq, trigger); + ioapic_register_intr(irq, trigger, apic > 0); if (irq < 16) disable_8259A_irq(irq); @@ -1440,7 +1464,8 @@ static void ack_apic_level(unsigned int irq_complete_move(irq); #if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) /* If we are moving the irq we need to mask it */ - if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { + if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING) && + !(irq_desc[irq].status & IRQ_INPROGRESS)) { do_unmask_irq = 1; mask_IO_APIC_irq(irq); } @@ -1487,14 +1512,27 @@ static void ack_apic_level(unsigned int } static struct irq_chip ioapic_chip __read_mostly = { - .name = "IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_apic_edge, - .eoi = ack_apic_level, + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_apic_edge, + .eoi = ack_apic_level, +#ifdef CONFIG_SMP + .set_affinity = set_ioapic_affinity_irq, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +static struct irq_chip pcix_ioapic_chip __read_mostly = { + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = pcix_mask_IO_APIC_irq, + .unmask = pcix_unmask_IO_APIC_irq, + .ack = ack_apic_edge, + .eoi = ack_apic_level, #ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity_irq, + .set_affinity = set_ioapic_affinity_irq, #endif .retrigger = ioapic_retrigger_irq, }; @@ -1694,7 +1732,6 @@ static inline void check_timer(void) */ unmask_IO_APIC_irq(0); if (!no_timer_check && timer_irq_works()) { - nmi_watchdog_default(); if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); setup_nmi(); @@ -1720,7 +1757,6 @@ static inline void check_timer(void) setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector); if (timer_irq_works()) { apic_printk(APIC_VERBOSE," works.\n"); - nmi_watchdog_default(); if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); } Index: linux-2.6.24-rc8-rt1/kernel/audit.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/audit.c 2008-01-16 22:27:46.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/audit.c 2008-01-16 22:28:10.000000000 -0500 @@ -1130,7 +1130,7 @@ struct audit_buffer *audit_log_start(str { struct audit_buffer *ab = NULL; struct timespec t; - unsigned int serial; + unsigned int serial = 0 /* shut up gcc */; int reserve; unsigned long timeout_start = jiffies; Index: linux-2.6.24-rc8-rt1/net/core/flow.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/net/core/flow.c 2008-01-16 22:27:46.000000000 -0500 +++ linux-2.6.24-rc8-rt1/net/core/flow.c 2008-01-16 22:28:34.000000000 -0500 @@ -40,9 +40,10 @@ atomic_t flow_cache_genid = ATOMIC_INIT( static u32 flow_hash_shift; #define flow_hash_size (1 << flow_hash_shift) -static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL }; -#define flow_table(cpu) (per_cpu(flow_tables, cpu)) +static DEFINE_PER_CPU_LOCKED(struct flow_cache_entry **, flow_tables); + +#define flow_table(cpu) (per_cpu_var_locked(flow_tables, cpu)) static struct kmem_cache *flow_cachep __read_mostly; @@ -169,24 +170,24 @@ static int flow_key_compare(struct flowi void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, flow_resolve_t resolver) { - struct flow_cache_entry *fle, **head; + struct flow_cache_entry **table, *fle, **head = NULL /* shut up GCC */; unsigned int hash; int cpu; local_bh_disable(); - cpu = smp_processor_id(); + table = get_cpu_var_locked(flow_tables, &cpu); fle = NULL; /* Packet really early in init? Making flow_cache_init a * pre-smp initcall would solve this. --RR */ - if (!flow_table(cpu)) + if (!table) goto nocache; if (flow_hash_rnd_recalc(cpu)) flow_new_hash_rnd(cpu); hash = flow_hash_code(key, cpu); - head = &flow_table(cpu)[hash]; + head = &table[hash]; for (fle = *head; fle; fle = fle->next) { if (fle->family == family && fle->dir == dir && @@ -196,6 +197,7 @@ void *flow_cache_lookup(struct flowi *ke if (ret) atomic_inc(fle->object_ref); + put_cpu_var_locked(flow_tables, cpu); local_bh_enable(); return ret; @@ -221,6 +223,8 @@ void *flow_cache_lookup(struct flowi *ke } nocache: + put_cpu_var_locked(flow_tables, cpu); + { int err; void *obj; @@ -250,14 +254,15 @@ nocache: static void flow_cache_flush_tasklet(unsigned long data) { struct flow_flush_info *info = (void *)data; + struct flow_cache_entry **table; int i; int cpu; - cpu = smp_processor_id(); + table = get_cpu_var_locked(flow_tables, &cpu); for (i = 0; i < flow_hash_size; i++) { struct flow_cache_entry *fle; - fle = flow_table(cpu)[i]; + fle = table[i]; for (; fle; fle = fle->next) { unsigned genid = atomic_read(&flow_cache_genid); @@ -268,6 +273,7 @@ static void flow_cache_flush_tasklet(uns atomic_dec(fle->object_ref); } } + put_cpu_var_locked(flow_tables, cpu); if (atomic_dec_and_test(&info->cpuleft)) complete(&info->completion); Index: linux-2.6.24-rc8-rt1/net/sunrpc/svc.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/net/sunrpc/svc.c 2008-01-16 22:27:46.000000000 -0500 +++ linux-2.6.24-rc8-rt1/net/sunrpc/svc.c 2008-01-16 22:28:10.000000000 -0500 @@ -547,7 +547,7 @@ __svc_create_thread(svc_thread_fn func, struct svc_rqst *rqstp; int error = -ENOMEM; int have_oldmask = 0; - cpumask_t oldmask; + cpumask_t oldmask = CPU_MASK_NONE /* shut up GCC */; rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL); if (!rqstp) Index: linux-2.6.24-rc8-rt1/sound/core/control_compat.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/sound/core/control_compat.c 2008-01-16 22:27:46.000000000 -0500 +++ linux-2.6.24-rc8-rt1/sound/core/control_compat.c 2008-01-16 22:28:10.000000000 -0500 @@ -219,7 +219,7 @@ static int copy_ctl_value_from_user(stru struct snd_ctl_elem_value32 __user *data32, int *typep, int *countp) { - int i, type, count, size; + int i, type, count = 0 /* shut up gcc warning */, size; unsigned int indirect; if (copy_from_user(&data->id, &data32->id, sizeof(data->id))) Index: linux-2.6.24-rc8-rt1/include/net/netfilter/nf_conntrack.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/net/netfilter/nf_conntrack.h 2008-01-16 22:27:45.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/net/netfilter/nf_conntrack.h 2008-01-16 22:29:09.000000000 -0500 @@ -63,11 +63,14 @@ union nf_conntrack_help { #ifdef CONFIG_NETFILTER_DEBUG #define NF_CT_ASSERT(x) \ do { \ - if (!(x)) \ + if (!(x)) { \ /* Wooah! I'm tripping my conntrack in a frenzy of \ netplay... */ \ printk("NF_CT_ASSERT: %s:%i(%s)\n", \ __FILE__, __LINE__, __FUNCTION__); \ + if (printk_ratelimit()) \ + WARN_ON(1); \ + } \ } while(0) #else #define NF_CT_ASSERT(x) @@ -256,13 +259,13 @@ extern atomic_t nf_conntrack_count; extern int nf_conntrack_max; DECLARE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); -#define NF_CT_STAT_INC(count) (__get_cpu_var(nf_conntrack_stat).count++) #define NF_CT_STAT_INC_ATOMIC(count) \ do { \ local_bh_disable(); \ - __get_cpu_var(nf_conntrack_stat).count++; \ + __raw_get_cpu_var(nf_conntrack_stat).count++; \ local_bh_enable(); \ } while (0) +#define NF_CT_STAT_INC(count) (__raw_get_cpu_var(nf_conntrack_stat).count++) extern int nf_conntrack_register_cache(u_int32_t features, const char *name, size_t size); Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/crash.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/crash.c 2008-01-16 22:27:45.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/crash.c 2008-01-16 22:28:11.000000000 -0500 @@ -78,14 +78,6 @@ static int crash_nmi_callback(struct not return 1; } -static void smp_send_nmi_allbutself(void) -{ - cpumask_t mask = cpu_online_map; - cpu_clear(safe_smp_processor_id(), mask); - if (!cpus_empty(mask)) - send_IPI_mask(mask, NMI_VECTOR); -} - static struct notifier_block crash_nmi_nb = { .notifier_call = crash_nmi_callback, }; Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/irq_64.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/irq_64.c 2008-01-16 22:27:45.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/irq_64.c 2008-01-16 22:28:12.000000000 -0500 @@ -145,10 +145,18 @@ asmlinkage unsigned int do_IRQ(struct pt unsigned vector = ~regs->orig_rax; unsigned irq; + irq_show_regs_callback(smp_processor_id(), regs); + exit_idle(); irq_enter(); irq = __get_cpu_var(vector_irq)[vector]; +#ifdef CONFIG_EVENT_TRACE + if (irq == trace_user_trigger_irq) + user_trace_start(); +#endif + trace_special(regs->rip, irq, 0); + #ifdef CONFIG_DEBUG_STACKOVERFLOW stack_overflow_check(regs); #endif Index: linux-2.6.24-rc8-rt1/include/asm-x86/apic_32.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/apic_32.h 2008-01-16 22:27:45.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/apic_32.h 2008-01-16 22:28:11.000000000 -0500 @@ -118,6 +118,8 @@ extern int local_apic_timer_c2_ok; extern int local_apic_timer_disabled; +extern void smp_send_nmi_allbutself(void); + #else /* !CONFIG_X86_LOCAL_APIC */ static inline void lapic_shutdown(void) { } #define local_apic_timer_c2_ok 1 Index: linux-2.6.24-rc8-rt1/include/asm-x86/apic_64.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/apic_64.h 2008-01-16 22:27:45.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/apic_64.h 2008-01-16 22:28:57.000000000 -0500 @@ -87,6 +87,8 @@ extern void setup_APIC_extended_lvt(unsi extern int apic_is_clustered_box(void); +extern void smp_send_nmi_allbutself(void); + #define K8_APIC_EXT_LVT_BASE 0x500 #define K8_APIC_EXT_INT_MSG_FIX 0x0 #define K8_APIC_EXT_INT_MSG_SMI 0x2 @@ -94,6 +96,8 @@ extern int apic_is_clustered_box(void); #define K8_APIC_EXT_INT_MSG_EXT 0x7 #define K8_APIC_EXT_LVT_ENTRY_THRESHOLD 0 +extern void smp_send_nmi_allbutself(void); + #define ARCH_APICTIMER_STOPS_ON_C3 1 extern unsigned boot_cpu_id; Index: linux-2.6.24-rc8-rt1/include/linux/profile.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/profile.h 2008-01-16 22:27:45.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/profile.h 2008-01-16 22:28:49.000000000 -0500 @@ -6,16 +6,18 @@ #include #include #include +#include #include #include extern int prof_on __read_mostly; -#define CPU_PROFILING 1 -#define SCHED_PROFILING 2 -#define SLEEP_PROFILING 3 -#define KVM_PROFILING 4 +#define CPU_PROFILING 1 +#define SCHED_PROFILING 2 +#define SLEEP_PROFILING 3 +#define KVM_PROFILING 4 +#define PREEMPT_PROFILING 5 struct proc_dir_entry; struct pt_regs; @@ -23,6 +25,7 @@ struct notifier_block; /* init basic kernel profiler */ void __init profile_init(void); +void __profile_tick(int type, struct pt_regs *regs); void profile_tick(int); /* @@ -53,6 +56,8 @@ enum profile_type { PROFILE_MUNMAP }; +extern int prof_pid; + #ifdef CONFIG_PROFILING struct task_struct; Index: linux-2.6.24-rc8-rt1/kernel/profile.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/profile.c 2008-01-16 22:27:45.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/profile.c 2008-01-16 22:28:53.000000000 -0500 @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -46,6 +47,7 @@ int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); static cpumask_t prof_cpu_mask = CPU_MASK_ALL; +int prof_pid = -1; #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(int, cpu_profile_flip); @@ -412,16 +414,20 @@ void profile_hits(int type, void *__pc, EXPORT_SYMBOL_GPL(profile_hits); -void profile_tick(int type) +void __profile_tick(int type, struct pt_regs *regs) { - struct pt_regs *regs = get_irq_regs(); - if (type == CPU_PROFILING && timer_hook) timer_hook(regs); - if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask)) + if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask) && + (prof_pid == -1 || prof_pid == current->pid)) profile_hit(type, (void *)profile_pc(regs)); } +void profile_tick(int type) +{ + return __profile_tick(type, get_irq_regs()); +} + #ifdef CONFIG_PROC_FS #include #include Index: linux-2.6.24-rc8-rt1/kernel/time/tick-common.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/time/tick-common.c 2008-01-16 22:27:45.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/time/tick-common.c 2008-01-16 22:28:54.000000000 -0500 @@ -32,7 +32,7 @@ DEFINE_PER_CPU(struct tick_device, tick_ ktime_t tick_next_period; ktime_t tick_period; int tick_do_timer_cpu __read_mostly = -1; -DEFINE_SPINLOCK(tick_device_lock); +DEFINE_RAW_SPINLOCK(tick_device_lock); /* * Debugging: see timer_list.c @@ -68,7 +68,6 @@ static void tick_periodic(int cpu) } update_process_times(user_mode(get_irq_regs())); - profile_tick(CPU_PROFILING); } /* Index: linux-2.6.24-rc8-rt1/kernel/time/tick-sched.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/time/tick-sched.c 2008-01-16 22:27:45.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/time/tick-sched.c 2008-01-16 22:28:55.000000000 -0500 @@ -178,10 +178,12 @@ void tick_nohz_stop_sched_tick(void) if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) goto end; - if (need_resched()) + if (need_resched() || need_resched_delayed()) goto end; cpu = smp_processor_id(); + +#ifndef CONFIG_PREEMPT_RT if (unlikely(local_softirq_pending())) { static int ratelimit; @@ -191,6 +193,7 @@ void tick_nohz_stop_sched_tick(void) ratelimit++; } } +#endif now = ktime_get(); /* @@ -249,6 +252,7 @@ void tick_nohz_stop_sched_tick(void) ts->idle_tick = ts->sched_timer.expires; ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; + rcu_enter_nohz(); } /* @@ -337,6 +341,8 @@ void tick_nohz_restart_sched_tick(void) if (!ts->tick_stopped) return; + rcu_exit_nohz(); + /* Update jiffies first */ now = ktime_get(); @@ -440,7 +446,6 @@ static void tick_nohz_handler(struct clo } update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING); /* Do not restart, when we are in the idle loop */ if (ts->tick_stopped) @@ -554,7 +559,6 @@ static enum hrtimer_restart tick_sched_t */ spin_unlock(&base->lock); update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING); spin_lock(&base->lock); } Index: linux-2.6.24-rc8-rt1/fs/proc/proc_misc.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/fs/proc/proc_misc.c 2008-01-16 22:27:45.000000000 -0500 +++ linux-2.6.24-rc8-rt1/fs/proc/proc_misc.c 2008-01-16 22:29:03.000000000 -0500 @@ -96,6 +96,27 @@ static int loadavg_read_proc(char *page, return proc_calc_metrics(page, start, off, count, eof, len); } +#ifdef CONFIG_PREEMPT_RT +static int loadavg_rt_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + extern unsigned long avenrun_rt[]; + extern unsigned long rt_nr_running(void); + int a, b, c; + int len; + + a = avenrun_rt[0] + (FIXED_1/200); + b = avenrun_rt[1] + (FIXED_1/200); + c = avenrun_rt[2] + (FIXED_1/200); + len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n", + LOAD_INT(a), LOAD_FRAC(a), + LOAD_INT(b), LOAD_FRAC(b), + LOAD_INT(c), LOAD_FRAC(c), + rt_nr_running(), nr_threads, current->nsproxy->pid_ns->last_pid); + return proc_calc_metrics(page, start, off, count, eof, len); +} +#endif + static int uptime_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -281,6 +302,25 @@ static const struct file_operations proc .release = seq_release, }; +#ifdef CONFIG_RADIX_TREE_OPTIMISTIC +extern struct seq_operations optimistic_op; +static int optimistic_open(struct inode *inode, struct file *file) +{ + (void)inode; + return seq_open(file, &optimistic_op); +} + +extern ssize_t optimistic_write(struct file *, const char __user *, size_t, loff_t *); + +static struct file_operations optimistic_file_operations = { + .open = optimistic_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = optimistic_write, +}; +#endif + static int devinfo_show(struct seq_file *f, void *v) { int i = *(loff_t *) v; @@ -455,7 +495,8 @@ static int show_stat(struct seq_file *p, { int i; unsigned long jif; - cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; + cputime64_t user_rt, user, nice, system_rt, system, idle, + iowait, irq, softirq, steal; cputime64_t guest; u64 sum = 0; struct timespec boottime; @@ -465,7 +506,7 @@ static int show_stat(struct seq_file *p, if (!per_irq_sum) return -ENOMEM; - user = nice = system = idle = iowait = + user_rt = user = nice = system_rt = system = idle = iowait = irq = softirq = steal = cputime64_zero; guest = cputime64_zero; getboottime(&boottime); @@ -482,6 +523,8 @@ static int show_stat(struct seq_file *p, irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); + user_rt = cputime64_add(user_rt, kstat_cpu(i).cpustat.user_rt); + system_rt = cputime64_add(system_rt, kstat_cpu(i).cpustat.system_rt); guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); for (j = 0; j < NR_IRQS; j++) { unsigned int temp = kstat_cpu(i).irqs[j]; @@ -490,7 +533,10 @@ static int show_stat(struct seq_file *p, } } - seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", + user = cputime64_add(user_rt, user); + system = cputime64_add(system_rt, system); + + seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", (unsigned long long)cputime64_to_clock_t(user), (unsigned long long)cputime64_to_clock_t(nice), (unsigned long long)cputime64_to_clock_t(system), @@ -499,13 +545,17 @@ static int show_stat(struct seq_file *p, (unsigned long long)cputime64_to_clock_t(irq), (unsigned long long)cputime64_to_clock_t(softirq), (unsigned long long)cputime64_to_clock_t(steal), + (unsigned long long)cputime64_to_clock_t(user_rt), + (unsigned long long)cputime64_to_clock_t(system_rt), (unsigned long long)cputime64_to_clock_t(guest)); for_each_online_cpu(i) { /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ - user = kstat_cpu(i).cpustat.user; + user_rt = kstat_cpu(i).cpustat.user_rt; + system_rt = kstat_cpu(i).cpustat.system_rt; + user = cputime64_add(user_rt, kstat_cpu(i).cpustat.user); nice = kstat_cpu(i).cpustat.nice; - system = kstat_cpu(i).cpustat.system; + system = cputime64_add(system_rt, kstat_cpu(i).cpustat.system); idle = kstat_cpu(i).cpustat.idle; iowait = kstat_cpu(i).cpustat.iowait; irq = kstat_cpu(i).cpustat.irq; @@ -513,7 +563,7 @@ static int show_stat(struct seq_file *p, steal = kstat_cpu(i).cpustat.steal; guest = kstat_cpu(i).cpustat.guest; seq_printf(p, - "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", + "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", i, (unsigned long long)cputime64_to_clock_t(user), (unsigned long long)cputime64_to_clock_t(nice), @@ -523,6 +573,8 @@ static int show_stat(struct seq_file *p, (unsigned long long)cputime64_to_clock_t(irq), (unsigned long long)cputime64_to_clock_t(softirq), (unsigned long long)cputime64_to_clock_t(steal), + (unsigned long long)cputime64_to_clock_t(user_rt), + (unsigned long long)cputime64_to_clock_t(system_rt), (unsigned long long)cputime64_to_clock_t(guest)); } seq_printf(p, "intr %llu", (unsigned long long)sum); @@ -543,6 +595,38 @@ static int show_stat(struct seq_file *p, nr_iowait()); kfree(per_irq_sum); +#ifdef CONFIG_PREEMPT_RT + { + unsigned long nr_uninterruptible_cpu(int cpu); + extern int pi_initialized; + unsigned long rt_nr_running(void); + unsigned long rt_nr_running_cpu(int cpu); + unsigned long rt_nr_uninterruptible(void); + unsigned long rt_nr_uninterruptible_cpu(int cpu); + + int i; + + seq_printf(p, "pi_init: %d\n", pi_initialized); + seq_printf(p, "nr_running(): %ld\n", + nr_running()); + seq_printf(p, "nr_uninterruptible(): %ld\n", + nr_uninterruptible()); + for_each_online_cpu(i) + seq_printf(p, "nr_uninterruptible(%d): %ld\n", + i, nr_uninterruptible_cpu(i)); + seq_printf(p, "rt_nr_running(): %ld\n", + rt_nr_running()); + for_each_online_cpu(i) + seq_printf(p, "rt_nr_running(%d): %ld\n", + i, rt_nr_running_cpu(i)); + seq_printf(p, "nr_rt_uninterruptible(): %ld\n", + rt_nr_uninterruptible()); + for_each_online_cpu(i) + seq_printf(p, "nr_rt_uninterruptible(%d): %ld\n", + i, rt_nr_uninterruptible_cpu(i)); + } +#endif + return 0; } @@ -653,6 +737,20 @@ static int execdomains_read_proc(char *p return proc_calc_metrics(page, start, off, count, eof, len); } +#ifdef CONFIG_EVENT_TRACE +extern struct seq_operations latency_trace_op; +static int latency_trace_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &latency_trace_op); +} +static struct file_operations proc_latency_trace_operations = { + .open = latency_trace_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif + #ifdef CONFIG_MAGIC_SYSRQ /* * writing 'C' to /proc/sysrq-trigger is like sysrq-C @@ -692,6 +790,9 @@ void __init proc_misc_init(void) int (*read_proc)(char*,char**,off_t,int,int*,void*); } *p, simple_ones[] = { {"loadavg", loadavg_read_proc}, +#ifdef CONFIG_PREEMPT_RT + {"loadavgrt", loadavg_rt_read_proc}, +#endif {"uptime", uptime_read_proc}, {"meminfo", meminfo_read_proc}, {"version", version_read_proc}, @@ -720,6 +821,9 @@ void __init proc_misc_init(void) entry->proc_fops = &proc_kmsg_operations; } #endif +#ifdef CONFIG_RADIX_TREE_OPTIMISTIC + create_seq_entry("radix_optimistic", 0, &optimistic_file_operations); +#endif create_seq_entry("locks", 0, &proc_locks_operations); create_seq_entry("devices", 0, &proc_devinfo_operations); create_seq_entry("cpuinfo", 0, &proc_cpuinfo_operations); @@ -747,6 +851,9 @@ void __init proc_misc_init(void) #ifdef CONFIG_SCHEDSTATS create_seq_entry("schedstat", 0, &proc_schedstat_operations); #endif +#ifdef CONFIG_EVENT_TRACE + create_seq_entry("latency_trace", 0, &proc_latency_trace_operations); +#endif #ifdef CONFIG_PROC_KCORE proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL); if (proc_root_kcore) { Index: linux-2.6.24-rc8-rt1/include/linux/kernel.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/kernel.h 2008-01-16 22:27:45.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/kernel.h 2008-01-16 22:29:06.000000000 -0500 @@ -111,7 +111,7 @@ extern int cond_resched(void); # define might_resched() do { } while (0) #endif -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT) void __might_sleep(char *file, int line); # define might_sleep() \ do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) @@ -179,6 +179,8 @@ asmlinkage int vprintk(const char *fmt, __attribute__ ((format (printf, 1, 0))); asmlinkage int printk(const char * fmt, ...) __attribute__ ((format (printf, 1, 2))) __cold; +extern void early_printk(const char *fmt, ...) + __attribute__ ((format (printf, 1, 2))); extern int log_buf_get_len(void); extern int log_buf_read(int idx); extern int log_buf_copy(char *dest, int idx, int len); @@ -194,6 +196,18 @@ static inline int log_buf_read(int idx) static inline int log_buf_copy(char *dest, int idx, int len) { return 0; } #endif +#ifdef CONFIG_PREEMPT_RT +extern void zap_rt_locks(void); +#else +# define zap_rt_locks() do { } while (0) +#endif + +#ifdef CONFIG_PREEMPT_RT +extern void zap_rt_locks(void); +#else +# define zap_rt_locks() do { } while (0) +#endif + unsigned long int_sqrt(unsigned long); extern int printk_ratelimit(void); @@ -217,6 +231,10 @@ extern void wake_up_klogd(void); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_timeout; extern int panic_on_oops; + +extern void pause_on_oops_head(void); +extern void pause_on_oops_tail(void); + extern int panic_on_unrecovered_nmi; extern int tainted; extern const char *print_tainted(void); @@ -225,6 +243,7 @@ extern void add_taint(unsigned); /* Values used for system_state */ extern enum system_states { SYSTEM_BOOTING, + SYSTEM_BOOTING_SCHEDULER_OK, SYSTEM_RUNNING, SYSTEM_HALT, SYSTEM_POWER_OFF, Index: linux-2.6.24-rc8-rt1/include/linux/latency_hist.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/include/linux/latency_hist.h 2008-01-16 22:28:18.000000000 -0500 @@ -0,0 +1,32 @@ +/* + * kernel/latency_hist.h + * + * Add support for histograms of preemption-off latency and + * interrupt-off latency and wakeup latency, it depends on + * Real-Time Preemption Support. + * + * Copyright (C) 2005 MontaVista Software, Inc. + * Yi Yang + * + */ +#ifndef _LINUX_LATENCY_HIST_H_ +#define _LINUX_LATENCY_HIST_H_ + +enum { + INTERRUPT_LATENCY = 0, + PREEMPT_LATENCY, + WAKEUP_LATENCY +}; + +#define MAX_ENTRY_NUM 10240 +#define LATENCY_TYPE_NUM 3 + +#ifdef CONFIG_LATENCY_HIST +extern void latency_hist(int latency_type, int cpu, unsigned long latency); +# define latency_hist_flag 1 +#else +# define latency_hist(a,b,c) do { (void)(cpu); } while (0) +# define latency_hist_flag 0 +#endif /* CONFIG_LATENCY_HIST */ + +#endif /* ifndef _LINUX_LATENCY_HIST_H_ */ Index: linux-2.6.24-rc8-rt1/init/main.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/init/main.c 2008-01-16 22:27:45.000000000 -0500 +++ linux-2.6.24-rc8-rt1/init/main.c 2008-01-16 22:29:07.000000000 -0500 @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -47,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -97,6 +99,12 @@ static inline void acpi_early_init(void) #ifndef CONFIG_DEBUG_RODATA static inline void mark_rodata_ro(void) { } #endif +#ifdef CONFIG_ALLOC_RTSJ_MEM +extern void alloc_rtsj_mem_early_setup(void); +#else +static inline void alloc_rtsj_mem_early_setup(void) { } +#endif + #ifdef CONFIG_TC extern void tc_init(void); @@ -434,6 +442,8 @@ static void noinline __init_refok rest_i { int pid; + system_state = SYSTEM_BOOTING_SCHEDULER_OK; + kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND); numa_default_policy(); pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); @@ -445,7 +455,7 @@ static void noinline __init_refok rest_i * at least once to get things moving: */ init_idle_bootup_task(current); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); schedule(); preempt_disable(); @@ -550,8 +560,10 @@ asmlinkage void __init start_kernel(void * fragile until we cpu_idle() for the first time. */ preempt_disable(); + build_all_zonelists(); page_alloc_init(); + early_init_hardirqs(); printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); parse_early_param(); parse_args("Booting kernel", static_command_line, __start___param, @@ -587,6 +599,8 @@ asmlinkage void __init start_kernel(void if (panic_later) panic(panic_later, panic_param); + init_tracer(); + lockdep_info(); /* @@ -606,6 +620,7 @@ asmlinkage void __init start_kernel(void #endif vfs_caches_init_early(); cpuset_init_early(); + alloc_rtsj_mem_early_setup(); mem_init(); kmem_cache_init(); setup_per_cpu_pageset(); @@ -644,6 +659,9 @@ asmlinkage void __init start_kernel(void acpi_early_init(); /* before LAPIC and SMP init */ +#ifdef CONFIG_PREEMPT_RT + WARN_ON(irqs_disabled()); +#endif /* Do the rest non-__init'ed, we're now alive */ rest_init(); } @@ -748,11 +766,14 @@ __setup("nosoftlockup", nosoftlockup_set static void __init do_pre_smp_initcalls(void) { extern int spawn_ksoftirqd(void); + extern int spawn_desched_task(void); migration_init(); + posix_cpu_thread_init(); spawn_ksoftirqd(); if (!nosoftlockup) spawn_softlockup_task(); + spawn_desched_task(); } static void run_init_process(char *init_filename) @@ -783,6 +804,9 @@ static int noinline init_post(void) printk(KERN_WARNING "Failed to execute %s\n", ramdisk_execute_command); } +#ifdef CONFIG_PREEMPT_RT + WARN_ON(irqs_disabled()); +#endif /* * We try each of these until one succeeds. @@ -825,6 +849,8 @@ static int __init kernel_init(void * unu smp_prepare_cpus(max_cpus); + init_hardirqs(); + do_pre_smp_initcalls(); smp_init(); @@ -846,12 +872,58 @@ static int __init kernel_init(void * unu ramdisk_execute_command = NULL; prepare_namespace(); } +#ifdef CONFIG_PREEMPT_RT + WARN_ON(irqs_disabled()); +#endif + +#define DEBUG_COUNT (defined(CONFIG_DEBUG_RT_MUTEXES) + defined(CONFIG_CRITICAL_PREEMPT_TIMING) + defined(CONFIG_CRITICAL_IRQSOFF_TIMING) + defined(CONFIG_FUNCTION_TRACE) + defined(CONFIG_DEBUG_SLAB) + defined(CONFIG_DEBUG_PAGEALLOC) + defined(CONFIG_LOCKDEP)) +#if DEBUG_COUNT > 0 + printk(KERN_ERR "*****************************************************************************\n"); + printk(KERN_ERR "* *\n"); +#if DEBUG_COUNT == 1 + printk(KERN_ERR "* REMINDER, the following debugging option is turned on in your .config: *\n"); +#else + printk(KERN_ERR "* REMINDER, the following debugging options are turned on in your .config: *\n"); +#endif + printk(KERN_ERR "* *\n"); +#ifdef CONFIG_DEBUG_RT_MUTEXES + printk(KERN_ERR "* CONFIG_DEBUG_RT_MUTEXES *\n"); +#endif +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + printk(KERN_ERR "* CONFIG_CRITICAL_PREEMPT_TIMING *\n"); +#endif +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + printk(KERN_ERR "* CONFIG_CRITICAL_IRQSOFF_TIMING *\n"); +#endif +#ifdef CONFIG_FUNCTION_TRACE + printk(KERN_ERR "* CONFIG_FUNCTION_TRACE *\n"); +#endif +#ifdef CONFIG_DEBUG_SLAB + printk(KERN_ERR "* CONFIG_DEBUG_SLAB *\n"); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk(KERN_ERR "* CONFIG_DEBUG_PAGEALLOC *\n"); +#endif +#ifdef CONFIG_LOCKDEP + printk(KERN_ERR "* CONFIG_LOCKDEP *\n"); +#endif + printk(KERN_ERR "* *\n"); +#if DEBUG_COUNT == 1 + printk(KERN_ERR "* it may increase runtime overhead and latencies. *\n"); +#else + printk(KERN_ERR "* they may increase runtime overhead and latencies. *\n"); +#endif + printk(KERN_ERR "* *\n"); + printk(KERN_ERR "*****************************************************************************\n"); +#endif /* * Ok, we have completed the initial bootup, and * we're essentially up and running. Get rid of the * initmem segments and start the user-mode stuff.. */ init_post(); + WARN_ON(debug_direct_keyboard); + return 0; } Index: linux-2.6.24-rc8-rt1/kernel/latency_hist.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/kernel/latency_hist.c 2008-01-16 22:28:18.000000000 -0500 @@ -0,0 +1,336 @@ +/* + * kernel/latency_hist.c + * + * Add support for histograms of preemption-off latency and + * interrupt-off latency and wakeup latency, it depends on + * Real-Time Preemption Support. + * + * Copyright (C) 2005 MontaVista Software, Inc. + * Yi Yang + * + */ +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct hist_data_struct { + atomic_t hist_mode; /* 0 log, 1 don't log */ + unsigned long min_lat; + unsigned long avg_lat; + unsigned long max_lat; + unsigned long long beyond_hist_bound_samples; + unsigned long long accumulate_lat; + unsigned long long total_samples; + unsigned long long hist_array[MAX_ENTRY_NUM]; +} hist_data_t; + +static struct proc_dir_entry * latency_hist_root = NULL; +static char * latency_hist_proc_dir_root = "latency_hist"; + +#ifdef CONFIG_INTERRUPT_OFF_HIST +static DEFINE_PER_CPU(hist_data_t, interrupt_off_hist); +static char * interrupt_off_hist_proc_dir = "interrupt_off_latency"; +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST +static DEFINE_PER_CPU(hist_data_t, preempt_off_hist); +static char * preempt_off_hist_proc_dir = "preempt_off_latency"; +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST +static DEFINE_PER_CPU(hist_data_t, wakeup_latency_hist); +static char * wakeup_latency_hist_proc_dir = "wakeup_latency"; +#endif + +static struct proc_dir_entry *entry[LATENCY_TYPE_NUM][NR_CPUS]; + +static inline u64 u64_div(u64 x, u64 y) +{ + do_div(x, y); + return x; +} + +void notrace latency_hist(int latency_type, int cpu, unsigned long latency) +{ + hist_data_t * my_hist; + + if ((cpu < 0) || (cpu >= NR_CPUS) || (latency_type < INTERRUPT_LATENCY) + || (latency_type > WAKEUP_LATENCY) || (latency < 0)) + return; + + switch(latency_type) { +#ifdef CONFIG_INTERRUPT_OFF_HIST + case INTERRUPT_LATENCY: + my_hist = (hist_data_t *)&per_cpu(interrupt_off_hist, cpu); + break; +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + case PREEMPT_LATENCY: + my_hist = (hist_data_t *)&per_cpu(preempt_off_hist, cpu); + break; +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST + case WAKEUP_LATENCY: + my_hist = (hist_data_t *)&per_cpu(wakeup_latency_hist, cpu); + break; +#endif + default: + return; + } + + if (atomic_read(&my_hist->hist_mode) == 0) + return; + + if (latency >= MAX_ENTRY_NUM) + my_hist->beyond_hist_bound_samples++; + else + my_hist->hist_array[latency]++; + + if (latency < my_hist->min_lat) + my_hist->min_lat = latency; + else if (latency > my_hist->max_lat) + my_hist->max_lat = latency; + + my_hist->total_samples++; + my_hist->accumulate_lat += latency; + my_hist->avg_lat = (unsigned long) u64_div(my_hist->accumulate_lat, + my_hist->total_samples); + return; +} + +static void *l_start(struct seq_file *m, loff_t * pos) +{ + loff_t *index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL); + loff_t index = *pos; + hist_data_t *my_hist = (hist_data_t *) m->private; + + if (!index_ptr) + return NULL; + + if (index == 0) { + atomic_dec(&my_hist->hist_mode); + seq_printf(m, "#Minimum latency: %lu microseconds.\n" + "#Average latency: %lu microseconds.\n" + "#Maximum latency: %lu microseconds.\n" + "#Total samples: %llu\n" + "#There are %llu samples greater or equal than %d microseconds\n" + "#usecs\t%16s\n" + , my_hist->min_lat + , my_hist->avg_lat + , my_hist->max_lat + , my_hist->total_samples + , my_hist->beyond_hist_bound_samples + , MAX_ENTRY_NUM, "samples"); + } + if (index >= MAX_ENTRY_NUM) + return NULL; + + *index_ptr = index; + return index_ptr; +} + +static void *l_next(struct seq_file *m, void *p, loff_t * pos) +{ + loff_t *index_ptr = p; + hist_data_t *my_hist = (hist_data_t *) m->private; + + if (++*pos >= MAX_ENTRY_NUM) { + atomic_inc(&my_hist->hist_mode); + return NULL; + } + *index_ptr = *pos; + return index_ptr; +} + +static void l_stop(struct seq_file *m, void *p) +{ + kfree(p); +} + +static int l_show(struct seq_file *m, void *p) +{ + int index = *(loff_t *) p; + hist_data_t *my_hist = (hist_data_t *) m->private; + + seq_printf(m, "%5d\t%16llu\n", index, my_hist->hist_array[index]); + return 0; +} + +static struct seq_operations latency_hist_seq_op = { + .start = l_start, + .next = l_next, + .stop = l_stop, + .show = l_show +}; + +static int latency_hist_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *entry_ptr = NULL; + int ret, i, j, break_flags = 0; + struct seq_file *seq; + + entry_ptr = PDE(file->f_dentry->d_inode); + for (i = 0; i < LATENCY_TYPE_NUM; i++) { + for (j = 0; j < NR_CPUS; j++) { + if (entry[i][j] == NULL) + continue; + if (entry_ptr->low_ino == entry[i][j]->low_ino) { + break_flags = 1; + break; + } + } + if (break_flags == 1) + break; + } + ret = seq_open(file, &latency_hist_seq_op); + if (break_flags == 1) { + seq = (struct seq_file *)file->private_data; + seq->private = entry[i][j]->data; + } + return ret; +} + +static struct file_operations latency_hist_seq_fops = { + .open = latency_hist_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void hist_reset(hist_data_t *hist) +{ + atomic_dec(&hist->hist_mode); + + memset(hist->hist_array, 0, sizeof(hist->hist_array)); + hist->beyond_hist_bound_samples = 0UL; + hist->min_lat = 0xFFFFFFFFUL; + hist->max_lat = 0UL; + hist->total_samples = 0UL; + hist->accumulate_lat = 0UL; + hist->avg_lat = 0UL; + + atomic_inc(&hist->hist_mode); +} + +ssize_t latency_hist_reset(struct file *file, const char __user *a, size_t size, loff_t *off) +{ + int cpu; + hist_data_t *hist; + struct proc_dir_entry *entry_ptr = PDE(file->f_dentry->d_inode); + int latency_type = (int)entry_ptr->data; + + switch (latency_type) { + +#ifdef CONFIG_WAKEUP_LATENCY_HIST + case WAKEUP_LATENCY: + for_each_online_cpu(cpu) { + hist = &per_cpu(wakeup_latency_hist, cpu); + hist_reset(hist); + } + break; +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + case PREEMPT_LATENCY: + for_each_online_cpu(cpu) { + hist = &per_cpu(preempt_off_hist, cpu); + hist_reset(hist); + } + break; +#endif + +#ifdef CONFIG_INTERRUPT_OFF_HIST + case INTERRUPT_LATENCY: + for_each_online_cpu(cpu) { + hist = &per_cpu(interrupt_off_hist, cpu); + hist_reset(hist); + } + break; +#endif + } + + return size; +} + +static struct file_operations latency_hist_reset_seq_fops = { + .write = latency_hist_reset, +}; + +static struct proc_dir_entry *interrupt_off_reset; +static struct proc_dir_entry *preempt_off_reset; +static struct proc_dir_entry *wakeup_latency_reset; + +static __init int latency_hist_init(void) +{ + struct proc_dir_entry *tmp_parent_proc_dir; + int i = 0, len = 0; + hist_data_t *my_hist; + char procname[64]; + + latency_hist_root = proc_mkdir(latency_hist_proc_dir_root, NULL); + +#ifdef CONFIG_INTERRUPT_OFF_HIST + tmp_parent_proc_dir = proc_mkdir(interrupt_off_hist_proc_dir, latency_hist_root); + for (i = 0; i < num_possible_cpus(); i++) { + len = sprintf(procname, "CPU%d", i); + procname[len] = '\0'; + entry[INTERRUPT_LATENCY][i] = + create_proc_entry(procname, 0, tmp_parent_proc_dir); + entry[INTERRUPT_LATENCY][i]->data = (void *)&per_cpu(interrupt_off_hist, i); + entry[INTERRUPT_LATENCY][i]->proc_fops = &latency_hist_seq_fops; + my_hist = (hist_data_t *) entry[INTERRUPT_LATENCY][i]->data; + atomic_set(&my_hist->hist_mode,1); + my_hist->min_lat = 0xFFFFFFFFUL; + } + interrupt_off_reset = create_proc_entry("reset", 0, tmp_parent_proc_dir); + interrupt_off_reset->data = INTERRUPT_LATENCY; + interrupt_off_reset->proc_fops = &latency_hist_reset_seq_fops; +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + tmp_parent_proc_dir = proc_mkdir(preempt_off_hist_proc_dir, latency_hist_root); + for (i = 0; i < num_possible_cpus(); i++) { + len = sprintf(procname, "CPU%d", i); + procname[len] = '\0'; + entry[PREEMPT_LATENCY][i] = + create_proc_entry(procname, 0, tmp_parent_proc_dir); + entry[PREEMPT_LATENCY][i]->data = (void *)&per_cpu(preempt_off_hist, i); + entry[PREEMPT_LATENCY][i]->proc_fops = &latency_hist_seq_fops; + my_hist = (hist_data_t *) entry[PREEMPT_LATENCY][i]->data; + atomic_set(&my_hist->hist_mode,1); + my_hist->min_lat = 0xFFFFFFFFUL; + } + preempt_off_reset = create_proc_entry("reset", 0, tmp_parent_proc_dir); + preempt_off_reset->data = PREEMPT_LATENCY; + preempt_off_reset->proc_fops = &latency_hist_reset_seq_fops; +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST + tmp_parent_proc_dir = proc_mkdir(wakeup_latency_hist_proc_dir, latency_hist_root); + for (i = 0; i < num_possible_cpus(); i++) { + len = sprintf(procname, "CPU%d", i); + procname[len] = '\0'; + entry[WAKEUP_LATENCY][i] = + create_proc_entry(procname, 0, tmp_parent_proc_dir); + entry[WAKEUP_LATENCY][i]->data = (void *)&per_cpu(wakeup_latency_hist, i); + entry[WAKEUP_LATENCY][i]->proc_fops = &latency_hist_seq_fops; + my_hist = (hist_data_t *) entry[WAKEUP_LATENCY][i]->data; + atomic_set(&my_hist->hist_mode,1); + my_hist->min_lat = 0xFFFFFFFFUL; + } + wakeup_latency_reset = create_proc_entry("reset", 0, tmp_parent_proc_dir); + wakeup_latency_reset->data = WAKEUP_LATENCY; + wakeup_latency_reset->proc_fops = &latency_hist_reset_seq_fops; +#endif + return 0; + +} + +__initcall(latency_hist_init); Index: linux-2.6.24-rc8-rt1/kernel/latency_trace.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/kernel/latency_trace.c 2008-01-16 22:29:28.000000000 -0500 @@ -0,0 +1,2858 @@ +/* + * kernel/latency_trace.c + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef DEFINE_RAW_SPINLOCK +# define DEFINE_RAW_SPINLOCK DEFINE_SPINLOCK +#endif + +#ifndef RAW_SPIN_LOCK_UNLOCKED +# define RAW_SPIN_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED +#endif + +int trace_use_raw_cycles = 0; + +#if defined(CONFIG_LATENCY_TIMING) || defined(CONFIG_EVENT_TRACE) +/* + * Convert raw cycles to usecs. + * Note: this is not the 'clocksource cycles' value, it's the raw + * cycle counter cycles. We use GTOD to timestamp latency start/end + * points, but the trace entries inbetween are timestamped with + * get_cycles(). + */ +static unsigned long notrace cycles_to_us(cycle_t delta) +{ + if (!trace_use_raw_cycles) + return cycles_to_usecs(delta); +#ifdef CONFIG_X86 + do_div(delta, cpu_khz/1000+1); +#elif defined(CONFIG_PPC) + delta = mulhwu(tb_to_us, delta); +#elif defined(CONFIG_ARM) + delta = mach_cycles_to_usecs(delta); +#else + #error Implement cycles_to_usecs. +#endif + + return (unsigned long) delta; +} +#endif + +static notrace inline cycle_t now(void) +{ + if (trace_use_raw_cycles) + return get_cycles(); + return get_monotonic_cycles(); +} + +#ifndef irqs_off +# define irqs_off() 0 +#endif + +#ifndef DEBUG_WARN_ON +static inline int DEBUG_WARN_ON(int cond) +{ + WARN_ON(cond); + return 0; +} +#endif + +#if defined(CONFIG_CRITICAL_IRQSOFF_TIMING) || \ + (defined(CONFIG_CRITICAL_PREEMPT_TIMING) && defined(CONFIG_TRACE_IRQFLAGS)) + static DEFINE_PER_CPU(int, trace_cpu_idle); +#endif + +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING +# ifdef CONFIG_CRITICAL_PREEMPT_TIMING +# define irqs_off_preempt_count() (preempt_count() && !__get_cpu_var(trace_cpu_idle)) +# else +# define irqs_off_preempt_count() 0 +# endif +#endif + +#ifdef CONFIG_WAKEUP_TIMING +struct sch_struct { + __raw_spinlock_t trace_lock; + struct task_struct *task; + int cpu; + struct cpu_trace *tr; +} ____cacheline_aligned_in_smp; + +static __cacheline_aligned_in_smp struct sch_struct sch = + { trace_lock: __RAW_SPIN_LOCK_UNLOCKED }; + +int wakeup_timing = 1; +#endif + +/* + * Track maximum latencies and save the trace: + */ + +/* + * trace_stop_sched_switched must not be called with runqueue locks held! + */ +static __cacheline_aligned_in_smp DECLARE_MUTEX(max_mutex); + +/* + * Sequence count - we record it when starting a measurement and + * skip the latency if the sequence has changed - some other section + * did a maximum and could disturb our measurement with serial console + * printouts, etc. Truly coinciding maximum latencies should be rare + * and what happens together happens separately as well, so this doesnt + * decrease the validity of the maximum found: + */ +static __cacheline_aligned_in_smp unsigned long max_sequence; + +enum trace_type +{ + __TRACE_FIRST_TYPE = 0, + + TRACE_FN, + TRACE_SPECIAL, + TRACE_SPECIAL_PID, + TRACE_SPECIAL_U64, + TRACE_SPECIAL_SYM, + TRACE_CMDLINE, + TRACE_SYSCALL, + TRACE_SYSRET, + + __TRACE_LAST_TYPE +}; + +enum trace_flag_type +{ + TRACE_FLAG_IRQS_OFF = 0x01, + TRACE_FLAG_NEED_RESCHED = 0x02, + TRACE_FLAG_NEED_RESCHED_DELAYED = 0x04, + TRACE_FLAG_HARDIRQ = 0x08, + TRACE_FLAG_SOFTIRQ = 0x10, + TRACE_FLAG_IRQS_HARD_OFF = 0x20, +}; + +/* + * Maximum preemption latency measured. Initialize to maximum, + * we clear it after bootup. + */ +#ifdef CONFIG_LATENCY_HIST +# define INIT_PREEMPT_MAX_LATENCY ((cycle_t)0UL) +#else +# define INIT_PREEMPT_MAX_LATENCY ((cycle_t)ULONG_MAX) +#endif + +unsigned long preempt_max_latency; +static cycle_t __preempt_max_latency = INIT_PREEMPT_MAX_LATENCY; +unsigned long preempt_thresh; +static cycle_t __preempt_thresh; + +/* + * Should this new latency be reported/recorded? + */ +static int report_latency(cycle_t delta) +{ + if (latency_hist_flag && !trace_user_triggered) + return 1; + + if (__preempt_thresh) { + if (delta < __preempt_thresh) + return 0; + } else { + if (delta <= __preempt_max_latency) + return 0; + } + return 1; +} + +#ifdef CONFIG_EVENT_TRACE + +/* + * Number of per-CPU trace entries: + */ +#define MAX_TRACE (65536UL) + +#define CMDLINE_BYTES 16 + +/* + * 32 bytes on 32-bit platforms: + */ +struct trace_entry { + char type; + char cpu; + char flags; + char preempt_count; // assumes PREEMPT_MASK is 8 bits or less + int pid; + cycle_t timestamp; + union { + struct { + unsigned long eip; + unsigned long parent_eip; + } fn; + struct { + unsigned long eip; + unsigned long v1, v2, v3; + } special; + struct { + unsigned char str[CMDLINE_BYTES]; + } cmdline; + struct { + unsigned long nr; // highest bit: compat call + unsigned long p1, p2, p3; + } syscall; + struct { + unsigned long ret; + } sysret; + struct { + unsigned long __pad3[4]; + } pad; + } u; +} __attribute__((packed)); + +#endif + +struct cpu_trace { + atomic_t disabled; + unsigned long trace_idx; + cycle_t preempt_timestamp; + unsigned long critical_start, critical_end; + unsigned long critical_sequence; + atomic_t underrun; + atomic_t overrun; + int early_warning; + int latency_type; + int cpu; + +#ifdef CONFIG_EVENT_TRACE + struct trace_entry *trace; + char comm[CMDLINE_BYTES]; + pid_t pid; + unsigned long uid; + unsigned long nice; + unsigned long policy; + unsigned long rt_priority; + unsigned long saved_latency; +#endif +#ifdef CONFIG_DEBUG_STACKOVERFLOW + unsigned long stack_check; +#endif +} ____cacheline_aligned_in_smp; + +static struct cpu_trace cpu_traces[NR_CPUS] ____cacheline_aligned_in_smp = +{ [0 ... NR_CPUS-1] = { +#ifdef CONFIG_DEBUG_STACKOVERFLOW + .stack_check = 1 +#endif + } }; + +#ifdef CONFIG_EVENT_TRACE + +int trace_enabled = 0; +int syscall_tracing = 1; +int stackframe_tracing = 0; +int trace_freerunning = 0; +int trace_print_on_crash = 0; +int trace_verbose = 0; +int trace_all_cpus = 0; +int print_functions = 0; +int trace_all_runnable = 0; + +/* + * user-triggered via gettimeofday(0,1)/gettimeofday(0,0) + */ +int trace_user_triggered = 0; +int trace_user_trigger_irq = -1; + +struct saved_trace_struct { + int cpu; + cycle_t first_timestamp, last_timestamp; + struct cpu_trace traces[NR_CPUS]; +} ____cacheline_aligned_in_smp; + +/* + * The current worst-case trace: + */ +static struct saved_trace_struct max_tr; + +/* + * /proc/latency_trace atomicity: + */ +static DECLARE_MUTEX(out_mutex); + +static struct saved_trace_struct out_tr; + +static void notrace printk_name(unsigned long eip) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); + if (sym_name) + printk("%s+%#lx/%#lx", sym_name, offset, size); + else + printk("<%08lx>", eip); +} + +#ifdef CONFIG_DEBUG_STACKOVERFLOW + +#ifndef STACK_WARN +# define STACK_WARN (THREAD_SIZE/8) +#endif + +#define MIN_STACK_NEEDED (sizeof(struct thread_info) + STACK_WARN) +#define MAX_STACK (THREAD_SIZE - sizeof(struct thread_info)) + +#if (defined(__i386__) || defined(__x86_64__)) && defined(CONFIG_FRAME_POINTER) +# define PRINT_EXACT_STACKFRAME +#endif + +#ifdef PRINT_EXACT_STACKFRAME +static unsigned long *worst_stack_bp; +#endif +static DEFINE_RAW_SPINLOCK(worst_stack_lock); +unsigned long worst_stack_left = THREAD_SIZE; +static unsigned long worst_stack_printed = THREAD_SIZE; +static char worst_stack_comm[TASK_COMM_LEN+1]; +static int worst_stack_pid; +static unsigned long worst_stack_sp; +static char worst_stack[THREAD_SIZE]; + +static notrace void fill_worst_stack(unsigned long stack_left) +{ + unsigned long flags; + + /* + * On x64, we must not read the PDA during early bootup: + */ +#ifdef CONFIG_X86_64 + if (system_state == SYSTEM_BOOTING) + return; +#endif + spin_lock_irqsave(&worst_stack_lock, flags); + if (likely(stack_left < worst_stack_left)) { + worst_stack_left = stack_left; + memcpy(worst_stack, current_thread_info(), THREAD_SIZE); + worst_stack_sp = (unsigned long)&stack_left; + memcpy(worst_stack_comm, current->comm, TASK_COMM_LEN); + worst_stack_pid = current->pid; +#ifdef PRINT_EXACT_STACKFRAME +# ifdef __i386__ + asm ("mov %%ebp, %0\n" :"=g"(worst_stack_bp)); +# elif defined(__x86_64__) + asm ("mov %%rbp, %0\n" :"=g"(worst_stack_bp)); +# else +# error Poke the author of above asm code lines ! +# endif +#endif + } + spin_unlock_irqrestore(&worst_stack_lock, flags); +} + +#ifdef PRINT_EXACT_STACKFRAME + +/* + * This takes a BP offset to point the BP back into the saved stack, + * the original stack might be long gone (but the stackframe within + * the saved copy still contains references to it). + */ +#define CONVERT_TO_SAVED_STACK(bp) \ + ((void *)worst_stack + ((unsigned long)bp & (THREAD_SIZE-1))) + +static void show_stackframe(void) +{ + unsigned long addr, frame_size, *bp, *prev_bp, sum = 0; + + bp = CONVERT_TO_SAVED_STACK(worst_stack_bp); + + while (bp[0]) { + addr = bp[1]; + if (!kernel_text_address(addr)) + break; + + prev_bp = bp; + bp = CONVERT_TO_SAVED_STACK((unsigned long *)bp[0]); + + frame_size = (bp - prev_bp) * sizeof(long); + + if (frame_size < THREAD_SIZE) { + printk("{ %4ld} ", frame_size); + sum += frame_size; + } else + printk("{=%4ld} ", sum); + + printk("[<%08lx>] ", addr); + printk_name(addr); + printk("\n"); + } +} + +#else + +static inline int valid_stack_ptr(void *p) +{ + return p > (void *)worst_stack && + p < (void *)worst_stack + THREAD_SIZE - 3; +} + +static void show_stackframe(void) +{ + unsigned long prev_frame, addr; + unsigned long *stack; + + prev_frame = (unsigned long)(worst_stack + + (worst_stack_sp & (THREAD_SIZE-1))); + stack = (unsigned long *)prev_frame; + + while (valid_stack_ptr(stack)) { + addr = *stack++; + if (__kernel_text_address(addr)) { + printk("(%4ld) ", (unsigned long)stack - prev_frame); + printk("[<%08lx>] ", addr); + print_symbol("%s\n", addr); + prev_frame = (unsigned long)stack; + } + if ((char *)stack >= worst_stack + THREAD_SIZE) + break; + } +} + +#endif + +static notrace void __print_worst_stack(void) +{ + unsigned long fill_ratio; + printk("----------------------------->\n"); + printk("| new stack fill maximum: %s/%d, %ld bytes (out of %ld bytes).\n", + worst_stack_comm, worst_stack_pid, + MAX_STACK-worst_stack_left, (long)MAX_STACK); + fill_ratio = (MAX_STACK-worst_stack_left)*100/(long)MAX_STACK; + printk("| Stack fill ratio: %02ld%%", fill_ratio); + if (fill_ratio >= 90) + printk(" - BUG: that's quite high, please report this!\n"); + else + printk(" - that's still OK, no need to report this.\n"); + printk("------------|\n"); + + show_stackframe(); + printk("<---------------------------\n\n"); +} + +static notrace void print_worst_stack(void) +{ + unsigned long flags; + + if (irqs_disabled() || preempt_count()) + return; + + spin_lock_irqsave(&worst_stack_lock, flags); + if (worst_stack_printed == worst_stack_left) { + spin_unlock_irqrestore(&worst_stack_lock, flags); + return; + } + worst_stack_printed = worst_stack_left; + spin_unlock_irqrestore(&worst_stack_lock, flags); + + __print_worst_stack(); +} + +static notrace void debug_stackoverflow(struct cpu_trace *tr) +{ + long stack_left; + + if (unlikely(tr->stack_check <= 0)) + return; + atomic_inc(&tr->disabled); + + /* Debugging check for stack overflow: is there less than 1KB free? */ +#ifdef __i386__ + __asm__ __volatile__("and %%esp,%0" : + "=r" (stack_left) : "0" (THREAD_SIZE - 1)); +#elif defined(__x86_64__) + __asm__ __volatile__("and %%rsp,%0" : + "=r" (stack_left) : "0" (THREAD_SIZE - 1)); +#else +# error Poke the author of above asm code lines ! +#endif + if (unlikely(stack_left < MIN_STACK_NEEDED)) { + tr->stack_check = 0; + printk(KERN_ALERT "BUG: stack overflow: only %ld bytes left! [%08lx...(%08lx-%08lx)]\n", + stack_left - sizeof(struct thread_info), + (long)&stack_left, + (long)current_thread_info(), + (long)current_thread_info() + THREAD_SIZE); + fill_worst_stack(stack_left); + __print_worst_stack(); + goto out; + } + if (unlikely(stack_left < worst_stack_left)) { + tr->stack_check--; + fill_worst_stack(stack_left); + print_worst_stack(); + tr->stack_check++; + } else + if (worst_stack_printed != worst_stack_left) { + tr->stack_check--; + print_worst_stack(); + tr->stack_check++; + } +out: + atomic_dec(&tr->disabled); +} + +#endif + +#ifdef CONFIG_EARLY_PRINTK +static void notrace early_printk_name(unsigned long eip) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); + if (sym_name) + early_printk("%s <%08lx>", sym_name, eip); + else + early_printk("<%08lx>", eip); +} + +static __raw_spinlock_t early_print_lock = __RAW_SPIN_LOCK_UNLOCKED; + +static void notrace early_print_entry(struct trace_entry *entry) +{ + int hardirq, softirq; + + __raw_spin_lock(&early_print_lock); + early_printk("%-5d ", entry->pid); + + early_printk("%d%c%c", + entry->cpu, + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.', + (entry->flags & TRACE_FLAG_NEED_RESCHED_DELAYED) ? 'n' : + ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); + + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + if (hardirq && softirq) + early_printk("H"); + else { + if (hardirq) + early_printk("h"); + else { + if (softirq) + early_printk("s"); + else + early_printk("."); + } + } + + early_printk(":%d: ", entry->preempt_count); + + if (entry->type == TRACE_FN) { + early_printk_name(entry->u.fn.eip); + early_printk(" <= ("); + early_printk_name(entry->u.fn.parent_eip); + early_printk(")\n"); + } else { + /* special entries: */ + early_printk_name(entry->u.special.eip); + early_printk(": <%08lx> <%08lx> <%08lx>\n", + entry->u.special.v1, + entry->u.special.v2, + entry->u.special.v3); + } + __raw_spin_unlock(&early_print_lock); +} +#else +# define early_print_entry(x) do { } while(0) +#endif + +static void notrace +____trace(int cpu, enum trace_type type, struct cpu_trace *tr, + unsigned long eip, unsigned long parent_eip, + unsigned long v1, unsigned long v2, unsigned long v3, + unsigned long flags) +{ + struct trace_entry *entry; + unsigned long idx, idx_next; + cycle_t timestamp; + u32 pc; + +#ifdef CONFIG_DEBUG_PREEMPT +// WARN_ON(!atomic_read(&tr->disabled)); +#endif + if (!tr->critical_start && !trace_user_triggered && !trace_all_cpus && + !trace_print_on_crash && !print_functions) + goto out; + /* + * Allocate the next index. Make sure an NMI (or interrupt) + * has not taken it away. Potentially redo the timestamp as + * well to make sure the trace timestamps are in chronologic + * order. + */ +again: + idx = tr->trace_idx; + idx_next = idx + 1; + timestamp = now(); + + if (unlikely((trace_freerunning || print_functions || atomic_read(&tr->underrun)) && + (idx_next >= MAX_TRACE) && !atomic_read(&tr->overrun))) { + atomic_inc(&tr->underrun); + idx_next = 0; + } + if (unlikely(idx >= MAX_TRACE)) { + atomic_inc(&tr->overrun); + goto out; + } +#ifdef __HAVE_ARCH_CMPXCHG + if (unlikely(cmpxchg(&tr->trace_idx, idx, idx_next) != idx)) { + if (idx_next == 0) + atomic_dec(&tr->underrun); + goto again; + } +#else +# ifdef CONFIG_SMP +# error CMPXCHG missing +# else + /* No worry, we are protected by the atomic_incr(&tr->disabled) + * in __trace further down + */ + tr->trace_idx = idx_next; +# endif +#endif + if (unlikely(idx_next != 0 && atomic_read(&tr->underrun))) + atomic_inc(&tr->underrun); + + pc = preempt_count(); + + if (unlikely(!tr->trace)) + goto out; + entry = tr->trace + idx; + entry->type = type; +#ifdef CONFIG_SMP + entry->cpu = cpu; +#endif + entry->flags = (irqs_off() ? TRACE_FLAG_IRQS_OFF : 0) | + (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_HARD_OFF : 0)| + ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | + ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | + (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | + (need_resched_delayed() ? TRACE_FLAG_NEED_RESCHED_DELAYED : 0); + entry->preempt_count = pc & 0xff; + entry->pid = current->pid; + entry->timestamp = timestamp; + + switch (type) { + case TRACE_FN: + entry->u.fn.eip = eip; + entry->u.fn.parent_eip = parent_eip; + if (unlikely(print_functions && !in_interrupt())) + early_print_entry(entry); + break; + case TRACE_SPECIAL: + case TRACE_SPECIAL_PID: + case TRACE_SPECIAL_U64: + case TRACE_SPECIAL_SYM: + entry->u.special.eip = eip; + entry->u.special.v1 = v1; + entry->u.special.v2 = v2; + entry->u.special.v3 = v3; + if (unlikely(print_functions && !in_interrupt())) + early_print_entry(entry); + break; + case TRACE_SYSCALL: + entry->u.syscall.nr = eip; + entry->u.syscall.p1 = v1; + entry->u.syscall.p2 = v2; + entry->u.syscall.p3 = v3; + break; + case TRACE_SYSRET: + entry->u.sysret.ret = eip; + break; + case TRACE_CMDLINE: + memcpy(entry->u.cmdline.str, current->comm, CMDLINE_BYTES); + break; + default: + break; + } +out: + ; +} + +static inline void notrace +___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip, + unsigned long v1, unsigned long v2, + unsigned long v3) +{ + struct cpu_trace *tr; + unsigned long flags; + int cpu; + + if (unlikely(trace_enabled <= 0)) + return; + +#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_X86) + debug_stackoverflow(cpu_traces + raw_smp_processor_id()); +#endif + + raw_local_irq_save(flags); + cpu = raw_smp_processor_id(); + /* + * Trace on the CPU where the current highest-prio task + * is waiting to become runnable: + */ +#ifdef CONFIG_WAKEUP_TIMING + if (wakeup_timing && !trace_all_cpus && !trace_print_on_crash && + !print_functions) { + if (!sch.tr || cpu != sch.cpu) + goto out; + tr = sch.tr; + } else + tr = cpu_traces + cpu; +#else + tr = cpu_traces + cpu; +#endif + atomic_inc(&tr->disabled); + if (likely(atomic_read(&tr->disabled) == 1)) { +//#define DEBUG_STACK_POISON +#ifdef DEBUG_STACK_POISON + char stack; + + memset(&stack - 128, 0x34, 128); +#endif + ____trace(cpu, type, tr, eip, parent_eip, v1, v2, v3, flags); + } + atomic_dec(&tr->disabled); +#ifdef CONFIG_WAKEUP_TIMING +out: +#endif + raw_local_irq_restore(flags); +} + +/* + * Special, ad-hoc tracepoints: + */ +void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3) +{ + ___trace(TRACE_SPECIAL, CALLER_ADDR0, 0, v1, v2, v3); +} + +EXPORT_SYMBOL(trace_special); + +void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2) +{ + ___trace(TRACE_SPECIAL_PID, CALLER_ADDR0, 0, pid, v1, v2); +} + +EXPORT_SYMBOL(trace_special_pid); + +void notrace trace_special_u64(unsigned long long v1, unsigned long v2) +{ + ___trace(TRACE_SPECIAL_U64, CALLER_ADDR0, 0, + (unsigned long) (v1 >> 32), (unsigned long) (v1 & 0xFFFFFFFF), + v2); +} + +EXPORT_SYMBOL(trace_special_u64); + +void notrace trace_special_sym(void) +{ +#define STACK_ENTRIES 8 + unsigned long entries[STACK_ENTRIES]; + struct stack_trace trace; + + if (!trace_enabled || !stackframe_tracing) + return; + + trace.entries = entries; + trace.skip = 3; + trace.max_entries = STACK_ENTRIES; + trace.nr_entries = 0; + + save_stack_trace(&trace); + /* + * clear out the rest: + */ + while (trace.nr_entries < trace.max_entries) + entries[trace.nr_entries++] = 0; + + ___trace(TRACE_SPECIAL_SYM, entries[0], 0, + entries[1], entries[2], entries[3]); + ___trace(TRACE_SPECIAL_SYM, entries[4], 0, + entries[5], entries[6], entries[7]); +} + +EXPORT_SYMBOL(trace_special_sym); + +/* + * Non-inlined function: + */ +void notrace __trace(unsigned long eip, unsigned long parent_eip) +{ + ___trace(TRACE_FN, eip, parent_eip, 0, 0, 0); +} + +#ifdef CONFIG_MCOUNT + +static void notrace __mcount(unsigned long ip, unsigned long parent_ip) +{ + ___trace(TRACE_FN, ip, parent_ip, 0, 0, 0); +} + +static struct mcount_ops trace_ops __read_mostly = +{ + .func = __mcount, +}; + +static int __init setup_trace_mcount(void) +{ + register_mcount_function(&trace_ops); + return 0; +} +core_initcall(setup_trace_mcount); + +void notrace call_mcount(void) +{ + /* mcount does not set up a stack frame */ + mcount(); +} + +#endif + +void notrace +sys_call(unsigned long nr, unsigned long p1, unsigned long p2, unsigned long p3) +{ + if (syscall_tracing) + ___trace(TRACE_SYSCALL, nr, 0, p1, p2, p3); +} + +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86) + +void notrace +sys_ia32_call(unsigned long nr, unsigned long p1, unsigned long p2, + unsigned long p3) +{ + if (syscall_tracing) + ___trace(TRACE_SYSCALL, nr | 0x80000000, 0, p1, p2, p3); +} + +#endif + +void notrace sys_ret(unsigned long ret) +{ + if (syscall_tracing) + ___trace(TRACE_SYSRET, ret, 0, 0, 0, 0); +} + +static void notrace print_name(struct seq_file *m, unsigned long eip) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + /* + * Special trace values: + */ + if (((long)eip < 100000L) && ((long)eip > -100000L)) { + seq_printf(m, "<%ld>", eip); + return; + } + sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); + if (sym_name) + seq_printf(m, "%s+%#lx/%#lx", + sym_name, offset, size); + else + seq_printf(m, "<%08lx>", eip); +} + +static void notrace print_name_eip(struct seq_file *m, unsigned long eip) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + if (eip) { + sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); + if (sym_name) + seq_printf(m, "%s+%#lx/%#lx <%08lx>", + sym_name, offset, size, eip); + else + seq_printf(m, "<%08lx>", eip); + } else + seq_printf(m, "0"); +} + +static unsigned long out_sequence = -1; + +static int pid_to_cmdline_array[PID_MAX_DEFAULT+1]; + +static void notrace _trace_cmdline(int cpu, struct cpu_trace *tr) +{ + unsigned long flags; + + local_save_flags(flags); + ____trace(cpu, TRACE_CMDLINE, tr, 0, 0, 0, 0, 0, flags); +} + +void notrace trace_cmdline(void) +{ + ___trace(TRACE_CMDLINE, 0, 0, 0, 0, 0); +} + +static void construct_pid_to_cmdline(struct cpu_trace *tr) +{ + unsigned int i, j, entries, pid; + + if (tr->critical_sequence == out_sequence) + return; + out_sequence = tr->critical_sequence; + + memset(pid_to_cmdline_array, -1, sizeof(int) * (PID_MAX_DEFAULT + 1)); + + if (!tr->trace) + return; + + entries = min(tr->trace_idx, MAX_TRACE); + + for (i = 0; i < entries; i++) { + struct trace_entry *entry = tr->trace + i; + + if (entry->type != TRACE_CMDLINE) + continue; + pid = entry->pid; + if (pid < PID_MAX_DEFAULT) { + pid_to_cmdline_array[pid] = i; + /* + * Replace space with underline - makes it easier + * to process for tools: + */ + for (j = 0; j < CMDLINE_BYTES; j++) + if (entry->u.cmdline.str[j] == ' ') + entry->u.cmdline.str[j] = '_'; + } + } +} + +char *pid_to_cmdline(unsigned long pid) +{ + struct cpu_trace *tr = out_tr.traces + 0; + char *cmdline = "<...>"; + int idx; + + pid = min(pid, (unsigned long)PID_MAX_DEFAULT); + if (!pid) + return ""; + + if (pid_to_cmdline_array[pid] != -1) { + idx = pid_to_cmdline_array[pid]; + if (tr->trace[idx].type == TRACE_CMDLINE) + cmdline = tr->trace[idx].u.cmdline.str; + } + return cmdline; +} + +static void copy_trace(struct cpu_trace *save, struct cpu_trace *tr, int reorder) +{ + if (!save->trace || !tr->trace) + return; + /* free-running needs reordering */ + if (reorder && atomic_read(&tr->underrun)) { + int i, idx, idx0 = tr->trace_idx; + + for (i = 0; i < MAX_TRACE; i++) { + idx = (idx0 + i) % MAX_TRACE; + save->trace[i] = tr->trace[idx]; + } + save->trace_idx = MAX_TRACE; + } else { + save->trace_idx = tr->trace_idx; + + memcpy(save->trace, tr->trace, + min(save->trace_idx, MAX_TRACE) * + sizeof(struct trace_entry)); + } + save->underrun = tr->underrun; + save->overrun = tr->overrun; +} + + +struct block_idx { + int idx[NR_CPUS]; +}; + +/* + * return the trace entry (position) of the smallest-timestamp + * one (that is still in the valid idx range): + */ +static int min_idx(struct block_idx *bidx) +{ + cycle_t min_stamp = (cycle_t) -1; + struct trace_entry *entry; + int cpu, min_cpu = -1, idx; + + for_each_online_cpu(cpu) { + idx = bidx->idx[cpu]; + if (idx >= min(max_tr.traces[cpu].trace_idx, MAX_TRACE)) + continue; + if (idx > MAX_TRACE*NR_CPUS) { + printk("huh: idx (%d) > %ld*%d!\n", idx, MAX_TRACE, + NR_CPUS); + WARN_ON(1); + break; + } + entry = max_tr.traces[cpu].trace + bidx->idx[cpu]; + if (entry->timestamp < min_stamp) { + min_cpu = cpu; + min_stamp = entry->timestamp; + } + } + + return min_cpu; +} + +/* + * This code is called to construct an output trace from + * the maximum trace. Having separate traces serves both + * atomicity (a new max might be saved while we are busy + * accessing /proc/latency_trace) and it is also used to + * delay the (expensive) sorting of the output trace by + * timestamps, in the trace_all_cpus case. + */ +static void update_out_trace(void) +{ + struct trace_entry *out_entry, *entry, *tmp; + cycle_t stamp, first_stamp, last_stamp; + struct block_idx bidx = { { 0, }, }; + struct cpu_trace *tmp_max, *tmp_out; + int cpu, sum, entries, underrun_sum, overrun_sum; + + /* + * For out_tr we only have the first array's trace entries + * allocated - and they have are larger on SMP to make room + * for all trace entries from all CPUs. + */ + tmp_out = out_tr.traces + 0; + tmp_max = max_tr.traces + max_tr.cpu; + /* + * Easier to copy this way. Note: the trace buffer is private + * to the output buffer, so preserve it: + */ + copy_trace(tmp_out, tmp_max, 0); + tmp = tmp_out->trace; + *tmp_out = *tmp_max; + tmp_out->trace = tmp; + + out_tr.cpu = max_tr.cpu; + + if (!tmp_out->trace) + return; + + out_entry = tmp_out->trace + 0; + + if (!trace_all_cpus) { + entries = min(tmp_out->trace_idx, MAX_TRACE); + if (!entries) + return; + out_tr.first_timestamp = tmp_out->trace[0].timestamp; + out_tr.last_timestamp = tmp_out->trace[entries-1].timestamp; + return; + } + /* + * Find the range of timestamps that are fully traced in + * all CPU traces. (since CPU traces can cover a variable + * range of time, we have to find the best range.) + */ + first_stamp = 0; + for_each_online_cpu(cpu) { + tmp_max = max_tr.traces + cpu; + stamp = tmp_max->trace[0].timestamp; + if (stamp > first_stamp) + first_stamp = stamp; + } + /* + * Save the timestamp range: + */ + tmp_max = max_tr.traces + max_tr.cpu; + entries = min(tmp_max->trace_idx, MAX_TRACE); + /* + * No saved trace yet? + */ + if (!entries) { + out_tr.traces[0].trace_idx = 0; + return; + } + + last_stamp = tmp_max->trace[entries-1].timestamp; + + if (last_stamp < first_stamp) { + WARN_ON(1); + + for_each_online_cpu(cpu) { + tmp_max = max_tr.traces + cpu; + entries = min(tmp_max->trace_idx, MAX_TRACE); + printk("CPU%d: %016Lx (%016Lx) ... #%d (%016Lx) %016Lx\n", + cpu, + tmp_max->trace[0].timestamp, + tmp_max->trace[1].timestamp, + entries, + tmp_max->trace[entries-2].timestamp, + tmp_max->trace[entries-1].timestamp); + } + tmp_max = max_tr.traces + max_tr.cpu; + entries = min(tmp_max->trace_idx, MAX_TRACE); + + printk("CPU%d entries: %d\n", max_tr.cpu, entries); + printk("first stamp: %016Lx\n", first_stamp); + printk(" last stamp: %016Lx\n", first_stamp); + } + +#if 0 + printk("first_stamp: %Ld [%016Lx]\n", first_stamp, first_stamp); + printk(" last_stamp: %Ld [%016Lx]\n", last_stamp, last_stamp); + printk(" +1 stamp: %Ld [%016Lx]\n", + tmp_max->trace[entries].timestamp, + tmp_max->trace[entries].timestamp); + printk(" +2 stamp: %Ld [%016Lx]\n", + tmp_max->trace[entries+1].timestamp, + tmp_max->trace[entries+1].timestamp); + printk(" delta: %Ld\n", last_stamp-first_stamp); + printk(" entries: %d\n", entries); +#endif + + out_tr.first_timestamp = first_stamp; + out_tr.last_timestamp = last_stamp; + + /* + * Fetch trace entries one by one, in increasing timestamp + * order. Start at first_stamp, stop at last_stamp: + */ + sum = 0; + for (;;) { + cpu = min_idx(&bidx); + if (cpu == -1) + break; + entry = max_tr.traces[cpu].trace + bidx.idx[cpu]; + if (entry->timestamp > last_stamp) + break; + + bidx.idx[cpu]++; + if (entry->timestamp < first_stamp) + continue; + *out_entry = *entry; + out_entry++; + sum++; + if (sum > MAX_TRACE*NR_CPUS) { + printk("huh: sum (%d) > %ld*%d!\n", sum, MAX_TRACE, + NR_CPUS); + WARN_ON(1); + break; + } + } + + sum = 0; + underrun_sum = 0; + overrun_sum = 0; + for_each_online_cpu(cpu) { + sum += max_tr.traces[cpu].trace_idx; + underrun_sum += atomic_read(&max_tr.traces[cpu].underrun); + overrun_sum += atomic_read(&max_tr.traces[cpu].overrun); + } + tmp_out->trace_idx = sum; + atomic_set(&tmp_out->underrun, underrun_sum); + atomic_set(&tmp_out->overrun, overrun_sum); +} + +static void notrace print_help_header(struct seq_file *m) +{ + seq_puts(m, " _------=> CPU# \n"); + seq_puts(m, " / _-----=> irqs-off \n"); + seq_puts(m, " | / _----=> need-resched \n"); + seq_puts(m, " || / _---=> hardirq/softirq \n"); + seq_puts(m, " ||| / _--=> preempt-depth \n"); + seq_puts(m, " |||| / \n"); + seq_puts(m, " ||||| delay \n"); + seq_puts(m, " cmd pid ||||| time | caller \n"); + seq_puts(m, " \\ / ||||| \\ | / \n"); +} + +static void * notrace l_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + unsigned long entries; + struct cpu_trace *tr = out_tr.traces + 0; + + down(&out_mutex); + /* + * if the file is being read newly, update the output trace: + */ + if (!n) { + // TODO: use the sequence counter here to optimize + down(&max_mutex); + update_out_trace(); + up(&max_mutex); +#if 0 + if (!tr->trace_idx) { + up(&out_mutex); + return NULL; + } +#endif + construct_pid_to_cmdline(tr); + } + entries = min(tr->trace_idx, MAX_TRACE); + + if (!n) { + seq_printf(m, "preemption latency trace v1.1.5 on %s\n", + UTS_RELEASE); + seq_puts(m, "--------------------------------------------------------------------\n"); + seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d | (M:%s VP:%d, KP:%d, SP:%d HP:%d", + cycles_to_usecs(tr->saved_latency), + entries, + (entries + atomic_read(&tr->underrun) + + atomic_read(&tr->overrun)), + out_tr.cpu, +#if defined(CONFIG_PREEMPT_NONE) + "server", +#elif defined(CONFIG_PREEMPT_VOLUNTARY) + "desktop", +#elif defined(CONFIG_PREEMPT_DESKTOP) + "preempt", +#else + "rt", +#endif + 0, 0, +#ifdef CONFIG_PREEMPT_SOFTIRQS + softirq_preemption +#else + 0 +#endif + , +#ifdef CONFIG_PREEMPT_HARDIRQS + hardirq_preemption +#else + 0 +#endif + ); +#ifdef CONFIG_SMP + seq_printf(m, " #P:%d)\n", num_online_cpus()); +#else + seq_puts(m, ")\n"); +#endif + seq_puts(m, " -----------------\n"); + seq_printf(m, " | task: %.16s-%d (uid:%ld nice:%ld policy:%ld rt_prio:%ld)\n", + tr->comm, tr->pid, tr->uid, tr->nice, + tr->policy, tr->rt_priority); + seq_puts(m, " -----------------\n"); + if (trace_user_triggered) { + seq_puts(m, " => started at: "); + print_name_eip(m, tr->critical_start); + seq_puts(m, "\n => ended at: "); + print_name_eip(m, tr->critical_end); + seq_puts(m, "\n"); + } + seq_puts(m, "\n"); + + if (!trace_verbose) + print_help_header(m); + } + if (n >= entries || !tr->trace) + return NULL; + + return tr->trace + n; +} + +static void * notrace l_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct cpu_trace *tr = out_tr.traces; + unsigned long entries = min(tr->trace_idx, MAX_TRACE); + + WARN_ON(!tr->trace); + + if (++*pos >= entries) { + if (*pos == entries) + seq_puts(m, "\n\nvim:ft=help\n"); + return NULL; + } + return tr->trace + *pos; +} + +static void notrace l_stop(struct seq_file *m, void *p) +{ + up(&out_mutex); +} + +unsigned long preempt_mark_thresh = 100; + +static void print_timestamp(struct seq_file *m, unsigned long abs_usecs, + unsigned long rel_usecs) +{ + seq_printf(m, " %4ldus", abs_usecs); + if (rel_usecs > preempt_mark_thresh) + seq_puts(m, "!: "); + else if (rel_usecs > 1) + seq_puts(m, "+: "); + else + seq_puts(m, " : "); +} + +static void +print_timestamp_short(struct seq_file *m, unsigned long abs_usecs, + unsigned long rel_usecs) +{ + seq_printf(m, " %4ldus", abs_usecs); + if (rel_usecs > 100) + seq_putc(m, '!'); + else if (rel_usecs > 1) + seq_putc(m, '+'); + else + seq_putc(m, ' '); +} + +static void +print_generic(struct seq_file *m, struct trace_entry *entry) +{ + int hardirq, softirq; + + seq_printf(m, "%8.8s-%-5d ", pid_to_cmdline(entry->pid), entry->pid); + seq_printf(m, "%d", entry->cpu); + seq_printf(m, "%c%c", + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.', + (entry->flags & TRACE_FLAG_NEED_RESCHED_DELAYED) ? 'n' : + ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); + + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + if (hardirq && softirq) + seq_putc(m, 'H'); + else { + if (hardirq) + seq_putc(m, 'h'); + else { + if (softirq) + seq_putc(m, 's'); + else + seq_putc(m, '.'); + } + } + + if (entry->preempt_count) + seq_printf(m, "%x", entry->preempt_count); + else + seq_puts(m, "."); +} + + +static int notrace l_show_fn(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + + abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp); + + if (trace_verbose) { + seq_printf(m, "%16s %5d %d %d %08x %08lx [%016Lx] %ld.%03ldms (+%ld.%03ldms): ", + pid_to_cmdline(entry->pid), + entry->pid, entry->cpu, entry->flags, + entry->preempt_count, trace_idx, + entry->timestamp, abs_usecs/1000, + abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000); + print_name_eip(m, entry->u.fn.eip); + seq_puts(m, " ("); + print_name_eip(m, entry->u.fn.parent_eip); + seq_puts(m, ")\n"); + } else { + print_generic(m, entry); + print_timestamp(m, abs_usecs, rel_usecs); + print_name(m, entry->u.fn.eip); + seq_puts(m, " ("); + print_name(m, entry->u.fn.parent_eip); + seq_puts(m, ")\n"); + } + return 0; +} + +static int notrace l_show_special(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry, int mode64) +{ + unsigned long abs_usecs, rel_usecs; + + abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp(m, abs_usecs, rel_usecs); + if (trace_verbose) + print_name_eip(m, entry->u.special.eip); + else + print_name(m, entry->u.special.eip); + + if (!mode64) { + /* + * For convenience, print small numbers in decimal: + */ + if (abs((int)entry->u.special.v1) < 100000) + seq_printf(m, " (%5ld ", entry->u.special.v1); + else + seq_printf(m, " (%lx ", entry->u.special.v1); + if (abs((int)entry->u.special.v2) < 100000) + seq_printf(m, "%5ld ", entry->u.special.v2); + else + seq_printf(m, "%lx ", entry->u.special.v2); + if (abs((int)entry->u.special.v3) < 100000) + seq_printf(m, "%5ld)\n", entry->u.special.v3); + else + seq_printf(m, "%lx)\n", entry->u.special.v3); + } else { + seq_printf(m, " (%13Ld %ld)\n", + ((u64)entry->u.special.v1 << 32) + + (u64)entry->u.special.v2, entry->u.special.v3); + } + return 0; +} + +static int notrace +l_show_special_pid(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + unsigned int pid; + + pid = entry->u.special.v1; + + abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp(m, abs_usecs, rel_usecs); + if (trace_verbose) + print_name_eip(m, entry->u.special.eip); + else + print_name(m, entry->u.special.eip); + seq_printf(m, " <%.8s-%d> (%ld %ld)\n", + pid_to_cmdline(pid), pid, + entry->u.special.v2, entry->u.special.v3); + + return 0; +} + +static int notrace +l_show_special_sym(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry, int mode64) +{ + unsigned long abs_usecs, rel_usecs; + + abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp(m, abs_usecs, rel_usecs); + if (trace_verbose) + print_name_eip(m, entry->u.special.eip); + else + print_name(m, entry->u.special.eip); + + seq_puts(m, "()<-"); + print_name(m, entry->u.special.v1); + seq_puts(m, "()<-"); + print_name(m, entry->u.special.v2); + seq_puts(m, "()<-"); + print_name(m, entry->u.special.v3); + seq_puts(m, "()\n"); + + return 0; +} + + +static int notrace l_show_cmdline(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + + if (!trace_verbose) + return 0; + + abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp); + + seq_printf(m, + "[ => %16s ] %ld.%03ldms (+%ld.%03ldms)\n", + entry->u.cmdline.str, + abs_usecs/1000, abs_usecs % 1000, + rel_usecs/1000, rel_usecs % 1000); + + return 0; +} + +extern unsigned long sys_call_table[NR_syscalls]; + +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86) +extern unsigned long ia32_sys_call_table[], ia32_syscall_end[]; +#define IA32_NR_syscalls (ia32_syscall_end - ia32_sys_call_table) +#endif + +static int notrace l_show_syscall(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + unsigned long nr; + + abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp_short(m, abs_usecs, rel_usecs); + + seq_puts(m, "> "); + nr = entry->u.syscall.nr; +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86) + if (nr & 0x80000000) { + nr &= ~0x80000000; + if (nr < IA32_NR_syscalls) + print_name(m, ia32_sys_call_table[nr]); + else + seq_printf(m, "", nr); + } else +#endif + if (nr < NR_syscalls) + print_name(m, sys_call_table[nr]); + else + seq_printf(m, "", nr); + +#ifdef CONFIG_64BIT + seq_printf(m, " (%016lx %016lx %016lx)\n", + entry->u.syscall.p1, entry->u.syscall.p2, entry->u.syscall.p3); +#else + seq_printf(m, " (%08lx %08lx %08lx)\n", + entry->u.syscall.p1, entry->u.syscall.p2, entry->u.syscall.p3); +#endif + + return 0; +} + +static int notrace l_show_sysret(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + + abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp_short(m, abs_usecs, rel_usecs); + + seq_printf(m, "< (%ld)\n", entry->u.sysret.ret); + + return 0; +} + + +static int notrace l_show(struct seq_file *m, void *p) +{ + struct cpu_trace *tr = out_tr.traces; + struct trace_entry *entry, *entry0, *next_entry; + unsigned long trace_idx; + + cond_resched(); + entry = p; + if (entry->timestamp < out_tr.first_timestamp) + return 0; + if (entry->timestamp > out_tr.last_timestamp) + return 0; + + entry0 = tr->trace; + trace_idx = entry - entry0; + + if (trace_idx + 1 < tr->trace_idx) + next_entry = entry + 1; + else + next_entry = entry; + + if (trace_verbose) + seq_printf(m, "(T%d/#%ld) ", entry->type, trace_idx); + + switch (entry->type) { + case TRACE_FN: + l_show_fn(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_SPECIAL: + l_show_special(m, trace_idx, entry, entry0, next_entry, 0); + break; + case TRACE_SPECIAL_PID: + l_show_special_pid(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_SPECIAL_U64: + l_show_special(m, trace_idx, entry, entry0, next_entry, 1); + break; + case TRACE_SPECIAL_SYM: + l_show_special_sym(m, trace_idx, entry, entry0, + next_entry, 1); + break; + case TRACE_CMDLINE: + l_show_cmdline(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_SYSCALL: + l_show_syscall(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_SYSRET: + l_show_sysret(m, trace_idx, entry, entry0, next_entry); + break; + default: + seq_printf(m, "unknown trace type %d\n", entry->type); + } + return 0; +} + +struct seq_operations latency_trace_op = { + .start = l_start, + .next = l_next, + .stop = l_stop, + .show = l_show +}; + +/* + * Copy the new maximum trace into the separate maximum-trace + * structure. (this way the maximum trace is permanently saved, + * for later retrieval via /proc/latency_trace) + */ +static void update_max_tr(struct cpu_trace *tr) +{ + struct cpu_trace *save; + int cpu, all_cpus = 0; + +#ifdef CONFIG_PREEMPT + WARN_ON(!preempt_count() && !irqs_disabled()); +#endif + + max_tr.cpu = tr->cpu; + save = max_tr.traces + tr->cpu; + + if ((wakeup_timing || trace_user_triggered || trace_print_on_crash || + print_functions) && trace_all_cpus) { + all_cpus = 1; + for_each_online_cpu(cpu) + atomic_inc(&cpu_traces[cpu].disabled); + } + + save->saved_latency = __preempt_max_latency; + save->preempt_timestamp = tr->preempt_timestamp; + save->critical_start = tr->critical_start; + save->critical_end = tr->critical_end; + save->critical_sequence = tr->critical_sequence; + + memcpy(save->comm, current->comm, CMDLINE_BYTES); + save->pid = current->pid; + save->uid = current->uid; + save->nice = current->static_prio - 20 - MAX_RT_PRIO; + save->policy = current->policy; + save->rt_priority = current->rt_priority; + + if (all_cpus) { + for_each_online_cpu(cpu) { + copy_trace(max_tr.traces + cpu, cpu_traces + cpu, 1); + atomic_dec(&cpu_traces[cpu].disabled); + } + } else + copy_trace(save, tr, 1); +} + +#else /* !EVENT_TRACE */ + +static inline void notrace +____trace(int cpu, enum trace_type type, struct cpu_trace *tr, + unsigned long eip, unsigned long parent_eip, + unsigned long v1, unsigned long v2, unsigned long v3, + unsigned long flags) +{ +} + +static inline void notrace +___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip, + unsigned long v1, unsigned long v2, + unsigned long v3) +{ +} + +static inline void notrace __trace(unsigned long eip, unsigned long parent_eip) +{ +} + +static inline void update_max_tr(struct cpu_trace *tr) +{ +} + +static inline void notrace _trace_cmdline(int cpu, struct cpu_trace *tr) +{ +} + +#endif + +static int setup_preempt_thresh(char *s) +{ + int thresh; + + get_option(&s, &thresh); + if (thresh > 0) { + __preempt_thresh = usecs_to_cycles(thresh); + printk("Preemption threshold = %u us\n", thresh); + } + return 1; +} +__setup("preempt_thresh=", setup_preempt_thresh); + +static inline void notrace reset_trace_idx(int cpu, struct cpu_trace *tr) +{ + if (trace_all_cpus) + for_each_online_cpu(cpu) { + tr = cpu_traces + cpu; + tr->trace_idx = 0; + atomic_set(&tr->underrun, 0); + atomic_set(&tr->overrun, 0); + } + else{ + tr->trace_idx = 0; + atomic_set(&tr->underrun, 0); + atomic_set(&tr->overrun, 0); + } +} + +#ifdef CONFIG_CRITICAL_TIMING + +static void notrace +check_critical_timing(int cpu, struct cpu_trace *tr, unsigned long parent_eip) +{ + unsigned long latency, t0, t1; + cycle_t T0, T1, T2, delta; + unsigned long flags; + + if (trace_user_triggered) + return; + /* + * usecs conversion is slow so we try to delay the conversion + * as long as possible: + */ + T0 = tr->preempt_timestamp; + T1 = now(); + delta = T1-T0; + + local_save_flags(flags); + + if (!report_latency(delta)) + goto out; + + ____trace(cpu, TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags); + /* + * Update the timestamp, because the trace entry above + * might change it (it can only get larger so the latency + * is fair to be reported): + */ + T2 = now(); + + delta = T2-T0; + + latency = cycles_to_usecs(delta); + latency_hist(tr->latency_type, cpu, latency); + + if (latency_hist_flag) { + if (__preempt_max_latency >= delta) + goto out; + } + + if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex)) + goto out; + +#ifndef CONFIG_CRITICAL_LATENCY_HIST + if (!__preempt_thresh && __preempt_max_latency > delta) { + printk("bug: updating %016Lx > %016Lx?\n", + (u64)__preempt_max_latency, (u64)delta); + printk(" [%016Lx %016Lx %016Lx]\n", (u64)T0, (u64)T1, (u64)T2); + } +#endif + + __preempt_max_latency = delta; + t0 = cycles_to_usecs(T0); + t1 = cycles_to_usecs(T1); + + tr->critical_end = parent_eip; + + update_max_tr(tr); + +#ifndef CONFIG_CRITICAL_LATENCY_HIST + if (__preempt_thresh) + printk("(%16s-%-5d|#%d): %lu us critical section " + "violates %lu us threshold.\n" + " => started at timestamp %lu: ", + current->comm, current->pid, + raw_smp_processor_id(), + latency, cycles_to_usecs(__preempt_thresh), t0); + else + printk("(%16s-%-5d|#%d): new %lu us maximum-latency " + "critical section.\n => started at timestamp %lu: ", + current->comm, current->pid, + raw_smp_processor_id(), + latency, t0); + + print_symbol("<%s>\n", tr->critical_start); + printk(" => ended at timestamp %lu: ", t1); + print_symbol("<%s>\n", tr->critical_end); + dump_stack(); + t1 = cycles_to_usecs(now()); + printk(" => dump-end timestamp %lu\n\n", t1); +#endif + + max_sequence++; + + up(&max_mutex); + +out: + tr->critical_sequence = max_sequence; + tr->preempt_timestamp = now(); + tr->early_warning = 0; + reset_trace_idx(cpu, tr); + _trace_cmdline(cpu, tr); + ____trace(cpu, TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags); +} + +void notrace touch_critical_timing(void) +{ + int cpu = raw_smp_processor_id(); + struct cpu_trace *tr = cpu_traces + cpu; + + if (!tr->critical_start || atomic_read(&tr->disabled) || + trace_user_triggered || wakeup_timing) + return; + + if (preempt_count() > 0 && tr->critical_start) { + atomic_inc(&tr->disabled); + check_critical_timing(cpu, tr, CALLER_ADDR0); + tr->critical_start = CALLER_ADDR0; + tr->critical_sequence = max_sequence; + atomic_dec(&tr->disabled); + } +} +EXPORT_SYMBOL(touch_critical_timing); + +void notrace stop_critical_timing(void) +{ + struct cpu_trace *tr = cpu_traces + raw_smp_processor_id(); + + tr->critical_start = 0; +} +EXPORT_SYMBOL(stop_critical_timing); + +static inline void notrace +__start_critical_timing(unsigned long eip, unsigned long parent_eip, + int latency_type) +{ + int cpu = raw_smp_processor_id(); + struct cpu_trace *tr = cpu_traces + cpu; + unsigned long flags; + + if (tr->critical_start || atomic_read(&tr->disabled) || + trace_user_triggered || wakeup_timing) + return; + + atomic_inc(&tr->disabled); + + tr->critical_sequence = max_sequence; + tr->preempt_timestamp = now(); + tr->critical_start = eip; + reset_trace_idx(cpu, tr); + tr->latency_type = latency_type; + _trace_cmdline(cpu, tr); + + local_save_flags(flags); + ____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0, flags); + + atomic_dec(&tr->disabled); +} + +static inline void notrace +__stop_critical_timing(unsigned long eip, unsigned long parent_eip) +{ + int cpu = raw_smp_processor_id(); + struct cpu_trace *tr = cpu_traces + cpu; + unsigned long flags; + + if (!tr->critical_start || atomic_read(&tr->disabled) || + trace_user_triggered || wakeup_timing) + return; + + atomic_inc(&tr->disabled); + local_save_flags(flags); + ____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0, flags); + check_critical_timing(cpu, tr, eip); + tr->critical_start = 0; + atomic_dec(&tr->disabled); +} + +#endif + +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + +#ifdef CONFIG_LOCKDEP + +void notrace time_hardirqs_on(unsigned long a0, unsigned long a1) +{ + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_off_preempt_count() && irqs_disabled_flags(flags)) + __stop_critical_timing(a0, a1); +} + +void notrace time_hardirqs_off(unsigned long a0, unsigned long a1) +{ + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_off_preempt_count() && irqs_disabled_flags(flags)) + __start_critical_timing(a0, a1, INTERRUPT_LATENCY); +} + +#else /* !CONFIG_LOCKDEP */ + +/* + * Dummy: + */ + +void early_boot_irqs_off(void) +{ +} + +void early_boot_irqs_on(void) +{ +} + +void trace_softirqs_on(unsigned long ip) +{ +} + +void trace_softirqs_off(unsigned long ip) +{ +} + +inline void print_irqtrace_events(struct task_struct *curr) +{ +} + +/* + * We are only interested in hardirq on/off events: + */ +void notrace trace_hardirqs_on(void) +{ + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_off_preempt_count() && irqs_disabled_flags(flags)) + __stop_critical_timing(CALLER_ADDR0, 0 /* CALLER_ADDR1 */); +} + +EXPORT_SYMBOL(trace_hardirqs_on); + +void notrace trace_hardirqs_off(void) +{ + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_off_preempt_count() && irqs_disabled_flags(flags)) + __start_critical_timing(CALLER_ADDR0, 0 /* CALLER_ADDR1 */, + INTERRUPT_LATENCY); +} + +EXPORT_SYMBOL(trace_hardirqs_off); + +/* used by x86_64 thunk.S */ +void notrace trace_hardirqs_on_caller(unsigned long caller_addr) +{ + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_off_preempt_count() && irqs_disabled_flags(flags)) + __stop_critical_timing(caller_addr, 0 /* CALLER_ADDR1 */); +} + +void notrace trace_hardirqs_off_caller(unsigned long caller_addr) +{ + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_off_preempt_count() && irqs_disabled_flags(flags)) + __start_critical_timing(caller_addr, 0 /* CALLER_ADDR1 */, + INTERRUPT_LATENCY); +} + +#endif /* !CONFIG_LOCKDEP */ + +#endif /* CONFIG_CRITICAL_IRQSOFF_TIMING */ + +#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING) + +static inline unsigned long get_parent_eip(void) +{ + unsigned long parent_eip = CALLER_ADDR1; + + if (in_lock_functions(parent_eip)) { + parent_eip = CALLER_ADDR2; + if (in_lock_functions(parent_eip)) + parent_eip = CALLER_ADDR3; + } + + return parent_eip; +} + +void notrace add_preempt_count(unsigned int val) +{ + unsigned long eip = CALLER_ADDR0; + unsigned long parent_eip = get_parent_eip(); + +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Underflow? + */ + if (DEBUG_WARN_ON(((int)preempt_count() < 0))) + return; + /* + * Spinlock count overflowing soon? + */ + if (DEBUG_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10)) + return; +#endif + + preempt_count() += val; +#ifdef CONFIG_PREEMPT_TRACE + if (val <= 10) { + unsigned int idx = preempt_count() & PREEMPT_MASK; + if (idx < MAX_PREEMPT_TRACE) { + current->preempt_trace_eip[idx] = eip; + current->preempt_trace_parent_eip[idx] = parent_eip; + } + } +#endif +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + { +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_disabled_flags(flags)) +#endif + if (preempt_count() == val) + __start_critical_timing(eip, parent_eip, + PREEMPT_LATENCY); + } +#endif + (void)eip, (void)parent_eip; +} +EXPORT_SYMBOL(add_preempt_count); + +void notrace sub_preempt_count(unsigned int val) +{ +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Underflow? + */ + if (DEBUG_WARN_ON(unlikely(val > preempt_count()))) + return; + /* + * Is the spinlock portion underflowing? + */ + if (DEBUG_WARN_ON((val < PREEMPT_MASK) && + !(preempt_count() & PREEMPT_MASK))) + return; +#endif + +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + { +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_disabled_flags(flags)) +#endif + if (preempt_count() == val) + __stop_critical_timing(CALLER_ADDR0, + CALLER_ADDR1); + } +#endif + preempt_count() -= val; +} + +EXPORT_SYMBOL(sub_preempt_count); + +void notrace mask_preempt_count(unsigned int mask) +{ + unsigned long eip = CALLER_ADDR0; + unsigned long parent_eip = get_parent_eip(); + + preempt_count() |= mask; + +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + { +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_disabled_flags(flags)) +#endif + if (preempt_count() == mask) + __start_critical_timing(eip, parent_eip, + PREEMPT_LATENCY); + } +#endif + (void) eip, (void) parent_eip; +} +EXPORT_SYMBOL(mask_preempt_count); + +void notrace unmask_preempt_count(unsigned int mask) +{ +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + { +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_disabled_flags(flags)) +#endif + if (preempt_count() == mask) + __stop_critical_timing(CALLER_ADDR0, + CALLER_ADDR1); + } +#endif + preempt_count() &= ~mask; +} +EXPORT_SYMBOL(unmask_preempt_count); + +#if defined(CONFIG_CRITICAL_PREEMPT_TIMING) && defined(CONFIG_TRACE_IRQFLAGS) + +/* Some archs do their cpu_idle with preemption on. Don't measure it */ +void notrace trace_preempt_enter_idle(void) +{ + __get_cpu_var(trace_cpu_idle) = 1; +} + +void notrace trace_preempt_exit_idle(void) +{ + __get_cpu_var(trace_cpu_idle) = 0; +} + +#endif /* CONFIG_CRITICAL_PREEMPT_TIMING */ + +#endif + +/* + * Wakeup latency timing/tracing. We get upcalls from the scheduler + * when a task is being woken up and we time/trace it until it gets + * to a CPU - or an even-higher-prio task supercedes it. (in that + * case we throw away the currently traced task - we dont try to + * handle nesting, that simplifies things significantly) + */ +#ifdef CONFIG_WAKEUP_TIMING + +unsigned long last_preempt_max_latency; + +static void notrace +check_wakeup_timing(struct cpu_trace *tr, unsigned long parent_eip, + unsigned long *flags) +{ + int cpu = raw_smp_processor_id(); + unsigned long latency, t0, t1; + cycle_t T0, T1, delta; + + if (trace_user_triggered) + return; + + atomic_inc(&tr->disabled); + if (atomic_read(&tr->disabled) != 1) + goto out; + + T0 = tr->preempt_timestamp; + T1 = now(); + /* + * Any wraparound or time warp and we are out: + */ + if (T0 > T1) + goto out; + delta = T1-T0; + + if (!report_latency(delta)) + goto out; + + ____trace(smp_processor_id(), TRACE_FN, tr, CALLER_ADDR0, parent_eip, + 0, 0, 0, *flags); + + latency = cycles_to_usecs(delta); + latency_hist(tr->latency_type, cpu, latency); + + if (latency_hist_flag) { + if (__preempt_max_latency >= delta) + goto out; + } + + if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex)) + goto out; + +#ifndef CONFIG_WAKEUP_LATENCY_HIST + if (!__preempt_thresh && __preempt_max_latency > delta) { + printk("bug2: updating %016lx > %016Lx?\n", + __preempt_max_latency, delta); + printk(" [%016Lx %016Lx]\n", T0, T1); + } +#endif + + __preempt_max_latency = delta; + t0 = cycles_to_usecs(T0); + t1 = cycles_to_usecs(T1); + tr->critical_end = parent_eip; + + update_max_tr(tr); + + atomic_dec(&tr->disabled); + __raw_spin_unlock(&sch.trace_lock); + local_irq_restore(*flags); + +#ifndef CONFIG_WAKEUP_LATENCY_HIST + if (__preempt_thresh) + printk("(%16s-%-5d|#%d): %lu us wakeup latency " + "violates %lu us threshold.\n", + current->comm, current->pid, + raw_smp_processor_id(), latency, + cycles_to_usecs(__preempt_thresh)); + else + printk("(%16s-%-5d|#%d): new %lu us maximum-latency " + "wakeup.\n", current->comm, current->pid, + raw_smp_processor_id(), latency); +#endif + + max_sequence++; + + up(&max_mutex); + + return; + +out: + atomic_dec(&tr->disabled); + __raw_spin_unlock(&sch.trace_lock); + local_irq_restore(*flags); +} + +/* + * Start wakeup latency tracing - called with the runqueue held + * and interrupts disabled: + */ +void __trace_start_sched_wakeup(struct task_struct *p) +{ + struct cpu_trace *tr; + int cpu; + + if (trace_user_triggered || !wakeup_timing) { + trace_special_pid(p->pid, p->prio, -1); + return; + } + + __raw_spin_lock(&sch.trace_lock); + if (sch.task && (sch.task->prio <= p->prio)) + goto out_unlock; + + /* + * New highest-prio task just woke up - start tracing: + */ + sch.task = p; + cpu = task_cpu(p); + sch.cpu = cpu; + /* + * We keep using this CPU's trace buffer even if the task + * gets migrated to another CPU. Tracing only happens on + * the CPU that 'owns' the highest-prio task so it's + * fundamentally single-threaded. + */ + sch.tr = tr = cpu_traces + cpu; + reset_trace_idx(cpu, tr); + +// if (!atomic_read(&tr->disabled)) { + atomic_inc(&tr->disabled); + tr->critical_sequence = max_sequence; + tr->preempt_timestamp = now(); + tr->latency_type = WAKEUP_LATENCY; + tr->critical_start = CALLER_ADDR0; + _trace_cmdline(raw_smp_processor_id(), tr); + atomic_dec(&tr->disabled); +// } + + call_mcount(); + trace_special_pid(p->pid, p->prio, cpu); + trace_special_sym(); +out_unlock: + __raw_spin_unlock(&sch.trace_lock); +} + +void trace_stop_sched_switched(struct task_struct *p) +{ + struct cpu_trace *tr; + unsigned long flags; + + if (trace_user_triggered || !wakeup_timing) + return; + + local_irq_save(flags); + __raw_spin_lock(&sch.trace_lock); + if (p == sch.task) { + trace_special_pid(p->pid, p->prio, task_cpu(p)); + + sch.task = NULL; + tr = sch.tr; + sch.tr = NULL; + WARN_ON(!tr); + /* auto-unlocks the spinlock: */ + check_wakeup_timing(tr, CALLER_ADDR0, &flags); + } else { + if (sch.task) + trace_special_pid(sch.task->pid, sch.task->prio, + p->prio); + if (sch.task && (sch.task->prio >= p->prio)) + sch.task = NULL; + __raw_spin_unlock(&sch.trace_lock); + } + local_irq_restore(flags); +} + +void trace_change_sched_cpu(struct task_struct *p, int new_cpu) +{ + unsigned long flags; + + if (!wakeup_timing) + return; + + trace_special_pid(p->pid, task_cpu(p), new_cpu); + trace_special_sym(); + local_irq_save(flags); + __raw_spin_lock(&sch.trace_lock); + if (p == sch.task && task_cpu(p) != new_cpu) { + sch.cpu = new_cpu; + trace_special(task_cpu(p), new_cpu, 0); + } + __raw_spin_unlock(&sch.trace_lock); + local_irq_restore(flags); +} + +#endif + +#ifdef CONFIG_EVENT_TRACE + +long user_trace_start(void) +{ + struct cpu_trace *tr; + unsigned long flags; + int cpu; + + if (!trace_user_triggered || trace_print_on_crash || print_functions) + return -EINVAL; + + /* + * If the user has not yet reset the max latency after + * bootup then we assume that this was the intention + * (we wont get any tracing done otherwise): + */ + if (__preempt_max_latency == (cycle_t)ULONG_MAX) + __preempt_max_latency = 0; + + /* + * user_trace_start() might be called from hardirq + * context, if trace_user_triggered_irq is set, so + * be careful about locking: + */ + if (preempt_count() || irqs_disabled()) { + if (down_trylock(&max_mutex)) + return -EAGAIN; + } else + down(&max_mutex); + + local_irq_save(flags); + cpu = smp_processor_id(); + tr = cpu_traces + cpu; + +#ifdef CONFIG_WAKEUP_TIMING + if (wakeup_timing) { + __raw_spin_lock(&sch.trace_lock); + sch.task = current; + sch.cpu = cpu; + sch.tr = tr; + __raw_spin_unlock(&sch.trace_lock); + } +#endif + reset_trace_idx(cpu, tr); + + tr->critical_sequence = max_sequence; + tr->preempt_timestamp = now(); + tr->critical_start = CALLER_ADDR0; + _trace_cmdline(cpu, tr); + call_mcount(); + + WARN_ON(!irqs_disabled()); + local_irq_restore(flags); + + up(&max_mutex); + + return 0; +} + +EXPORT_SYMBOL_GPL(user_trace_start); + +long user_trace_stop(void) +{ + unsigned long latency = 0, flags; + struct cpu_trace *tr; + cycle_t delta; + + if (!trace_user_triggered || trace_print_on_crash || print_functions) + return -EINVAL; + + local_irq_save(flags); + call_mcount(); + +#ifdef CONFIG_WAKEUP_TIMING + if (wakeup_timing) { + struct task_struct *t; + + __raw_spin_lock(&sch.trace_lock); + t = sch.task; + if (current != t) { + __raw_spin_unlock(&sch.trace_lock); + local_irq_restore(flags); + printk("wrong stop: curr: %s/%d[%d] => %p\n", + current->comm, current->pid, + task_thread_info(current)->cpu, t); + if (t) + printk("wrong stop: curr: %s/%d[%d]\n", + t->comm, t->pid, + task_thread_info(t)->cpu); + return -EINVAL; + } + sch.task = NULL; + tr = sch.tr; + sch.tr = NULL; + __raw_spin_unlock(&sch.trace_lock); + } else +#endif + tr = cpu_traces + smp_processor_id(); + + atomic_inc(&tr->disabled); + if (tr->preempt_timestamp) { + cycle_t T0, T1; + unsigned long long tmp0; + + T0 = tr->preempt_timestamp; + T1 = now(); + tmp0 = __preempt_max_latency; + if (T1 < T0) + T0 = T1; + delta = T1 - T0; + if (!report_latency(delta)) + goto out; + if (tr->critical_sequence != max_sequence || + down_trylock(&max_mutex)) + goto out; + + WARN_ON(!__preempt_thresh && __preempt_max_latency > delta); + + __preempt_max_latency = delta; + update_max_tr(tr); + + latency = cycles_to_usecs(delta); + + max_sequence++; + up(&max_mutex); +out: + tr->preempt_timestamp = 0; + } + atomic_dec(&tr->disabled); + local_irq_restore(flags); + + if (latency) { + if (__preempt_thresh) + printk("(%16s-%-5d|#%d): %lu us user-latency " + "violates %lu us threshold.\n", + current->comm, current->pid, + raw_smp_processor_id(), latency, + cycles_to_usecs(__preempt_thresh)); + else + printk("(%16s-%-5d|#%d): new %lu us user-latency.\n", + current->comm, current->pid, + raw_smp_processor_id(), latency); + } + + return 0; +} + +EXPORT_SYMBOL(user_trace_stop); + +static int trace_print_cpu = -1; + +void notrace stop_trace(void) +{ + if (trace_print_on_crash && trace_print_cpu == -1) { + trace_enabled = -1; + trace_print_cpu = raw_smp_processor_id(); + } +} + +EXPORT_SYMBOL(stop_trace); + +static void print_entry(struct trace_entry *entry, struct trace_entry *entry0) +{ + unsigned long abs_usecs; + int hardirq, softirq; + + abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp); + + printk("%-5d ", entry->pid); + + printk("%d%c%c", + entry->cpu, + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.', + (entry->flags & TRACE_FLAG_NEED_RESCHED_DELAYED) ? 'n' : + ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); + + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + if (hardirq && softirq) + printk("H"); + else { + if (hardirq) + printk("h"); + else { + if (softirq) + printk("s"); + else + printk("."); + } + } + + if (entry->preempt_count) + printk(":%x ", entry->preempt_count); + else + printk(":. "); + + printk("%ld.%03ldms: ", abs_usecs/1000, abs_usecs % 1000); + + switch (entry->type) { + case TRACE_FN: + printk_name(entry->u.fn.eip); + printk(" <= ("); + printk_name(entry->u.fn.parent_eip); + printk(")\n"); + break; + case TRACE_SPECIAL: + printk(" special: %lx %lx %lx\n", + entry->u.special.v1, entry->u.special.v2, + entry->u.special.v3); + break; + case TRACE_SPECIAL_U64: + printk(" spec64: %lx%08lx %lx\n", + entry->u.special.v1, entry->u.special.v2, + entry->u.special.v3); + break; + } +} + +/* + * Print the current trace at crash time. + * + * We print it backwards, so that the newest (most interesting) entries + * are printed first. + */ +void print_last_trace(void) +{ + unsigned int idx0, idx, i, cpu; + struct cpu_trace *tr; + struct trace_entry *entry0, *entry; + + preempt_disable(); + cpu = smp_processor_id(); + if (trace_enabled != -1 || trace_print_cpu != cpu || + !trace_print_on_crash) { + if (trace_print_on_crash) + printk("skipping trace printing on CPU#%d != %d\n", + cpu, trace_print_cpu); + preempt_enable(); + return; + } + + trace_print_on_crash = 0; + + tr = cpu_traces + cpu; + if (!tr->trace) + goto out; + + printk("Last %ld trace entries:\n", MAX_TRACE); + idx0 = tr->trace_idx; + printk("curr idx: %d\n", idx0); + if (idx0 >= MAX_TRACE) + idx0 = 0; + idx = idx0; + entry0 = tr->trace + idx0; + + for (i = 0; i < MAX_TRACE; i++) { + if (idx == 0) + idx = MAX_TRACE-1; + else + idx--; + entry = tr->trace + idx; + switch (entry->type) { + case TRACE_FN: + case TRACE_SPECIAL: + case TRACE_SPECIAL_U64: + print_entry(entry, entry0); + break; + } + } + printk("printed %ld entries\n", MAX_TRACE); +out: + preempt_enable(); +} + +#ifdef CONFIG_SMP +/* + * On SMP, try to 'peek' on other CPU's traces and record them + * in this CPU's trace. This way we get a rough idea about what's + * going on there, without the overhead of global tracing. + * + * (no need to make this PER_CPU, we bounce it around anyway.) + */ +unsigned long nmi_eips[NR_CPUS]; +unsigned long nmi_flags[NR_CPUS]; + +void notrace nmi_trace(unsigned long eip, unsigned long parent_eip, + unsigned long flags) +{ + int cpu, this_cpu = smp_processor_id(); + + __trace(eip, parent_eip); + + nmi_eips[this_cpu] = parent_eip; + nmi_flags[this_cpu] = flags; + for (cpu = 0; cpu < NR_CPUS; cpu++) + if (cpu_online(cpu) && cpu != this_cpu) { + __trace(eip, nmi_eips[cpu]); + __trace(eip, nmi_flags[cpu]); + } +} +#else +/* + * On UP, NMI tracing is quite simple: + */ +void notrace nmi_trace(unsigned long eip, unsigned long parent_eip, + unsigned long flags) +{ + __trace(eip, parent_eip); +} +#endif + +#endif + +#ifdef CONFIG_PREEMPT_TRACE + +static void print_preempt_trace(struct task_struct *task) +{ + unsigned int count = task_thread_info(task)->preempt_count; + unsigned int i, lim = count & PREEMPT_MASK; + if (lim >= MAX_PREEMPT_TRACE) + lim = MAX_PREEMPT_TRACE-1; + printk("---------------------------\n"); + printk("| preempt count: %08x ]\n", count); + printk("| %d-level deep critical section nesting:\n", lim); + printk("----------------------------------------\n"); + for (i = 1; i <= lim; i++) { + printk(".. [<%08lx>] .... ", task->preempt_trace_eip[i]); + print_symbol("%s\n", task->preempt_trace_eip[i]); + printk(".....[<%08lx>] .. ( <= ", + task->preempt_trace_parent_eip[i]); + print_symbol("%s)\n", task->preempt_trace_parent_eip[i]); + } + printk("\n"); +} + +#endif + +#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_EVENT_TRACE) +void print_traces(struct task_struct *task) +{ + if (!task) + task = current; + +#ifdef CONFIG_PREEMPT_TRACE + print_preempt_trace(task); +#endif +#ifdef CONFIG_EVENT_TRACE + print_last_trace(); +#endif +} +#endif + +#ifdef CONFIG_EVENT_TRACE +#ifndef ARCH_LOW_ADDRESS_LIMIT +#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL +#endif +void * __init tracer_alloc_bootmem(unsigned long size) +{ + void * ret; + + ret =__alloc_bootmem_nopanic(size, SMP_CACHE_BYTES, + ARCH_LOW_ADDRESS_LIMIT); + if (ret != NULL && ((unsigned long)ret) < ARCH_LOW_ADDRESS_LIMIT) { + free_bootmem(__pa(ret), size); + ret = __alloc_bootmem_nopanic(size, + SMP_CACHE_BYTES, + __pa(MAX_DMA_ADDRESS)); + } + + return ret; +} + +/* + * Allocate all the per-CPU trace buffers and the + * save-maximum/save-output staging buffers: + */ +void __init init_tracer(void) +{ + unsigned long size, total_size = 0; + struct trace_entry *array; + struct cpu_trace *tr; + int cpu; + + printk("num_possible_cpus(): %d\n", num_possible_cpus()); + + size = sizeof(struct trace_entry)*MAX_TRACE; + + for_each_possible_cpu(cpu) { + tr = cpu_traces + cpu; + array = tracer_alloc_bootmem(size); + if (!array) { + printk(KERN_ERR + "CPU#%d: failed to allocate %ld bytes trace buffer!\n", + cpu, size); + } else { + printk(KERN_INFO + "CPU#%d: allocated %ld bytes trace buffer.\n", + cpu, size); + total_size += size; + } + tr->cpu = cpu; + tr->trace = array; + + array = tracer_alloc_bootmem(size); + if (!array) { + printk(KERN_ERR + "CPU#%d: failed to allocate %ld bytes max-trace buffer!\n", + cpu, size); + } else { + printk(KERN_INFO + "CPU#%d: allocated %ld bytes max-trace buffer.\n", + cpu, size); + total_size += size; + } + max_tr.traces[cpu].trace = array; + } + + /* + * The output trace buffer is a special one that only has + * trace entries for the first cpu-trace structure: + */ + size = sizeof(struct trace_entry)*MAX_TRACE*num_possible_cpus(); + array = tracer_alloc_bootmem(size); + if (!array) { + printk(KERN_ERR + "failed to allocate %ld bytes out-trace buffer!\n", + size); + } else { + printk(KERN_INFO "allocated %ld bytes out-trace buffer.\n", + size); + total_size += size; + } + out_tr.traces[0].trace = array; + printk(KERN_INFO + "tracer: a total of %ld bytes allocated.\n", + total_size); +} +#endif + +#ifdef CONFIG_LATENCY_TIMING + +int proc_preempt_max_latency(struct ctl_table *table, int write, + struct file *file, void __user *buffer, + size_t *length, loff_t *ppos) +{ + if (!write) + preempt_max_latency = cycles_to_us(__preempt_max_latency); + + proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + + if (write) + __preempt_max_latency = usecs_to_cycles(preempt_max_latency); + + return 0; +} + +int proc_preempt_threshold(struct ctl_table *table, int write, + struct file *file, void __user *buffer, + size_t *length, loff_t *ppos) +{ + if (!write) + preempt_thresh = cycles_to_us(__preempt_thresh); + + proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + + if (write) + __preempt_thresh = usecs_to_cycles(preempt_thresh); + + return 0; +} +#endif Index: linux-2.6.24-rc8-rt1/kernel/panic.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/panic.c 2008-01-16 22:27:45.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/panic.c 2008-01-16 22:29:06.000000000 -0500 @@ -27,7 +27,38 @@ static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); -int panic_timeout; +/* + * Debugging helper: freeze all console output after printing the + * first oops's head (or tail): + */ +static int pause_on_oops_head_flag = 0; +static int pause_on_oops_tail_flag = 0; + +static void pause_on_oops_loop(int flag) +{ + switch (flag) { + default: + break; + case 1: + for (;;) + local_irq_disable(); + case 2: + for (;;) + local_irq_enable(); + } +} + +void pause_on_oops_head(void) +{ + pause_on_oops_loop(pause_on_oops_head_flag); +} + +void pause_on_oops_tail(void) +{ + pause_on_oops_loop(pause_on_oops_tail_flag); +} + +int panic_timeout __read_mostly; ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -67,6 +98,8 @@ NORET_TYPE void panic(const char * fmt, unsigned long caller = (unsigned long) __builtin_return_address(0); #endif + stop_trace(); + /* * It's possible to come here directly from a panic-assertion and not * have preempt disabled. Some functions called from here want @@ -79,6 +112,7 @@ NORET_TYPE void panic(const char * fmt, vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); + dump_stack(); bust_spinlocks(0); /* @@ -94,7 +128,7 @@ NORET_TYPE void panic(const char * fmt, * unfortunately means it may not be hardened to work in a panic * situation. */ - smp_send_stop(); +// smp_send_stop(); #endif atomic_notifier_call_chain(&panic_notifier_list, 0, buf); @@ -189,6 +223,22 @@ static int __init pause_on_oops_setup(ch } __setup("pause_on_oops=", pause_on_oops_setup); +static int __init pause_on_oops_head_setup(char *str) +{ + pause_on_oops_head_flag = simple_strtoul(str, NULL, 0); + printk(KERN_INFO "pause_on_oops_head: %d\n", pause_on_oops_head_flag); + return 1; +} +__setup("pause_on_oops_head=", pause_on_oops_head_setup); + +static int __init pause_on_oops_tail_setup(char *str) +{ + pause_on_oops_tail_flag = simple_strtoul(str, NULL, 0); + printk(KERN_INFO "pause_on_oops_tail: %d\n", pause_on_oops_tail_flag); + return 1; +} +__setup("pause_on_oops_tail=", pause_on_oops_tail_setup); + static void spin_msec(int msecs) { int i; Index: linux-2.6.24-rc8-rt1/kernel/printk.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/printk.c 2008-01-16 22:27:45.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/printk.c 2008-01-16 22:29:10.000000000 -0500 @@ -84,7 +84,7 @@ static int console_locked, console_suspe * It is also used in interesting ways to provide interlocking in * release_console_sem(). */ -static DEFINE_SPINLOCK(logbuf_lock); +static DEFINE_RAW_SPINLOCK(logbuf_lock); #define LOG_BUF_MASK (log_buf_len-1) #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) @@ -433,12 +433,33 @@ static void __call_console_drivers(unsig { struct console *con; + touch_critical_timing(); for (con = console_drivers; con; con = con->next) { if ((con->flags & CON_ENABLED) && con->write && - (cpu_online(smp_processor_id()) || - (con->flags & CON_ANYTIME))) + (cpu_online(raw_smp_processor_id()) || + (con->flags & CON_ANYTIME))) { + /* + * Disable tracing of printk details - it just + * clobbers the trace output with lots of + * repetitive lines (especially if console is + * on a serial line): + */ +#ifdef CONFIG_EVENT_TRACE + int trace_save = trace_enabled; + + trace_enabled = 0; + set_printk_might_sleep(1); + con->write(con, &LOG_BUF(start), end - start); + set_printk_might_sleep(0); + trace_enabled = trace_save; +#else + set_printk_might_sleep(1); con->write(con, &LOG_BUF(start), end - start); + set_printk_might_sleep(0); +#endif + } } + touch_critical_timing(); } static int __read_mostly ignore_loglevel; @@ -551,6 +572,7 @@ static void zap_locks(void) spin_lock_init(&logbuf_lock); /* And make sure that we print immediately */ init_MUTEX(&console_sem); + zap_rt_locks(); } #if defined(CONFIG_PRINTK_TIME) @@ -649,6 +671,7 @@ asmlinkage int vprintk(const char *fmt, lockdep_off(); spin_lock(&logbuf_lock); printk_cpu = smp_processor_id(); + preempt_enable(); /* Emit the output into the temporary buffer */ printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); @@ -718,6 +741,8 @@ asmlinkage int vprintk(const char *fmt, console_locked = 1; printk_cpu = UINT_MAX; spin_unlock(&logbuf_lock); + lockdep_on(); + local_irq_restore(flags); /* * Console drivers may assume that per-cpu resources have @@ -725,7 +750,7 @@ asmlinkage int vprintk(const char *fmt, * being able to cope (CON_ANYTIME) don't call them until * this CPU is officially up. */ - if (cpu_online(smp_processor_id()) || have_callable_console()) { + if (cpu_online(raw_smp_processor_id()) || have_callable_console()) { console_may_schedule = 0; release_console_sem(); } else { @@ -733,8 +758,6 @@ asmlinkage int vprintk(const char *fmt, console_locked = 0; up(&console_sem); } - lockdep_on(); - raw_local_irq_restore(flags); } else { /* * Someone else owns the drivers. We drop the spinlock, which @@ -747,7 +770,6 @@ asmlinkage int vprintk(const char *fmt, raw_local_irq_restore(flags); } - preempt_enable(); return printed_len; } EXPORT_SYMBOL(printk); @@ -971,13 +993,33 @@ void release_console_sem(void) _con_start = con_start; _log_end = log_end; con_start = log_end; /* Flush */ + /* + * on PREEMPT_RT, call console drivers with + * interrupts enabled (if printk was called + * with interrupts disabled): + */ +#ifdef CONFIG_PREEMPT_RT + spin_unlock_irqrestore(&logbuf_lock, flags); +#else spin_unlock(&logbuf_lock); +#endif call_console_drivers(_con_start, _log_end); +#ifndef CONFIG_PREEMPT_RT local_irq_restore(flags); +#endif } console_locked = 0; - up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); + up(&console_sem); + /* + * On PREEMPT_RT kernels __wake_up may sleep, so wake syslogd + * up only if we are in a preemptible section. We normally dont + * printk from non-preemptible sections so this is for the emergency + * case only. + */ +#ifdef CONFIG_PREEMPT_RT + if (!in_atomic() && !irqs_disabled()) +#endif if (wake_klogd) wake_up_klogd(); } @@ -1244,7 +1286,7 @@ void tty_write_message(struct tty_struct */ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) { - static DEFINE_SPINLOCK(ratelimit_lock); + static DEFINE_RAW_SPINLOCK(ratelimit_lock); static unsigned long toks = 10 * 5 * HZ; static unsigned long last_msg; static int missed; @@ -1285,6 +1327,23 @@ int printk_ratelimit(void) } EXPORT_SYMBOL(printk_ratelimit); +static DEFINE_RAW_SPINLOCK(warn_lock); + +void __WARN_ON(const char *func, const char *file, const int line) +{ + unsigned long flags; + + spin_lock_irqsave(&warn_lock, flags); + printk("%s/%d[CPU#%d]: BUG in %s at %s:%d\n", + current->comm, current->pid, raw_smp_processor_id(), + func, file, line); + dump_stack(); + spin_unlock_irqrestore(&warn_lock, flags); +} + +EXPORT_SYMBOL(__WARN_ON); + + /** * printk_timed_ratelimit - caller-controlled printk ratelimiting * @caller_jiffies: pointer to caller's state Index: linux-2.6.24-rc8-rt1/lib/debug_locks.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/lib/debug_locks.c 2008-01-16 22:27:45.000000000 -0500 +++ linux-2.6.24-rc8-rt1/lib/debug_locks.c 2008-01-16 22:28:11.000000000 -0500 @@ -10,6 +10,7 @@ */ #include #include +#include #include #include #include @@ -36,7 +37,14 @@ int debug_locks_silent; int debug_locks_off(void) { if (xchg(&debug_locks, 0)) { +#ifdef CONFIG_DEBUG_RT_MUTEXES + if (spin_is_locked(¤t->pi_lock)) + spin_unlock(¤t->pi_lock); +#endif if (!debug_locks_silent) { + stop_trace(); + user_trace_stop(); + printk("stopped custom tracer.\n"); console_verbose(); return 1; } Index: linux-2.6.24-rc8-rt1/scripts/Makefile =================================================================== --- linux-2.6.24-rc8-rt1.orig/scripts/Makefile 2008-01-16 22:27:45.000000000 -0500 +++ linux-2.6.24-rc8-rt1/scripts/Makefile 2008-01-16 22:29:12.000000000 -0500 @@ -7,11 +7,18 @@ # conmakehash: Create chartable # conmakehash: Create arrays for initializing the kernel console tables +hostprogs-$(CONFIG_EVENT_TRACE) += trace-it hostprogs-$(CONFIG_KALLSYMS) += kallsyms hostprogs-$(CONFIG_LOGO) += pnmtologo hostprogs-$(CONFIG_VT) += conmakehash hostprogs-$(CONFIG_PROM_CONSOLE) += conmakehash hostprogs-$(CONFIG_IKCONFIG) += bin2c +HOST_OS := $(shell uname) +ifeq ($(HOST_OS),Linux) +ifdef CONFIG_LPPTEST +hostprogs-y += testlpp +endif +endif always := $(hostprogs-y) $(hostprogs-m) Index: linux-2.6.24-rc8-rt1/scripts/trace-it.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/scripts/trace-it.c 2008-01-16 22:28:11.000000000 -0500 @@ -0,0 +1,79 @@ + +/* + * Copyright (C) 2005, Ingo Molnar + * + * user-triggered tracing. + * + * The -rt kernel has a built-in kernel tracer, which will trace + * all kernel function calls (and a couple of special events as well), + * by using a build-time gcc feature that instruments all kernel + * functions. + * + * The tracer is highly automated for a number of latency tracing purposes, + * but it can also be switched into 'user-triggered' mode, which is a + * half-automatic tracing mode where userspace apps start and stop the + * tracer. This file shows a dumb example how to turn user-triggered + * tracing on, and how to start/stop tracing. Note that if you do + * multiple start/stop sequences, the kernel will do a maximum search + * over their latencies, and will keep the trace of the largest latency + * in /proc/latency_trace. The maximums are also reported to the kernel + * log. (but can also be read from /proc/sys/kernel/preempt_max_latency) + * + * For the tracer to be activated, turn on CONFIG_EVENT_TRACING + * in the .config, rebuild the kernel and boot into it. The trace will + * get _alot_ more verbose if you also turn on CONFIG_FUNCTION_TRACING, + * every kernel function call will be put into the trace. Note that + * CONFIG_FUNCTION_TRACING has significant runtime overhead, so you dont + * want to use it for performance testing :) + */ + +#include +#include +#include +#include +#include +#include +#include + +int main (int argc, char **argv) +{ + int ret; + + if (getuid() != 0) { + fprintf(stderr, "needs to run as root.\n"); + exit(1); + } + ret = system("cat /proc/sys/kernel/mcount_enabled >/dev/null 2>/dev/null"); + if (ret) { + fprintf(stderr, "CONFIG_LATENCY_TRACING not enabled?\n"); + exit(1); + } + system("echo 1 > /proc/sys/kernel/trace_user_triggered"); + system("[ -e /proc/sys/kernel/wakeup_timing ] && echo 0 > /proc/sys/kernel/wakeup_timing"); + system("echo 1 > /proc/sys/kernel/trace_enabled"); + system("echo 1 > /proc/sys/kernel/mcount_enabled"); + system("echo 0 > /proc/sys/kernel/trace_freerunning"); + system("echo 0 > /proc/sys/kernel/trace_print_on_crash"); + system("echo 0 > /proc/sys/kernel/trace_verbose"); + system("echo 0 > /proc/sys/kernel/preempt_thresh 2>/dev/null"); + system("echo 0 > /proc/sys/kernel/preempt_max_latency 2>/dev/null"); + + // start tracing + if (prctl(0, 1)) { + fprintf(stderr, "trace-it: couldnt start tracing!\n"); + return 1; + } + usleep(10000000); + if (prctl(0, 0)) { + fprintf(stderr, "trace-it: couldnt stop tracing!\n"); + return 1; + } + + system("echo 0 > /proc/sys/kernel/trace_user_triggered"); + system("echo 0 > /proc/sys/kernel/trace_enabled"); + system("cat /proc/latency_trace"); + + return 0; +} + + Index: linux-2.6.24-rc8-rt1/arch/x86/boot/compressed/Makefile_32 =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/boot/compressed/Makefile_32 2008-01-16 22:27:45.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/boot/compressed/Makefile_32 2008-01-16 22:28:12.000000000 -0500 @@ -11,7 +11,7 @@ EXTRA_AFLAGS := -traditional LDFLAGS_vmlinux := -T hostprogs-y := relocs -KBUILD_CFLAGS := -m32 -D__KERNEL__ $(LINUX_INCLUDE) -O2 \ +KBUILD_CFLAGS := -m32 -D__KERNEL__ $(LINUX_INCLUDE) -Iinclude -O2 \ -fno-strict-aliasing -fPIC \ $(call cc-option,-ffreestanding) \ $(call cc-option,-fno-stack-protector) Index: linux-2.6.24-rc8-rt1/include/asm-x86/irq_32.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/irq_32.h 2008-01-16 22:27:45.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/irq_32.h 2008-01-16 22:28:12.000000000 -0500 @@ -41,7 +41,7 @@ extern int irqbalance_disable(char *str) extern void fixup_irqs(cpumask_t map); #endif -unsigned int do_IRQ(struct pt_regs *regs); +extern fastcall notrace unsigned int do_IRQ(struct pt_regs *regs); void init_IRQ(void); void __init native_init_IRQ(void); Index: linux-2.6.24-rc8-rt1/include/asm-x86/processor_32.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/processor_32.h 2008-01-16 22:27:45.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/processor_32.h 2008-01-16 22:28:12.000000000 -0500 @@ -134,7 +134,7 @@ extern void detect_ht(struct cpuinfo_x86 static inline void detect_ht(struct cpuinfo_x86 *c) {} #endif -static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, +static inline void fastcall native_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { /* ecx is often an input as well as an output. */ Index: linux-2.6.24-rc8-rt1/arch/x86/ia32/ia32entry.S =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/ia32/ia32entry.S 2008-01-16 22:27:44.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/ia32/ia32entry.S 2008-01-16 22:28:12.000000000 -0500 @@ -132,7 +132,9 @@ sysenter_do_call: cmpl $(IA32_NR_syscalls-1),%eax ja ia32_badsys IA32_ARG_FIXUP 1 + TRACE_SYS_IA32_CALL call *ia32_sys_call_table(,%rax,8) + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) cli @@ -237,7 +239,9 @@ cstar_do_call: cmpl $IA32_NR_syscalls-1,%eax ja ia32_badsys IA32_ARG_FIXUP 1 + TRACE_SYS_IA32_CALL call *ia32_sys_call_table(,%rax,8) + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) cli @@ -328,8 +332,10 @@ ia32_do_syscall: cmpl $(IA32_NR_syscalls-1),%eax ja ia32_badsys IA32_ARG_FIXUP + TRACE_SYS_IA32_CALL call *ia32_sys_call_table(,%rax,8) # xxx: rip relative ia32_sysret: + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) jmp int_ret_from_sys_call @@ -400,7 +406,7 @@ END(ia32_ptregs_common) .section .rodata,"a" .align 8 -ia32_sys_call_table: +ENTRY(ia32_sys_call_table) .quad sys_restart_syscall .quad sys_exit .quad stub32_fork @@ -726,4 +732,7 @@ ia32_sys_call_table: .quad compat_sys_timerfd .quad sys_eventfd .quad sys32_fallocate +#ifdef CONFIG_EVENT_TRACE + .globl ia32_syscall_end +#endif ia32_syscall_end: Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/traps_64.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/traps_64.c 2008-01-16 22:27:45.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/traps_64.c 2008-01-16 22:29:24.000000000 -0500 @@ -80,20 +80,22 @@ static inline void conditional_sti(struc local_irq_enable(); } -static inline void preempt_conditional_sti(struct pt_regs *regs) +static inline void preempt_conditional_sti(struct pt_regs *regs, int stack) { - preempt_disable(); + if (stack) + preempt_disable(); if (regs->eflags & X86_EFLAGS_IF) local_irq_enable(); } -static inline void preempt_conditional_cli(struct pt_regs *regs) +static inline void preempt_conditional_cli(struct pt_regs *regs, int stack) { if (regs->eflags & X86_EFLAGS_IF) local_irq_disable(); /* Make sure to not schedule here because we could be running on an exception stack. */ - preempt_enable_no_resched(); + if (stack) + preempt_enable_no_resched(); } int kstack_depth_to_print = 12; @@ -129,10 +131,14 @@ static unsigned long *in_exception_stack unsigned *usedp, char **idp) { static char ids[][8] = { +#if DEBUG_STACK > 0 [DEBUG_STACK - 1] = "#DB", +#endif [NMI_STACK - 1] = "NMI", [DOUBLEFAULT_STACK - 1] = "#DF", +#if STACKFAULT_STACK > 0 [STACKFAULT_STACK - 1] = "#SS", +#endif [MCE_STACK - 1] = "#MC", #if DEBUG_STKSZ > EXCEPTION_STKSZ [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" @@ -218,7 +224,7 @@ void dump_trace(struct task_struct *tsk, unsigned long *stack, const struct stacktrace_ops *ops, void *data) { - const unsigned cpu = get_cpu(); + const unsigned cpu = raw_smp_processor_id(); unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr; unsigned used = 0; struct thread_info *tinfo; @@ -309,7 +315,6 @@ void dump_trace(struct task_struct *tsk, tinfo = task_thread_info(tsk); HANDLE_STACK (valid_stack_ptr(tinfo, stack)); #undef HANDLE_STACK - put_cpu(); } EXPORT_SYMBOL(dump_trace); @@ -347,9 +352,13 @@ static const struct stacktrace_ops print void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack) { + pause_on_oops_head(); printk("\nCall Trace:\n"); dump_trace(tsk, regs, stack, &print_trace_ops, NULL); printk("\n"); + pause_on_oops_tail(); + debug_show_held_locks(tsk); + print_traces(tsk); } static void @@ -357,7 +366,7 @@ _show_stack(struct task_struct *tsk, str { unsigned long *stack; int i; - const int cpu = smp_processor_id(); + const int cpu = raw_smp_processor_id(); unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); @@ -469,7 +478,7 @@ void out_of_line_bug(void) EXPORT_SYMBOL(out_of_line_bug); #endif -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static raw_spinlock_t die_lock = RAW_SPIN_LOCK_UNLOCKED(die_lock); static int die_owner = -1; static unsigned int die_nest_count; @@ -483,11 +492,11 @@ unsigned __kprobes long oops_begin(void) /* racy, but better than risking deadlock. */ raw_local_irq_save(flags); cpu = smp_processor_id(); - if (!__raw_spin_trylock(&die_lock)) { + if (!spin_trylock(&die_lock)) { if (cpu == die_owner) /* nested oops. should stop eventually */; else - __raw_spin_lock(&die_lock); + spin_lock(&die_lock); } die_nest_count++; die_owner = cpu; @@ -503,7 +512,7 @@ void __kprobes oops_end(unsigned long fl die_nest_count--; if (!die_nest_count) /* Nest count reaches zero, release the lock. */ - __raw_spin_unlock(&die_lock); + spin_unlock(&die_lock); raw_local_irq_restore(flags); if (panic_on_oops) panic("Fatal exception"); @@ -659,9 +668,9 @@ asmlinkage void do_stack_segment(struct if (notify_die(DIE_TRAP, "stack segment", regs, error_code, 12, SIGBUS) == NOTIFY_STOP) return; - preempt_conditional_sti(regs); + preempt_conditional_sti(regs, STACKFAULT_STACK); do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); - preempt_conditional_cli(regs); + preempt_conditional_cli(regs, STACKFAULT_STACK); } asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) @@ -819,9 +828,9 @@ asmlinkage void __kprobes do_int3(struct if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { return; } - preempt_conditional_sti(regs); + preempt_conditional_sti(regs, DEBUG_STACK); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); - preempt_conditional_cli(regs); + preempt_conditional_cli(regs, DEBUG_STACK); } /* Help handler running on IST stack to switch back to user stack @@ -861,7 +870,7 @@ asmlinkage void __kprobes do_debug(struc SIGTRAP) == NOTIFY_STOP) return; - preempt_conditional_sti(regs); + preempt_conditional_sti(regs, DEBUG_STACK); /* Mask out spurious debug traps due to lazy DR7 setting */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { @@ -906,13 +915,13 @@ asmlinkage void __kprobes do_debug(struc clear_dr7: set_debugreg(0UL, 7); - preempt_conditional_cli(regs); + preempt_conditional_cli(regs, DEBUG_STACK); return; clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); regs->eflags &= ~TF_MASK; - preempt_conditional_cli(regs); + preempt_conditional_cli(regs, DEBUG_STACK); } static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) Index: linux-2.6.24-rc8-rt1/include/asm-x86/calling.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/calling.h 2008-01-16 22:27:45.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/calling.h 2008-01-16 22:28:12.000000000 -0500 @@ -160,3 +160,53 @@ .macro icebp .byte 0xf1 .endm + +/* + * latency-tracing helpers: + */ + + .macro TRACE_SYS_CALL + +#ifdef CONFIG_EVENT_TRACE + SAVE_ARGS + + mov %rdx, %rcx + mov %rsi, %rdx + mov %rdi, %rsi + mov %rax, %rdi + + call sys_call + + RESTORE_ARGS +#endif + .endm + + + .macro TRACE_SYS_IA32_CALL + +#ifdef CONFIG_EVENT_TRACE + SAVE_ARGS + + mov %rdx, %rcx + mov %rsi, %rdx + mov %rdi, %rsi + mov %rax, %rdi + + call sys_ia32_call + + RESTORE_ARGS +#endif + .endm + + .macro TRACE_SYS_RET + +#ifdef CONFIG_EVENT_TRACE + SAVE_ARGS + + mov %rax, %rdi + + call sys_ret + + RESTORE_ARGS +#endif + .endm Index: linux-2.6.24-rc8-rt1/include/asm-x86/unistd_64.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/unistd_64.h 2008-01-16 22:27:45.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/unistd_64.h 2008-01-16 22:28:12.000000000 -0500 @@ -11,6 +11,8 @@ * Note: holes are not allowed. */ +#define NR_syscalls (__NR_syscall_max+1) + /* at least 8 syscall per cacheline */ #define __NR_read 0 __SYSCALL(__NR_read, sys_read) Index: linux-2.6.24-rc8-rt1/arch/ppc/boot/Makefile =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ppc/boot/Makefile 2008-01-16 22:27:44.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ppc/boot/Makefile 2008-01-16 22:28:13.000000000 -0500 @@ -15,6 +15,15 @@ # KBUILD_CFLAGS used when building rest of boot (takes effect recursively) KBUILD_CFLAGS += -fno-builtin -D__BOOTER__ -Iarch/$(ARCH)/boot/include + +ifdef CONFIG_MCOUNT +# do not trace the boot loader +nullstring := +space := $(nullstring) # end of the line +pg_flag = $(nullstring) -pg # end of the line +KBUILD_CFLAGS := $(subst ${pg_flag},${space},${KBUILD_CFLAGS}) +endif + HOSTCFLAGS += -Iarch/$(ARCH)/boot/include BOOT_TARGETS = zImage zImage.initrd znetboot znetboot.initrd Index: linux-2.6.24-rc8-rt1/arch/arm/boot/compressed/head.S =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/boot/compressed/head.S 2008-01-16 22:27:44.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/boot/compressed/head.S 2008-01-16 22:28:13.000000000 -0500 @@ -928,6 +928,19 @@ memdump: mov r12, r0 #endif .ltorg +#ifdef CONFIG_MCOUNT +/* CONFIG_MCOUNT causes boot header to be built with -pg requiring this + * trampoline + */ + .text + .align 0 + .type mcount %function + .global mcount +mcount: + mov pc, lr @ just return +#endif + + reloc_end: .align Index: linux-2.6.24-rc8-rt1/arch/arm/kernel/entry-common.S =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/kernel/entry-common.S 2008-01-16 22:27:44.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/kernel/entry-common.S 2008-01-16 22:28:33.000000000 -0500 @@ -3,6 +3,8 @@ * * Copyright (C) 2000 Russell King * + * FUNCTION_TRACE/mcount support (C) 2005 Timesys john.cooper@timesys.com + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. @@ -44,7 +46,7 @@ ret_fast_syscall: fast_work_pending: str r0, [sp, #S_R0+S_OFF]! @ returned r0 work_pending: - tst r1, #_TIF_NEED_RESCHED + tst r1, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED bne work_resched tst r1, #_TIF_SIGPENDING beq no_work_pending @@ -54,7 +56,8 @@ work_pending: b ret_slow_syscall @ Check work again work_resched: - bl schedule + bl __schedule + /* * "slow" syscall return path. "why" tells us if this was a real syscall. */ @@ -394,6 +397,114 @@ ENTRY(sys_oabi_call_table) #include "calls.S" #undef ABI #undef OBSOLETE +#endif + +#ifdef CONFIG_FRAME_POINTER + +#ifdef CONFIG_MCOUNT +/* + * At the point where we are in mcount() we maintain the + * frame of the prologue code and keep the call to mcount() + * out of the stack frame list: + + saved pc <---\ caller of instrumented routine + saved lr | + ip/prev_sp | + fp -----^ | + : | + | + -> saved pc | instrumented routine + | saved lr | + | ip/prev_sp | + | fp ---------/ + | : + | + | mcount + | saved pc + | saved lr + | ip/prev sp + -- fp + r3 + r2 + r1 + sp-> r0 + : + */ + + .text + .align 0 + .type mcount %function + .global mcount + +/* gcc -pg generated FUNCTION_PROLOGUE references mcount() + * and has already created the stack frame invocation for + * the routine we have been called to instrument. We create + * a complete frame nevertheless, as we want to use the same + * call to mcount() from c code. + */ +mcount: + + ldr ip, =mcount_enabled @ leave early, if disabled + ldr ip, [ip] + cmp ip, #0 + moveq pc, lr + + mov ip, sp + stmdb sp!, {r0 - r3, fp, ip, lr, pc} @ create stack frame + + mov r2, =mcount_trace_function + + ldr r1, [fp, #-4] @ get lr (the return address + @ of the caller of the + @ instrumented function) + mov r0, lr @ get lr - (the return address + @ of the instrumented function) + + sub fp, ip, #4 @ point fp at this frame + + bl r2 +1: + ldmdb fp, {r0 - r3, fp, sp, pc} @ pop entry frame and return + +#endif + +/* ARM replacement for unsupported gcc __builtin_return_address(n) + * where 0 < n. n == 0 is supported here as well. + * + * Walk up the stack frame until the desired frame is found or a NULL + * fp is encountered, return NULL in the latter case. + * + * Note: it is possible under code optimization for the stack invocation + * of an ancestor function (level N) to be removed before calling a + * descendant function (level N+1). No easy means is available to deduce + * this scenario with the result being [for example] caller_addr(0) when + * called from level N+1 returning level N-1 rather than the expected + * level N. This optimization issue appears isolated to the case of + * a call to a level N+1 routine made at the tail end of a level N + * routine -- the level N frame is deleted and a simple branch is made + * to the level N+1 routine. + */ + + .text + .align 0 + .type arm_return_addr %function + .global arm_return_addr + +arm_return_addr: + mov ip, r0 + mov r0, fp +3: + cmp r0, #0 + beq 1f @ frame list hit end, bail + cmp ip, #0 + beq 2f @ reached desired frame + ldr r0, [r0, #-12] @ else continue, get next fp + sub ip, ip, #1 + b 3b +2: + ldr r0, [r0, #-4] @ get target return address +1: + mov pc, lr #endif Index: linux-2.6.24-rc8-rt1/arch/arm/kernel/fiq.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/kernel/fiq.c 2008-01-16 22:27:44.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/kernel/fiq.c 2008-01-16 22:28:13.000000000 -0500 @@ -89,7 +89,7 @@ void set_fiq_handler(void *start, unsign * disable irqs for the duration. Note - these functions are almost * entirely coded in assembly. */ -void __attribute__((naked)) set_fiq_regs(struct pt_regs *regs) +void notrace __attribute__((naked)) set_fiq_regs(struct pt_regs *regs) { register unsigned long tmp; asm volatile ( @@ -107,7 +107,7 @@ void __attribute__((naked)) set_fiq_regs : "r" (®s->ARM_r8), "I" (PSR_I_BIT | PSR_F_BIT | FIQ_MODE)); } -void __attribute__((naked)) get_fiq_regs(struct pt_regs *regs) +void notrace __attribute__((naked)) get_fiq_regs(struct pt_regs *regs) { register unsigned long tmp; asm volatile ( Index: linux-2.6.24-rc8-rt1/arch/arm/kernel/irq.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/kernel/irq.c 2008-01-16 22:27:44.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/kernel/irq.c 2008-01-16 22:28:39.000000000 -0500 @@ -100,7 +100,7 @@ unlock: /* Handle bad interrupts */ static struct irq_desc bad_irq_desc = { .handle_irq = handle_bad_irq, - .lock = SPIN_LOCK_UNLOCKED + .lock = RAW_SPIN_LOCK_UNLOCKED(bad_irq_desc.lock) }; /* @@ -108,11 +108,13 @@ static struct irq_desc bad_irq_desc = { * come via this function. Instead, they should provide their * own 'handler' */ -asmlinkage void __exception asm_do_IRQ(unsigned int irq, struct pt_regs *regs) +asmlinkage void __exception notrace asm_do_IRQ(unsigned int irq, struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); struct irq_desc *desc = irq_desc + irq; + trace_special(instruction_pointer(regs), irq, 0); + /* * Some hardware gives randomly wrong interrupts. Rather * than crashing, do something sensible. Index: linux-2.6.24-rc8-rt1/arch/arm/kernel/traps.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/kernel/traps.c 2008-01-16 22:27:44.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/kernel/traps.c 2008-01-16 22:28:39.000000000 -0500 @@ -233,7 +233,7 @@ static void __die(const char *str, int e } } -DEFINE_SPINLOCK(die_lock); +DEFINE_RAW_SPINLOCK(die_lock); /* * This function is protected against re-entrancy. @@ -276,7 +276,7 @@ void arm_notify_die(const char *str, str } static LIST_HEAD(undef_hook); -static DEFINE_SPINLOCK(undef_lock); +static DEFINE_RAW_SPINLOCK(undef_lock); void register_undef_hook(struct undef_hook *hook) { @@ -354,6 +354,7 @@ asmlinkage void do_unexp_fiq (struct pt_ { printk("Hmm. Unexpected FIQ received, but trying to continue\n"); printk("You may have a hardware problem...\n"); + print_traces(current); } /* Index: linux-2.6.24-rc8-rt1/arch/arm/mm/copypage-v4mc.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/mm/copypage-v4mc.c 2008-01-16 22:27:44.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/mm/copypage-v4mc.c 2008-01-16 22:28:39.000000000 -0500 @@ -30,7 +30,7 @@ #define minicache_pgprot __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | \ L_PTE_CACHEABLE) -static DEFINE_SPINLOCK(minicache_lock); +static DEFINE_RAW_SPINLOCK(minicache_lock); /* * ARMv4 mini-dcache optimised copy_user_page @@ -44,7 +44,7 @@ static DEFINE_SPINLOCK(minicache_lock); * instruction. If your processor does not supply this, you have to write your * own copy_user_page that does the right thing. */ -static void __attribute__((naked)) +static void notrace __attribute__((naked)) mc_copy_user_page(void *from, void *to) { asm volatile( @@ -88,7 +88,7 @@ void v4_mc_copy_user_page(void *kto, con /* * ARMv4 optimised clear_user_page */ -void __attribute__((naked)) +void notrace __attribute__((naked)) v4_mc_clear_user_page(void *kaddr, unsigned long vaddr) { asm volatile( Index: linux-2.6.24-rc8-rt1/arch/arm/mm/copypage-xscale.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/mm/copypage-xscale.c 2008-01-16 22:27:44.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/mm/copypage-xscale.c 2008-01-16 22:28:39.000000000 -0500 @@ -32,7 +32,7 @@ #define minicache_pgprot __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | \ L_PTE_CACHEABLE) -static DEFINE_SPINLOCK(minicache_lock); +static DEFINE_RAW_SPINLOCK(minicache_lock); /* * XScale mini-dcache optimised copy_user_page @@ -42,7 +42,7 @@ static DEFINE_SPINLOCK(minicache_lock); * Dcache aliasing issue. The writes will be forwarded to the write buffer, * and merged as appropriate. */ -static void __attribute__((naked)) +static void notrace __attribute__((naked)) mc_copy_user_page(void *from, void *to) { /* @@ -110,7 +110,7 @@ void xscale_mc_copy_user_page(void *kto, /* * XScale optimised clear_user_page */ -void __attribute__((naked)) +void notrace __attribute__((naked)) xscale_mc_clear_user_page(void *kaddr, unsigned long vaddr) { asm volatile( Index: linux-2.6.24-rc8-rt1/arch/arm/mm/fault.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/mm/fault.c 2008-01-16 22:27:44.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/mm/fault.c 2008-01-16 22:29:04.000000000 -0500 @@ -215,7 +215,7 @@ out: return fault; } -static int +static notrace int do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { struct task_struct *tsk; @@ -229,7 +229,7 @@ do_page_fault(unsigned long addr, unsign * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_atomic() || !mm) + if (in_atomic() || !mm || current->pagefault_disabled) goto no_context; /* @@ -311,7 +311,7 @@ no_context: * interrupt or a critical region, and should only copy the information * from the master page table, nothing more. */ -static int +static notrace int do_translation_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { @@ -354,7 +354,7 @@ bad_area: * Some section permission faults need to be handled gracefully. * They can happen due to a __{get,put}_user during an oops. */ -static int +static notrace int do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { do_bad_area(addr, fsr, regs); @@ -364,7 +364,7 @@ do_sect_fault(unsigned long addr, unsign /* * This abort handler always returns "fault". */ -static int +static notrace int do_bad(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { return 1; @@ -419,7 +419,7 @@ static struct fsr_info { { do_bad, SIGBUS, 0, "unknown 31" } }; -void __init +void __init notrace hook_fault_code(int nr, int (*fn)(unsigned long, unsigned int, struct pt_regs *), int sig, const char *name) { @@ -433,7 +433,7 @@ hook_fault_code(int nr, int (*fn)(unsign /* * Dispatch a data abort to the relevant handler. */ -asmlinkage void __exception +asmlinkage void __exception notrace do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { const struct fsr_info *inf = fsr_info + (fsr & 15) + ((fsr & (1 << 10)) >> 6); @@ -452,7 +452,7 @@ do_DataAbort(unsigned long addr, unsigne arm_notify_die("", regs, &info, fsr, 0); } -asmlinkage void __exception +asmlinkage void __exception notrace do_PrefetchAbort(unsigned long addr, struct pt_regs *regs) { do_translation_fault(addr, 0, regs); Index: linux-2.6.24-rc8-rt1/include/asm-arm/pgalloc.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-arm/pgalloc.h 2008-01-16 22:27:44.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-arm/pgalloc.h 2008-01-16 22:28:13.000000000 -0500 @@ -109,7 +109,7 @@ static inline void __pmd_populate(pmd_t * * Ensure that we always set both PMD entries. */ -static inline void +static inline void notrace pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep) { unsigned long pte_ptr = (unsigned long)ptep; @@ -122,7 +122,7 @@ pmd_populate_kernel(struct mm_struct *mm __pmd_populate(pmdp, __pa(pte_ptr) | _PAGE_KERNEL_TABLE); } -static inline void +static inline void notrace pmd_populate(struct mm_struct *mm, pmd_t *pmdp, struct page *ptep) { __pmd_populate(pmdp, page_to_pfn(ptep) << PAGE_SHIFT | _PAGE_USER_TABLE); Index: linux-2.6.24-rc8-rt1/include/asm-arm/timex.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-arm/timex.h 2008-01-16 22:27:44.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-arm/timex.h 2008-01-16 22:29:26.000000000 -0500 @@ -16,9 +16,17 @@ typedef unsigned long cycles_t; +#ifndef mach_read_cycles + #define mach_read_cycles() (0) +#ifdef CONFIG_EVENT_TRACE + #define mach_cycles_to_usecs(d) (d) + #define mach_usecs_to_cycles(d) (d) +#endif +#endif + static inline cycles_t get_cycles (void) { - return 0; + return mach_read_cycles(); } #endif Index: linux-2.6.24-rc8-rt1/include/asm-arm/unistd.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-arm/unistd.h 2008-01-16 22:27:44.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-arm/unistd.h 2008-01-16 22:28:13.000000000 -0500 @@ -380,6 +380,10 @@ #define __NR_eventfd (__NR_SYSCALL_BASE+351) #define __NR_fallocate (__NR_SYSCALL_BASE+352) +#ifndef __ASSEMBLY__ +#define NR_syscalls (__NR_fallocate + 1 - __NR_SYSCALL_BASE) +#endif + /* * The following SWIs are ARM private. */ Index: linux-2.6.24-rc8-rt1/include/linux/prctl.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/prctl.h 2008-01-16 22:27:44.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/prctl.h 2008-01-16 22:28:14.000000000 -0500 @@ -3,6 +3,7 @@ /* Values to pass as first argument to prctl() */ +#define PR_SET_TRACING 0 /* Second arg is tracing on/off */ #define PR_SET_PDEATHSIG 1 /* Second arg is a signal */ #define PR_GET_PDEATHSIG 2 /* Second arg is a ptr to return the signal */ Index: linux-2.6.24-rc8-rt1/kernel/sys.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/sys.c 2008-01-16 22:27:44.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/sys.c 2008-01-16 22:29:04.000000000 -0500 @@ -32,10 +32,12 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -264,6 +266,15 @@ out_unlock: */ void emergency_restart(void) { + /* + * Call the notifier chain if we are not in an + * atomic context: + */ +#ifdef CONFIG_PREEMPT + if (!in_atomic() && !irqs_disabled()) + blocking_notifier_call_chain(&reboot_notifier_list, + SYS_RESTART, NULL); +#endif machine_emergency_restart(); } EXPORT_SYMBOL_GPL(emergency_restart); @@ -1643,6 +1654,14 @@ asmlinkage long sys_prctl(int option, un { long error; +#ifdef CONFIG_EVENT_TRACE + if (option == PR_SET_TRACING) { + if (arg2) + return user_trace_start(); + return user_trace_stop(); + } +#endif + error = security_task_prctl(option, arg2, arg3, arg4, arg5); if (error) return error; Index: linux-2.6.24-rc8-rt1/arch/x86/lib/thunk_64.S =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/lib/thunk_64.S 2008-01-16 22:27:44.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/lib/thunk_64.S 2008-01-16 22:28:33.000000000 -0500 @@ -40,15 +40,31 @@ thunk rwsem_wake_thunk,rwsem_wake thunk rwsem_downgrade_thunk,rwsem_downgrade_wake #endif - - thunk __down_failed,__down - thunk_retrax __down_failed_interruptible,__down_interruptible - thunk_retrax __down_failed_trylock,__down_trylock - thunk __up_wakeup,__up + +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK + thunk __compat_down_failed,__compat_down + thunk_retrax __compat_down_failed_interruptible,__compat_down_interruptible + thunk_retrax __compat_down_failed_trylock,__compat_down_trylock + thunk __compat_up_wakeup,__compat_up +#endif #ifdef CONFIG_TRACE_IRQFLAGS - thunk trace_hardirqs_on_thunk,trace_hardirqs_on - thunk trace_hardirqs_off_thunk,trace_hardirqs_off + /* put return address in rdi (arg1) */ + .macro thunk_ra name,func + .globl \name +\name: + CFI_STARTPROC + SAVE_ARGS + /* SAVE_ARGS pushs 9 elements */ + /* the next element would be the rip */ + movq 9*8(%rsp), %rdi + call \func + jmp restore + CFI_ENDPROC + .endm + + thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller + thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/process_32.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/process_32.c 2008-01-16 22:27:44.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/process_32.c 2008-01-16 22:29:18.000000000 -0500 @@ -113,7 +113,7 @@ void default_idle(void) smp_mb(); local_irq_disable(); - if (!need_resched()) + if (!need_resched() && !need_resched_delayed()) safe_halt(); /* enables interrupts racelessly */ else local_irq_enable(); @@ -134,7 +134,9 @@ EXPORT_SYMBOL(default_idle); */ static void poll_idle (void) { - cpu_relax(); + do { + cpu_relax(); + } while (!need_resched() && !need_resched_delayed()); } #ifdef CONFIG_HOTPLUG_CPU @@ -178,13 +180,12 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { tick_nohz_stop_sched_tick(); - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { void (*idle)(void); if (__get_cpu_var(cpu_idle_state)) __get_cpu_var(cpu_idle_state) = 0; - check_pgt_cache(); rmb(); idle = pm_idle; @@ -194,13 +195,26 @@ void cpu_idle(void) if (cpu_is_offline(cpu)) play_dead(); + /* + * We have irqs disabled here, so stop latency tracing + * at this point and restart it after we return: + */ + stop_critical_timing(); + __get_cpu_var(irq_stat).idle_timestamp = jiffies; idle(); + + touch_critical_timing(); + } + local_irq_disable(); + trace_preempt_exit_idle(); tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + trace_preempt_enter_idle(); + local_irq_enable(); } } @@ -257,10 +271,10 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) { - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); - if (!need_resched()) + if (!need_resched() && !need_resched_delayed()) __mwait(eax, ecx); } } @@ -339,9 +353,10 @@ void __show_registers(struct pt_regs *re regs->eax, regs->ebx, regs->ecx, regs->edx); printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", regs->esi, regs->edi, regs->ebp, esp); - printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", + printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x" + " preempt:%08x\n", regs->xds & 0xffff, regs->xes & 0xffff, - regs->xfs & 0xffff, gs, ss); + regs->xfs & 0xffff, gs, ss, preempt_count()); if (!all) return; @@ -413,15 +428,23 @@ void exit_thread(void) if (unlikely(test_thread_flag(TIF_IO_BITMAP))) { struct task_struct *tsk = current; struct thread_struct *t = &tsk->thread; - int cpu = get_cpu(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + void *io_bitmap_ptr = t->io_bitmap_ptr; + int cpu; + struct tss_struct *tss; - kfree(t->io_bitmap_ptr); + /* + * On PREEMPT_RT we must not call kfree() with + * preemption disabled, so we first zap the pointer: + */ t->io_bitmap_ptr = NULL; + kfree(io_bitmap_ptr); + clear_thread_flag(TIF_IO_BITMAP); /* * Careful, clear this in the TSS too: */ + cpu = get_cpu(); + tss = &per_cpu(init_tss, cpu); memset(tss->io_bitmap, 0xff, tss->io_bitmap_max); t->io_bitmap_max = 0; tss->io_bitmap_owner = NULL; Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/process_64.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/process_64.c 2008-01-16 22:27:44.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/process_64.c 2008-01-16 22:29:18.000000000 -0500 @@ -115,7 +115,7 @@ static void default_idle(void) */ smp_mb(); local_irq_disable(); - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { /* Enables interrupts one instruction before HLT. x86 special cases this so there is no race. */ safe_halt(); @@ -212,7 +212,7 @@ void cpu_idle (void) current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { void (*idle)(void); if (__get_cpu_var(cpu_idle_state)) @@ -232,8 +232,18 @@ void cpu_idle (void) * Otherwise, idle callbacks can misfire. */ local_irq_disable(); + + /* + * We have irqs disabled here, so stop latency tracing + * at this point and restart it after we return: + */ + stop_critical_timing(); + enter_idle(); idle(); + + touch_critical_timing(); + /* In many cases the interrupt that ended idle has already called exit_idle. But some idle loops can be woken up without interrupt. */ @@ -241,9 +251,13 @@ void cpu_idle (void) } tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + trace_preempt_exit_idle(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + trace_preempt_enter_idle(); + local_irq_enable(); } } @@ -259,10 +273,10 @@ void cpu_idle (void) */ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) { - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); - if (!need_resched()) + if (!need_resched() && !need_resched_delayed()) __mwait(eax, ecx); } } @@ -270,12 +284,13 @@ void mwait_idle_with_hints(unsigned long /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); - if (!need_resched()) + if (!need_resched() && !need_resched_delayed()) { + trace_hardirqs_on(); __sti_mwait(0, 0); - else + } else local_irq_enable(); } else { local_irq_enable(); @@ -391,7 +406,7 @@ void exit_thread(void) struct thread_struct *t = &me->thread; if (me->thread.io_bitmap_ptr) { - struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); + struct tss_struct *tss; kfree(t->io_bitmap_ptr); t->io_bitmap_ptr = NULL; @@ -399,6 +414,7 @@ void exit_thread(void) /* * Careful, clear this in the TSS too: */ + tss = &per_cpu(init_tss, get_cpu()); memset(tss->io_bitmap, 0xff, t->io_bitmap_max); t->io_bitmap_max = 0; put_cpu(); Index: linux-2.6.24-rc8-rt1/include/linux/irqflags.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/irqflags.h 2008-01-16 22:27:44.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/irqflags.h 2008-01-16 22:28:30.000000000 -0500 @@ -11,11 +11,24 @@ #ifndef _LINUX_TRACE_IRQFLAGS_H #define _LINUX_TRACE_IRQFLAGS_H +#define BUILD_CHECK_IRQ_FLAGS(flags) \ + do { \ + BUILD_BUG_ON(sizeof(flags) != sizeof(unsigned long)); \ + typecheck(unsigned long, flags); \ + } while (0) + #ifdef CONFIG_TRACE_IRQFLAGS extern void trace_hardirqs_on(void); extern void trace_hardirqs_off(void); extern void trace_softirqs_on(unsigned long ip); extern void trace_softirqs_off(unsigned long ip); +# ifdef CONFIG_CRITICAL_PREEMPT_TIMING + extern void trace_preempt_enter_idle(void); + extern void trace_preempt_exit_idle(void); +# else +# define trace_preempt_enter_idle() do { } while (0) +# define trace_preempt_exit_idle() do { } while (0) +# endif # define trace_hardirq_context(p) ((p)->hardirq_context) # define trace_softirq_context(p) ((p)->softirq_context) # define trace_hardirqs_enabled(p) ((p)->hardirqs_enabled) @@ -26,6 +39,8 @@ # define trace_softirq_exit() do { current->softirq_context--; } while (0) # define INIT_TRACE_IRQFLAGS .softirqs_enabled = 1, #else +# define trace_preempt_enter_idle() do { } while (0) +# define trace_preempt_exit_idle() do { } while (0) # define trace_hardirqs_on() do { } while (0) # define trace_hardirqs_off() do { } while (0) # define trace_softirqs_on(ip) do { } while (0) @@ -50,10 +65,15 @@ #define local_irq_disable() \ do { raw_local_irq_disable(); trace_hardirqs_off(); } while (0) #define local_irq_save(flags) \ - do { raw_local_irq_save(flags); trace_hardirqs_off(); } while (0) + do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + raw_local_irq_save(flags); \ + trace_hardirqs_off(); \ + } while (0) #define local_irq_restore(flags) \ do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ if (raw_irqs_disabled_flags(flags)) { \ raw_local_irq_restore(flags); \ trace_hardirqs_off(); \ @@ -69,8 +89,16 @@ */ # define raw_local_irq_disable() local_irq_disable() # define raw_local_irq_enable() local_irq_enable() -# define raw_local_irq_save(flags) local_irq_save(flags) -# define raw_local_irq_restore(flags) local_irq_restore(flags) +# define raw_local_irq_save(flags) \ + do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + local_irq_save(flags); \ + } while (0) +# define raw_local_irq_restore(flags) \ + do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + local_irq_restore(flags); \ + } while (0) #endif /* CONFIG_TRACE_IRQFLAGS_SUPPORT */ #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT @@ -80,7 +108,11 @@ raw_safe_halt(); \ } while (0) -#define local_save_flags(flags) raw_local_save_flags(flags) +#define local_save_flags(flags) \ + do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + raw_local_save_flags(flags); \ + } while (0) #define irqs_disabled() \ ({ \ @@ -90,7 +122,11 @@ raw_irqs_disabled_flags(flags); \ }) -#define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags) +#define irqs_disabled_flags(flags) \ +({ \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + raw_irqs_disabled_flags(flags); \ +}) #endif /* CONFIG_X86 */ #endif Index: linux-2.6.24-rc8-rt1/drivers/acpi/processor_idle.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/acpi/processor_idle.c 2008-01-16 22:27:43.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/acpi/processor_idle.c 2008-01-16 22:28:50.000000000 -0500 @@ -209,7 +209,7 @@ static void acpi_safe_halt(void) * test NEED_RESCHED: */ smp_mb(); - if (!need_resched()) + if (!need_resched() || !need_resched_delayed()) safe_halt(); current_thread_info()->status |= TS_POLLING; } @@ -1347,6 +1347,12 @@ static inline void acpi_idle_update_bm_r */ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) { + /* + * We have irqs disabled here, so stop latency tracing + * at this point and restart it after we return: + */ + stop_critical_timing(); + if (cx->space_id == ACPI_CSTATE_FFH) { /* Call into architectural FFH based C-state */ acpi_processor_ffh_cstate_enter(cx); @@ -1359,6 +1365,8 @@ static inline void acpi_idle_do_entry(st gets asserted in time to freeze execution properly. */ unused = inl(acpi_gbl_FADT.xpm_timer_block.address); } + + touch_critical_timing(); } /** @@ -1417,7 +1425,7 @@ static int acpi_idle_enter_simple(struct */ smp_mb(); - if (unlikely(need_resched())) { + if (unlikely(need_resched() || need_resched_delayed())) { current_thread_info()->status |= TS_POLLING; local_irq_enable(); return 0; @@ -1461,7 +1469,7 @@ static int acpi_idle_enter_simple(struct } static int c3_cpu_count; -static DEFINE_SPINLOCK(c3_lock); +static DEFINE_RAW_SPINLOCK(c3_lock); /** * acpi_idle_enter_bm - enters C3 with proper BM handling @@ -1503,7 +1511,7 @@ static int acpi_idle_enter_bm(struct cpu */ smp_mb(); - if (unlikely(need_resched())) { + if (unlikely(need_resched() || need_resched_delayed())) { current_thread_info()->status |= TS_POLLING; local_irq_enable(); return 0; Index: linux-2.6.24-rc8-rt1/arch/arm/Kconfig =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/Kconfig 2008-01-16 22:27:43.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/Kconfig 2008-01-16 22:28:24.000000000 -0500 @@ -33,6 +33,10 @@ config GENERIC_CLOCKEVENTS bool default n +config STACKTRACE_SUPPORT + bool + default y + config MMU bool default y @@ -618,18 +622,7 @@ config LOCAL_TIMERS accounting to be spread across the timer interval, preventing a "thundering herd" at every timer tick. -config PREEMPT - bool "Preemptible Kernel (EXPERIMENTAL)" - depends on EXPERIMENTAL - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. - - Say Y here if you are building a kernel for a desktop, embedded - or real-time system. Say N if you are unsure. +source kernel/Kconfig.preempt config NO_IDLE_HZ bool "Dynamic tick timer" Index: linux-2.6.24-rc8-rt1/arch/arm/lib/Makefile =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/lib/Makefile 2008-01-16 22:27:43.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/lib/Makefile 2008-01-16 22:28:16.000000000 -0500 @@ -41,6 +41,7 @@ lib-$(CONFIG_ARCH_RPC) += ecard.o io-ac lib-$(CONFIG_ARCH_CLPS7500) += io-acorn.o lib-$(CONFIG_ARCH_L7200) += io-acorn.o lib-$(CONFIG_ARCH_SHARK) += io-shark.o +lib-$(CONFIG_STACKTRACE) += stacktrace.o $(obj)/csumpartialcopy.o: $(obj)/csumpartialcopygeneric.S $(obj)/csumpartialcopyuser.o: $(obj)/csumpartialcopygeneric.S Index: linux-2.6.24-rc8-rt1/arch/arm/lib/stacktrace.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/arch/arm/lib/stacktrace.c 2008-01-16 22:28:16.000000000 -0500 @@ -0,0 +1,7 @@ +#include +#include + +void save_stack_trace(struct stack_trace *trace) +{ +} + Index: linux-2.6.24-rc8-rt1/include/asm-arm/arch-ep93xx/timex.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-arm/arch-ep93xx/timex.h 2008-01-16 22:27:43.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-arm/arch-ep93xx/timex.h 2008-01-16 22:28:16.000000000 -0500 @@ -1,5 +1,11 @@ /* * linux/include/asm-arm/arch-ep93xx/timex.h */ +#include +#include #define CLOCK_TICK_RATE 983040 + +#define mach_read_cycles() __raw_readl(EP93XX_TIMER4_VALUE_LOW) +#define mach_cycles_to_usecs(d) (((d) * ((1000000LL << 32) / CLOCK_TICK_RATE)) >> 32) +#define mach_usecs_to_cycles(d) (((d) * (((long long)CLOCK_TICK_RATE << 32) / 1000000)) >> 32) Index: linux-2.6.24-rc8-rt1/drivers/char/random.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/char/random.c 2008-01-16 22:27:43.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/char/random.c 2008-01-16 22:28:17.000000000 -0500 @@ -580,8 +580,11 @@ static void add_timer_randomness(struct preempt_disable(); /* if over the trickle threshold, use only 1 in 4096 samples */ if (input_pool.entropy_count > trickle_thresh && - (__get_cpu_var(trickle_count)++ & 0xfff)) - goto out; + (__get_cpu_var(trickle_count)++ & 0xfff)) { + preempt_enable(); + return; + } + preempt_enable(); sample.jiffies = jiffies; sample.cycles = get_cycles(); @@ -626,9 +629,6 @@ static void add_timer_randomness(struct if(input_pool.entropy_count >= random_read_wakeup_thresh) wake_up_interruptible(&random_read_wait); - -out: - preempt_enable(); } void add_input_randomness(unsigned int type, unsigned int code, Index: linux-2.6.24-rc8-rt1/drivers/char/Kconfig =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/char/Kconfig 2008-01-16 22:27:43.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/char/Kconfig 2008-01-16 22:29:07.000000000 -0500 @@ -753,6 +753,46 @@ config JS_RTC To compile this driver as a module, choose M here: the module will be called js-rtc. +config RTC_HISTOGRAM + bool "Real Time Clock Histogram Support" + default n + depends on RTC + ---help--- + If you say Y here then the kernel will track the delivery and + wakeup latency of /dev/rtc using tasks and will report a + histogram to the kernel log when the application closes /dev/rtc. + +config BLOCKER + tristate "Priority Inheritance Debugging (Blocker) Device Support" + depends on X86 + default y + ---help--- + If you say Y here then a device will be created that the userspace + pi_test suite uses to test and measure kernel locking primitives. + +config LPPTEST + tristate "Parallel Port Based Latency Measurement Device" + depends on !PARPORT && X86 + default y + ---help--- + If you say Y here then a device will be created that the userspace + testlpp utility uses to measure IRQ latencies of a target system + from an independent measurement system. + + NOTE: this code assumes x86 PCs and that the parallel port is + bidirectional and is on IRQ 7. + + to use the device, both the target and the source system needs to + run a kernel with CONFIG_LPPTEST enabled. To measure latencies, + use the scripts/testlpp utility in your kernel source directory, + and run it (as root) on the source system - it will start printing + out the latencies it took to get a response from the target system: + + Latency of response: 12.2 usecs (121265 cycles) + + then generate various workloads on the target system to see how + (worst-case-) latencies are impacted. + config SGI_DS1286 tristate "SGI DS1286 RTC support" depends on SGI_IP22 @@ -1032,6 +1072,24 @@ config TELCLOCK /sys/devices/platform/telco_clock, with a number of files for controlling the behavior of this hardware. +config RMEM + tristate "Access to physical memory via /dev/rmem" + default m + help + The /dev/mem device only allows mmap() memory available to + I/O mapped memory; it does not allow access to "real" + physical memory. The /dev/rmem device is a hack which does + allow access to physical memory. We use this instead of + patching /dev/mem because we don't expect this functionality + to ever be accepted into mainline. + +config ALLOC_RTSJ_MEM + tristate "RTSJ-specific hack to reserve memory" + default m + help + The RTSJ TCK conformance test requires reserving some physical + memory for testing /dev/rmem. + config DEVPORT bool depends on !M68K Index: linux-2.6.24-rc8-rt1/drivers/char/Makefile =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/char/Makefile 2008-01-16 22:27:43.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/char/Makefile 2008-01-16 22:29:07.000000000 -0500 @@ -85,6 +85,8 @@ obj-$(CONFIG_TOSHIBA) += toshiba.o obj-$(CONFIG_I8K) += i8k.o obj-$(CONFIG_DS1620) += ds1620.o obj-$(CONFIG_HW_RANDOM) += hw_random/ +obj-$(CONFIG_BLOCKER) += blocker.o +obj-$(CONFIG_LPPTEST) += lpptest.o obj-$(CONFIG_COBALT_LCD) += lcd.o obj-$(CONFIG_PPDEV) += ppdev.o obj-$(CONFIG_NWBUTTON) += nwbutton.o @@ -96,6 +98,7 @@ obj-$(CONFIG_CS5535_GPIO) += cs5535_gpio obj-$(CONFIG_GPIO_VR41XX) += vr41xx_giu.o obj-$(CONFIG_GPIO_TB0219) += tb0219.o obj-$(CONFIG_TELCLOCK) += tlclk.o +obj-$(CONFIG_RMEM) += rmem.o obj-$(CONFIG_MWAVE) += mwave/ obj-$(CONFIG_AGP) += agp/ @@ -111,6 +114,8 @@ obj-$(CONFIG_PS3_FLASH) += ps3flash.o obj-$(CONFIG_JS_RTC) += js-rtc.o js-rtc-y = rtc.o +obj-$(CONFIG_ALLOC_RTSJ_MEM) += alloc_rtsj_mem.o + # Files generated that shall be removed upon make clean clean-files := consolemap_deftbl.c defkeymap.c Index: linux-2.6.24-rc8-rt1/drivers/char/blocker.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/drivers/char/blocker.c 2008-01-16 22:28:17.000000000 -0500 @@ -0,0 +1,109 @@ +/* + * priority inheritance testing device + */ + +#include +#include +#include +#include + +#define BLOCKER_MINOR 221 + +#define BLOCK_IOCTL 4245 +#define BLOCK_SET_DEPTH 4246 + +#define BLOCKER_MAX_LOCK_DEPTH 10 + +void loop(int loops) +{ + int i; + + for (i = 0; i < loops; i++) + get_cycles(); +} + +static spinlock_t blocker_lock[BLOCKER_MAX_LOCK_DEPTH]; + +static unsigned int lock_depth = 1; + +void do_the_lock_and_loop(unsigned int args) +{ + int i, max; + + if (rt_task(current)) + max = lock_depth; + else if (lock_depth > 1) + max = (current->pid % lock_depth) + 1; + else + max = 1; + + /* Always lock from the top down */ + for (i = max-1; i >= 0; i--) + spin_lock(&blocker_lock[i]); + loop(args); + for (i = 0; i < max; i++) + spin_unlock(&blocker_lock[i]); +} + +static int blocker_open(struct inode *in, struct file *file) +{ + printk(KERN_INFO "blocker_open called\n"); + + return 0; +} + +static long blocker_ioctl(struct file *file, + unsigned int cmd, unsigned long args) +{ + switch(cmd) { + case BLOCK_IOCTL: + do_the_lock_and_loop(args); + return 0; + case BLOCK_SET_DEPTH: + if (args >= BLOCKER_MAX_LOCK_DEPTH) + return -EINVAL; + lock_depth = args; + return 0; + default: + return -EINVAL; + } +} + +static struct file_operations blocker_fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .unlocked_ioctl = blocker_ioctl, + .open = blocker_open, +}; + +static struct miscdevice blocker_dev = +{ + BLOCKER_MINOR, + "blocker", + &blocker_fops +}; + +static int __init blocker_init(void) +{ + int i; + + if (misc_register(&blocker_dev)) + return -ENODEV; + + for (i = 0; i < BLOCKER_MAX_LOCK_DEPTH; i++) + spin_lock_init(blocker_lock + i); + + return 0; +} + +void __exit blocker_exit(void) +{ + printk(KERN_INFO "blocker device uninstalled\n"); + misc_deregister(&blocker_dev); +} + +module_init(blocker_init); +module_exit(blocker_exit); + +MODULE_LICENSE("GPL"); + Index: linux-2.6.24-rc8-rt1/drivers/char/lpptest.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/drivers/char/lpptest.c 2008-01-16 22:28:17.000000000 -0500 @@ -0,0 +1,178 @@ +/* + * /dev/lpptest device: test IRQ handling latencies over parallel port + * + * Copyright (C) 2005 Thomas Gleixner, Ingo Molnar + * + * licensed under the GPL + * + * You need to have CONFIG_PARPORT disabled for this device, it is a + * completely self-contained device that assumes sole ownership of the + * parallel port. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * API wrappers so that the code can be shared with the -rt tree: + */ +#ifndef local_irq_disable +# define local_irq_disable local_irq_disable +# define local_irq_enable local_irq_enable +#endif + +#ifndef IRQ_NODELAY +# define IRQ_NODELAY 0 +# define IRQF_NODELAY 0 +#endif + +/* + * Driver: + */ +#define LPPTEST_CHAR_MAJOR 245 +#define LPPTEST_DEVICE_NAME "lpptest" + +#define LPPTEST_IRQ 7 + +#define LPPTEST_TEST _IOR (LPPTEST_CHAR_MAJOR, 1, unsigned long long) +#define LPPTEST_DISABLE _IOR (LPPTEST_CHAR_MAJOR, 2, unsigned long long) +#define LPPTEST_ENABLE _IOR (LPPTEST_CHAR_MAJOR, 3, unsigned long long) + +static char dev_id[] = "lpptest"; + +#define INIT_PORT() outb(0x04, 0x37a) +#define ENABLE_IRQ() outb(0x10, 0x37a) +#define DISABLE_IRQ() outb(0, 0x37a) + +static unsigned char out = 0x5a; + +/** + * Interrupt handler. Flip a bit in the reply. + */ +static int lpptest_irq (int irq, void *dev_id) +{ + out ^= 0xff; + outb(out, 0x378); + + return IRQ_HANDLED; +} + +static cycles_t test_response(void) +{ + cycles_t now, end; + unsigned char in; + int timeout = 0; + + local_irq_disable(); + in = inb(0x379); + inb(0x378); + outb(0x08, 0x378); + now = get_cycles(); + while(1) { + if (inb(0x379) != in) + break; + if (timeout++ > 1000000) { + outb(0x00, 0x378); + local_irq_enable(); + + return 0; + } + } + end = get_cycles(); + outb(0x00, 0x378); + local_irq_enable(); + + return end - now; +} + +static int lpptest_open(struct inode *inode, struct file *file) +{ + return 0; +} + +static int lpptest_close(struct inode *inode, struct file *file) +{ + return 0; +} + +int lpptest_ioctl(struct inode *inode, struct file *file, unsigned int ioctl_num, unsigned long ioctl_param) +{ + int retval = 0; + + switch (ioctl_num) { + + case LPPTEST_DISABLE: + DISABLE_IRQ(); + break; + + case LPPTEST_ENABLE: + ENABLE_IRQ(); + break; + + case LPPTEST_TEST: { + + cycles_t diff = test_response(); + if (copy_to_user((void *)ioctl_param, (void*) &diff, sizeof(diff))) + goto errcpy; + break; + } + default: retval = -EINVAL; + } + + return retval; + + errcpy: + return -EFAULT; +} + +static struct file_operations lpptest_dev_fops = { + .ioctl = lpptest_ioctl, + .open = lpptest_open, + .release = lpptest_close, +}; + +static int __init lpptest_init (void) +{ + if (register_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME, &lpptest_dev_fops)) + { + printk(KERN_NOTICE "Can't allocate major number %d for lpptest.\n", + LPPTEST_CHAR_MAJOR); + return -EAGAIN; + } + + if (request_irq (LPPTEST_IRQ, lpptest_irq, 0, "lpptest", dev_id)) { + printk (KERN_WARNING "lpptest: irq %d in use. Unload parport module!\n", LPPTEST_IRQ); + unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME); + return -EAGAIN; + } + irq_desc[LPPTEST_IRQ].status |= IRQ_NODELAY; + irq_desc[LPPTEST_IRQ].action->flags |= IRQF_NODELAY | IRQF_DISABLED; + + INIT_PORT(); + ENABLE_IRQ(); + + return 0; +} +module_init (lpptest_init); + +static void __exit lpptest_exit (void) +{ + DISABLE_IRQ(); + + free_irq(LPPTEST_IRQ, dev_id); + unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME); +} +module_exit (lpptest_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("lpp test module"); + Index: linux-2.6.24-rc8-rt1/drivers/char/rtc.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/char/rtc.c 2008-01-16 22:27:43.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/char/rtc.c 2008-01-16 22:28:56.000000000 -0500 @@ -90,10 +90,35 @@ #include #include +#ifdef CONFIG_MIPS +# include +#endif static unsigned long rtc_port; static int rtc_irq = PCI_IRQ_NONE; #endif +#ifdef CONFIG_RTC_HISTOGRAM + +static cycles_t last_interrupt_time; + +#include + +#define CPU_MHZ (cpu_khz / 1000) + +#define HISTSIZE 10000 +static int histogram[HISTSIZE]; + +static int rtc_state; + +enum rtc_states { + S_STARTUP, /* First round - let the application start */ + S_IDLE, /* Waiting for an interrupt */ + S_WAITING_FOR_READ, /* Signal delivered. waiting for rtc_read() */ + S_READ_MISSED, /* Signal delivered, read() deadline missed */ +}; + +#endif + #ifdef CONFIG_HPET_RTC_IRQ #undef RTC_IRQ #endif @@ -222,7 +247,146 @@ static inline unsigned char rtc_is_updat return uip; } +#ifndef RTC_IRQ +# undef CONFIG_RTC_HISTOGRAM +#endif + +static inline void rtc_open_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + int i; + + last_interrupt_time = 0; + rtc_state = S_STARTUP; + rtc_irq_data = 0; + + for (i = 0; i < HISTSIZE; i++) + histogram[i] = 0; +#endif +} + +static inline void rtc_wake_event(void) +{ +#ifndef CONFIG_RTC_HISTOGRAM + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); +#else + if (!(rtc_status & RTC_IS_OPEN)) + return; + + switch (rtc_state) { + /* Startup */ + case S_STARTUP: + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); + break; + /* Waiting for an interrupt */ + case S_IDLE: + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); + last_interrupt_time = get_cycles(); + rtc_state = S_WAITING_FOR_READ; + break; + + /* Signal has been delivered. waiting for rtc_read() */ + case S_WAITING_FOR_READ: + /* + * Well foo. The usermode application didn't + * schedule and read in time. + */ + last_interrupt_time = get_cycles(); + rtc_state = S_READ_MISSED; + printk("Read missed before next interrupt\n"); + break; + /* Signal has been delivered, read() deadline was missed */ + case S_READ_MISSED: + /* + * Not much we can do here. We're waiting for the usermode + * application to read the rtc + */ + last_interrupt_time = get_cycles(); + break; + } +#endif +} + +static inline void rtc_read_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + cycles_t now = get_cycles(); + + switch (rtc_state) { + /* Startup */ + case S_STARTUP: + rtc_state = S_IDLE; + break; + + /* Waiting for an interrupt */ + case S_IDLE: + printk("bug in rtc_read(): called in state S_IDLE!\n"); + break; + case S_WAITING_FOR_READ: /* + * Signal has been delivered. + * waiting for rtc_read() + */ + /* + * Well done + */ + case S_READ_MISSED: /* + * Signal has been delivered, read() + * deadline was missed + */ + /* + * So, you finally got here. + */ + if (!last_interrupt_time) + printk("bug in rtc_read(): last_interrupt_time = 0\n"); + rtc_state = S_IDLE; + { + cycles_t latency = now - last_interrupt_time; + unsigned long delta; /* Microseconds */ + + delta = latency; + delta /= CPU_MHZ; + + if (delta > 1000 * 1000) { + printk("rtc: eek\n"); + } else { + unsigned long slot = delta; + if (slot >= HISTSIZE) + slot = HISTSIZE - 1; + histogram[slot]++; + if (delta > 2000) + printk("wow! That was a " + "%ld millisec bump\n", + delta / 1000); + } + } + rtc_state = S_IDLE; + break; + } +#endif +} + +static inline void rtc_close_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + int i = 0; + unsigned long total = 0; + + for (i = 0; i < HISTSIZE; i++) + total += histogram[i]; + if (!total) + return; + + printk("\nrtc latency histogram of {%s/%d, %lu samples}:\n", + current->comm, current->pid, total); + for (i = 0; i < HISTSIZE; i++) { + if (histogram[i]) + printk("%d %d\n", i, histogram[i]); + } +#endif +} + #ifdef RTC_IRQ + /* * A very tiny interrupt handler. It runs with IRQF_DISABLED set, * but there is possibility of conflicting with the set_rtc_mmss() @@ -266,9 +430,9 @@ irqreturn_t rtc_interrupt(int irq, void if (rtc_callback) rtc_callback->func(rtc_callback->private_data); spin_unlock(&rtc_task_lock); - wake_up_interruptible(&rtc_wait); - kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); + rtc_wake_event(); + wake_up_interruptible(&rtc_wait); return IRQ_HANDLED; } @@ -378,6 +542,8 @@ static ssize_t rtc_read(struct file *fil schedule(); } while (1); + rtc_read_event(); + if (count == sizeof(unsigned int)) retval = put_user(data, (unsigned int __user *)buf) ?: sizeof(int); else @@ -610,6 +776,11 @@ static int rtc_do_ioctl(unsigned int cmd save_freq_select = CMOS_READ(RTC_FREQ_SELECT); CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT); + /* + * Make CMOS date writes nonpreemptible even on PREEMPT_RT. + * There's a limit to everything! =B-) + */ + preempt_disable(); #ifdef CONFIG_MACH_DECSTATION CMOS_WRITE(real_yrs, RTC_DEC_YEAR); #endif @@ -619,6 +790,7 @@ static int rtc_do_ioctl(unsigned int cmd CMOS_WRITE(hrs, RTC_HOURS); CMOS_WRITE(min, RTC_MINUTES); CMOS_WRITE(sec, RTC_SECONDS); + preempt_enable(); CMOS_WRITE(save_control, RTC_CONTROL); CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); @@ -717,6 +889,7 @@ static int rtc_open(struct inode *inode, if(rtc_status & RTC_IS_OPEN) goto out_busy; + rtc_open_event(); rtc_status |= RTC_IS_OPEN; rtc_irq_data = 0; @@ -772,6 +945,7 @@ no_irq: rtc_irq_data = 0; rtc_status &= ~RTC_IS_OPEN; spin_unlock_irq (&rtc_lock); + rtc_close_event(); return 0; } @@ -1167,8 +1341,10 @@ static void rtc_dropped_irq(unsigned lon spin_unlock_irq(&rtc_lock); +#ifndef CONFIG_PREEMPT_RT if (printk_ratelimit()) printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", freq); +#endif /* Now we have new data */ wake_up_interruptible(&rtc_wait); Index: linux-2.6.24-rc8-rt1/scripts/testlpp.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/scripts/testlpp.c 2008-01-16 22:28:17.000000000 -0500 @@ -0,0 +1,159 @@ +/* + * testlpp.c: use the /dev/lpptest device to test IRQ handling + * latencies over parallel port + * + * Copyright (C) 2005 Thomas Gleixner + * + * licensed under the GPL + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LPPTEST_CHAR_MAJOR 245 +#define LPPTEST_DEVICE_NAME "lpptest" + +#define LPPTEST_TEST _IOR (LPPTEST_CHAR_MAJOR, 1, unsigned long long) +#define LPPTEST_DISABLE _IOR (LPPTEST_CHAR_MAJOR, 2, unsigned long long) +#define LPPTEST_ENABLE _IOR (LPPTEST_CHAR_MAJOR, 3, unsigned long long) + +#define HIST_SIZE 10000 + +static int hist_total; +static unsigned long hist[HIST_SIZE]; + +static void hist_hit(unsigned long usecs) +{ + hist_total++; + if (usecs >= HIST_SIZE-1) + hist[HIST_SIZE-1]++; + else + hist[usecs]++; +} + +static void print_hist(void) +{ + int i; + + printf("LPP latency histogram:\n"); + + for (i = 0; i < HIST_SIZE; i++) { + if (hist[i]) + printf("%3d usecs: %9ld\n", i, hist[i]); + } +} + +static inline unsigned long long int rdtsc(void) +{ + unsigned long long int x, y; + for (;;) { + __asm__ volatile ("rdtsc" : "=A" (x)); + __asm__ volatile ("rdtsc" : "=A" (y)); + if (y - x < 1000) + return y; + } +} + +static unsigned long long calibrate_loop(void) +{ + unsigned long long mytime1, mytime2; + + mytime1 = rdtsc(); + usleep(500000); + mytime2 = rdtsc(); + + return (mytime2 - mytime1) * 2; +} + +#define time_to_usecs(time) ((double)time*1000000.0/(double)cycles_per_sec) + +#define time_to_usecs_l(time) (long)(time*1000000/cycles_per_sec) + +int fd, total; +unsigned long long tim, sum_tim, min_tim = -1ULL, max_tim, cycles_per_sec; + +void cleanup(int sig) +{ + ioctl (fd, LPPTEST_ENABLE, &tim); + if (sig) + printf("[ interrupted - exiting ]\n"); + printf("\ntotal number of responses: %d\n", total); + printf("average reponse latency: %.2lf usecs\n", + time_to_usecs(sum_tim/total)); + printf("minimum latency: %.2lf usecs\n", + time_to_usecs(min_tim)); + printf("maximum latency: %.2lf usecs\n", + time_to_usecs(max_tim)); + print_hist(); + exit(0); +} + +#define HZ 3000 + +int main (int argc, char **argv) +{ + unsigned int nr_requests = 0; + + if (argc > 2) { + fprintf(stderr, "usage: testlpp []\n"); + exit(-1); + } + if (argc == 2) + nr_requests = atol(argv[1]); + + if (getuid() != 0) { + fprintf(stderr, "need to run as root!\n"); + exit(-1); + } + mknod("/dev/lpptest", S_IFCHR|0666, makedev(245, 1)); + + fd = open("/dev/lpptest", O_RDWR); + if (fd == -1) { + fprintf(stderr, "could not open /dev/lpptest, your kernel doesnt have CONFIG_LPPTEST enabled?\n"); + exit(-1); + } + + signal(SIGINT,&cleanup); + + ioctl (fd, LPPTEST_DISABLE, &tim); + + fprintf(stderr, "calibrating cycles to usecs: "); + cycles_per_sec = calibrate_loop(); + fprintf(stderr, "%lld cycles per usec\n", cycles_per_sec/1000000); + if (nr_requests) + fprintf(stderr, "[max # of requests: %u]\n", nr_requests); + fprintf(stderr, "starting %dHz test, hit Ctrl-C to stop:\n\n", HZ); + + while(1) { + ioctl (fd, LPPTEST_TEST, &tim); + if (tim == 0) + printf ("No response from target.\n"); + else { + hist_hit(time_to_usecs_l(tim)); + if (tim > max_tim) { + printf ("new max latency: %.2lf usecs (%Ld cycles)\n", time_to_usecs(tim), tim); + max_tim = tim; + } + if (tim < min_tim) + min_tim = tim; + total++; + if (total == nr_requests) + break; + sum_tim += tim; + } + usleep(1000000/HZ); + } + cleanup(0); + + return 0; +} + + Index: linux-2.6.24-rc8-rt1/include/linux/lockdep.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/lockdep.h 2008-01-16 22:27:42.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/lockdep.h 2008-01-16 22:29:10.000000000 -0500 @@ -304,6 +304,9 @@ extern void lock_acquire(struct lockdep_ extern void lock_release(struct lockdep_map *lock, int nested, unsigned long ip); +extern void lock_set_subclass(struct lockdep_map *lock, unsigned int subclass, + unsigned long ip); + # define INIT_LOCKDEP .lockdep_recursion = 0, #define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) @@ -320,6 +323,7 @@ static inline void lockdep_on(void) # define lock_acquire(l, s, t, r, c, i) do { } while (0) # define lock_release(l, n, i) do { } while (0) +# define lock_set_subclass(l, s, i) do { } while (0) # define lockdep_init() do { } while (0) # define lockdep_info() do { } while (0) # define lockdep_init_map(lock, name, key, sub) do { (void)(key); } while (0) @@ -357,6 +361,28 @@ do { \ lock_acquired(&(_lock)->dep_map); \ } while (0) +#define LOCK_CONTENDED_RT(_lock, f_try, f_lock) \ +do { \ + if (!f_try(&(_lock)->lock)) { \ + lock_contended(&(_lock)->dep_map, _RET_IP_); \ + f_lock(&(_lock)->lock); \ + } \ + lock_acquired(&(_lock)->dep_map); \ +} while (0) + + +#define LOCK_CONTENDED_RT_RET(_lock, f_try, f_lock) \ +({ \ + int ret = 0; \ + if (!f_try(&(_lock)->lock)) { \ + lock_contended(&(_lock)->dep_map, _RET_IP_); \ + ret = f_lock(&(_lock)->lock); \ + } \ + if (!ret) \ + lock_acquired(&(_lock)->dep_map); \ + ret; \ +}) + #else /* CONFIG_LOCK_STAT */ #define lock_contended(lockdep_map, ip) do {} while (0) @@ -365,6 +391,12 @@ do { \ #define LOCK_CONTENDED(_lock, try, lock) \ lock(_lock) +#define LOCK_CONTENDED_RT(_lock, f_try, f_lock) \ + f_lock(&(_lock)->lock) + +#define LOCK_CONTENDED_RT_RET(_lock, f_try, f_lock) \ + f_lock(&(_lock)->lock) + #endif /* CONFIG_LOCK_STAT */ #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_GENERIC_HARDIRQS) Index: linux-2.6.24-rc8-rt1/kernel/lockdep_internals.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/lockdep_internals.h 2008-01-16 22:27:42.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/lockdep_internals.h 2008-01-16 22:28:19.000000000 -0500 @@ -15,12 +15,12 @@ * table (if it's not there yet), and we check it for lock order * conflicts and deadlocks. */ -#define MAX_LOCKDEP_ENTRIES 8192UL +#define MAX_LOCKDEP_ENTRIES 16384UL #define MAX_LOCKDEP_KEYS_BITS 11 #define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) -#define MAX_LOCKDEP_CHAINS_BITS 14 +#define MAX_LOCKDEP_CHAINS_BITS 15 #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) /* Index: linux-2.6.24-rc8-rt1/drivers/net/loopback.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/net/loopback.c 2008-01-16 22:27:42.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/net/loopback.c 2008-01-16 22:28:56.000000000 -0500 @@ -154,13 +154,13 @@ static int loopback_xmit(struct sk_buff #endif dev->last_rx = jiffies; - /* it's OK to use per_cpu_ptr() because BHs are off */ pcpu_lstats = netdev_priv(dev); - lb_stats = per_cpu_ptr(pcpu_lstats, smp_processor_id()); + lb_stats = per_cpu_ptr(pcpu_lstats, get_cpu()); lb_stats->bytes += skb->len; lb_stats->packets++; + put_cpu(); - netif_rx(skb); + netif_rx_ni(skb); return 0; } Index: linux-2.6.24-rc8-rt1/kernel/time/clockevents.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/time/clockevents.c 2008-01-16 22:27:42.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/time/clockevents.c 2008-01-16 22:28:54.000000000 -0500 @@ -12,12 +12,14 @@ */ #include +#include #include #include #include #include #include #include +#include /* The registered clock event devices */ static LIST_HEAD(clockevent_devices); @@ -27,7 +29,7 @@ static LIST_HEAD(clockevents_released); static RAW_NOTIFIER_HEAD(clockevents_chain); /* Protection for the above */ -static DEFINE_SPINLOCK(clockevents_lock); +static DEFINE_RAW_SPINLOCK(clockevents_lock); /** * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds @@ -85,6 +87,8 @@ int clockevents_program_event(struct clo delta = ktime_to_ns(ktime_sub(expires, now)); + hrtimer_trace(expires, (unsigned long) delta); + if (delta <= 0) return -ETIME; Index: linux-2.6.24-rc8-rt1/include/linux/rcuclassic.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/include/linux/rcuclassic.h 2008-01-16 22:29:21.000000000 -0500 @@ -0,0 +1,100 @@ +/* + * Read-Copy Update mechanism for mutual exclusion (classic version) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2001 + * + * Author: Dipankar Sarma + * + * Based on the original work by Paul McKenney + * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. + * Papers: + * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf + * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU + * + */ + +#ifndef __LINUX_RCUCLASSIC_H +#define __LINUX_RCUCLASSIC_H + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include +#include + +DECLARE_PER_CPU(int, rcu_data_bh_passed_quiesc); + +/* + * Increment the bottom-half quiescent state counter. + * The counter is a bit degenerated: We do not need to know + * how many quiescent states passed, just if there was at least + * one since the start of the grace period. Thus just a flag. + */ +static inline void rcu_bh_qsctr_inc(int cpu) +{ + per_cpu(rcu_data_bh_passed_quiesc, cpu) = 1; +} + +#define __rcu_read_lock() \ + do { \ + preempt_disable(); \ + __acquire(RCU); \ + } while (0) +#define __rcu_read_unlock() \ + do { \ + __release(RCU); \ + preempt_enable(); \ + } while (0) +#define __rcu_read_lock_bh() \ + do { \ + local_bh_disable(); \ + __acquire(RCU_BH); \ + } while (0) +#define __rcu_read_unlock_bh() \ + do { \ + __release(RCU_BH); \ + local_bh_enable(); \ + } while (0) + +#define __synchronize_sched() synchronize_rcu() + +#define rcu_advance_callbacks_rt(cpu, user) do { } while (0) +#define rcu_check_callbacks_rt(cpu, user) do { } while (0) +#define rcu_init_rt() do { } while (0) +#define rcu_needs_cpu_rt(cpu) 0 +#define rcu_offline_cpu_rt(cpu) +#define rcu_online_cpu_rt(cpu) +#define rcu_pending_rt(cpu) 0 +#define rcu_process_callbacks_rt(unused) do { } while (0) +#define rcu_enter_nohz() do { } while (0) +#define rcu_exit_nohz() do { } while (0) +#define rcu_preempt_boost_init() do { } while (0) + +extern void FASTCALL(call_rcu_classic(struct rcu_head *head, + void (*func)(struct rcu_head *head))); + +struct softirq_action; +extern void rcu_process_callbacks(struct softirq_action *unused); + +#endif /* __KERNEL__ */ +#endif /* __LINUX_RCUCLASSIC_H */ Index: linux-2.6.24-rc8-rt1/include/linux/rcupdate.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/rcupdate.h 2008-01-16 22:27:42.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/rcupdate.h 2008-01-16 22:29:20.000000000 -0500 @@ -15,7 +15,7 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - * Copyright (C) IBM Corporation, 2001 + * Copyright IBM Corporation, 2001 * * Author: Dipankar Sarma * @@ -53,6 +53,12 @@ struct rcu_head { void (*func)(struct rcu_head *head); }; +#ifdef CONFIG_CLASSIC_RCU +#include +#else /* #ifdef CONFIG_CLASSIC_RCU */ +#include +#endif /* #else #ifdef CONFIG_CLASSIC_RCU */ + #define RCU_HEAD_INIT { .next = NULL, .func = NULL } #define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT #define INIT_RCU_HEAD(ptr) do { \ @@ -61,79 +67,6 @@ struct rcu_head { -/* Global control variables for rcupdate callback mechanism. */ -struct rcu_ctrlblk { - long cur; /* Current batch number. */ - long completed; /* Number of the last completed batch */ - int next_pending; /* Is the next batch already waiting? */ - - int signaled; - - spinlock_t lock ____cacheline_internodealigned_in_smp; - cpumask_t cpumask; /* CPUs that need to switch in order */ - /* for current batch to proceed. */ -} ____cacheline_internodealigned_in_smp; - -/* Is batch a before batch b ? */ -static inline int rcu_batch_before(long a, long b) -{ - return (a - b) < 0; -} - -/* Is batch a after batch b ? */ -static inline int rcu_batch_after(long a, long b) -{ - return (a - b) > 0; -} - -/* - * Per-CPU data for Read-Copy UPdate. - * nxtlist - new callbacks are added here - * curlist - current batch for which quiescent cycle started if any - */ -struct rcu_data { - /* 1) quiescent state handling : */ - long quiescbatch; /* Batch # for grace period */ - int passed_quiesc; /* User-mode/idle loop etc. */ - int qs_pending; /* core waits for quiesc state */ - - /* 2) batch handling */ - long batch; /* Batch # for current RCU batch */ - struct rcu_head *nxtlist; - struct rcu_head **nxttail; - long qlen; /* # of queued callbacks */ - struct rcu_head *curlist; - struct rcu_head **curtail; - struct rcu_head *donelist; - struct rcu_head **donetail; - long blimit; /* Upper limit on a processed batch */ - int cpu; - struct rcu_head barrier; -}; - -DECLARE_PER_CPU(struct rcu_data, rcu_data); -DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); - -/* - * Increment the quiescent state counter. - * The counter is a bit degenerated: We do not need to know - * how many quiescent states passed, just if there was at least - * one since the start of the grace period. Thus just a flag. - */ -static inline void rcu_qsctr_inc(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - rdp->passed_quiesc = 1; -} -static inline void rcu_bh_qsctr_inc(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); - rdp->passed_quiesc = 1; -} - -extern int rcu_pending(int cpu); -extern int rcu_needs_cpu(int cpu); - #ifdef CONFIG_DEBUG_LOCK_ALLOC extern struct lockdep_map rcu_lock_map; # define rcu_read_acquire() lock_acquire(&rcu_lock_map, 0, 0, 2, 1, _THIS_IP_) @@ -172,24 +105,14 @@ extern struct lockdep_map rcu_lock_map; * * It is illegal to block while in an RCU read-side critical section. */ -#define rcu_read_lock() \ - do { \ - preempt_disable(); \ - __acquire(RCU); \ - rcu_read_acquire(); \ - } while(0) +#define rcu_read_lock() __rcu_read_lock() /** * rcu_read_unlock - marks the end of an RCU read-side critical section. * * See rcu_read_lock() for more information. */ -#define rcu_read_unlock() \ - do { \ - rcu_read_release(); \ - __release(RCU); \ - preempt_enable(); \ - } while(0) +#define rcu_read_unlock() __rcu_read_unlock() /* * So where is rcu_write_lock()? It does not exist, as there is no @@ -212,24 +135,14 @@ extern struct lockdep_map rcu_lock_map; * can use just rcu_read_lock(). * */ -#define rcu_read_lock_bh() \ - do { \ - local_bh_disable(); \ - __acquire(RCU_BH); \ - rcu_read_acquire(); \ - } while(0) +#define rcu_read_lock_bh() __rcu_read_lock_bh() /* * rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section * * See rcu_read_lock_bh() for more information. */ -#define rcu_read_unlock_bh() \ - do { \ - rcu_read_release(); \ - __release(RCU_BH); \ - local_bh_enable(); \ - } while(0) +#define rcu_read_unlock_bh() __rcu_read_unlock_bh() /* * Prevent the compiler from merging or refetching accesses. The compiler @@ -293,21 +206,118 @@ extern struct lockdep_map rcu_lock_map; * In "classic RCU", these two guarantees happen to be one and * the same, but can differ in realtime RCU implementations. */ -#define synchronize_sched() synchronize_rcu() +#define synchronize_sched() __synchronize_sched() -extern void rcu_init(void); -extern void rcu_check_callbacks(int cpu, int user); -extern void rcu_restart_cpu(int cpu); -extern long rcu_batches_completed(void); -extern long rcu_batches_completed_bh(void); +/** + * call_rcu - Queue an RCU callback for invocation after a grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual update function to be invoked after the grace period + * + * The update function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), + * delimited by rcu_read_lock() and rcu_read_unlock(), + * and may be nested. + */ +#ifdef CONFIG_CLASSIC_RCU +#define call_rcu call_rcu_classic +#else /* #ifdef CONFIG_CLASSIC_RCU */ +#define call_rcu call_rcu_preempt +#endif /* #else #ifdef CONFIG_CLASSIC_RCU */ -/* Exported interfaces */ -extern void FASTCALL(call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *head))); +/** + * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual update function to be invoked after the grace period + * + * The update function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_bh() assumes + * that the read-side critical sections end on completion of a softirq + * handler. This means that read-side critical sections in process + * context must not be interrupted by softirqs. This interface is to be + * used when most of the read-side critical sections are in softirq context. + * RCU read-side critical sections are delimited by rcu_read_lock() and + * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() + * and rcu_read_unlock_bh(), if in process context. These may be nested. + */ extern void FASTCALL(call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *head))); + +/* Exported common interfaces */ extern void synchronize_rcu(void); extern void rcu_barrier(void); +extern long rcu_batches_completed(void); +extern long rcu_batches_completed_bh(void); + +/* Internal to kernel */ +extern void rcu_check_callbacks(int cpu, int user); +extern long rcu_batches_completed(void); +extern long rcu_batches_completed_bh(void); +extern void rcu_check_callbacks(int cpu, int user); +extern void rcu_init(void); +extern int rcu_needs_cpu(int cpu); +extern int rcu_pending(int cpu); +struct softirq_action; +extern void rcu_restart_cpu(int cpu); + +DECLARE_PER_CPU(int, rcu_data_passed_quiesc); + +/* + * Increment the quiescent state counter. + * The counter is a bit degenerated: We do not need to know + * how many quiescent states passed, just if there was at least + * one since the start of the grace period. Thus just a flag. + */ +static inline void rcu_qsctr_inc(int cpu) +{ + per_cpu(rcu_data_passed_quiesc, cpu) = 1; +} + +struct dentry; + +#ifdef CONFIG_PREEMPT_RCU_BOOST +extern void init_rcu_boost_late(void); +extern void rcu_boost_readers(void); +extern void rcu_unboost_readers(void); +extern void __rcu_preempt_boost(void); +#ifdef CONFIG_RCU_TRACE +extern int rcu_trace_boost_create(struct dentry *rcudir); +extern void rcu_trace_boost_destroy(void); +#endif /* CONFIG_RCU_TRACE */ +#define rcu_preempt_boost() /* cpp to avoid #include hell. */ \ + do { \ + if (unlikely(current->rcu_read_lock_nesting > 0)) \ + __rcu_preempt_boost(); \ + } while (0) +extern void __rcu_preempt_unboost(void); +#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST */ +static inline void init_rcu_boost_late(void) +{ +} +static inline void rcu_preempt_boost(void) +{ +} +static inline void __rcu_preempt_unboost(void) +{ +} +static inline void rcu_boost_readers(void) +{ +} +static inline void rcu_unboost_readers(void) +{ +} +#ifdef CONFIG_RCU_TRACE +static inline int rcu_trace_boost_create(struct dentry *rcudir) +{ + return 0; +} +static inline void rcu_trace_boost_destroy(void) +{ +} +#endif /* CONFIG_RCU_TRACE */ +#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST */ #endif /* __KERNEL__ */ #endif /* __LINUX_RCUPDATE_H */ Index: linux-2.6.24-rc8-rt1/kernel/rcuclassic.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/kernel/rcuclassic.c 2008-01-16 22:28:53.000000000 -0500 @@ -0,0 +1,636 @@ +/* + * Read-Copy Update mechanism for mutual exclusion + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2001 + * + * Authors: Dipankar Sarma + * Manfred Spraul + * + * Based on the original work by Paul McKenney + * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. + * Papers: + * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf + * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Global control variables for rcupdate callback mechanism. */ +struct rcu_ctrlblk { + long cur; /* Current batch number. */ + long completed; /* Number of the last completed batch */ + int next_pending; /* Is the next batch already waiting? */ + + int signaled; + + raw_spinlock_t lock ____cacheline_internodealigned_in_smp; + cpumask_t cpumask; /* CPUs that need to switch in order */ + /* for current batch to proceed. */ +} ____cacheline_internodealigned_in_smp; + +/* Is batch a before batch b ? */ +static inline int rcu_batch_before(long a, long b) +{ + return (a - b) < 0; +} + +/* + * Per-CPU data for Read-Copy UPdate. + * nxtlist - new callbacks are added here + * curlist - current batch for which quiescent cycle started if any + */ +struct rcu_data { + /* 1) quiescent state handling : */ + long quiescbatch; /* Batch # for grace period */ + int *passed_quiesc; /* User-mode/idle loop etc. */ + int qs_pending; /* core waits for quiesc state */ + + /* 2) batch handling */ + long batch; /* Batch # for current RCU batch */ + struct rcu_head *nxtlist; + struct rcu_head **nxttail; + long qlen; /* # of queued callbacks */ + struct rcu_head *curlist; + struct rcu_head **curtail; + struct rcu_head *donelist; + struct rcu_head **donetail; + long blimit; /* Upper limit on a processed batch */ + int cpu; +}; + +/* Definition for rcupdate control block. */ +static struct rcu_ctrlblk rcu_ctrlblk = { + .cur = -300, + .completed = -300, + .lock = RAW_SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), + .cpumask = CPU_MASK_NONE, +}; +static struct rcu_ctrlblk rcu_bh_ctrlblk = { + .cur = -300, + .completed = -300, + .lock = RAW_SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), + .cpumask = CPU_MASK_NONE, +}; + +static DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; +static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; +DEFINE_PER_CPU(int, rcu_data_bh_passed_quiesc); + +/* Fake initialization required by compiler */ +static int blimit = 10; +static int qhimark = 10000; +static int qlowmark = 100; + +#ifdef CONFIG_SMP +static void force_quiescent_state(struct rcu_data *rdp, + struct rcu_ctrlblk *rcp) +{ + int cpu; + cpumask_t cpumask; + set_need_resched(); + if (unlikely(!rcp->signaled)) { + rcp->signaled = 1; + /* + * Don't send IPI to itself. With irqs disabled, + * rdp->cpu is the current cpu. + */ + cpumask = rcp->cpumask; + cpu_clear(rdp->cpu, cpumask); + for_each_cpu_mask(cpu, cpumask) + smp_send_reschedule(cpu); + } +} +#else +static inline void force_quiescent_state(struct rcu_data *rdp, + struct rcu_ctrlblk *rcp) +{ + set_need_resched(); +} +#endif + +/** + * call_rcu - Queue an RCU callback for invocation after a grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual update function to be invoked after the grace period + * + * The update function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), + * and may be nested. + */ +void fastcall call_rcu_classic(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)) +{ + unsigned long flags; + struct rcu_data *rdp; + + head->func = func; + head->next = NULL; + local_irq_save(flags); + rdp = &__get_cpu_var(rcu_data); + *rdp->nxttail = head; + rdp->nxttail = &head->next; + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = INT_MAX; + force_quiescent_state(rdp, &rcu_ctrlblk); + } + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(call_rcu_classic); + +#ifdef CONFIG_CLASSIC_RCU + +/** + * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual update function to be invoked after the grace period + * + * The update function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_bh() assumes + * that the read-side critical sections end on completion of a softirq + * handler. This means that read-side critical sections in process + * context must not be interrupted by softirqs. This interface is to be + * used when most of the read-side critical sections are in softirq context. + * RCU read-side critical sections are delimited by rcu_read_lock() and + * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() + * and rcu_read_unlock_bh(), if in process context. These may be nested. + */ +void fastcall call_rcu_bh(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)) +{ + unsigned long flags; + struct rcu_data *rdp; + + head->func = func; + head->next = NULL; + local_irq_save(flags); + rdp = &__get_cpu_var(rcu_bh_data); + *rdp->nxttail = head; + rdp->nxttail = &head->next; + + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = INT_MAX; + force_quiescent_state(rdp, &rcu_bh_ctrlblk); + } + + local_irq_restore(flags); +} +#ifdef CONFIG_CLASSIC_RCU +EXPORT_SYMBOL_GPL(call_rcu_bh); +#endif /* #ifdef CONFIG_CLASSIC_RCU */ + +/* + * Return the number of RCU batches processed thus far. Useful + * for debug and statistics. + */ +long rcu_batches_completed(void) +{ + return rcu_ctrlblk.completed; +} +#ifdef CONFIG_CLASSIC_RCU +EXPORT_SYMBOL_GPL(rcu_batches_completed); +#endif /* #ifdef CONFIG_CLASSIC_RCU */ + +/* + * Return the number of RCU batches processed thus far. Useful + * for debug and statistics. + */ +long rcu_batches_completed_bh(void) +{ + return rcu_bh_ctrlblk.completed; +} +#ifdef CONFIG_CLASSIC_RCU +EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); +#endif /* #ifdef CONFIG_CLASSIC_RCU */ + +#endif /* #ifdef CONFIG_CLASSIC_RCU */ + +/* + * Invoke the completed RCU callbacks. They are expected to be in + * a per-cpu list. + */ +static void rcu_do_batch(struct rcu_data *rdp) +{ + struct rcu_head *next, *list; + int count = 0; + + list = rdp->donelist; + while (list) { + next = list->next; + prefetch(next); + list->func(list); + list = next; + if (++count >= rdp->blimit) + break; + } + rdp->donelist = list; + + local_irq_disable(); + rdp->qlen -= count; + local_irq_enable(); + if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) + rdp->blimit = blimit; + + if (!rdp->donelist) + rdp->donetail = &rdp->donelist; + else + raise_softirq(RCU_SOFTIRQ); +} + +/* + * Grace period handling: + * The grace period handling consists out of two steps: + * - A new grace period is started. + * This is done by rcu_start_batch. The start is not broadcasted to + * all cpus, they must pick this up by comparing rcp->cur with + * rdp->quiescbatch. All cpus are recorded in the + * rcu_ctrlblk.cpumask bitmap. + * - All cpus must go through a quiescent state. + * Since the start of the grace period is not broadcasted, at least two + * calls to rcu_check_quiescent_state are required: + * The first call just notices that a new grace period is running. The + * following calls check if there was a quiescent state since the beginning + * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If + * the bitmap is empty, then the grace period is completed. + * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace + * period (if necessary). + */ +/* + * Register a new batch of callbacks, and start it up if there is currently no + * active batch and the batch to be registered has not already occurred. + * Caller must hold rcu_ctrlblk.lock. + */ +static void rcu_start_batch(struct rcu_ctrlblk *rcp) +{ + if (rcp->next_pending && + rcp->completed == rcp->cur) { + rcp->next_pending = 0; + /* + * next_pending == 0 must be visible in + * __rcu_process_callbacks() before it can see new value of cur. + */ + smp_wmb(); + rcp->cur++; + + /* + * Accessing nohz_cpu_mask before incrementing rcp->cur needs a + * Barrier Otherwise it can cause tickless idle CPUs to be + * included in rcp->cpumask, which will extend graceperiods + * unnecessarily. + */ + smp_mb(); + cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); + + rcp->signaled = 0; + } +} + +/* + * cpu went through a quiescent state since the beginning of the grace period. + * Clear it from the cpu mask and complete the grace period if it was the last + * cpu. Start another grace period if someone has further entries pending + */ +static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) +{ + cpu_clear(cpu, rcp->cpumask); + if (cpus_empty(rcp->cpumask)) { + /* batch completed ! */ + rcp->completed = rcp->cur; + rcu_start_batch(rcp); + } +} + +/* + * Check if the cpu has gone through a quiescent state (say context + * switch). If so and if it already hasn't done so in this RCU + * quiescent cycle, then indicate that it has done so. + */ +static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ + if (rdp->quiescbatch != rcp->cur) { + /* start new grace period: */ + rdp->qs_pending = 1; + *rdp->passed_quiesc = 0; + rdp->quiescbatch = rcp->cur; + return; + } + + /* Grace period already completed for this cpu? + * qs_pending is checked instead of the actual bitmap to avoid + * cacheline trashing. + */ + if (!rdp->qs_pending) + return; + + /* + * Was there a quiescent state since the beginning of the grace + * period? If no, then exit and wait for the next call. + */ + if (!*rdp->passed_quiesc) + return; + rdp->qs_pending = 0; + + spin_lock(&rcp->lock); + /* + * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync + * during cpu startup. Ignore the quiescent state. + */ + if (likely(rdp->quiescbatch == rcp->cur)) + cpu_quiet(rdp->cpu, rcp); + + spin_unlock(&rcp->lock); +} + + +#ifdef CONFIG_HOTPLUG_CPU + +/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing + * locking requirements, the list it's pulling from has to belong to a cpu + * which is dead and hence not processing interrupts. + */ +static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, + struct rcu_head **tail) +{ + local_irq_disable(); + *this_rdp->nxttail = list; + if (list) + this_rdp->nxttail = tail; + local_irq_enable(); +} + +static void __rcu_offline_cpu(struct rcu_data *this_rdp, + struct rcu_ctrlblk *rcp, struct rcu_data *rdp) +{ + /* if the cpu going offline owns the grace period + * we can block indefinitely waiting for it, so flush + * it here + */ + spin_lock_bh(&rcp->lock); + if (rcp->cur != rcp->completed) + cpu_quiet(rdp->cpu, rcp); + spin_unlock_bh(&rcp->lock); + rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); + rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); + rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); +} + +static void rcu_offline_cpu(int cpu) +{ + struct rcu_data *this_rdp = &get_cpu_var(rcu_data); +#ifdef CONFIG_CLASSIC_RCU + struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); +#endif /* #ifdef CONFIG_CLASSIC_RCU */ + + __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, + &per_cpu(rcu_data, cpu)); +#ifdef CONFIG_CLASSIC_RCU + __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, + &per_cpu(rcu_bh_data, cpu)); + put_cpu_var(rcu_bh_data); +#endif /* #ifdef CONFIG_CLASSIC_RCU */ + put_cpu_var(rcu_data); + rcu_offline_cpu_rt(cpu); +} + +#else + +static void rcu_offline_cpu(int cpu) +{ +} + +#endif + +/* + * This does the RCU processing work from softirq context. + */ +static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ + unsigned long flags; + + if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { + *rdp->donetail = rdp->curlist; + rdp->donetail = rdp->curtail; + rdp->curlist = NULL; + rdp->curtail = &rdp->curlist; + } + + if (rdp->nxtlist && !rdp->curlist) { + local_irq_save(flags); + rdp->curlist = rdp->nxtlist; + rdp->curtail = rdp->nxttail; + rdp->nxtlist = NULL; + rdp->nxttail = &rdp->nxtlist; + local_irq_restore(flags); + + /* + * start the next batch of callbacks + */ + + /* determine batch number */ + rdp->batch = rcp->cur + 1; + /* see the comment and corresponding wmb() in + * the rcu_start_batch() + */ + smp_rmb(); + + if (!rcp->next_pending) { + /* and start it/schedule start if it's a new batch */ + spin_lock(&rcp->lock); + rcp->next_pending = 1; + rcu_start_batch(rcp); + spin_unlock(&rcp->lock); + } + } + + rcu_check_quiescent_state(rcp, rdp); + if (rdp->donelist) + rcu_do_batch(rdp); +} + +void rcu_process_callbacks(struct softirq_action *unused) +{ + __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); + __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); + rcu_process_callbacks_rt(unused); +} + +static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) +{ + /* This cpu has pending rcu entries and the grace period + * for them has completed. + */ + if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) + return 1; + + /* This cpu has no pending entries, but there are new entries */ + if (!rdp->curlist && rdp->nxtlist) + return 1; + + /* This cpu has finished callbacks to invoke */ + if (rdp->donelist) + return 1; + + /* The rcu core waits for a quiescent state from the cpu */ + if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) + return 1; + + /* nothing to do */ + return 0; +} + +/* + * Check to see if there is any immediate RCU-related work to be done + * by the current CPU, returning 1 if so. This function is part of the + * RCU implementation; it is -not- an exported member of the RCU API. + */ +int notrace rcu_pending(int cpu) +{ + return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || + __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)) || + rcu_pending_rt(cpu); +} + +/* + * Check to see if any future RCU-related work will need to be done + * by the current CPU, even if none need be done immediately, returning + * 1 if so. This function is part of the RCU implementation; it is -not- + * an exported member of the RCU API. + */ +int rcu_needs_cpu(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); + + return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu) || + rcu_needs_cpu_rt(cpu)); +} + +void rcu_advance_callbacks(int cpu, int user) +{ + if (user || + (idle_cpu(cpu) && !in_softirq() && + hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + rcu_qsctr_inc(cpu); + rcu_bh_qsctr_inc(cpu); + } else if (!in_softirq()) + rcu_bh_qsctr_inc(cpu); +} + +void rcu_check_callbacks(int cpu, int user) +{ + if (user || + (idle_cpu(cpu) && !in_softirq() && + hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + rcu_qsctr_inc(cpu); + rcu_bh_qsctr_inc(cpu); + } else if (!in_softirq()) + rcu_bh_qsctr_inc(cpu); + rcu_check_callbacks_rt(cpu, user); + raise_softirq(RCU_SOFTIRQ); +} + +static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ + memset(rdp, 0, sizeof(*rdp)); + rdp->curtail = &rdp->curlist; + rdp->nxttail = &rdp->nxtlist; + rdp->donetail = &rdp->donelist; + rdp->quiescbatch = rcp->completed; + rdp->qs_pending = 0; + rdp->cpu = cpu; + rdp->blimit = blimit; +} + +static void __devinit rcu_online_cpu(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu); + + rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); + rdp->passed_quiesc = &per_cpu(rcu_data_passed_quiesc, cpu); + rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); + bh_rdp->passed_quiesc = &per_cpu(rcu_data_bh_passed_quiesc, cpu); + rcu_online_cpu_rt(cpu); +} + +static int __cpuinit rcu_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + rcu_online_cpu(cpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + rcu_offline_cpu(cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata rcu_nb = { + .notifier_call = rcu_cpu_notify, +}; + +/* + * Initializes rcu mechanism. Assumed to be called early. + * That is before local timer(SMP) or jiffie timer (uniproc) is setup. + * Note that rcu_qsctr and friends are implicitly + * initialized due to the choice of ``0'' for RCU_CTR_INVALID. + */ +void __init rcu_init(void) +{ + rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + /* Register notifier for non-boot CPUs */ + register_cpu_notifier(&rcu_nb); + rcu_init_rt(); + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); +} + +module_param(blimit, int, 0); +module_param(qhimark, int, 0); +module_param(qlowmark, int, 0); Index: linux-2.6.24-rc8-rt1/include/linux/interrupt.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/interrupt.h 2008-01-16 22:27:42.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/interrupt.h 2008-01-16 22:28:36.000000000 -0500 @@ -50,10 +50,12 @@ #define IRQF_SAMPLE_RANDOM 0x00000040 #define IRQF_SHARED 0x00000080 #define IRQF_PROBE_SHARED 0x00000100 -#define IRQF_TIMER 0x00000200 +#define __IRQF_TIMER 0x00000200 #define IRQF_PERCPU 0x00000400 #define IRQF_NOBALANCING 0x00000800 #define IRQF_IRQPOLL 0x00001000 +#define IRQF_NODELAY 0x00002000 +#define IRQF_TIMER (__IRQF_TIMER | IRQF_NODELAY) typedef irqreturn_t (*irq_handler_t)(int, void *); @@ -65,7 +67,7 @@ struct irqaction { void *dev_id; struct irqaction *next; int irq; - struct proc_dir_entry *dir; + struct proc_dir_entry *dir, *threaded; }; extern irqreturn_t no_action(int cpl, void *dev_id); @@ -95,7 +97,7 @@ extern void devm_free_irq(struct device #ifdef CONFIG_LOCKDEP # define local_irq_enable_in_hardirq() do { } while (0) #else -# define local_irq_enable_in_hardirq() local_irq_enable() +# define local_irq_enable_in_hardirq() local_irq_enable_nort() #endif extern void disable_irq_nosync(unsigned int irq); @@ -196,6 +198,7 @@ static inline int disable_irq_wake(unsig #ifndef __ARCH_SET_SOFTIRQ_PENDING #define set_softirq_pending(x) (local_softirq_pending() = (x)) +// FIXME: PREEMPT_RT: set_bit()? #define or_softirq_pending(x) (local_softirq_pending() |= (x)) #endif @@ -256,6 +259,9 @@ enum #ifdef CONFIG_HIGH_RES_TIMERS HRTIMER_SOFTIRQ, #endif + RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ + /* Entries after this are ignored in split softirq mode */ + MAX_SOFTIRQ, }; /* softirq mask and active fields moved to irq_cpustat_t in @@ -271,10 +277,25 @@ struct softirq_action asmlinkage void do_softirq(void); extern void open_softirq(int nr, void (*action)(struct softirq_action*), void *data); extern void softirq_init(void); -#define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0) +extern void do_softirq_from_hardirq(void); + +#ifdef CONFIG_PREEMPT_HARDIRQS +# define __raise_softirq_irqoff(nr) raise_softirq_irqoff(nr) +# define __do_raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0) +#else +# define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0) +# define __do_raise_softirq_irqoff(nr) __raise_softirq_irqoff(nr) +#endif + extern void FASTCALL(raise_softirq_irqoff(unsigned int nr)); extern void FASTCALL(raise_softirq(unsigned int nr)); +extern void wakeup_irqd(void); +#ifdef CONFIG_PREEMPT_SOFTIRQS +extern void wait_for_softirq(int softirq); +#else +# define wait_for_softirq(x) do {} while(0) +#endif /* Tasklets --- multithreaded analogue of BHs. @@ -289,8 +310,9 @@ extern void FASTCALL(raise_softirq(unsig to be executed on some cpu at least once after this. * If the tasklet is already scheduled, but its excecution is still not started, it will be executed only once. - * If this tasklet is already running on another CPU (or schedule is called - from tasklet itself), it is rescheduled for later. + * If this tasklet is already running on another CPU, it is rescheduled + for later. + * Schedule must not be called from the tasklet itself (a lockup occurs) * Tasklet is strictly serialized wrt itself, but not wrt another tasklets. If client needs some intertask synchronization, he makes it with spinlocks. @@ -315,29 +337,38 @@ struct tasklet_struct name = { NULL, 0, enum { TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */ - TASKLET_STATE_RUN /* Tasklet is running (SMP only) */ + TASKLET_STATE_RUN, /* Tasklet is running (SMP only) */ + TASKLET_STATE_PENDING /* Tasklet is pending */ }; -#ifdef CONFIG_SMP +#define TASKLET_STATEF_SCHED (1 << TASKLET_STATE_SCHED) +#define TASKLET_STATEF_RUN (1 << TASKLET_STATE_RUN) +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING) + +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) static inline int tasklet_trylock(struct tasklet_struct *t) { return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state); } +static inline int tasklet_tryunlock(struct tasklet_struct *t) +{ + return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN; +} + static inline void tasklet_unlock(struct tasklet_struct *t) { smp_mb__before_clear_bit(); clear_bit(TASKLET_STATE_RUN, &(t)->state); } -static inline void tasklet_unlock_wait(struct tasklet_struct *t) -{ - while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); } -} +extern void tasklet_unlock_wait(struct tasklet_struct *t); + #else -#define tasklet_trylock(t) 1 -#define tasklet_unlock_wait(t) do { } while (0) -#define tasklet_unlock(t) do { } while (0) +# define tasklet_trylock(t) 1 +# define tasklet_tryunlock(t) 1 +# define tasklet_unlock_wait(t) do { } while (0) +# define tasklet_unlock(t) do { } while (0) #endif extern void FASTCALL(__tasklet_schedule(struct tasklet_struct *t)); @@ -370,22 +401,14 @@ static inline void tasklet_disable(struc smp_mb(); } -static inline void tasklet_enable(struct tasklet_struct *t) -{ - smp_mb__before_atomic_dec(); - atomic_dec(&t->count); -} - -static inline void tasklet_hi_enable(struct tasklet_struct *t) -{ - smp_mb__before_atomic_dec(); - atomic_dec(&t->count); -} +extern fastcall void tasklet_enable(struct tasklet_struct *t); +extern fastcall void tasklet_hi_enable(struct tasklet_struct *t); extern void tasklet_kill(struct tasklet_struct *t); extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu); extern void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data); +void takeover_tasklets(unsigned int cpu); /* * Autoprobing for irqs: @@ -443,4 +466,37 @@ static inline void init_irq_proc(void) } #endif +#ifdef CONFIG_PREEMPT_RT +# define local_irq_disable_nort() do { } while (0) +# define local_irq_enable_nort() do { } while (0) +# define local_irq_enable_rt() local_irq_enable() +# define local_irq_save_nort(flags) do { local_save_flags(flags); } while (0) +# define local_irq_restore_nort(flags) do { (void)(flags); } while (0) +# define spin_lock_nort(lock) do { } while (0) +# define spin_unlock_nort(lock) do { } while (0) +# define spin_lock_bh_nort(lock) do { } while (0) +# define spin_unlock_bh_nort(lock) do { } while (0) +# define spin_lock_rt(lock) spin_lock(lock) +# define spin_unlock_rt(lock) spin_unlock(lock) +# define smp_processor_id_rt(cpu) (cpu) +# define in_atomic_rt() (!oops_in_progress && \ + (in_atomic() || irqs_disabled())) +# define read_trylock_rt(lock) ({read_lock(lock); 1; }) +#else +# define local_irq_disable_nort() local_irq_disable() +# define local_irq_enable_nort() local_irq_enable() +# define local_irq_enable_rt() do { } while (0) +# define local_irq_save_nort(flags) local_irq_save(flags) +# define local_irq_restore_nort(flags) local_irq_restore(flags) +# define spin_lock_rt(lock) do { } while (0) +# define spin_unlock_rt(lock) do { } while (0) +# define spin_lock_nort(lock) spin_lock(lock) +# define spin_unlock_nort(lock) spin_unlock(lock) +# define spin_lock_bh_nort(lock) spin_lock_bh(lock) +# define spin_unlock_bh_nort(lock) spin_unlock_bh(lock) +# define smp_processor_id_rt(cpu) smp_processor_id() +# define in_atomic_rt() 0 +# define read_trylock_rt(lock) read_trylock(lock) +#endif + #endif Index: linux-2.6.24-rc8-rt1/include/linux/rcupreempt.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/include/linux/rcupreempt.h 2008-01-16 22:29:29.000000000 -0500 @@ -0,0 +1,138 @@ +/* + * Read-Copy Update mechanism for mutual exclusion (RT implementation) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2006 + * + * Author: Paul McKenney + * + * Based on the original work by Paul McKenney + * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. + * Papers: + * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf + * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU + * + */ + +#ifndef __LINUX_RCUPREEMPT_H +#define __LINUX_RCUPREEMPT_H + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_PREEMPT_RCU_BOOST +/* + * Task state with respect to being RCU-boosted. This state is changed + * by the task itself in response to the following three events: + * 1. Preemption (or block on lock) while in RCU read-side critical section. + * 2. Outermost rcu_read_unlock() for blocked RCU read-side critical section. + * + * The RCU-boost task also updates the state when boosting priority. + */ +enum rcu_boost_state { + RCU_BOOST_IDLE = 0, /* Not yet blocked if in RCU read-side. */ + RCU_BOOST_BLOCKED = 1, /* Blocked from RCU read-side. */ + RCU_BOOSTED = 2, /* Boosting complete. */ + RCU_BOOST_INVALID = 3, /* For bogus state sightings. */ +}; + +#define N_RCU_BOOST_STATE (RCU_BOOST_INVALID + 1) + +int __init rcu_preempt_boost_init(void); + +#else /* CONFIG_PREEPMT_RCU_BOOST */ + +#define rcu_preempt_boost_init() do { } while (0) + +#endif /* CONFIG_PREEMPT_RCU_BOOST */ + +/* + * Someone might want to pass call_rcu_bh as a function pointer. + * So this needs to just be a rename and not a macro function. + * (no parentheses) + */ +#define call_rcu_bh call_rcu_preempt +#define rcu_bh_qsctr_inc(cpu) do { } while (0) +#define __rcu_read_lock_bh() { rcu_read_lock(); local_bh_disable(); } +#define __rcu_read_unlock_bh() { local_bh_enable(); rcu_read_unlock(); } +#define __rcu_read_lock_nesting() (current->rcu_read_lock_nesting) + +extern void FASTCALL(call_rcu_classic(struct rcu_head *head, + void (*func)(struct rcu_head *head))); +extern void FASTCALL(call_rcu_preempt(struct rcu_head *head, + void (*func)(struct rcu_head *head))); +extern void __rcu_read_lock(void); +extern void __rcu_read_unlock(void); +extern void __synchronize_sched(void); +extern void rcu_advance_callbacks_rt(int cpu, int user); +extern void rcu_check_callbacks_rt(int cpu, int user); +extern void rcu_init_rt(void); +extern int rcu_needs_cpu_rt(int cpu); +extern void rcu_offline_cpu_rt(int cpu); +extern void rcu_online_cpu_rt(int cpu); +extern int rcu_pending_rt(int cpu); +struct softirq_action; +extern void rcu_process_callbacks_rt(struct softirq_action *unused); + +struct softirq_action; + +#ifdef CONFIG_NO_HZ +DECLARE_PER_CPU(long, dynticks_progress_counter); + +static inline void rcu_enter_nohz(void) +{ + __get_cpu_var(dynticks_progress_counter)++; + if (unlikely(__get_cpu_var(dynticks_progress_counter) & 0x1)) { + printk("BUG: bad accounting of dynamic ticks\n"); + printk(" will try to fix, but it is best to reboot\n"); + WARN_ON(1); + /* try to fix it */ + __get_cpu_var(dynticks_progress_counter)++; + } + mb(); +} + +static inline void rcu_exit_nohz(void) +{ + mb(); + __get_cpu_var(dynticks_progress_counter)++; + if (unlikely(!(__get_cpu_var(dynticks_progress_counter) & 0x1))) { + printk("BUG: bad accounting of dynamic ticks\n"); + printk(" will try to fix, but it is best to reboot\n"); + WARN_ON(1); + /* try to fix it */ + __get_cpu_var(dynticks_progress_counter)++; + } +} + +#else /* CONFIG_NO_HZ */ +#define rcu_enter_nohz() do { } while (0) +#define rcu_exit_nohz() do { } while (0) +#endif /* CONFIG_NO_HZ */ + +extern void rcu_process_callbacks(struct softirq_action *unused); + +#endif /* __KERNEL__ */ +#endif /* __LINUX_RCUPREEMPT_H */ Index: linux-2.6.24-rc8-rt1/include/linux/rcupreempt_trace.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/include/linux/rcupreempt_trace.h 2008-01-16 22:29:29.000000000 -0500 @@ -0,0 +1,140 @@ +/* + * Read-Copy Update mechanism for mutual exclusion (RT implementation) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2006 + * + * Author: Paul McKenney + * + * Based on the original work by Paul McKenney + * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. + * Papers: + * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf + * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) + * + * For detailed explanation of Read-Copy Update mechanism see - + * http://lse.sourceforge.net/locking/rcupdate.html + * + */ + +#ifndef __LINUX_RCUPREEMPT_TRACE_H +#define __LINUX_RCUPREEMPT_TRACE_H + +#ifdef __KERNEL__ +#include +#include + +#include + +/* + * PREEMPT_RCU data structures. + */ + +struct rcupreempt_trace { + long next_length; + long next_add; + long wait_length; + long wait_add; + long done_length; + long done_add; + long done_remove; + atomic_t done_invoked; + long rcu_check_callbacks; + atomic_t rcu_try_flip_1; + atomic_t rcu_try_flip_e1; + long rcu_try_flip_i1; + long rcu_try_flip_ie1; + long rcu_try_flip_g1; + long rcu_try_flip_a1; + long rcu_try_flip_ae1; + long rcu_try_flip_a2; + long rcu_try_flip_z1; + long rcu_try_flip_ze1; + long rcu_try_flip_z2; + long rcu_try_flip_m1; + long rcu_try_flip_me1; + long rcu_try_flip_m2; +}; + +struct rcupreempt_probe_data { + const char *name; + const char *format; + marker_probe_func *probe_func; +}; + +#define DEFINE_RCUPREEMPT_MARKER_HANDLER(rcupreempt_trace_worker) \ +void rcupreempt_trace_worker##_callback(const struct marker *mdata, \ + void *private_data, const char *format, ...) \ +{ \ + struct rcupreempt_trace *trace; \ + trace = (&per_cpu(trace_data, smp_processor_id())); \ + rcupreempt_trace_worker(trace); \ +} + +#define INIT_RCUPREEMPT_PROBE(rcupreempt_trace_worker) \ +{ \ + .name = __stringify(rcupreempt_trace_worker), \ + .probe_func = rcupreempt_trace_worker##_callback \ +} + +extern int *rcupreempt_flipctr(int cpu); +extern long rcupreempt_data_completed(void); +extern int rcupreempt_flip_flag(int cpu); +extern int rcupreempt_mb_flag(int cpu); +extern char *rcupreempt_try_flip_state_name(void); + +#ifdef CONFIG_PREEMPT_RCU_BOOST +struct preempt_rcu_boost_trace { + unsigned long rbs_stat_task_boost_called; + unsigned long rbs_stat_task_boosted; + unsigned long rbs_stat_boost_called; + unsigned long rbs_stat_try_boost; + unsigned long rbs_stat_boosted; + unsigned long rbs_stat_unboost_called; + unsigned long rbs_stat_unboosted; + unsigned long rbs_stat_try_boost_readers; + unsigned long rbs_stat_boost_readers; + unsigned long rbs_stat_try_unboost_readers; + unsigned long rbs_stat_unboost_readers; + unsigned long rbs_stat_over_taken; +}; + +#define DEFINE_PREEMPT_RCU_BOOST_MARKER_HANDLER(preempt_rcu_boost_var) \ +void preempt_rcu_boost_var##_callback(const struct marker *mdata, \ + void *private_data, const char *format, ...) \ +{ \ + struct preempt_rcu_boost_trace *boost_trace; \ + boost_trace = (&per_cpu(boost_trace_data, smp_processor_id())); \ + boost_trace->rbs_stat_##preempt_rcu_boost_var++; \ +} + +struct preempt_rcu_boost_probe { + const char *name; + const char *format; + marker_probe_func *probe_func; +}; + +#define INIT_PREEMPT_RCU_BOOST_PROBE(preempt_rcu_boost_probe_worker) \ +{ \ + .name = __stringify(preempt_rcu_boost_probe_worker), \ + .probe_func = preempt_rcu_boost_probe_worker##_callback \ +} + +extern int read_rcu_boost_prio(void); +#endif /* CONFIG_PREEMPT_RCU_BOOST */ + +#endif /* __KERNEL__ */ +#endif /* __LINUX_RCUPREEMPT_TRACE_H */ Index: linux-2.6.24-rc8-rt1/kernel/Kconfig.preempt =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/Kconfig.preempt 2008-01-16 22:27:42.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/Kconfig.preempt 2008-01-16 22:29:29.000000000 -0500 @@ -1,14 +1,13 @@ - choice - prompt "Preemption Model" - default PREEMPT_NONE + prompt "Preemption Mode" + default PREEMPT_RT config PREEMPT_NONE bool "No Forced Preemption (Server)" help - This is the traditional Linux preemption model, geared towards + This is the traditional Linux preemption model geared towards throughput. It will still provide good latencies most of the - time, but there are no guarantees and occasional longer delays + time but there are no guarantees and occasional long delays are possible. Select this option if you are building a kernel for a server or @@ -21,7 +20,7 @@ config PREEMPT_VOLUNTARY help This option reduces the latency of the kernel by adding more "explicit preemption points" to the kernel code. These new - preemption points have been selected to reduce the maximum + preemption points have been selected to minimize the maximum latency of rescheduling, providing faster application reactions, at the cost of slightly lower throughput. @@ -33,32 +32,166 @@ config PREEMPT_VOLUNTARY Select this if you are building a kernel for a desktop system. -config PREEMPT +config PREEMPT_DESKTOP bool "Preemptible Kernel (Low-Latency Desktop)" help This option reduces the latency of the kernel by making - all kernel code (that is not executing in a critical section) + all kernel code that is not executing in a critical section preemptible. This allows reaction to interactive events by permitting a low priority process to be preempted involuntarily even if it is in kernel mode executing a system call and would - otherwise not be about to reach a natural preemption point. - This allows applications to run more 'smoothly' even when the - system is under load, at the cost of slightly lower throughput - and a slight runtime overhead to kernel code. + otherwise not about to reach a preemption point. This allows + applications to run more 'smoothly' even when the system is + under load, at the cost of slighly lower throughput and a + slight runtime overhead to kernel code. + + (According to profiles, when this mode is selected then even + during kernel-intense workloads the system is in an immediately + preemptible state more than 50% of the time.) Select this if you are building a kernel for a desktop or embedded system with latency requirements in the milliseconds range. +config PREEMPT_RT + bool "Complete Preemption (Real-Time)" + select PREEMPT_SOFTIRQS + select PREEMPT_HARDIRQS + select PREEMPT_RCU + select RT_MUTEXES + help + This option further reduces the scheduling latency of the + kernel by replacing almost every spinlock used by the kernel + with preemptible mutexes and thus making all but the most + critical kernel code involuntarily preemptible. The remaining + handful of lowlevel non-preemptible codepaths are short and + have a deterministic latency of a couple of tens of + microseconds (depending on the hardware). This also allows + applications to run more 'smoothly' even when the system is + under load, at the cost of lower throughput and runtime + overhead to kernel code. + + (According to profiles, when this mode is selected then even + during kernel-intense workloads the system is in an immediately + preemptible state more than 95% of the time.) + + Select this if you are building a kernel for a desktop, + embedded or real-time system with guaranteed latency + requirements of 100 usecs or lower. + endchoice +config PREEMPT + bool + default y + depends on PREEMPT_DESKTOP || PREEMPT_RT + +config PREEMPT_SOFTIRQS + bool "Thread Softirqs" + default n +# depends on PREEMPT + help + This option reduces the latency of the kernel by 'threading' + soft interrupts. This means that all softirqs will execute + in softirqd's context. While this helps latency, it can also + reduce performance. + + The threading of softirqs can also be controlled via + /proc/sys/kernel/softirq_preemption runtime flag and the + sofirq-preempt=0/1 boot-time option. + + Say N if you are unsure. + +config PREEMPT_HARDIRQS + bool "Thread Hardirqs" + default n + depends on !GENERIC_HARDIRQS_NO__DO_IRQ + select PREEMPT_SOFTIRQS + help + This option reduces the latency of the kernel by 'threading' + hardirqs. This means that all (or selected) hardirqs will run + in their own kernel thread context. While this helps latency, + this feature can also reduce performance. + + The threading of hardirqs can also be controlled via the + /proc/sys/kernel/hardirq_preemption runtime flag and the + hardirq-preempt=0/1 boot-time option. Per-irq threading can + be enabled/disable via the /proc/irq///threaded + runtime flags. + + Say N if you are unsure. + config PREEMPT_BKL - bool "Preempt The Big Kernel Lock" - depends on SMP || PREEMPT + bool + depends on PREEMPT_RT || !SPINLOCK_BKL + default n if !PREEMPT default y + +choice + prompt "RCU implementation type:" + default PREEMPT_RCU if PREEMPT_RT + default CLASSIC_RCU + +config CLASSIC_RCU + bool "Classic RCU" + depends on !PREEMPT_RT + help + This option selects the classic RCU implementation that is + designed for best read-side performance on non-realtime + systems. + + Say Y if you are unsure. + +config PREEMPT_RCU + bool "Preemptible RCU" + depends on PREEMPT + help + This option reduces the latency of the kernel by making certain + RCU sections preemptible. Normally RCU code is non-preemptible, if + this option is selected then read-only RCU sections become + preemptible. This helps latency, but may expose bugs due to + now-naive assumptions about each RCU read-side critical section + remaining on a given CPU through its execution. + + Say N if you are unsure. + +endchoice + +config PREEMPT_RCU_BOOST + bool "Enable priority boosting of RCU read-side critical sections" + depends on PREEMPT_RCU + default y if PREEMPT_RT + help + This option permits priority boosting of RCU read-side critical + sections tat have been preempted and a RT process is waiting + on a synchronize_rcu. + + An RCU thread is also created that periodically wakes up and + performs a synchronize_rcu to make sure that all readers eventually + do complete to prevent an indefinite delay of grace periods and + possible OOM problems. + +config RCU_TRACE + tristate "Enable tracing for RCU - currently stats in debugfs" + select DEBUG_FS + select MARKERS + default m + help + This option provides tracing in RCU which presents stats + in debugfs for debugging RCU implementation. + + Say Y/M here if you want to enable RCU tracing in-kernel/module. + Say N if you are unsure. + +config SPINLOCK_BKL + bool "Old-Style Big Kernel Lock" + depends on (PREEMPT || SMP) && !PREEMPT_RT + default n help - This option reduces the latency of the kernel by making the - big kernel lock preemptible. + This option increases the latency of the kernel by making the + big kernel lock spinlock-based (which is bad for latency). + However, enable this option if you see any problems to revert + back to the traditional spinlock BKL design. Say Y here if you are building a kernel for a desktop system. Say N if you are unsure. Index: linux-2.6.24-rc8-rt1/kernel/rcupreempt.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/kernel/rcupreempt.c 2008-01-16 22:29:29.000000000 -0500 @@ -0,0 +1,1008 @@ +/* + * Read-Copy Update mechanism for mutual exclusion, realtime implementation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2006 + * + * Authors: Paul E. McKenney + * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar + * for pushing me away from locks and towards counters, and + * to Suparna Bhattacharya for pushing me completely away + * from atomic instructions on the read side. + * + * - Added handling of Dynamic Ticks + * Copyright 2007 - Paul E. Mckenney + * - Steven Rostedt + * + * Papers: http://www.rdrop.com/users/paulmck/RCU + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU/ *.txt + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * PREEMPT_RCU data structures. + */ + +#define GP_STAGES 2 +struct rcu_data { + raw_spinlock_t lock; /* Protect rcu_data fields. */ + long completed; /* Number of last completed batch. */ + int waitlistcount; + struct rcu_head *nextlist; + struct rcu_head **nexttail; + struct rcu_head *waitlist[GP_STAGES]; + struct rcu_head **waittail[GP_STAGES]; + struct rcu_head *donelist; + struct rcu_head **donetail; +}; +struct rcu_ctrlblk { + raw_spinlock_t fliplock; /* Protect state-machine transitions. */ + long completed; /* Number of last completed batch. */ +}; +static DEFINE_PER_CPU(struct rcu_data, rcu_data); +static struct rcu_ctrlblk rcu_ctrlblk = { + .fliplock = RAW_SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), + .completed = 0, +}; +static DEFINE_PER_CPU(int [2], rcu_flipctr) = { 0, 0 }; + +/* + * States for rcu_try_flip() and friends. + */ + +enum rcu_try_flip_states { + rcu_try_flip_idle_state, /* "I" */ + rcu_try_flip_waitack_state, /* "A" */ + rcu_try_flip_waitzero_state, /* "Z" */ + rcu_try_flip_waitmb_state /* "M" */ +}; +static enum rcu_try_flip_states rcu_try_flip_state = rcu_try_flip_idle_state; +static char *rcu_try_flip_state_names[] = + { "idle", "waitack", "waitzero", "waitmb" }; + +/* + * Enum and per-CPU flag to determine when each CPU has seen + * the most recent counter flip. + */ + +enum rcu_flip_flag_values { + rcu_flip_seen, /* Steady/initial state, last flip seen. */ + /* Only GP detector can update. */ + rcu_flipped /* Flip just completed, need confirmation. */ + /* Only corresponding CPU can update. */ +}; +static DEFINE_PER_CPU(enum rcu_flip_flag_values, rcu_flip_flag) = rcu_flip_seen; + +/* + * Enum and per-CPU flag to determine when each CPU has executed the + * needed memory barrier to fence in memory references from its last RCU + * read-side critical section in the just-completed grace period. + */ + +enum rcu_mb_flag_values { + rcu_mb_done, /* Steady/initial state, no mb()s required. */ + /* Only GP detector can update. */ + rcu_mb_needed /* Flip just completed, need an mb(). */ + /* Only corresponding CPU can update. */ +}; +static DEFINE_PER_CPU(enum rcu_mb_flag_values, rcu_mb_flag) = rcu_mb_done; + +static cpumask_t rcu_cpu_online_map = CPU_MASK_NONE; + +/* + * Macro that prevents the compiler from reordering accesses, but does + * absolutely -nothing- to prevent CPUs from reordering. This is used + * only to mediate communication between mainline code and hardware + * interrupt and NMI handlers. + */ +#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) + +/* + * RCU_DATA_ME: find the current CPU's rcu_data structure. + * RCU_DATA_CPU: find the specified CPU's rcu_data structure. + */ +#define RCU_DATA_ME() (&__get_cpu_var(rcu_data)) +#define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu)) + +/* + * Return the number of RCU batches processed thus far. Useful + * for debug and statistics. + */ +long rcu_batches_completed(void) +{ + return rcu_ctrlblk.completed; +} +EXPORT_SYMBOL_GPL(rcu_batches_completed); + +/* + * Return the number of RCU batches processed thus far. Useful for debug + * and statistics. The _bh variant is identical to straight RCU. + */ +long rcu_batches_completed_bh(void) +{ + return rcu_ctrlblk.completed; +} +EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); + +void __rcu_read_lock(void) +{ + int idx; + struct task_struct *me = current; + int nesting; + + nesting = ACCESS_ONCE(me->rcu_read_lock_nesting); + if (nesting != 0) { + + /* An earlier rcu_read_lock() covers us, just count it. */ + + me->rcu_read_lock_nesting = nesting + 1; + + } else { + unsigned long oldirq; + + /* + * Disable local interrupts to prevent the grace-period + * detection state machine from seeing us half-done. + * NMIs can still occur, of course, and might themselves + * contain rcu_read_lock(). + */ + + local_irq_save(oldirq); + + /* + * Outermost nesting of rcu_read_lock(), so increment + * the current counter for the current CPU. Use volatile + * casts to prevent the compiler from reordering. + */ + + idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1; + smp_read_barrier_depends(); /* @@@@ might be unneeded */ + ACCESS_ONCE(__get_cpu_var(rcu_flipctr)[idx])++; + + /* + * Now that the per-CPU counter has been incremented, we + * are protected from races with rcu_read_lock() invoked + * from NMI handlers on this CPU. We can therefore safely + * increment the nesting counter, relieving further NMIs + * of the need to increment the per-CPU counter. + */ + + ACCESS_ONCE(me->rcu_read_lock_nesting) = nesting + 1; + + /* + * Now that we have preventing any NMIs from storing + * to the ->rcu_flipctr_idx, we can safely use it to + * remember which counter to decrement in the matching + * rcu_read_unlock(). + */ + + ACCESS_ONCE(me->rcu_flipctr_idx) = idx; + local_irq_restore(oldirq); + } +} +EXPORT_SYMBOL_GPL(__rcu_read_lock); + +void __rcu_read_unlock(void) +{ + int idx; + struct task_struct *me = current; + int nesting; + + nesting = ACCESS_ONCE(me->rcu_read_lock_nesting); + if (nesting > 1) { + + /* + * We are still protected by the enclosing rcu_read_lock(), + * so simply decrement the counter. + */ + + me->rcu_read_lock_nesting = nesting - 1; + + } else { + unsigned long oldirq; + + /* + * Disable local interrupts to prevent the grace-period + * detection state machine from seeing us half-done. + * NMIs can still occur, of course, and might themselves + * contain rcu_read_lock() and rcu_read_unlock(). + */ + + local_irq_save(oldirq); + + /* + * Outermost nesting of rcu_read_unlock(), so we must + * decrement the current counter for the current CPU. + * This must be done carefully, because NMIs can + * occur at any point in this code, and any rcu_read_lock() + * and rcu_read_unlock() pairs in the NMI handlers + * must interact non-destructively with this code. + * Lots of volatile casts, and -very- careful ordering. + * + * Changes to this code, including this one, must be + * inspected, validated, and tested extremely carefully!!! + */ + + /* + * First, pick up the index. Enforce ordering for + * DEC Alpha. + */ + + idx = ACCESS_ONCE(me->rcu_flipctr_idx); + smp_read_barrier_depends(); /* @@@ Needed??? */ + + /* + * Now that we have fetched the counter index, it is + * safe to decrement the per-task RCU nesting counter. + * After this, any interrupts or NMIs will increment and + * decrement the per-CPU counters. + */ + ACCESS_ONCE(me->rcu_read_lock_nesting) = nesting - 1; + + /* + * It is now safe to decrement this task's nesting count. + * NMIs that occur after this statement will route their + * rcu_read_lock() calls through this "else" clause, and + * will thus start incrementing the per-CPU coutner on + * their own. They will also clobber ->rcu_flipctr_idx, + * but that is OK, since we have already fetched it. + */ + + ACCESS_ONCE(__get_cpu_var(rcu_flipctr)[idx])--; + + local_irq_restore(oldirq); + + __rcu_preempt_unboost(); + } +} +EXPORT_SYMBOL_GPL(__rcu_read_unlock); + +/* + * If a global counter flip has occurred since the last time that we + * advanced callbacks, advance them. Hardware interrupts must be + * disabled when calling this function. + */ +static void __rcu_advance_callbacks(struct rcu_data *rdp) +{ + int cpu; + int i; + int wlc = 0; + + if (rdp->completed != rcu_ctrlblk.completed) { + if (rdp->waitlist[GP_STAGES - 1] != NULL) { + *rdp->donetail = rdp->waitlist[GP_STAGES - 1]; + rdp->donetail = rdp->waittail[GP_STAGES - 1]; + trace_mark(rcupreempt_trace_move2done, "NULL"); + } + for (i = GP_STAGES - 2; i >= 0; i--) { + if (rdp->waitlist[i] != NULL) { + rdp->waitlist[i + 1] = rdp->waitlist[i]; + rdp->waittail[i + 1] = rdp->waittail[i]; + wlc++; + } else { + rdp->waitlist[i + 1] = NULL; + rdp->waittail[i + 1] = + &rdp->waitlist[i + 1]; + } + } + if (rdp->nextlist != NULL) { + rdp->waitlist[0] = rdp->nextlist; + rdp->waittail[0] = rdp->nexttail; + wlc++; + rdp->nextlist = NULL; + rdp->nexttail = &rdp->nextlist; + trace_mark(rcupreempt_trace_move2wait, "NULL"); + } else { + rdp->waitlist[0] = NULL; + rdp->waittail[0] = &rdp->waitlist[0]; + } + rdp->waitlistcount = wlc; + rdp->completed = rcu_ctrlblk.completed; + } + + /* + * Check to see if this CPU needs to report that it has seen + * the most recent counter flip, thereby declaring that all + * subsequent rcu_read_lock() invocations will respect this flip. + */ + + cpu = raw_smp_processor_id(); + if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { + smp_mb(); /* Subsequent counter accesses must see new value */ + per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen; + smp_mb(); /* Subsequent RCU read-side critical sections */ + /* seen -after- acknowledgement. */ + } +} + +#ifdef CONFIG_NO_HZ + +DEFINE_PER_CPU(long, dynticks_progress_counter) = 1; +static DEFINE_PER_CPU(long, rcu_dyntick_snapshot); +static DEFINE_PER_CPU(int, rcu_update_flag); + +/** + * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI. + * + * If the CPU was idle with dynamic ticks active, this updates the + * dynticks_progress_counter to let the RCU handling know that the + * CPU is active. + */ +void rcu_irq_enter(void) +{ + int cpu = smp_processor_id(); + + if (per_cpu(rcu_update_flag, cpu)) + per_cpu(rcu_update_flag, cpu)++; + + /* + * Only update if we are coming from a stopped ticks mode + * (dynticks_progress_counter is even). + */ + if (!in_interrupt() && (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) { + /* + * The following might seem like we could have a race + * with NMI/SMIs. But this really isn't a problem. + * Here we do a read/modify/write, and the race happens + * when an NMI/SMI comes in after the read and before + * the write. But NMI/SMIs will increment this counter + * twice before returning, so the zero bit will not + * be corrupted by the NMI/SMI which is the most important + * part. + * + * The only thing is that we would bring back the counter + * to a postion that it was in during the NMI/SMI. + * But the zero bit would be set, so the rest of the + * counter would again be ignored. + * + * On return from the IRQ, the counter may have the zero + * bit be 0 and the counter the same as the return from + * the NMI/SMI. If the state machine was so unlucky to + * see that, it still doesn't matter, since all + * RCU read-side critical sections on this CPU would + * have already completed. + */ + per_cpu(dynticks_progress_counter, cpu)++; + /* + * The following memory barrier ensures that any + * rcu_read_lock() primitives in the irq handler + * are seen by other CPUs to follow the above + * increment to dynticks_progress_counter. This is + * required in order for other CPUs to correctly + * determine when it is safe to advance the RCU + * grace-period state machine. + */ + smp_mb(); /* see above block comment. */ + /* + * Since we can't determine the dynamic tick mode from + * the dynticks_progress_counter after this routine, + * we use a second flag to acknowledge that we came + * from an idle state with ticks stopped. + */ + per_cpu(rcu_update_flag, cpu)++; + /* + * If we take an NMI/SMI now, they will also increment + * the rcu_update_flag, and will not update the + * dynticks_progress_counter on exit. That is for + * this IRQ to do. + */ + } +} + +/** + * rcu_irq_exit - Called from exiting Hard irq context. + * + * If the CPU was idle with dynamic ticks active, update the + * dynticks_progress_counter to put let the RCU handling be + * aware that the CPU is going back to idle with no ticks. + */ +void rcu_irq_exit(void) +{ + int cpu = smp_processor_id(); + + /* + * rcu_update_flag is set if we interrupted the CPU + * when it was idle with ticks stopped. + * Once this occurs, we keep track of interrupt nesting + * because a NMI/SMI could also come in, and we still + * only want the IRQ that started the increment of the + * dynticks_progress_counter to be the one that modifies + * it on exit. + */ + if (per_cpu(rcu_update_flag, cpu)) { + if (--per_cpu(rcu_update_flag, cpu)) + return; + + /* This must match the interrupt nesting */ + WARN_ON(in_interrupt()); + + /* + * If an NMI/SMI happens now we are still + * protected by the dynticks_progress_counter being odd. + */ + + /* + * The following memory barrier ensures that any + * rcu_read_unlock() primitives in the irq handler + * are seen by other CPUs to preceed the following + * increment to dynticks_progress_counter. This + * is required in order for other CPUs to determine + * when it is safe to advance the RCU grace-period + * state machine. + */ + smp_mb(); /* see above block comment. */ + per_cpu(dynticks_progress_counter, cpu)++; + WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1); + } +} + +static void dyntick_save_progress_counter(int cpu) +{ + per_cpu(rcu_dyntick_snapshot, cpu) = + per_cpu(dynticks_progress_counter, cpu); +} + +static inline int +rcu_try_flip_waitack_needed(int cpu) +{ + long curr; + long snap; + + curr = per_cpu(dynticks_progress_counter, cpu); + snap = per_cpu(rcu_dyntick_snapshot, cpu); + smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ + + /* + * If the CPU remained in dynticks mode for the entire time + * and didn't take any interrupts, NMIs, SMIs, or whatever, + * then it cannot be in the middle of an rcu_read_lock(), so + * the next rcu_read_lock() it executes must use the new value + * of the counter. So we can safely pretend that this CPU + * already acknowledged the counter. + */ + + if ((curr == snap) && ((curr & 0x1) == 0)) + return 0; + + /* + * If the CPU passed through or entered a dynticks idle phase with + * no active irq handlers, then, as above, we can safely pretend + * that this CPU already acknowledged the counter. + */ + + if ((curr - snap) > 2 || (snap & 0x1) == 0) + return 0; + + /* We need this CPU to explicitly acknowledge the counter flip. */ + + return 1; +} + +static inline int +rcu_try_flip_waitmb_needed(int cpu) +{ + long curr; + long snap; + + curr = per_cpu(dynticks_progress_counter, cpu); + snap = per_cpu(rcu_dyntick_snapshot, cpu); + smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ + + /* + * If the CPU remained in dynticks mode for the entire time + * and didn't take any interrupts, NMIs, SMIs, or whatever, + * then it cannot have executed an RCU read-side critical section + * during that time, so there is no need for it to execute a + * memory barrier. + */ + + if ((curr == snap) && ((curr & 0x1) == 0)) + return 0; + + /* + * If the CPU either entered or exited an outermost interrupt, + * SMI, NMI, or whatever handler, then we know that it executed + * a memory barrier when doing so. So we don't need another one. + */ + if (curr != snap) + return 0; + + /* We need the CPU to execute a memory barrier. */ + + return 1; +} + +#else /* !CONFIG_NO_HZ */ + +# define dyntick_save_progress_counter(cpu) do { } while (0) +# define rcu_try_flip_waitack_needed(cpu) (1) +# define rcu_try_flip_waitmb_needed(cpu) (1) + +#endif /* CONFIG_NO_HZ */ + +/* + * Get here when RCU is idle. Decide whether we need to + * move out of idle state, and return non-zero if so. + * "Straightforward" approach for the moment, might later + * use callback-list lengths, grace-period duration, or + * some such to determine when to exit idle state. + * Might also need a pre-idle test that does not acquire + * the lock, but let's get the simple case working first... + */ + +static int +rcu_try_flip_idle(void) +{ + int cpu; + + trace_mark(rcupreempt_trace_try_flip_i1, "NULL"); + if (!rcu_pending(smp_processor_id())) { + trace_mark(rcupreempt_trace_try_flip_ie1, "NULL"); + return 0; + } + + /* + * Do the flip. + */ + + trace_mark(rcupreempt_trace_try_flip_g1, "NULL"); + rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */ + + /* + * Need a memory barrier so that other CPUs see the new + * counter value before they see the subsequent change of all + * the rcu_flip_flag instances to rcu_flipped. + */ + + smp_mb(); /* see above block comment. */ + + /* Now ask each CPU for acknowledgement of the flip. */ + + for_each_cpu_mask(cpu, rcu_cpu_online_map) { + per_cpu(rcu_flip_flag, cpu) = rcu_flipped; + dyntick_save_progress_counter(cpu); + } + + return 1; +} + +/* + * Wait for CPUs to acknowledge the flip. + */ + +static int +rcu_try_flip_waitack(void) +{ + int cpu; + + trace_mark(rcupreempt_trace_try_flip_a1, "NULL"); + for_each_cpu_mask(cpu, rcu_cpu_online_map) + if (rcu_try_flip_waitack_needed(cpu) && + per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { + trace_mark(rcupreempt_trace_try_flip_ae1, "NULL"); + return 0; + } + + /* + * Make sure our checks above don't bleed into subsequent + * waiting for the sum of the counters to reach zero. + */ + + smp_mb(); /* see above block comment. */ + trace_mark(rcupreempt_trace_try_flip_a2, "NULL"); + return 1; +} + +/* + * Wait for collective ``last'' counter to reach zero, + * then tell all CPUs to do an end-of-grace-period memory barrier. + */ + +static int +rcu_try_flip_waitzero(void) +{ + int cpu; + int lastidx = !(rcu_ctrlblk.completed & 0x1); + int sum = 0; + + /* Check to see if the sum of the "last" counters is zero. */ + + trace_mark(rcupreempt_trace_try_flip_z1, "NULL"); + for_each_possible_cpu(cpu) + sum += per_cpu(rcu_flipctr, cpu)[lastidx]; + if (sum != 0) { + trace_mark(rcupreempt_trace_try_flip_ze1, "NULL"); + return 0; + } + + smp_mb(); /* Don't call for memory barriers before we see zero. */ + + /* Call for a memory barrier from each CPU. */ + + for_each_cpu_mask(cpu, rcu_cpu_online_map) { + per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; + dyntick_save_progress_counter(cpu); + } + + trace_mark(rcupreempt_trace_try_flip_z2, "NULL"); + return 1; +} + +/* + * Wait for all CPUs to do their end-of-grace-period memory barrier. + * Return 0 once all CPUs have done so. + */ + +static int +rcu_try_flip_waitmb(void) +{ + int cpu; + + trace_mark(rcupreempt_trace_try_flip_m1, "NULL"); + for_each_cpu_mask(cpu, rcu_cpu_online_map) + if (rcu_try_flip_waitmb_needed(cpu) && + per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { + trace_mark(rcupreempt_trace_try_flip_me1, "NULL"); + return 0; + } + + smp_mb(); /* Ensure that the above checks precede any following flip. */ + trace_mark(rcupreempt_trace_try_flip_m2, "NULL"); + return 1; +} + +/* + * Attempt a single flip of the counters. Remember, a single flip does + * -not- constitute a grace period. Instead, the interval between + * at least GP_STAGES+2 consecutive flips is a grace period. + * + * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation + * on a large SMP, they might want to use a hierarchical organization of + * the per-CPU-counter pairs. + */ +static void rcu_try_flip(void) +{ + unsigned long oldirq; + + trace_mark(rcupreempt_trace_try_flip_1, "NULL"); + if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, oldirq))) { + trace_mark(rcupreempt_trace_try_flip_e1, "NULL"); + return; + } + + /* + * Take the next transition(s) through the RCU grace-period + * flip-counter state machine. + */ + + switch (rcu_try_flip_state) { + case rcu_try_flip_idle_state: + if (rcu_try_flip_idle()) + rcu_try_flip_state = rcu_try_flip_waitack_state; + break; + case rcu_try_flip_waitack_state: + if (rcu_try_flip_waitack()) + rcu_try_flip_state = rcu_try_flip_waitzero_state; + break; + case rcu_try_flip_waitzero_state: + if (rcu_try_flip_waitzero()) + rcu_try_flip_state = rcu_try_flip_waitmb_state; + break; + case rcu_try_flip_waitmb_state: + if (rcu_try_flip_waitmb()) + rcu_try_flip_state = rcu_try_flip_idle_state; + } + spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq); +} + +/* + * Check to see if this CPU needs to do a memory barrier in order to + * ensure that any prior RCU read-side critical sections have committed + * their counter manipulations and critical-section memory references + * before declaring the grace period to be completed. + */ +static void rcu_check_mb(int cpu) +{ + if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) { + smp_mb(); /* Ensure RCU read-side accesses are visible. */ + per_cpu(rcu_mb_flag, cpu) = rcu_mb_done; + } +} + +void rcu_check_callbacks_rt(int cpu, int user) +{ + unsigned long oldirq; + struct rcu_data *rdp = RCU_DATA_CPU(cpu); + + rcu_check_mb(cpu); + if (rcu_ctrlblk.completed == rdp->completed) + rcu_try_flip(); + spin_lock_irqsave(&rdp->lock, oldirq); + trace_mark(rcupreempt_trace_check_callbacks, "NULL"); + __rcu_advance_callbacks(rdp); + spin_unlock_irqrestore(&rdp->lock, oldirq); +} + +/* + * Needed by dynticks, to make sure all RCU processing has finished + * when we go idle. (Currently unused, needed?) + */ +void rcu_advance_callbacks_rt(int cpu, int user) +{ + unsigned long oldirq; + struct rcu_data *rdp = RCU_DATA_CPU(cpu); + + if (rcu_ctrlblk.completed == rdp->completed) { + rcu_try_flip(); + if (rcu_ctrlblk.completed == rdp->completed) + return; + } + spin_lock_irqsave(&rdp->lock, oldirq); + trace_mark(rcupreempt_trace_check_callbacks, "NULL"); + __rcu_advance_callbacks(rdp); + spin_unlock_irqrestore(&rdp->lock, oldirq); +} + +#ifdef CONFIG_HOTPLUG_CPU + +#define rcu_offline_cpu_rt_enqueue(srclist, srctail, dstlist, dsttail) do { \ + *dsttail = srclist; \ + if (srclist != NULL) { \ + dsttail = srctail; \ + srclist = NULL; \ + srctail = &srclist;\ + } \ + } while (0) + + +void rcu_offline_cpu_rt(int cpu) +{ + int i; + struct rcu_head *list = NULL; + unsigned long oldirq; + struct rcu_data *rdp = RCU_DATA_CPU(cpu); + struct rcu_head **tail = &list; + + /* Remove all callbacks from the newly dead CPU, retaining order. */ + + spin_lock_irqsave(&rdp->lock, oldirq); + rcu_offline_cpu_rt_enqueue(rdp->donelist, rdp->donetail, list, tail); + for (i = GP_STAGES - 1; i >= 0; i--) + rcu_offline_cpu_rt_enqueue(rdp->waitlist[i], rdp->waittail[i], + list, tail); + rcu_offline_cpu_rt_enqueue(rdp->nextlist, rdp->nexttail, list, tail); + spin_unlock_irqrestore(&rdp->lock, oldirq); + rdp->waitlistcount = 0; + + /* Disengage the newly dead CPU from grace-period computation. */ + + spin_lock_irqsave(&rcu_ctrlblk.fliplock, oldirq); + rcu_check_mb(cpu); + if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { + smp_mb(); /* Subsequent counter accesses must see new value */ + per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen; + smp_mb(); /* Subsequent RCU read-side critical sections */ + /* seen -after- acknowledgement. */ + } + cpu_clear(cpu, rcu_cpu_online_map); + spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq); + + /* + * Place the removed callbacks on the current CPU's queue. + * Make them all start a new grace period: simple approach, + * in theory could starve a given set of callbacks, but + * you would need to be doing some serious CPU hotplugging + * to make this happen. If this becomes a problem, adding + * a synchronize_rcu() to the hotplug path would be a simple + * fix. + */ + + rdp = RCU_DATA_ME(); + spin_lock_irqsave(&rdp->lock, oldirq); + *rdp->nexttail = list; + if (list) + rdp->nexttail = tail; + spin_unlock_irqrestore(&rdp->lock, oldirq); +} + +void __devinit rcu_online_cpu_rt(int cpu) +{ + unsigned long oldirq; + + spin_lock_irqsave(&rcu_ctrlblk.fliplock, oldirq); + cpu_set(cpu, rcu_cpu_online_map); + spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq); +} + +#else /* #ifdef CONFIG_HOTPLUG_CPU */ + +void rcu_offline_cpu(int cpu) +{ +} + +void __devinit rcu_online_cpu_rt(int cpu) +{ +} + +#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ + +void rcu_process_callbacks_rt(struct softirq_action *unused) +{ + unsigned long flags; + struct rcu_head *next, *list; + struct rcu_data *rdp = RCU_DATA_ME(); + + spin_lock_irqsave(&rdp->lock, flags); + list = rdp->donelist; + if (list == NULL) { + spin_unlock_irqrestore(&rdp->lock, flags); + return; + } + rdp->donelist = NULL; + rdp->donetail = &rdp->donelist; + trace_mark(rcupreempt_trace_done_remove, "NULL"); + spin_unlock_irqrestore(&rdp->lock, flags); + while (list) { + next = list->next; + list->func(list); + list = next; + trace_mark(rcupreempt_trace_invoke, "NULL"); + } +} + +void fastcall call_rcu_preempt(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)) +{ + unsigned long oldirq; + struct rcu_data *rdp; + + head->func = func; + head->next = NULL; + local_irq_save(oldirq); + rdp = RCU_DATA_ME(); + spin_lock(&rdp->lock); + __rcu_advance_callbacks(rdp); + *rdp->nexttail = head; + rdp->nexttail = &head->next; + trace_mark(rcupreempt_trace_next_add, "NULL"); + spin_unlock(&rdp->lock); + local_irq_restore(oldirq); +} +EXPORT_SYMBOL_GPL(call_rcu_preempt); + +/* + * Check to see if any future RCU-related work will need to be done + * by the current CPU, even if none need be done immediately, returning + * 1 if so. Assumes that notifiers would take care of handling any + * outstanding requests from the RCU core. + * + * This function is part of the RCU implementation; it is -not- + * an exported member of the RCU API. + */ +int rcu_needs_cpu_rt(int cpu) +{ + struct rcu_data *rdp = RCU_DATA_CPU(cpu); + + return (rdp->donelist != NULL || + !!rdp->waitlistcount || + rdp->nextlist != NULL); +} + +int rcu_pending_rt(int cpu) +{ + struct rcu_data *rdp = RCU_DATA_CPU(cpu); + + /* The CPU has at least one callback queued somewhere. */ + + if (rdp->donelist != NULL || + !!rdp->waitlistcount || + rdp->nextlist != NULL) + return 1; + + /* The RCU core needs an acknowledgement from this CPU. */ + + if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) || + (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed)) + return 1; + + /* This CPU has fallen behind the global grace-period number. */ + + if (rdp->completed != rcu_ctrlblk.completed) + return 1; + + /* Nothing needed from this CPU. */ + + return 0; +} + +void __init rcu_init_rt(void) +{ + int cpu; + int i; + struct rcu_data *rdp; + +/*&&&&*/printk(KERN_NOTICE "WARNING: experimental RCU implementation.\n"); + for_each_possible_cpu(cpu) { + rdp = RCU_DATA_CPU(cpu); + spin_lock_init(&rdp->lock); + rdp->completed = 0; + rdp->waitlistcount = 0; + rdp->nextlist = NULL; + rdp->nexttail = &rdp->nextlist; + for (i = 0; i < GP_STAGES; i++) { + rdp->waitlist[i] = NULL; + rdp->waittail[i] = &rdp->waitlist[i]; + } + rdp->donelist = NULL; + rdp->donetail = &rdp->donelist; + } + rcu_preempt_boost_init(); +} + +/* + * Deprecated, use synchronize_rcu() or synchronize_sched() instead. + */ +void synchronize_kernel(void) +{ + synchronize_rcu(); +} + +int *rcupreempt_flipctr(int cpu) +{ + return &per_cpu(rcu_flipctr, cpu)[0]; +} +EXPORT_SYMBOL_GPL(rcupreempt_flipctr); + +int rcupreempt_flip_flag(int cpu) +{ + return per_cpu(rcu_flip_flag, cpu); +} +EXPORT_SYMBOL_GPL(rcupreempt_flip_flag); + +int rcupreempt_mb_flag(int cpu) +{ + return per_cpu(rcu_mb_flag, cpu); +} +EXPORT_SYMBOL_GPL(rcupreempt_mb_flag); + +char *rcupreempt_try_flip_state_name(void) +{ + return rcu_try_flip_state_names[rcu_try_flip_state]; +} +EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name); Index: linux-2.6.24-rc8-rt1/kernel/rcupreempt_trace.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/kernel/rcupreempt_trace.c 2008-01-16 22:29:29.000000000 -0500 @@ -0,0 +1,596 @@ +/* + * Read-Copy Update tracing for realtime implementation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2006 + * + * Papers: http://www.rdrop.com/users/paulmck/RCU + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU/ *.txt + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct mutex rcupreempt_trace_mutex; +static char *rcupreempt_trace_buf; +#define RCUPREEMPT_TRACE_BUF_SIZE 4096 + +static DEFINE_PER_CPU(struct rcupreempt_trace, trace_data); + +#ifdef CONFIG_PREEMPT_RCU_BOOST +#define RCUPREEMPT_BOOST_TRACE_BUF_SIZE 4096 +static char rcupreempt_boost_trace_buf[RCUPREEMPT_BOOST_TRACE_BUF_SIZE]; +static DEFINE_PER_CPU(struct preempt_rcu_boost_trace, boost_trace_data); + +DEFINE_PREEMPT_RCU_BOOST_MARKER_HANDLER(task_boost_called); +DEFINE_PREEMPT_RCU_BOOST_MARKER_HANDLER(task_boosted); +DEFINE_PREEMPT_RCU_BOOST_MARKER_HANDLER(boost_called); +DEFINE_PREEMPT_RCU_BOOST_MARKER_HANDLER(try_boost); +DEFINE_PREEMPT_RCU_BOOST_MARKER_HANDLER(boosted); +DEFINE_PREEMPT_RCU_BOOST_MARKER_HANDLER(unboost_called); +DEFINE_PREEMPT_RCU_BOOST_MARKER_HANDLER(unboosted); +DEFINE_PREEMPT_RCU_BOOST_MARKER_HANDLER(try_boost_readers); +DEFINE_PREEMPT_RCU_BOOST_MARKER_HANDLER(boost_readers); +DEFINE_PREEMPT_RCU_BOOST_MARKER_HANDLER(try_unboost_readers); +DEFINE_PREEMPT_RCU_BOOST_MARKER_HANDLER(unboost_readers); +DEFINE_PREEMPT_RCU_BOOST_MARKER_HANDLER(over_taken); + +static struct preempt_rcu_boost_probe preempt_rcu_boost_probe_array[] = +{ + INIT_PREEMPT_RCU_BOOST_PROBE(task_boost_called), + INIT_PREEMPT_RCU_BOOST_PROBE(task_boosted), + INIT_PREEMPT_RCU_BOOST_PROBE(boost_called), + INIT_PREEMPT_RCU_BOOST_PROBE(try_boost), + INIT_PREEMPT_RCU_BOOST_PROBE(boosted), + INIT_PREEMPT_RCU_BOOST_PROBE(unboost_called), + INIT_PREEMPT_RCU_BOOST_PROBE(unboosted), + INIT_PREEMPT_RCU_BOOST_PROBE(try_boost_readers), + INIT_PREEMPT_RCU_BOOST_PROBE(boost_readers), + INIT_PREEMPT_RCU_BOOST_PROBE(try_unboost_readers), + INIT_PREEMPT_RCU_BOOST_PROBE(unboost_readers), + INIT_PREEMPT_RCU_BOOST_PROBE(over_taken) +}; + +static ssize_t rcuboost_read(struct file *filp, char __user *buffer, + size_t count, loff_t *ppos) +{ + static DEFINE_MUTEX(mutex); + int cnt = 0; + int cpu; + struct preempt_rcu_boost_trace *prbt; + ssize_t bcount; + unsigned long task_boost_called = 0; + unsigned long task_boosted = 0; + unsigned long boost_called = 0; + unsigned long try_boost = 0; + unsigned long boosted = 0; + unsigned long unboost_called = 0; + unsigned long unboosted = 0; + unsigned long try_boost_readers = 0; + unsigned long boost_readers = 0; + unsigned long try_unboost_readers = 0; + unsigned long unboost_readers = 0; + unsigned long over_taken = 0; + + mutex_lock(&mutex); + + for_each_online_cpu(cpu) { + prbt = &per_cpu(boost_trace_data, cpu); + + task_boost_called += prbt->rbs_stat_task_boost_called; + task_boosted += prbt->rbs_stat_task_boosted; + boost_called += prbt->rbs_stat_boost_called; + try_boost += prbt->rbs_stat_try_boost; + boosted += prbt->rbs_stat_boosted; + unboost_called += prbt->rbs_stat_unboost_called; + unboosted += prbt->rbs_stat_unboosted; + try_boost_readers += prbt->rbs_stat_try_boost_readers; + boost_readers += prbt->rbs_stat_boost_readers; + try_unboost_readers += prbt->rbs_stat_try_boost_readers; + unboost_readers += prbt->rbs_stat_boost_readers; + over_taken += prbt->rbs_stat_over_taken; + } + + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "task_boost_called = %ld\n", + task_boost_called); + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "task_boosted = %ld\n", + task_boosted); + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "boost_called = %ld\n", + boost_called); + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "try_boost = %ld\n", + try_boost); + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "boosted = %ld\n", + boosted); + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "unboost_called = %ld\n", + unboost_called); + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "unboosted = %ld\n", + unboosted); + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "try_boost_readers = %ld\n", + try_boost_readers); + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "boost_readers = %ld\n", + boost_readers); + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "try_unboost_readers = %ld\n", + try_unboost_readers); + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "unboost_readers = %ld\n", + unboost_readers); + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "over_taken = %ld\n", + over_taken); + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "rcu_boost_prio = %d\n", + read_rcu_boost_prio()); + bcount = simple_read_from_buffer(buffer, count, ppos, + rcupreempt_boost_trace_buf, strlen(rcupreempt_boost_trace_buf)); + mutex_unlock(&mutex); + + return bcount; +} + +static struct file_operations rcuboost_fops = { + .read = rcuboost_read, +}; + +static struct dentry *rcuboostdir; +int rcu_trace_boost_create(struct dentry *rcudir) +{ + rcuboostdir = debugfs_create_file("rcuboost", 0444, rcudir, + NULL, &rcuboost_fops); + if (!rcuboostdir) + return 0; + + return 1; +} + +void rcu_trace_boost_destroy(void) +{ + if (rcuboostdir) + debugfs_remove(rcuboostdir); + rcuboostdir = NULL; +} + +#endif /* CONFIG_PREEMPT_RCU_BOOST */ + +struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu) +{ + return &per_cpu(trace_data, cpu); +} + +void rcupreempt_trace_move2done(struct rcupreempt_trace *trace) +{ + trace->done_length += trace->wait_length; + trace->done_add += trace->wait_length; + trace->wait_length = 0; +} +void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace) +{ + trace->wait_length += trace->next_length; + trace->wait_add += trace->next_length; + trace->next_length = 0; +} +void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace) +{ + atomic_inc(&trace->rcu_try_flip_1); +} +void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace) +{ + atomic_inc(&trace->rcu_try_flip_e1); +} +void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace) +{ + trace->rcu_try_flip_i1++; +} +void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace) +{ + trace->rcu_try_flip_ie1++; +} +void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace) +{ + trace->rcu_try_flip_g1++; +} +void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace) +{ + trace->rcu_try_flip_a1++; +} +void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace) +{ + trace->rcu_try_flip_ae1++; +} +void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace) +{ + trace->rcu_try_flip_a2++; +} +void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace) +{ + trace->rcu_try_flip_z1++; +} +void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace) +{ + trace->rcu_try_flip_ze1++; +} +void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace) +{ + trace->rcu_try_flip_z2++; +} +void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace) +{ + trace->rcu_try_flip_m1++; +} +void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace) +{ + trace->rcu_try_flip_me1++; +} +void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace) +{ + trace->rcu_try_flip_m2++; +} +void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace) +{ + trace->rcu_check_callbacks++; +} +void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace) +{ + trace->done_remove += trace->done_length; + trace->done_length = 0; +} +void rcupreempt_trace_invoke(struct rcupreempt_trace *trace) +{ + atomic_inc(&trace->done_invoked); +} +void rcupreempt_trace_next_add(struct rcupreempt_trace *trace) +{ + trace->next_add++; + trace->next_length++; +} + +DEFINE_RCUPREEMPT_MARKER_HANDLER(rcupreempt_trace_move2done); +DEFINE_RCUPREEMPT_MARKER_HANDLER(rcupreempt_trace_move2wait); +DEFINE_RCUPREEMPT_MARKER_HANDLER(rcupreempt_trace_try_flip_1); +DEFINE_RCUPREEMPT_MARKER_HANDLER(rcupreempt_trace_try_flip_e1); +DEFINE_RCUPREEMPT_MARKER_HANDLER(rcupreempt_trace_try_flip_i1); +DEFINE_RCUPREEMPT_MARKER_HANDLER(rcupreempt_trace_try_flip_ie1); +DEFINE_RCUPREEMPT_MARKER_HANDLER(rcupreempt_trace_try_flip_g1); +DEFINE_RCUPREEMPT_MARKER_HANDLER(rcupreempt_trace_try_flip_a1); +DEFINE_RCUPREEMPT_MARKER_HANDLER(rcupreempt_trace_try_flip_ae1); +DEFINE_RCUPREEMPT_MARKER_HANDLER(rcupreempt_trace_try_flip_a2); +DEFINE_RCUPREEMPT_MARKER_HANDLER(rcupreempt_trace_try_flip_z1); +DEFINE_RCUPREEMPT_MARKER_HANDLER(rcupreempt_trace_try_flip_ze1); +DEFINE_RCUPREEMPT_MARKER_HANDLER(rcupreempt_trace_try_flip_z2); +DEFINE_RCUPREEMPT_MARKER_HANDLER(rcupreempt_trace_try_flip_m1); +DEFINE_RCUPREEMPT_MARKER_HANDLER(rcupreempt_trace_try_flip_me1); +DEFINE_RCUPREEMPT_MARKER_HANDLER(rcupreempt_trace_try_flip_m2); +DEFINE_RCUPREEMPT_MARKER_HANDLER(rcupreempt_trace_check_callbacks); +DEFINE_RCUPREEMPT_MARKER_HANDLER(rcupreempt_trace_done_remove); +DEFINE_RCUPREEMPT_MARKER_HANDLER(rcupreempt_trace_invoke); +DEFINE_RCUPREEMPT_MARKER_HANDLER(rcupreempt_trace_next_add); + +static struct rcupreempt_probe_data rcupreempt_probe_array[] = +{ + INIT_RCUPREEMPT_PROBE(rcupreempt_trace_move2done), + INIT_RCUPREEMPT_PROBE(rcupreempt_trace_move2wait), + INIT_RCUPREEMPT_PROBE(rcupreempt_trace_try_flip_1), + INIT_RCUPREEMPT_PROBE(rcupreempt_trace_try_flip_e1), + INIT_RCUPREEMPT_PROBE(rcupreempt_trace_try_flip_i1), + INIT_RCUPREEMPT_PROBE(rcupreempt_trace_try_flip_ie1), + INIT_RCUPREEMPT_PROBE(rcupreempt_trace_try_flip_g1), + INIT_RCUPREEMPT_PROBE(rcupreempt_trace_try_flip_a1), + INIT_RCUPREEMPT_PROBE(rcupreempt_trace_try_flip_ae1), + INIT_RCUPREEMPT_PROBE(rcupreempt_trace_try_flip_a2), + INIT_RCUPREEMPT_PROBE(rcupreempt_trace_try_flip_z1), + INIT_RCUPREEMPT_PROBE(rcupreempt_trace_try_flip_ze1), + INIT_RCUPREEMPT_PROBE(rcupreempt_trace_try_flip_z2), + INIT_RCUPREEMPT_PROBE(rcupreempt_trace_try_flip_m1), + INIT_RCUPREEMPT_PROBE(rcupreempt_trace_try_flip_me1), + INIT_RCUPREEMPT_PROBE(rcupreempt_trace_try_flip_m2), + INIT_RCUPREEMPT_PROBE(rcupreempt_trace_check_callbacks), + INIT_RCUPREEMPT_PROBE(rcupreempt_trace_done_remove), + INIT_RCUPREEMPT_PROBE(rcupreempt_trace_invoke), + INIT_RCUPREEMPT_PROBE(rcupreempt_trace_next_add) +}; + +static void rcupreempt_trace_sum(struct rcupreempt_trace *sp) +{ + struct rcupreempt_trace *cp; + int cpu; + + memset(sp, 0, sizeof(*sp)); + for_each_possible_cpu(cpu) { + cp = rcupreempt_trace_cpu(cpu); + sp->next_length += cp->next_length; + sp->next_add += cp->next_add; + sp->wait_length += cp->wait_length; + sp->wait_add += cp->wait_add; + sp->done_length += cp->done_length; + sp->done_add += cp->done_add; + sp->done_remove += cp->done_remove; + atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked)); + sp->rcu_check_callbacks += cp->rcu_check_callbacks; + atomic_set(&sp->rcu_try_flip_1, + atomic_read(&cp->rcu_try_flip_1)); + atomic_set(&sp->rcu_try_flip_e1, + atomic_read(&cp->rcu_try_flip_e1)); + sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1; + sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1; + sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1; + sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1; + sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1; + sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2; + sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1; + sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1; + sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2; + sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1; + sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1; + sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2; + } +} + +static ssize_t rcustats_read(struct file *filp, char __user *buffer, + size_t count, loff_t *ppos) +{ + struct rcupreempt_trace trace; + ssize_t bcount; + int cnt = 0; + + rcupreempt_trace_sum(&trace); + mutex_lock(&rcupreempt_trace_mutex); + snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt, + "ggp=%ld rcc=%ld\n", + rcu_batches_completed(), + trace.rcu_check_callbacks); + snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt, + "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n" + "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n" + "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n", + + trace.next_add, trace.next_length, + trace.wait_add, trace.wait_length, + trace.done_add, trace.done_length, + trace.done_remove, atomic_read(&trace.done_invoked), + atomic_read(&trace.rcu_try_flip_1), + atomic_read(&trace.rcu_try_flip_e1), + trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1, + trace.rcu_try_flip_g1, + trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1, + trace.rcu_try_flip_a2, + trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1, + trace.rcu_try_flip_z2, + trace.rcu_try_flip_m1, trace.rcu_try_flip_me1, + trace.rcu_try_flip_m2); + bcount = simple_read_from_buffer(buffer, count, ppos, + rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); + mutex_unlock(&rcupreempt_trace_mutex); + return bcount; +} + +static ssize_t rcugp_read(struct file *filp, char __user *buffer, + size_t count, loff_t *ppos) +{ + long oldgp = rcu_batches_completed(); + ssize_t bcount; + + mutex_lock(&rcupreempt_trace_mutex); + synchronize_rcu(); + snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE, + "oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed()); + bcount = simple_read_from_buffer(buffer, count, ppos, + rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); + mutex_unlock(&rcupreempt_trace_mutex); + return bcount; +} + +static ssize_t rcuctrs_read(struct file *filp, char __user *buffer, + size_t count, loff_t *ppos) +{ + int cnt = 0; + int cpu; + int f = rcu_batches_completed() & 0x1; + ssize_t bcount; + + mutex_lock(&rcupreempt_trace_mutex); + + cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE, + "CPU last cur F M\n"); + for_each_online_cpu(cpu) { + int *flipctr = rcupreempt_flipctr(cpu); + cnt += snprintf(&rcupreempt_trace_buf[cnt], + RCUPREEMPT_TRACE_BUF_SIZE - cnt, + "%3d %4d %3d %d %d\n", + cpu, + flipctr[!f], + flipctr[f], + rcupreempt_flip_flag(cpu), + rcupreempt_mb_flag(cpu)); + } + cnt += snprintf(&rcupreempt_trace_buf[cnt], + RCUPREEMPT_TRACE_BUF_SIZE - cnt, + "ggp = %ld, state = %s\n", + rcu_batches_completed(), + rcupreempt_try_flip_state_name()); + cnt += snprintf(&rcupreempt_trace_buf[cnt], + RCUPREEMPT_TRACE_BUF_SIZE - cnt, + "\n"); + bcount = simple_read_from_buffer(buffer, count, ppos, + rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); + mutex_unlock(&rcupreempt_trace_mutex); + return bcount; +} + +static struct file_operations rcustats_fops = { + .owner = THIS_MODULE, + .read = rcustats_read, +}; + +static struct file_operations rcugp_fops = { + .owner = THIS_MODULE, + .read = rcugp_read, +}; + +static struct file_operations rcuctrs_fops = { + .owner = THIS_MODULE, + .read = rcuctrs_read, +}; + +static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir; +static int rcupreempt_debugfs_init(void) +{ + rcudir = debugfs_create_dir("rcu", NULL); + if (!rcudir) + goto out; + statdir = debugfs_create_file("rcustats", 0444, rcudir, + NULL, &rcustats_fops); + if (!statdir) + goto free_out; + + gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); + if (!gpdir) + goto free_out; + + ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir, + NULL, &rcuctrs_fops); + if (!ctrsdir) + goto free_out; + +#ifdef CONFIG_PREEMPT_RCU_BOOST + if (!rcu_trace_boost_create(rcudir)) + goto free_out; +#endif /* CONFIG_PREEMPT_RCU_BOOST */ + return 0; +free_out: + if (ctrsdir) + debugfs_remove(ctrsdir); + if (statdir) + debugfs_remove(statdir); + if (gpdir) + debugfs_remove(gpdir); + debugfs_remove(rcudir); +out: + return 1; +} + +static int __init rcupreempt_trace_init(void) +{ + int ret; + int i; + + for (i = 0; i < ARRAY_SIZE(rcupreempt_probe_array); i++) { + struct rcupreempt_probe_data *p = &rcupreempt_probe_array[i]; + ret = marker_probe_register(p->name, p->format, + p->probe_func, p); + if (ret) + printk(KERN_INFO "Unable to register rcupreempt \ + probe %s\n", rcupreempt_probe_array[i].name); + ret = marker_arm(p->name); + if (ret) + printk(KERN_INFO "Unable to arm rcupreempt probe %s\n", + p->name); + } + printk(KERN_INFO "RCU Preempt markers registered\n"); + +#ifdef CONFIG_PREEMPT_RCU_BOOST + for (i = 0; i < ARRAY_SIZE(preempt_rcu_boost_probe_array); i++) { + struct preempt_rcu_boost_probe *p = \ + &preempt_rcu_boost_probe_array[i]; + ret = marker_probe_register(p->name, p->format, + p->probe_func, p); + if (ret) + printk(KERN_INFO "Unable to register Preempt RCU Boost \ + probe %s\n", preempt_rcu_boost_probe_array[i].name); + ret = marker_arm(p->name); + if (ret) + printk(KERN_INFO "Unable to arm Preempt RCU Boost \ + markers %s\n", p->name); +} +#endif /* CONFIG_PREEMPT_RCU_BOOST */ + + mutex_init(&rcupreempt_trace_mutex); + rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL); + if (!rcupreempt_trace_buf) + return 1; + ret = rcupreempt_debugfs_init(); + if (ret) + kfree(rcupreempt_trace_buf); + return ret; +} + +static void __exit rcupreempt_trace_cleanup(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(rcupreempt_probe_array); i++) + marker_probe_unregister(rcupreempt_probe_array[i].name); + printk(KERN_INFO "RCU Preempt markers unregistered\n"); + +#ifdef CONFIG_PREEMPT_RCU_BOOST + rcu_trace_boost_destroy(); + for (i = 0; i < ARRAY_SIZE(preempt_rcu_boost_probe_array); i++) + marker_probe_unregister(preempt_rcu_boost_probe_array[i].name); + printk(KERN_INFO "Preempt RCU Boost markers unregistered\n"); +#endif /* CONFIG_PREEMPT_RCU_BOOST */ + debugfs_remove(statdir); + debugfs_remove(gpdir); + debugfs_remove(ctrsdir); + debugfs_remove(rcudir); + kfree(rcupreempt_trace_buf); +} + +MODULE_LICENSE("GPL"); + +module_init(rcupreempt_trace_init); +module_exit(rcupreempt_trace_cleanup); Index: linux-2.6.24-rc8-rt1/kernel/rcutorture.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/rcutorture.c 2008-01-16 22:27:41.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/rcutorture.c 2008-01-16 22:29:20.000000000 -0500 @@ -52,11 +52,13 @@ MODULE_AUTHOR("Paul E. McKenney rtort_rcu, rcu_torture_cb); } +static unsigned long rcu_torture_preempt_errors; + +static int rcu_torture_preempt(void *arg) +{ + int completedstart; + int err; + time_t gcstart; + struct sched_param sp; + + sp.sched_priority = 1; + err = sched_setscheduler(current, SCHED_RR, &sp); + if (err != 0) + printk(KERN_ALERT "rcu_torture_preempt() priority err: %d\n", + err); + current->flags |= PF_NOFREEZE; + + do { + completedstart = rcu_torture_completed(); + gcstart = xtime.tv_sec; + while ((xtime.tv_sec - gcstart < 10) && + (rcu_torture_completed() == completedstart)) + cond_resched(); + if (rcu_torture_completed() == completedstart) + rcu_torture_preempt_errors++; + schedule_timeout_interruptible(1); + } while (!kthread_should_stop()); + return 0; +} + +static long rcu_preempt_start(void) +{ + long retval = 0; + int i; + + rcu_preempt_tasks = kzalloc(nrealpreempthogs * sizeof(rcu_preempt_tasks[0]), + GFP_KERNEL); + if (rcu_preempt_tasks == NULL) { + VERBOSE_PRINTK_ERRSTRING("out of memory"); + retval = -ENOMEM; + goto out; + } + + for (i=0; i < nrealpreempthogs; i++) { + rcu_preempt_tasks[i] = kthread_run(rcu_torture_preempt, NULL, + "rcu_torture_preempt"); + if (IS_ERR(rcu_preempt_tasks[i])) { + VERBOSE_PRINTK_ERRSTRING("Failed to create preempter"); + retval = PTR_ERR(rcu_preempt_tasks[i]); + rcu_preempt_tasks[i] = NULL; + break; + } + } + out: + return retval; +} + +static void rcu_preempt_end(void) +{ + int i; + if (rcu_preempt_tasks) { + for (i=0; i < nrealpreempthogs; i++) { + if (rcu_preempt_tasks[i] != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_preempt task"); + kthread_stop(rcu_preempt_tasks[i]); + } + rcu_preempt_tasks[i] = NULL; + } + kfree(rcu_preempt_tasks); + } +} + +static int rcu_preempt_stats(char *page) +{ + return sprintf(page, + "Preemption stalls: %lu\n", rcu_torture_preempt_errors); +} + static struct rcu_torture_ops rcu_ops = { - .init = NULL, - .cleanup = NULL, .readlock = rcu_torture_read_lock, .readdelay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, .completed = rcu_torture_completed, .deferredfree = rcu_torture_deferred_free, .sync = synchronize_rcu, - .stats = NULL, + .preemptstart = rcu_preempt_start, + .preemptend = rcu_preempt_end, + .stats = rcu_preempt_stats, .name = "rcu" }; @@ -296,14 +381,12 @@ static void rcu_sync_torture_init(void) static struct rcu_torture_ops rcu_sync_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = rcu_torture_read_lock, .readdelay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, .completed = rcu_torture_completed, .deferredfree = rcu_sync_torture_deferred_free, .sync = synchronize_rcu, - .stats = NULL, .name = "rcu_sync" }; @@ -355,28 +438,23 @@ static void rcu_bh_torture_synchronize(v } static struct rcu_torture_ops rcu_bh_ops = { - .init = NULL, - .cleanup = NULL, .readlock = rcu_bh_torture_read_lock, .readdelay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, .completed = rcu_bh_torture_completed, .deferredfree = rcu_bh_torture_deferred_free, .sync = rcu_bh_torture_synchronize, - .stats = NULL, .name = "rcu_bh" }; static struct rcu_torture_ops rcu_bh_sync_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = rcu_bh_torture_read_lock, .readdelay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, .completed = rcu_bh_torture_completed, .deferredfree = rcu_sync_torture_deferred_free, .sync = rcu_bh_torture_synchronize, - .stats = NULL, .name = "rcu_bh_sync" }; @@ -488,14 +566,12 @@ static void sched_torture_synchronize(vo static struct rcu_torture_ops sched_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = sched_torture_read_lock, .readdelay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, .completed = sched_torture_completed, .deferredfree = rcu_sync_torture_deferred_free, .sync = sched_torture_synchronize, - .stats = NULL, .name = "sched" }; @@ -550,10 +626,20 @@ rcu_torture_writer(void *arg) static int rcu_torture_fakewriter(void *arg) { + struct sched_param sp; + long id = (long) arg; + int err; DEFINE_RCU_RANDOM(rand); VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); - set_user_nice(current, 19); + /* + * Set up at a higher prio than the readers. + */ + sp.sched_priority = 1 + id; + err = sched_setscheduler(current, SCHED_RR, &sp); + if (err != 0) + printk(KERN_ALERT "rcu_torture_writer() priority err: %d\n", + err); do { schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); @@ -592,7 +678,7 @@ rcu_torture_reader(void *arg) if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ cur_ops->readunlock(idx); - schedule_timeout_interruptible(HZ); + schedule_timeout_interruptible(round_jiffies_relative(HZ)); continue; } if (p->rtort_mbtest == 0) @@ -786,10 +872,13 @@ rcu_torture_print_module_parms(char *tag { printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d nfakewriters=%d " + "npreempthogs=%d " "stat_interval=%d verbose=%d test_no_idle_hz=%d " - "shuffle_interval = %d\n", + "shuffle_interval=%d preempt_torture=%d\n", torture_type, tag, nrealreaders, nfakewriters, - stat_interval, verbose, test_no_idle_hz, shuffle_interval); + nrealpreempthogs, + stat_interval, verbose, test_no_idle_hz, shuffle_interval, + preempt_torture); } static void @@ -842,6 +931,8 @@ rcu_torture_cleanup(void) kthread_stop(stats_task); } stats_task = NULL; + if (preempt_torture && (cur_ops->preemptend != NULL)) + cur_ops->preemptend(); /* Wait for all RCU callbacks to fire. */ rcu_barrier(); @@ -859,7 +950,7 @@ rcu_torture_cleanup(void) static int __init rcu_torture_init(void) { - int i; + long i; int cpu; int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = @@ -887,6 +978,12 @@ rcu_torture_init(void) rcu_torture_print_module_parms("Start of test"); fullstop = 0; + if (npreempthogs >= 0) + nrealpreempthogs = npreempthogs; + else + nrealpreempthogs = num_online_cpus() == 1 ? 1 : + num_online_cpus() - 1; + /* Set up the freelist. */ INIT_LIST_HEAD(&rcu_torture_freelist); @@ -934,7 +1031,7 @@ rcu_torture_init(void) } for (i = 0; i < nfakewriters; i++) { VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); - fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, + fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, (void*)i, "rcu_torture_fakewriter"); if (IS_ERR(fakewriter_tasks[i])) { firsterr = PTR_ERR(fakewriter_tasks[i]); @@ -984,6 +1081,11 @@ rcu_torture_init(void) goto unwind; } } + if (preempt_torture && (cur_ops->preemptstart != NULL)) { + firsterr = cur_ops->preemptstart(); + if (firsterr != 0) + goto unwind; + } return 0; unwind: Index: linux-2.6.24-rc8-rt1/Documentation/RCU/rcu.txt =================================================================== --- linux-2.6.24-rc8-rt1.orig/Documentation/RCU/rcu.txt 2008-01-16 22:27:41.000000000 -0500 +++ linux-2.6.24-rc8-rt1/Documentation/RCU/rcu.txt 2008-01-16 22:28:22.000000000 -0500 @@ -36,6 +36,14 @@ o How can the updater tell when a grace executed in user mode, or executed in the idle loop, we can safely free up that item. + Preemptible variants of RCU (CONFIG_PREEMPT_RCU) get the + same effect, but require that the readers manipulate CPU-local + counters. These counters allow limited types of blocking + within RCU read-side critical sections. SRCU also uses + CPU-local counters, and permits general blocking within + RCU read-side critical sections. These two variants of + RCU detect grace periods by sampling these counters. + o If I am running on a uniprocessor kernel, which can only do one thing at a time, why should I wait for a grace period? @@ -46,7 +54,10 @@ o How can I see where RCU is currently u Search for "rcu_read_lock", "rcu_read_unlock", "call_rcu", "rcu_read_lock_bh", "rcu_read_unlock_bh", "call_rcu_bh", "srcu_read_lock", "srcu_read_unlock", "synchronize_rcu", - "synchronize_net", and "synchronize_srcu". + "synchronize_net", "synchronize_srcu", and the other RCU + primitives. Or grab one of the cscope databases from: + + http://www.rdrop.com/users/paulmck/RCU/linuxusage/rculocktab.html o What guidelines should I follow when writing code that uses RCU? @@ -67,7 +78,12 @@ o I hear that RCU is patented? What is o I hear that RCU needs work in order to support realtime kernels? - Yes, work in progress. + This work is largely completed. Realtime-friendly RCU can be + enabled via the CONFIG_PREEMPT_RCU kernel configuration parameter. + In addition, the CONFIG_PREEMPT_RCU_BOOST kernel configuration + parameter enables priority boosting of preempted RCU read-side + critical sections, though this is only needed if you have + CPU-bound realtime threads. o Where can I find more information on RCU? Index: linux-2.6.24-rc8-rt1/Documentation/RCU/RTFP.txt =================================================================== --- linux-2.6.24-rc8-rt1.orig/Documentation/RCU/RTFP.txt 2008-01-16 22:27:41.000000000 -0500 +++ linux-2.6.24-rc8-rt1/Documentation/RCU/RTFP.txt 2008-01-16 22:28:22.000000000 -0500 @@ -9,8 +9,8 @@ The first thing resembling RCU was publi [Kung80] recommended use of a garbage collector to defer destruction of nodes in a parallel binary search tree in order to simplify its implementation. This works well in environments that have garbage -collectors, but current production garbage collectors incur significant -read-side overhead. +collectors, but most production garbage collectors incur significant +overhead. In 1982, Manber and Ladner [Manber82,Manber84] recommended deferring destruction until all threads running at that time have terminated, again @@ -99,16 +99,25 @@ locking, reduces contention, reduces mem parallelizes pipeline stalls and memory latency for writers. However, these techniques still impose significant read-side overhead in the form of memory barriers. Researchers at Sun worked along similar lines -in the same timeframe [HerlihyLM02,HerlihyLMS03]. These techniques -can be thought of as inside-out reference counts, where the count is -represented by the number of hazard pointers referencing a given data -structure (rather than the more conventional counter field within the -data structure itself). +in the same timeframe [HerlihyLM02]. These techniques can be thought +of as inside-out reference counts, where the count is represented by the +number of hazard pointers referencing a given data structure (rather than +the more conventional counter field within the data structure itself). + +By the same token, RCU can be thought of as a "bulk reference count", +where some form of reference counter covers all reference by a given CPU +or thread during a set timeframe. This timeframe is related to, but +not necessarily exactly the same as, an RCU grace period. In classic +RCU, the reference counter is the per-CPU bit in the "bitmask" field, +and each such bit covers all references that might have been made by +the corresponding CPU during the prior grace period. Of course, RCU +can be thought of in other terms as well. In 2003, the K42 group described how RCU could be used to create -hot-pluggable implementations of operating-system functions. Later that -year saw a paper describing an RCU implementation of System V IPC -[Arcangeli03], and an introduction to RCU in Linux Journal [McKenney03a]. +hot-pluggable implementations of operating-system functions [Appavoo03a]. +Later that year saw a paper describing an RCU implementation of System +V IPC [Arcangeli03], and an introduction to RCU in Linux Journal +[McKenney03a]. 2004 has seen a Linux-Journal article on use of RCU in dcache [McKenney04a], a performance comparison of locking to RCU on several @@ -117,10 +126,27 @@ number of operating-system kernels [Paul describing how to make RCU safe for soft-realtime applications [Sarma04c], and a paper describing SELinux performance with RCU [JamesMorris04b]. -2005 has seen further adaptation of RCU to realtime use, permitting +2005 brought further adaptation of RCU to realtime use, permitting preemption of RCU realtime critical sections [PaulMcKenney05a, PaulMcKenney05b]. +2006 saw the first best-paper award for an RCU paper [ThomasEHart2006a], +as well as further work on efficient implementations of preemptible +RCU [PaulEMcKenney2006b], but priority-boosting of RCU read-side critical +sections proved elusive. An RCU implementation permitting general +blocking in read-side critical sections appeared [PaulEMcKenney2006c], +Robert Olsson described an RCU-protected trie-hash combination +[RobertOlsson2006a]. + +In 2007, the RCU priority-boosting problem finally was solved +[PaulEMcKenney2007BoostRCU], and an RCU paper was first accepted into +an academic journal [ThomasEHart2007a]. An LWN article on the use of +Promela and spin to validate parallel algorithms [PaulEMcKenney2007QRCUspin] +also described Oleg Nesterov's QRCU, the first RCU implementation that +can boast deep sub-microsecond grace periods (in absence of readers, +and read-side overhead is roughly that of a global reference count). + + Bibtex Entries @article{Kung80 @@ -203,6 +229,41 @@ Bibtex Entries ,Address="New Orleans, LA" } +@conference{Pu95a, +Author = "Calton Pu and Tito Autrey and Andrew Black and Charles Consel and +Crispin Cowan and Jon Inouye and Lakshmi Kethana and Jonathan Walpole and +Ke Zhang", +Title = "Optimistic Incremental Specialization: Streamlining a Commercial +Operating System", +Booktitle = "15\textsuperscript{th} ACM Symposium on +Operating Systems Principles (SOSP'95)", +address = "Copper Mountain, CO", +month="December", +year="1995", +pages="314-321", +annotation=" + Uses a replugger, but with a flag to signal when people are + using the resource at hand. Only one reader at a time. +" +} + +@conference{Cowan96a, +Author = "Crispin Cowan and Tito Autrey and Charles Krasic and +Calton Pu and Jonathan Walpole", +Title = "Fast Concurrent Dynamic Linking for an Adaptive Operating System", +Booktitle = "International Conference on Configurable Distributed Systems +(ICCDS'96)", +address = "Annapolis, MD", +month="May", +year="1996", +pages="108", +isbn="0-8186-7395-8", +annotation=" + Uses a replugger, but with a counter to signal when people are + using the resource at hand. Allows multiple readers. +" +} + @techreport{Slingwine95 ,author="John D. Slingwine and Paul E. McKenney" ,title="Apparatus and Method for Achieving Reduced Overhead Mutual @@ -312,6 +373,49 @@ Andrea Arcangeli and Andi Kleen and Orra [Viewed June 23, 2004]" } +@conference{Michael02a +,author="Maged M. Michael" +,title="Safe Memory Reclamation for Dynamic Lock-Free Objects Using Atomic +Reads and Writes" +,Year="2002" +,Month="August" +,booktitle="{Proceedings of the 21\textsuperscript{st} Annual ACM +Symposium on Principles of Distributed Computing}" +,pages="21-30" +,annotation=" + Each thread keeps an array of pointers to items that it is + currently referencing. Sort of an inside-out garbage collection + mechanism, but one that requires the accessing code to explicitly + state its needs. Also requires read-side memory barriers on + most architectures. +" +} + +@conference{Michael02b +,author="Maged M. Michael" +,title="High Performance Dynamic Lock-Free Hash Tables and List-Based Sets" +,Year="2002" +,Month="August" +,booktitle="{Proceedings of the 14\textsuperscript{th} Annual ACM +Symposium on Parallel +Algorithms and Architecture}" +,pages="73-82" +,annotation=" + Like the title says... +" +} + +@InProceedings{HerlihyLM02 +,author={Maurice Herlihy and Victor Luchangco and Mark Moir} +,title="The Repeat Offender Problem: A Mechanism for Supporting Dynamic-Sized, +Lock-Free Data Structures" +,booktitle={Proceedings of 16\textsuperscript{th} International +Symposium on Distributed Computing} +,year=2002 +,month="October" +,pages="339-353" +} + @article{Appavoo03a ,author="J. Appavoo and K. Hui and C. A. N. Soules and R. W. Wisniewski and D. M. {Da Silva} and O. Krieger and M. A. Auslander and D. J. Edelsohn and @@ -447,3 +551,111 @@ Oregon Health and Sciences University" Realtime turns into making RCU yet more realtime friendly. " } + +@conference{ThomasEHart2006a +,Author="Thomas E. Hart and Paul E. McKenney and Angela Demke Brown" +,Title="Making Lockless Synchronization Fast: Performance Implications +of Memory Reclamation" +,Booktitle="20\textsuperscript{th} {IEEE} International Parallel and +Distributed Processing Symposium" +,month="April" +,year="2006" +,day="25-29" +,address="Rhodes, Greece" +,annotation=" + Compares QSBR (AKA "classic RCU"), HPBR, EBR, and lock-free + reference counting. +" +} + +@Conference{PaulEMcKenney2006b +,Author="Paul E. McKenney and Dipankar Sarma and Ingo Molnar and +Suparna Bhattacharya" +,Title="Extending RCU for Realtime and Embedded Workloads" +,Booktitle="{Ottawa Linux Symposium}" +,Month="July" +,Year="2006" +,pages="v2 123-138" +,note="Available: +\url{http://www.linuxsymposium.org/2006/view_abstract.php?content_key=184} +\url{http://www.rdrop.com/users/paulmck/RCU/OLSrtRCU.2006.08.11a.pdf} +[Viewed January 1, 2007]" +,annotation=" + Described how to improve the -rt implementation of realtime RCU. +" +} + +@unpublished{PaulEMcKenney2006c +,Author="Paul E. McKenney" +,Title="Sleepable {RCU}" +,month="October" +,day="9" +,year="2006" +,note="Available: +\url{http://lwn.net/Articles/202847/} +Revised: +\url{http://www.rdrop.com/users/paulmck/RCU/srcu.2007.01.14a.pdf} +[Viewed August 21, 2006]" +,annotation=" + LWN article introducing SRCU. +" +} + +@unpublished{RobertOlsson2006a +,Author="Robert Olsson and Stefan Nilsson" +,Title="{TRASH}: A dynamic {LC}-trie and hash data structure" +,month="August" +,day="18" +,year="2006" +,note="Available: +\url{http://www.nada.kth.se/~snilsson/public/papers/trash/trash.pdf} +[Viewed February 24, 2007]" +,annotation=" + RCU-protected dynamic trie-hash combination. +" +} + +@unpublished{PaulEMcKenney2007BoostRCU +,Author="Paul E. McKenney" +,Title="Priority-Boosting {RCU} Read-Side Critical Sections" +,month="February" +,day="5" +,year="2007" +,note="Available: +\url{http://lwn.net/Articles/220677/} +Revised: +\url{http://www.rdrop.com/users/paulmck/RCU/RCUbooststate.2007.04.16a.pdf} +[Viewed September 7, 2007]" +,annotation=" + LWN article introducing RCU priority boosting. +" +} + +@unpublished{ThomasEHart2007a +,Author="Thomas E. Hart and Paul E. McKenney and Angela Demke Brown and Jonathan Walpole" +,Title="Performance of memory reclamation for lockless synchronization" +,journal="J. Parallel Distrib. Comput." +,year="2007" +,note="To appear in J. Parallel Distrib. Comput. + \url{doi=10.1016/j.jpdc.2007.04.010}" +,annotation={ + Compares QSBR (AKA "classic RCU"), HPBR, EBR, and lock-free + reference counting. Journal version of ThomasEHart2006a. +} +} + +@unpublished{PaulEMcKenney2007QRCUspin +,Author="Paul E. McKenney" +,Title="Using Promela and Spin to verify parallel algorithms" +,month="August" +,day="1" +,year="2007" +,note="Available: +\url{http://lwn.net/Articles/243851/} +[Viewed September 8, 2007]" +,annotation=" + LWN article describing Promela and spin, and also using Oleg + Nesterov's QRCU as an example (with Paul McKenney's fastpath). +" +} + Index: linux-2.6.24-rc8-rt1/Documentation/RCU/torture.txt =================================================================== --- linux-2.6.24-rc8-rt1.orig/Documentation/RCU/torture.txt 2008-01-16 22:27:41.000000000 -0500 +++ linux-2.6.24-rc8-rt1/Documentation/RCU/torture.txt 2008-01-16 22:28:22.000000000 -0500 @@ -37,6 +37,24 @@ nfakewriters This is the number of RCU f to trigger special cases caused by multiple writers, such as the synchronize_srcu() early return optimization. +preempt_torture Specifies that torturing of preemptible RCU is to be + undertaken, defaults to no such testing. This test + creates a kernel thread that runs at the lowest possible + realtime priority, alternating between ten seconds + of spinning and a short sleep period. The goal is + to preempt lower-priority RCU readers. Note that this + currently does not fail the full test, but instead simply + counts the number of times that a ten-second CPU burst + coincides with a stall in grace-period detection. + + Of course, if the grace period advances during a CPU burst, + that indicates that no RCU reader was preempted, so the + burst ends early in that case. + + Note that such stalls are expected behavior in preemptible + RCU implementations when RCU priority boosting is not + enabled (PREEMPT_RCU_BOOST=n). + stat_interval The number of seconds between output of torture statistics (via printk()). Regardless of the interval, statistics are printed when the module is unloaded. @@ -46,12 +64,13 @@ stat_interval The number of seconds betw shuffle_interval The number of seconds to keep the test threads affinitied - to a particular subset of the CPUs. Used in conjunction - with test_no_idle_hz. + to a particular subset of the CPUs, defaults to 5 seconds. + Used in conjunction with test_no_idle_hz. test_no_idle_hz Whether or not to test the ability of RCU to operate in a kernel that disables the scheduling-clock interrupt to idle CPUs. Boolean parameter, "1" to test, "0" otherwise. + Defaults to omitting this test. torture_type The type of RCU to test: "rcu" for the rcu_read_lock() API, "rcu_sync" for rcu_read_lock() with synchronous reclamation, @@ -82,8 +101,6 @@ be evident. ;-) The entries are as follows: -o "ggp": The number of counter flips (or batches) since boot. - o "rtc": The hexadecimal address of the structure currently visible to readers. @@ -117,8 +134,8 @@ o "Reader Pipe": Histogram of "ages" of o "Reader Batch": Another histogram of "ages" of structures seen by readers, but in terms of counter flips (or batches) rather than in terms of grace periods. The legal number of non-zero - entries is again two. The reason for this separate view is - that it is easier to get the third entry to show up in the + entries is again two. The reason for this separate view is that + it is sometimes easier to get the third entry to show up in the "Reader Batch" list than in the "Reader Pipe" list. o "Free-Block Circulation": Shows the number of torture structures @@ -145,6 +162,21 @@ of the "old" and "current" counters for "idx" value maps the "old" and "current" values to the underlying array, and is useful for debugging. +In addition, preemptible RCU rcutorture runs will report preemption +stalls: + +rcu-torture: rtc: ffffffff88005a40 ver: 17041 tfle: 1 rta: 17041 rtaf: 7904 rtf: 16941 rtmbe: 0 +rcu-torture: Reader Pipe: 975332139 34406 0 0 0 0 0 0 0 0 0 +rcu-torture: Reader Batch: 975349310 17234 0 0 0 0 0 0 0 0 0 +rcu-torture: Free-Block Circulation: 17040 17030 17028 17022 17009 16994 16982 16969 16955 16941 0 +Preemption stalls: 0 + +The first four lines are as before, and the last line records the number +of times that grace-period processing stalled during a realtime CPU burst. +Note that a non-zero value does not -prove- that RCU priority boosting is +broken, because there are other things that can stall RCU grace-period +processing. Here is hoping that someone comes up with a better test! + USAGE Index: linux-2.6.24-rc8-rt1/kernel/softirq.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/softirq.c 2008-01-16 22:27:41.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/softirq.c 2008-01-16 22:29:30.000000000 -0500 @@ -4,15 +4,23 @@ * Copyright (C) 1992 Linus Torvalds * * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) + * + * Softirq-split implemetation by + * Copyright (C) 2005 Thomas Gleixner, Ingo Molnar */ #include +#include +#include +#include #include #include #include +#include #include #include #include +#include #include #include #include @@ -46,7 +54,41 @@ EXPORT_SYMBOL(irq_stat); static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp; -static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); +struct softirqdata { + int nr; + unsigned long cpu; + struct task_struct *tsk; +#ifdef CONFIG_PREEMPT_SOFTIRQS + wait_queue_head_t wait; + int running; +#endif +}; + +static DEFINE_PER_CPU(struct softirqdata [MAX_SOFTIRQ], ksoftirqd); + +#ifdef CONFIG_PREEMPT_SOFTIRQS +/* + * Preempting the softirq causes cases that would not be a + * problem when the softirq is not preempted. That is a + * process may have code to spin while waiting for a softirq + * to finish on another CPU. But if it happens that the + * process has preempted the softirq, this could cause a + * deadlock. + */ +void wait_for_softirq(int softirq) +{ + struct softirqdata *data = &__get_cpu_var(ksoftirqd)[softirq]; + if (data->running) { + DECLARE_WAITQUEUE(wait, current); + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&data->wait, &wait); + if (data->running) + schedule(); + remove_wait_queue(&data->wait, &wait); + __set_current_state(TASK_RUNNING); + } +} +#endif /* * we cannot loop indefinitely here to avoid userspace starvation, @@ -54,15 +96,37 @@ static DEFINE_PER_CPU(struct task_struct * to the pending events, so lets the scheduler to balance * the softirq load for us. */ -static inline void wakeup_softirqd(void) +static void wakeup_softirqd(int softirq) { /* Interrupts are disabled: no need to stop preemption */ - struct task_struct *tsk = __get_cpu_var(ksoftirqd); + struct task_struct *tsk = __get_cpu_var(ksoftirqd)[softirq].tsk; - if (tsk && tsk->state != TASK_RUNNING) - wake_up_process(tsk); + if (unlikely(!tsk)) + return; + /* + * Wake up the softirq task: + */ + wake_up_process(tsk); +} + +/* + * Wake up the softirq threads which have work + */ +static void trigger_softirqs(void) +{ + u32 pending = local_softirq_pending(); + int curr = 0; + + while (pending) { + if (pending & 1) + wakeup_softirqd(curr); + pending >>= 1; + curr++; + } } +#ifndef CONFIG_PREEMPT_HARDIRQS + /* * This one is for softirq.c-internal use, * where hardirqs are disabled legitimately: @@ -98,20 +162,6 @@ void local_bh_disable(void) EXPORT_SYMBOL(local_bh_disable); -void __local_bh_enable(void) -{ - WARN_ON_ONCE(in_irq()); - - /* - * softirqs should never be enabled by __local_bh_enable(), - * it always nests inside local_bh_enable() sections: - */ - WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET); - - sub_preempt_count(SOFTIRQ_OFFSET); -} -EXPORT_SYMBOL_GPL(__local_bh_enable); - /* * Special-case - softirqs can safely be enabled in * cond_resched_softirq(), or by __do_softirq(), @@ -136,7 +186,6 @@ void local_bh_enable(void) WARN_ON_ONCE(in_irq()); #endif - WARN_ON_ONCE(irqs_disabled()); #ifdef CONFIG_TRACE_IRQFLAGS local_irq_save(flags); @@ -194,6 +243,8 @@ void local_bh_enable_ip(unsigned long ip } EXPORT_SYMBOL(local_bh_enable_ip); +#endif + /* * We restart softirq processing MAX_SOFTIRQ_RESTART times, * and we fall back to softirqd after that. @@ -203,52 +254,175 @@ EXPORT_SYMBOL(local_bh_enable_ip); * we want to handle softirqs as soon as possible, but they * should not be able to lock up the box. */ -#define MAX_SOFTIRQ_RESTART 10 +#define MAX_SOFTIRQ_RESTART 20 -asmlinkage void __do_softirq(void) +static DEFINE_PER_CPU(u32, softirq_running); + +static void ___do_softirq(const int same_prio_only) { + int max_restart = MAX_SOFTIRQ_RESTART, max_loops = MAX_SOFTIRQ_RESTART; + __u32 pending, available_mask, same_prio_skipped; struct softirq_action *h; - __u32 pending; - int max_restart = MAX_SOFTIRQ_RESTART; - int cpu; + struct task_struct *tsk; + int cpu, softirq; pending = local_softirq_pending(); account_system_vtime(current); - __local_bh_disable((unsigned long)__builtin_return_address(0)); - trace_softirq_enter(); - cpu = smp_processor_id(); restart: + available_mask = -1; + softirq = 0; + same_prio_skipped = 0; /* Reset the pending bitmask before enabling irqs */ set_softirq_pending(0); - local_irq_enable(); - h = softirq_vec; do { + u32 softirq_mask = 1 << softirq; + if (pending & 1) { + u32 preempt_count = preempt_count(); + +#if defined(CONFIG_PREEMPT_SOFTIRQS) && defined(CONFIG_PREEMPT_HARDIRQS) + /* + * If executed by a same-prio hardirq thread + * then skip pending softirqs that belong + * to softirq threads with different priority: + */ + if (same_prio_only) { + tsk = __get_cpu_var(ksoftirqd)[softirq].tsk; + if (tsk && tsk->normal_prio != + current->normal_prio) { + same_prio_skipped |= softirq_mask; + available_mask &= ~softirq_mask; + goto next; + } + } +#endif + /* + * Is this softirq already being processed? + */ + if (per_cpu(softirq_running, cpu) & softirq_mask) { + available_mask &= ~softirq_mask; + goto next; + } + per_cpu(softirq_running, cpu) |= softirq_mask; + local_irq_enable(); + h->action(h); + if (preempt_count != preempt_count()) { + print_symbol("BUG: softirq exited %s with wrong preemption count!\n", (unsigned long) h->action); + printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count()); + preempt_count() = preempt_count; + } rcu_bh_qsctr_inc(cpu); + cond_resched_softirq_context(); + local_irq_disable(); + per_cpu(softirq_running, cpu) &= ~softirq_mask; } +next: h++; + softirq++; pending >>= 1; } while (pending); - local_irq_disable(); - + or_softirq_pending(same_prio_skipped); pending = local_softirq_pending(); - if (pending && --max_restart) - goto restart; + if (pending & available_mask) { + if (--max_restart) + goto restart; + /* + * With softirq threading there's no reason not to + * finish the workload we have: + */ +#ifdef CONFIG_PREEMPT_SOFTIRQS + if (--max_loops) { + if (printk_ratelimit()) + printk("INFO: softirq overload: %08x\n", pending); + max_restart = MAX_SOFTIRQ_RESTART; + goto restart; + } + if (printk_ratelimit()) + printk("BUG: softirq loop! %08x\n", pending); +#endif + } if (pending) - wakeup_softirqd(); + trigger_softirqs(); +} + +asmlinkage void __do_softirq(void) +{ + unsigned long p_flags; + +#ifdef CONFIG_PREEMPT_SOFTIRQS + /* + * 'preempt harder'. Push all softirq processing off to ksoftirqd. + */ + if (softirq_preemption) { + if (local_softirq_pending()) + trigger_softirqs(); + return; + } +#endif + /* + * 'immediate' softirq execution: + */ + __local_bh_disable((unsigned long)__builtin_return_address(0)); + trace_softirq_enter(); + p_flags = current->flags & PF_HARDIRQ; + current->flags &= ~PF_HARDIRQ; + + ___do_softirq(0); trace_softirq_exit(); account_system_vtime(current); _local_bh_enable(); + + current->flags |= p_flags; +} + +/* + * Process softirqs straight from hardirq context, + * without having to switch to a softirq thread. + * This can reduce the context-switch rate. + * + * NOTE: this is unused right now. + */ +void do_softirq_from_hardirq(void) +{ + unsigned long p_flags; + + /* + * 'immediate' softirq execution, from hardirq context: + */ + local_irq_disable(); + if (!local_softirq_pending()) + goto out; + __local_bh_disable((unsigned long)__builtin_return_address(0)); +#ifndef CONFIG_PREEMPT_SOFTIRQS + trace_softirq_enter(); +#endif + p_flags = current->flags & PF_HARDIRQ; + current->flags &= ~PF_HARDIRQ; + current->flags |= PF_SOFTIRQ; + + ___do_softirq(1); + +#ifndef CONFIG_PREEMPT_SOFTIRQS + trace_softirq_exit(); +#endif + account_system_vtime(current); + + current->flags |= p_flags; + current->flags &= ~PF_SOFTIRQ; + + _local_bh_enable(); +out: + local_irq_enable(); } #ifndef __ARCH_HAS_DO_SOFTIRQ @@ -306,8 +480,9 @@ void irq_exit(void) /* Make sure that timer wheel updates are propagated */ if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) tick_nohz_stop_sched_tick(); + rcu_irq_exit(); #endif - preempt_enable_no_resched(); + __preempt_enable_no_resched(); } /* @@ -315,19 +490,11 @@ void irq_exit(void) */ inline fastcall void raise_softirq_irqoff(unsigned int nr) { - __raise_softirq_irqoff(nr); + __do_raise_softirq_irqoff(nr); - /* - * If we're in an interrupt or softirq, we're done - * (this also catches softirq-disabled code). We will - * actually run the softirq once we return from - * the irq or softirq. - * - * Otherwise we wake up ksoftirqd to make sure we - * schedule the softirq soon. - */ - if (!in_interrupt()) - wakeup_softirqd(); +#ifdef CONFIG_PREEMPT_SOFTIRQS + wakeup_softirqd(nr); +#endif } void fastcall raise_softirq(unsigned int nr) @@ -356,14 +523,45 @@ struct tasklet_head static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL }; static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL }; +static void inline +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr) +{ + if (tasklet_trylock(t)) { +again: + /* We may have been preempted before tasklet_trylock + * and __tasklet_action may have already run. + * So double check the sched bit while the takslet + * is locked before adding it to the list. + */ + if (test_bit(TASKLET_STATE_SCHED, &t->state)) { + WARN_ON(t->next != NULL); + t->next = head->list; + head->list = t; + raise_softirq_irqoff(nr); + tasklet_unlock(t); + } else { + /* This is subtle. If we hit the corner case above + * It is possible that we get preempted right here, + * and another task has successfully called + * tasklet_schedule(), then this function, and + * failed on the trylock. Thus we must be sure + * before releasing the tasklet lock, that the + * SCHED_BIT is clear. Otherwise the tasklet + * may get its SCHED_BIT set, but not added to the + * list + */ + if (!tasklet_tryunlock(t)) + goto again; + } + } +} + void fastcall __tasklet_schedule(struct tasklet_struct *t) { unsigned long flags; local_irq_save(flags); - t->next = __get_cpu_var(tasklet_vec).list; - __get_cpu_var(tasklet_vec).list = t; - raise_softirq_irqoff(TASKLET_SOFTIRQ); + __tasklet_common_schedule(t, &__get_cpu_var(tasklet_vec), TASKLET_SOFTIRQ); local_irq_restore(flags); } @@ -374,81 +572,130 @@ void fastcall __tasklet_hi_schedule(stru unsigned long flags; local_irq_save(flags); - t->next = __get_cpu_var(tasklet_hi_vec).list; - __get_cpu_var(tasklet_hi_vec).list = t; - raise_softirq_irqoff(HI_SOFTIRQ); + __tasklet_common_schedule(t, &__get_cpu_var(tasklet_hi_vec), HI_SOFTIRQ); local_irq_restore(flags); } EXPORT_SYMBOL(__tasklet_hi_schedule); -static void tasklet_action(struct softirq_action *a) +void fastcall tasklet_enable(struct tasklet_struct *t) { - struct tasklet_struct *list; + if (!atomic_dec_and_test(&t->count)) + return; + if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state)) + tasklet_schedule(t); +} - local_irq_disable(); - list = __get_cpu_var(tasklet_vec).list; - __get_cpu_var(tasklet_vec).list = NULL; - local_irq_enable(); +EXPORT_SYMBOL(tasklet_enable); + +void fastcall tasklet_hi_enable(struct tasklet_struct *t) +{ + if (!atomic_dec_and_test(&t->count)) + return; + if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state)) + tasklet_hi_schedule(t); +} + +EXPORT_SYMBOL(tasklet_hi_enable); + +static void +__tasklet_action(struct softirq_action *a, struct tasklet_struct *list) +{ + int loops = 1000000; while (list) { struct tasklet_struct *t = list; list = list->next; + /* + * Should always succeed - after a tasklist got on the + * list (after getting the SCHED bit set from 0 to 1), + * nothing but the tasklet softirq it got queued to can + * lock it: + */ + if (!tasklet_trylock(t)) { + WARN_ON(1); + continue; + } + + t->next = NULL; - if (tasklet_trylock(t)) { - if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) - BUG(); - t->func(t->data); + /* + * If we cannot handle the tasklet because it's disabled, + * mark it as pending. tasklet_enable() will later + * re-schedule the tasklet. + */ + if (unlikely(atomic_read(&t->count))) { +out_disabled: + /* implicit unlock: */ + wmb(); + t->state = TASKLET_STATEF_PENDING; + continue; + } + + /* + * After this point on the tasklet might be rescheduled + * on another CPU, but it can only be added to another + * CPU's tasklet list if we unlock the tasklet (which we + * dont do yet). + */ + if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) + WARN_ON(1); + +again: + t->func(t->data); + + /* + * Try to unlock the tasklet. We must use cmpxchg, because + * another CPU might have scheduled or disabled the tasklet. + * We only allow the STATE_RUN -> 0 transition here. + */ + while (!tasklet_tryunlock(t)) { + /* + * If it got disabled meanwhile, bail out: + */ + if (atomic_read(&t->count)) + goto out_disabled; + /* + * If it got scheduled meanwhile, re-execute + * the tasklet function: + */ + if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) + goto again; + if (!--loops) { + printk("hm, tasklet state: %08lx\n", t->state); + WARN_ON(1); tasklet_unlock(t); - continue; + break; } - tasklet_unlock(t); } - - local_irq_disable(); - t->next = __get_cpu_var(tasklet_vec).list; - __get_cpu_var(tasklet_vec).list = t; - __raise_softirq_irqoff(TASKLET_SOFTIRQ); - local_irq_enable(); } } -static void tasklet_hi_action(struct softirq_action *a) +static void tasklet_action(struct softirq_action *a) { struct tasklet_struct *list; local_irq_disable(); - list = __get_cpu_var(tasklet_hi_vec).list; - __get_cpu_var(tasklet_hi_vec).list = NULL; + list = __get_cpu_var(tasklet_vec).list; + __get_cpu_var(tasklet_vec).list = NULL; local_irq_enable(); - while (list) { - struct tasklet_struct *t = list; + __tasklet_action(a, list); +} - list = list->next; +static void tasklet_hi_action(struct softirq_action *a) +{ + struct tasklet_struct *list; - if (tasklet_trylock(t)) { - if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) - BUG(); - t->func(t->data); - tasklet_unlock(t); - continue; - } - tasklet_unlock(t); - } + local_irq_disable(); + list = __get_cpu_var(tasklet_hi_vec).list; + __get_cpu_var(tasklet_hi_vec).list = NULL; + local_irq_enable(); - local_irq_disable(); - t->next = __get_cpu_var(tasklet_hi_vec).list; - __get_cpu_var(tasklet_hi_vec).list = t; - __raise_softirq_irqoff(HI_SOFTIRQ); - local_irq_enable(); - } + __tasklet_action(a, list); } - void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data) { @@ -468,7 +715,7 @@ void tasklet_kill(struct tasklet_struct while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { do - yield(); + msleep(1); while (test_bit(TASKLET_STATE_SCHED, &t->state)); } tasklet_unlock_wait(t); @@ -483,33 +730,98 @@ void __init softirq_init(void) open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); } -static int ksoftirqd(void * __bind_cpu) +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) + +void tasklet_unlock_wait(struct tasklet_struct *t) { + while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { + /* + * Hack for now to avoid this busy-loop: + */ +#ifdef CONFIG_PREEMPT_RT + msleep(1); +#else + barrier(); +#endif + } +} +EXPORT_SYMBOL(tasklet_unlock_wait); + +#endif + +static int ksoftirqd(void * __data) +{ + struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2 }; + struct softirqdata *data = __data; + u32 softirq_mask = (1 << data->nr); + struct softirq_action *h; + int cpu = data->cpu; + +#ifdef CONFIG_PREEMPT_SOFTIRQS + init_waitqueue_head(&data->wait); +#endif + + sys_sched_setscheduler(current->pid, SCHED_FIFO, ¶m); + current->flags |= PF_SOFTIRQ; set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { preempt_disable(); - if (!local_softirq_pending()) { - preempt_enable_no_resched(); + if (!(local_softirq_pending() & softirq_mask)) { +sleep_more: + __preempt_enable_no_resched(); schedule(); preempt_disable(); } __set_current_state(TASK_RUNNING); - while (local_softirq_pending()) { +#ifdef CONFIG_PREEMPT_SOFTIRQS + data->running = 1; +#endif + + while (local_softirq_pending() & softirq_mask) { /* Preempt disable stops cpu going offline. If already offline, we'll be on wrong CPU: don't process */ - if (cpu_is_offline((long)__bind_cpu)) + if (cpu_is_offline(cpu)) goto wait_to_die; - do_softirq(); - preempt_enable_no_resched(); + + local_irq_disable(); + /* + * Is the softirq already being executed by + * a hardirq context? + */ + if (per_cpu(softirq_running, cpu) & softirq_mask) { + local_irq_enable(); + set_current_state(TASK_INTERRUPTIBLE); + goto sleep_more; + } + per_cpu(softirq_running, cpu) |= softirq_mask; + __preempt_enable_no_resched(); + set_softirq_pending(local_softirq_pending() & ~softirq_mask); + local_bh_disable(); + local_irq_enable(); + + h = &softirq_vec[data->nr]; + if (h) + h->action(h); + rcu_bh_qsctr_inc(data->cpu); + + local_irq_disable(); + per_cpu(softirq_running, cpu) &= ~softirq_mask; + _local_bh_enable(); + local_irq_enable(); + cond_resched(); preempt_disable(); } preempt_enable(); set_current_state(TASK_INTERRUPTIBLE); +#ifdef CONFIG_PREEMPT_SOFTIRQS + data->running = 0; + wake_up(&data->wait); +#endif } __set_current_state(TASK_RUNNING); return 0; @@ -556,7 +868,7 @@ void tasklet_kill_immediate(struct taskl BUG(); } -static void takeover_tasklets(unsigned int cpu) +void takeover_tasklets(unsigned int cpu) { struct tasklet_struct **i; @@ -578,49 +890,82 @@ static void takeover_tasklets(unsigned i } #endif /* CONFIG_HOTPLUG_CPU */ +static const char *softirq_names [] = +{ + [HI_SOFTIRQ] = "high", + [SCHED_SOFTIRQ] = "sched", + [TIMER_SOFTIRQ] = "timer", + [NET_TX_SOFTIRQ] = "net-tx", + [NET_RX_SOFTIRQ] = "net-rx", + [BLOCK_SOFTIRQ] = "block", + [TASKLET_SOFTIRQ] = "tasklet", +#ifdef CONFIG_HIGH_RES_TIMERS + [HRTIMER_SOFTIRQ] = "hrtimer", +#endif + [RCU_SOFTIRQ] = "rcu", +}; + static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - int hotcpu = (unsigned long)hcpu; + int hotcpu = (unsigned long)hcpu, i; struct task_struct *p; switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); - if (IS_ERR(p)) { - printk("ksoftirqd for %i failed\n", hotcpu); - return NOTIFY_BAD; - } - kthread_bind(p, hotcpu); - per_cpu(ksoftirqd, hotcpu) = p; - break; + for (i = 0; i < MAX_SOFTIRQ; i++) { + per_cpu(ksoftirqd, hotcpu)[i].nr = i; + per_cpu(ksoftirqd, hotcpu)[i].cpu = hotcpu; + per_cpu(ksoftirqd, hotcpu)[i].tsk = NULL; + } + for (i = 0; i < MAX_SOFTIRQ; i++) { + p = kthread_create(ksoftirqd, + &per_cpu(ksoftirqd, hotcpu)[i], + "softirq-%s/%d", softirq_names[i], + hotcpu); + if (IS_ERR(p)) { + printk("ksoftirqd %d for %i failed\n", i, + hotcpu); + return NOTIFY_BAD; + } + kthread_bind(p, hotcpu); + per_cpu(ksoftirqd, hotcpu)[i].tsk = p; + } + break; + break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - wake_up_process(per_cpu(ksoftirqd, hotcpu)); + for (i = 0; i < MAX_SOFTIRQ; i++) + wake_up_process(per_cpu(ksoftirqd, hotcpu)[i].tsk); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - if (!per_cpu(ksoftirqd, hotcpu)) - break; - /* Unbind so it can run. Fall thru. */ - kthread_bind(per_cpu(ksoftirqd, hotcpu), - any_online_cpu(cpu_online_map)); +#if 0 + for (i = 0; i < MAX_SOFTIRQ; i++) { + if (!per_cpu(ksoftirqd, hotcpu)[i].tsk) + continue; + kthread_bind(per_cpu(ksoftirqd, hotcpu)[i].tsk, + any_online_cpu(cpu_online_map)); + } +#endif case CPU_DEAD: - case CPU_DEAD_FROZEN: { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - - p = per_cpu(ksoftirqd, hotcpu); - per_cpu(ksoftirqd, hotcpu) = NULL; - sched_setscheduler(p, SCHED_FIFO, ¶m); - kthread_stop(p); + case CPU_DEAD_FROZEN: + for (i = 0; i < MAX_SOFTIRQ; i++) { + struct sched_param param; + + param.sched_priority = MAX_RT_PRIO-1; + p = per_cpu(ksoftirqd, hotcpu)[i].tsk; + sched_setscheduler(p, SCHED_FIFO, ¶m); + per_cpu(ksoftirqd, hotcpu)[i].tsk = NULL; + kthread_stop(p); + } takeover_tasklets(hotcpu); break; - } #endif /* CONFIG_HOTPLUG_CPU */ - } + } return NOTIFY_OK; } @@ -639,6 +984,34 @@ __init int spawn_ksoftirqd(void) return 0; } + +#ifdef CONFIG_PREEMPT_SOFTIRQS + +int softirq_preemption = 1; + +EXPORT_SYMBOL(softirq_preemption); + +/* + * Real-Time Preemption depends on softirq threading: + */ +#ifndef CONFIG_PREEMPT_RT + +static int __init softirq_preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) + softirq_preemption = 0; + else + get_option(&str, &softirq_preemption); + if (!softirq_preemption) + printk("turning off softirq preemption!\n"); + + return 1; +} + +__setup("softirq-preempt=", softirq_preempt_setup); +#endif +#endif + #ifdef CONFIG_SMP /* * Call a function on all processors Index: linux-2.6.24-rc8-rt1/include/linux/hardirq.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/hardirq.h 2008-01-16 22:27:41.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/hardirq.h 2008-01-16 22:28:49.000000000 -0500 @@ -41,23 +41,25 @@ # error HARDIRQ_BITS is too low! #endif #endif +#define PREEMPT_ACTIVE_BITS 1 -#define PREEMPT_SHIFT 0 -#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) -#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS) - -#define __IRQ_MASK(x) ((1UL << (x))-1) - -#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT) -#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) -#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) - -#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT) -#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) -#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) +#define PREEMPT_SHIFT 0 +#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) +#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS) +#define PREEMPT_ACTIVE_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS) + +#define __IRQ_MASK(x) ((1UL << (x))-1) + +#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT) +#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) +#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) + +#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT) +#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) +#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) #if PREEMPT_ACTIVE < (1 << (HARDIRQ_SHIFT + HARDIRQ_BITS)) -#error PREEMPT_ACTIVE is too low! +# error PREEMPT_ACTIVE is too low! #endif #define hardirq_count() (preempt_count() & HARDIRQ_MASK) @@ -68,11 +70,13 @@ * Are we doing bottom half or hardware interrupt processing? * Are we in a softirq context? Interrupt context? */ -#define in_irq() (hardirq_count()) -#define in_softirq() (softirq_count()) -#define in_interrupt() (irq_count()) - -#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) +#define in_irq() (hardirq_count() || (current->flags & PF_HARDIRQ)) +#define in_softirq() (softirq_count() || (current->flags & PF_SOFTIRQ)) +#define in_interrupt() (irq_count()) + +#if defined(CONFIG_PREEMPT) && \ + !defined(CONFIG_PREEMPT_BKL) && \ + !defined(CONFIG_PREEMPT_RT) # define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != kernel_locked()) #else # define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0) @@ -113,6 +117,14 @@ static inline void account_system_vtime( } #endif +#if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ) +extern void rcu_irq_enter(void); +extern void rcu_irq_exit(void); +#else +# define rcu_irq_enter() do { } while (0) +# define rcu_irq_exit() do { } while (0) +#endif /* CONFIG_PREEMPT_RCU */ + /* * It is safe to do non-atomic ops on ->hardirq_context, * because NMI handlers may not preempt and the ops are @@ -121,6 +133,7 @@ static inline void account_system_vtime( */ #define __irq_enter() \ do { \ + rcu_irq_enter(); \ account_system_vtime(current); \ add_preempt_count(HARDIRQ_OFFSET); \ trace_hardirq_enter(); \ @@ -139,6 +152,7 @@ extern void irq_enter(void); trace_hardirq_exit(); \ account_system_vtime(current); \ sub_preempt_count(HARDIRQ_OFFSET); \ + rcu_irq_exit(); \ } while (0) /* Index: linux-2.6.24-rc8-rt1/include/asm-arm/atomic.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-arm/atomic.h 2008-01-16 22:27:41.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-arm/atomic.h 2008-01-16 22:28:23.000000000 -0500 @@ -114,6 +114,46 @@ static inline void atomic_clear_mask(uns : "cc"); } +/* + * Atomic compare and exchange. + */ +#define __HAVE_ARCH_CMPXCHG 1 + +extern unsigned long wrong_size_cmpxchg(volatile void *ptr); + +static inline unsigned long __cmpxchg(volatile void *ptr, + unsigned long old, + unsigned long new, int size) +{ + volatile unsigned long *p = ptr; + + if (size == 4) { + unsigned long oldval, res; + + do { + __asm__ __volatile__("@ atomic_cmpxchg\n" + "ldrex %1, [%2]\n" + "mov %0, #0\n" + "teq %1, %3\n" + "strexeq %0, %4, [%2]\n" + : "=&r" (res), "=&r" (oldval) + : "r" (p), "Ir" (old), "r" (new) + : "cc"); + } while (res); + + return oldval; + } else + return wrong_size_cmpxchg(ptr); +} + +#define cmpxchg(ptr,o,n) \ +({ \ + __typeof__(*(ptr)) _o_ = (o); \ + __typeof__(*(ptr)) _n_ = (n); \ + (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_, \ + (unsigned long)_n_, sizeof(*(ptr))); \ +}) + #else /* ARM_ARCH_6 */ #include @@ -173,6 +213,41 @@ static inline void atomic_clear_mask(uns raw_local_irq_restore(flags); } +#ifndef CONFIG_SMP +/* + * Atomic compare and exchange. + */ +#define __HAVE_ARCH_CMPXCHG 1 + +extern unsigned long wrong_size_cmpxchg(volatile void *ptr); + +static inline unsigned long __cmpxchg(volatile void *ptr, + unsigned long old, + unsigned long new, int size) +{ + unsigned long flags, prev; + volatile unsigned long *p = ptr; + + if (size == 4) { + raw_local_irq_save(flags); + if ((prev = *p) == old) + *p = new; + raw_local_irq_restore(flags); + return(prev); + } else + return wrong_size_cmpxchg(ptr); +} + +#define cmpxchg(ptr,o,n) \ +({ \ + __typeof__(*(ptr)) _o_ = (o); \ + __typeof__(*(ptr)) _n_ = (n); \ + (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_, \ + (unsigned long)_n_, sizeof(*(ptr))); \ +}) + +#endif + #endif /* __LINUX_ARM_ARCH__ */ #define atomic_xchg(v, new) (xchg(&((v)->counter), new)) Index: linux-2.6.24-rc8-rt1/include/asm-arm/futex.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-arm/futex.h 2008-01-16 22:27:41.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-arm/futex.h 2008-01-16 22:28:39.000000000 -0500 @@ -1,6 +1,125 @@ -#ifndef _ASM_FUTEX_H -#define _ASM_FUTEX_H +#ifndef _ASM_ARM_FUTEX_H +#define _ASM_ARM_FUTEX_H -#include +#ifdef __KERNEL__ +#include +#include +#include + +extern raw_spinlock_t futex_atomic_lock; + +#define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \ + __asm__ __volatile__ ( \ + "1: ldrt %1, [%2] \n" \ + insn \ + "2: strt %0, [%2] \n" \ + " mov %0, #0 \n" \ + "3: \n" \ + " .section __ex_table, \"a\" \n" \ + " .align 3 \n" \ + " .long 1b, 4f, 2b, 4f \n" \ + " .previous \n" \ + " .section .fixup,\"ax\" \n" \ + "4: mov %0, %4 \n" \ + " b 3b \n" \ + " .previous" \ + : "=&r" (ret), "=&r" (oldval) \ + : "r" (uaddr), "r" (oparg), "Ir" (-EFAULT) \ + : "cc", "memory") + +static inline int +futex_atomic_op_inuser (int encoded_op, int __user *uaddr) +{ + int op = (encoded_op >> 28) & 7; + int cmp = (encoded_op >> 24) & 15; + int oparg = (encoded_op << 8) >> 20; + int cmparg = (encoded_op << 20) >> 20; + int oldval = 0, ret; + if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) + oparg = 1 << oparg; + + if (!access_ok (VERIFY_WRITE, uaddr, sizeof(int))) + return -EFAULT; + + pagefault_disable(); + + spin_lock(&futex_atomic_lock); + + switch (op) { + case FUTEX_OP_SET: + __futex_atomic_op(" mov %0, %3\n", + ret, oldval, uaddr, oparg); + break; + case FUTEX_OP_ADD: + __futex_atomic_op(" add %0, %1, %3\n", + ret, oldval, uaddr, oparg); + break; + case FUTEX_OP_OR: + __futex_atomic_op(" orr %0, %1, %3\n", + ret, oldval, uaddr, oparg); + break; + case FUTEX_OP_ANDN: + __futex_atomic_op(" and %0, %1, %3\n", + ret, oldval, uaddr, oparg); + break; + case FUTEX_OP_XOR: + __futex_atomic_op(" eor %0, %1, %3\n", + ret, oldval, uaddr, oparg); + break; + default: + ret = -ENOSYS; + } + + spin_unlock(&futex_atomic_lock); + + pagefault_enable(); + + if (!ret) { + switch (cmp) { + case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; + case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; + case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; + case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; + case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; + case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; + default: ret = -ENOSYS; + } + } + return ret; +} + +static inline int +futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) +{ + int val; + + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + return -EFAULT; + + spin_lock(&futex_atomic_lock); + + __asm__ __volatile__( "@futex_atomic_cmpxchg_inatomic \n" + "1: ldrt %0, [%3] \n" + " teq %0, %1 \n" + "2: streqt %2, [%3] \n" + "3: \n" + " .section __ex_table, \"a\" \n" + " .align 3 \n" + " .long 1b, 4f, 2b, 4f \n" + " .previous \n" + " .section .fixup,\"ax\" \n" + "4: mov %0, %4 \n" + " b 3b \n" + " .previous" + : "=&r" (val) + : "r" (oldval), "r" (newval), "r" (uaddr), "Ir" (-EFAULT) + : "cc"); + + spin_unlock(&futex_atomic_lock); + + return val; +} + +#endif #endif Index: linux-2.6.24-rc8-rt1/arch/arm/kernel/process.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/kernel/process.c 2008-01-16 22:27:41.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/kernel/process.c 2008-01-16 22:28:40.000000000 -0500 @@ -37,6 +37,8 @@ #include #include +DEFINE_RAW_SPINLOCK(futex_atomic_lock); + static const char *processor_modes[] = { "USER_26", "FIQ_26" , "IRQ_26" , "SVC_26" , "UK4_26" , "UK5_26" , "UK6_26" , "UK7_26" , "UK8_26" , "UK9_26" , "UK10_26", "UK11_26", "UK12_26", "UK13_26", "UK14_26", "UK15_26", @@ -134,7 +136,7 @@ static void default_idle(void) cpu_relax(); else { local_irq_disable(); - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { timer_dyn_reprogram(); arch_idle(); } @@ -166,13 +168,17 @@ void cpu_idle(void) idle = default_idle; leds_event(led_idle_start); tick_nohz_stop_sched_tick(); - while (!need_resched()) + while (!need_resched() && !need_resched_delayed()) idle(); leds_event(led_idle_end); + local_irq_disable(); + trace_preempt_exit_idle(); tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + trace_preempt_enter_idle(); + local_irq_enable(); } } Index: linux-2.6.24-rc8-rt1/include/linux/bottom_half.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/bottom_half.h 2008-01-16 22:27:41.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/bottom_half.h 2008-01-16 22:29:08.000000000 -0500 @@ -1,10 +1,17 @@ #ifndef _LINUX_BH_H #define _LINUX_BH_H +#ifdef CONFIG_PREEMPT_HARDIRQS +# define local_bh_disable() do { } while (0) +# define __local_bh_disable(ip) do { } while (0) +# define _local_bh_enable() do { } while (0) +# define local_bh_enable() do { } while (0) +# define local_bh_enable_ip(ip) do { } while (0) +#else extern void local_bh_disable(void); -extern void __local_bh_enable(void); extern void _local_bh_enable(void); extern void local_bh_enable(void); extern void local_bh_enable_ip(unsigned long ip); +#endif #endif /* _LINUX_BH_H */ Index: linux-2.6.24-rc8-rt1/include/linux/irq.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/irq.h 2008-01-16 22:27:40.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/irq.h 2008-01-16 22:28:52.000000000 -0500 @@ -19,10 +19,12 @@ #include #include #include +#include #include #include #include +#include struct irq_desc; typedef void fastcall (*irq_flow_handler_t)(unsigned int irq, @@ -61,6 +63,7 @@ typedef void fastcall (*irq_flow_handler #define IRQ_WAKEUP 0x00100000 /* IRQ triggers system wakeup */ #define IRQ_MOVE_PENDING 0x00200000 /* need to re-target IRQ destination */ #define IRQ_NO_BALANCING 0x00400000 /* IRQ is excluded from balancing */ +#define IRQ_NODELAY 0x40000000 /* IRQ must run immediately */ #ifdef CONFIG_IRQ_PER_CPU # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU) @@ -141,6 +144,8 @@ struct irq_chip { * @irq_count: stats field to detect stalled irqs * @irqs_unhandled: stats field for spurious unhandled interrupts * @last_unhandled: aging timer for unhandled count + * @thread: Thread pointer for threaded preemptible irq handling + * @wait_for_handler: Waitqueue to wait for a running preemptible handler * @lock: locking for SMP * @affinity: IRQ affinity on SMP * @cpu: cpu index useful for balancing @@ -163,7 +168,10 @@ struct irq_desc { unsigned int irq_count; /* For detecting broken IRQs */ unsigned int irqs_unhandled; unsigned long last_unhandled; /* Aging timer for unhandled count */ - spinlock_t lock; + struct task_struct *thread; + wait_queue_head_t wait_for_handler; + cycles_t timestamp; + raw_spinlock_t lock; #ifdef CONFIG_SMP cpumask_t affinity; unsigned int cpu; @@ -394,7 +402,21 @@ extern int set_irq_msi(unsigned int irq, #define get_irq_data(irq) (irq_desc[irq].handler_data) #define get_irq_msi(irq) (irq_desc[irq].msi_desc) -#endif /* CONFIG_GENERIC_HARDIRQS */ +/* Early initialization of irqs */ +extern void early_init_hardirqs(void); + +#if defined(CONFIG_PREEMPT_HARDIRQS) +extern void init_hardirqs(void); +#else +static inline void init_hardirqs(void) { } +#endif + +#else /* end GENERIC HARDIRQS */ + +static inline void early_init_hardirqs(void) { } +static inline void init_hardirqs(void) { } + +#endif /* !CONFIG_GENERIC_HARDIRQS */ #endif /* !CONFIG_S390 */ Index: linux-2.6.24-rc8-rt1/kernel/irq/autoprobe.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/irq/autoprobe.c 2008-01-16 22:27:40.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/irq/autoprobe.c 2008-01-16 22:28:24.000000000 -0500 @@ -7,6 +7,7 @@ */ #include +#include #include #include #include Index: linux-2.6.24-rc8-rt1/kernel/irq/chip.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/irq/chip.c 2008-01-16 22:27:40.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/irq/chip.c 2008-01-16 22:29:14.000000000 -0500 @@ -269,8 +269,10 @@ static inline void mask_ack_irq(struct i if (desc->chip->mask_ack) desc->chip->mask_ack(irq); else { - desc->chip->mask(irq); - desc->chip->ack(irq); + if (desc->chip->mask) + desc->chip->mask(irq); + if (desc->chip->ack) + desc->chip->ack(irq); } } @@ -295,8 +297,10 @@ handle_simple_irq(unsigned int irq, stru spin_lock(&desc->lock); - if (unlikely(desc->status & IRQ_INPROGRESS)) + if (unlikely(desc->status & IRQ_INPROGRESS)) { + desc->status |= IRQ_PENDING; goto out_unlock; + } desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); kstat_cpu(cpu).irqs[irq]++; @@ -305,6 +309,11 @@ handle_simple_irq(unsigned int irq, stru goto out_unlock; desc->status |= IRQ_INPROGRESS; + /* + * hardirq redirection to the irqd process context: + */ + if (redirect_hardirq(desc)) + goto out_unlock; spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); @@ -313,6 +322,8 @@ handle_simple_irq(unsigned int irq, stru spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; + if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) + desc->chip->unmask(irq); out_unlock: spin_unlock(&desc->lock); } @@ -351,6 +362,13 @@ handle_level_irq(unsigned int irq, struc goto out_unlock; desc->status |= IRQ_INPROGRESS; + + /* + * hardirq redirection to the irqd process context: + */ + if (redirect_hardirq(desc)) + goto out_unlock; + spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); @@ -384,18 +402,16 @@ handle_fasteoi_irq(unsigned int irq, str spin_lock(&desc->lock); - if (unlikely(desc->status & IRQ_INPROGRESS)) - goto out; - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); kstat_cpu(cpu).irqs[irq]++; /* - * If its disabled or no action available + * If it's running, disabled or no action available * then mask it and get out of here: */ action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) { + if (unlikely(!action || (desc->status & (IRQ_INPROGRESS | + IRQ_DISABLED)))) { desc->status |= IRQ_PENDING; if (desc->chip->mask) desc->chip->mask(irq); @@ -403,6 +419,15 @@ handle_fasteoi_irq(unsigned int irq, str } desc->status |= IRQ_INPROGRESS; + /* + * In the threaded case we fall back to a mask+eoi sequence: + */ + if (redirect_hardirq(desc)) { + if (desc->chip->mask) + desc->chip->mask(irq); + goto out; + } + desc->status &= ~IRQ_PENDING; spin_unlock(&desc->lock); @@ -412,9 +437,10 @@ handle_fasteoi_irq(unsigned int irq, str spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; + if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) + desc->chip->unmask(irq); out: desc->chip->eoi(irq); - spin_unlock(&desc->lock); } @@ -463,6 +489,12 @@ handle_edge_irq(unsigned int irq, struct /* Mark the IRQ currently in progress.*/ desc->status |= IRQ_INPROGRESS; + /* + * hardirq redirection to the irqd process context: + */ + if (redirect_hardirq(desc)) + goto out_unlock; + do { struct irqaction *action = desc->action; irqreturn_t action_ret; Index: linux-2.6.24-rc8-rt1/kernel/irq/internals.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/irq/internals.h 2008-01-16 22:27:40.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/irq/internals.h 2008-01-16 22:28:24.000000000 -0500 @@ -10,6 +10,10 @@ extern void irq_chip_set_defaults(struct /* Set default handler: */ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); +extern int redirect_hardirq(struct irq_desc *desc); + +void recalculate_desc_flags(struct irq_desc *desc); + #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq); extern void register_handler_proc(unsigned int irq, struct irqaction *action); Index: linux-2.6.24-rc8-rt1/kernel/irq/manage.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/irq/manage.c 2008-01-16 22:27:40.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/irq/manage.c 2008-01-16 22:29:30.000000000 -0500 @@ -8,8 +8,10 @@ */ #include -#include #include +#include +#include +#include #include #include "internals.h" @@ -41,8 +43,12 @@ void synchronize_irq(unsigned int irq) * Wait until we're out of the critical section. This might * give the wrong answer due to the lack of memory barriers. */ - while (desc->status & IRQ_INPROGRESS) - cpu_relax(); + if (hardirq_preemption && !(desc->status & IRQ_NODELAY)) + wait_event(desc->wait_for_handler, + !(desc->status & IRQ_INPROGRESS)); + else + while (desc->status & IRQ_INPROGRESS) + cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ spin_lock_irqsave(&desc->lock, flags); @@ -185,6 +191,14 @@ void enable_irq(unsigned int irq) desc->depth--; } spin_unlock_irqrestore(&desc->lock, flags); +#ifdef CONFIG_HARDIRQS_SW_RESEND + /* + * Do a bh disable/enable pair to trigger any pending + * irq resend logic: + */ + local_bh_disable(); + local_bh_enable(); +#endif } EXPORT_SYMBOL(enable_irq); @@ -234,6 +248,21 @@ int set_irq_wake(unsigned int irq, unsig EXPORT_SYMBOL(set_irq_wake); /* + * If any action has IRQF_NODELAY then turn IRQ_NODELAY on: + */ +void recalculate_desc_flags(struct irq_desc *desc) +{ + struct irqaction *action; + + desc->status &= ~IRQ_NODELAY; + for (action = desc->action ; action; action = action->next) + if (action->flags & IRQF_NODELAY) + desc->status |= IRQ_NODELAY; +} + +static int start_irq_thread(int irq, struct irq_desc *desc); + +/* * Internal function that tells the architecture code whether a * particular irq has been exclusively allocated or is available * for driver use. @@ -298,6 +327,9 @@ int setup_irq(unsigned int irq, struct i rand_initialize_irq(irq); } + if (!(new->flags & IRQF_NODELAY)) + if (start_irq_thread(irq, desc)) + return -ENOMEM; /* * The following block of code has to be executed atomically */ @@ -338,6 +370,11 @@ int setup_irq(unsigned int irq, struct i if (new->flags & IRQF_NOBALANCING) desc->status |= IRQ_NO_BALANCING; + /* + * Propagate any possible IRQF_NODELAY flag into IRQ_NODELAY: + */ + recalculate_desc_flags(desc); + if (!shared) { irq_chip_set_defaults(desc->chip); @@ -384,7 +421,7 @@ int setup_irq(unsigned int irq, struct i new->irq = irq; register_irq_proc(irq); - new->dir = NULL; + new->dir = new->threaded = NULL; register_handler_proc(irq, new); return 0; @@ -455,6 +492,7 @@ void free_irq(unsigned int irq, void *de else desc->chip->disable(irq); } + recalculate_desc_flags(desc); spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); @@ -470,9 +508,9 @@ void free_irq(unsigned int irq, void *de * parallel with our fake */ if (action->flags & IRQF_SHARED) { - local_irq_save(flags); + local_irq_save_nort(flags); action->handler(irq, dev_id); - local_irq_restore(flags); + local_irq_restore_nort(flags); } #endif kfree(action); @@ -564,9 +602,9 @@ int request_irq(unsigned int irq, irq_ha */ unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); handler(irq, dev_id); - local_irq_restore(flags); + local_irq_restore_nort(flags); } #endif @@ -577,3 +615,264 @@ int request_irq(unsigned int irq, irq_ha return retval; } EXPORT_SYMBOL(request_irq); + +#ifdef CONFIG_PREEMPT_HARDIRQS + +int hardirq_preemption = 1; + +EXPORT_SYMBOL(hardirq_preemption); + +/* + * Real-Time Preemption depends on hardirq threading: + */ +#ifndef CONFIG_PREEMPT_RT + +static int __init hardirq_preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) + hardirq_preemption = 0; + else + get_option(&str, &hardirq_preemption); + if (!hardirq_preemption) + printk("turning off hardirq preemption!\n"); + + return 1; +} + +__setup("hardirq-preempt=", hardirq_preempt_setup); + +#endif + +/* + * threaded simple handler + */ +static void thread_simple_irq(irq_desc_t *desc) +{ + struct irqaction *action = desc->action; + unsigned int irq = desc - irq_desc; + irqreturn_t action_ret; + + do { + if (!action || desc->depth) + break; + desc->status &= ~IRQ_PENDING; + spin_unlock(&desc->lock); + action_ret = handle_IRQ_event(irq, action); + cond_resched_hardirq_context(); + spin_lock_irq(&desc->lock); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); + } while (desc->status & IRQ_PENDING); + desc->status &= ~IRQ_INPROGRESS; +} + +/* + * threaded level type irq handler + */ +static void thread_level_irq(irq_desc_t *desc) +{ + unsigned int irq = desc - irq_desc; + + thread_simple_irq(desc); + if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) + desc->chip->unmask(irq); +} + +/* + * threaded fasteoi type irq handler + */ +static void thread_fasteoi_irq(irq_desc_t *desc) +{ + unsigned int irq = desc - irq_desc; + + thread_simple_irq(desc); + if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) + desc->chip->unmask(irq); +} + +/* + * threaded edge type IRQ handler + */ +static void thread_edge_irq(irq_desc_t *desc) +{ + unsigned int irq = desc - irq_desc; + + do { + struct irqaction *action = desc->action; + irqreturn_t action_ret; + + if (unlikely(!action)) { + desc->status &= ~IRQ_INPROGRESS; + desc->chip->mask(irq); + return; + } + + /* + * When another irq arrived while we were handling + * one, we could have masked the irq. + * Renable it, if it was not disabled in meantime. + */ + if (unlikely(((desc->status & (IRQ_PENDING | IRQ_MASKED)) == + (IRQ_PENDING | IRQ_MASKED)) && !desc->depth)) + desc->chip->unmask(irq); + + desc->status &= ~IRQ_PENDING; + spin_unlock(&desc->lock); + action_ret = handle_IRQ_event(irq, action); + spin_lock_irq(&desc->lock); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); + } while ((desc->status & IRQ_PENDING) && !desc->depth); + + desc->status &= ~IRQ_INPROGRESS; +} + +/* + * threaded edge type IRQ handler + */ +static void thread_do_irq(irq_desc_t *desc) +{ + unsigned int irq = desc - irq_desc; + + do { + struct irqaction *action = desc->action; + irqreturn_t action_ret; + + if (unlikely(!action)) { + desc->status &= ~IRQ_INPROGRESS; + desc->chip->disable(irq); + return; + } + + desc->status &= ~IRQ_PENDING; + spin_unlock(&desc->lock); + action_ret = handle_IRQ_event(irq, action); + spin_lock_irq(&desc->lock); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); + } while ((desc->status & IRQ_PENDING) && !desc->depth); + + desc->status &= ~IRQ_INPROGRESS; + desc->chip->end(irq); +} + +static void do_hardirq(struct irq_desc *desc) +{ + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + + if (!(desc->status & IRQ_INPROGRESS)) + goto out; + + if (desc->handle_irq == handle_simple_irq) + thread_simple_irq(desc); + else if (desc->handle_irq == handle_level_irq) + thread_level_irq(desc); + else if (desc->handle_irq == handle_fasteoi_irq) + thread_fasteoi_irq(desc); + else if (desc->handle_irq == handle_edge_irq) + thread_edge_irq(desc); + else + thread_do_irq(desc); + out: + spin_unlock_irqrestore(&desc->lock, flags); + + if (waitqueue_active(&desc->wait_for_handler)) + wake_up(&desc->wait_for_handler); +} + +static int do_irqd(void * __desc) +{ + struct sched_param param = { 0, }; + struct irq_desc *desc = __desc; + +#ifdef CONFIG_SMP + cpumask_t cpus_allowed; + + cpus_allowed = desc->affinity; +#endif + current->flags |= PF_NOFREEZE | PF_HARDIRQ; + + /* + * Set irq thread priority to SCHED_FIFO/50: + */ + param.sched_priority = MAX_USER_RT_PRIO/2; + + sys_sched_setscheduler(current->pid, SCHED_FIFO, ¶m); + + while (!kthread_should_stop()) { + local_irq_disable_nort(); + do { + set_current_state(TASK_INTERRUPTIBLE); + do_hardirq(desc); + } while (current->state == TASK_RUNNING); + + local_irq_enable_nort(); +#ifdef CONFIG_SMP + /* + * Did IRQ affinities change? + */ + if (!cpus_equal(cpus_allowed, desc->affinity)) + cpus_allowed = desc->affinity; +#endif + schedule(); + } + __set_current_state(TASK_RUNNING); + + return 0; +} + +static int ok_to_create_irq_threads; + +static int start_irq_thread(int irq, struct irq_desc *desc) +{ + if (desc->thread || !ok_to_create_irq_threads) + return 0; + + desc->thread = kthread_create(do_irqd, desc, "IRQ-%d", irq); + if (!desc->thread) { + printk(KERN_ERR "irqd: could not create IRQ thread %d!\n", irq); + return -ENOMEM; + } + + /* + * An interrupt may have come in before the thread pointer was + * stored in desc->thread; make sure the thread gets woken up in + * such a case: + */ + smp_mb(); + wake_up_process(desc->thread); + + return 0; +} + +void __init init_hardirqs(void) +{ + int i; + ok_to_create_irq_threads = 1; + + for (i = 0; i < NR_IRQS; i++) { + irq_desc_t *desc = irq_desc + i; + + if (desc->action && !(desc->status & IRQ_NODELAY)) + start_irq_thread(i, desc); + } +} + +#else + +static int start_irq_thread(int irq, struct irq_desc *desc) +{ + return 0; +} + +#endif + +void __init early_init_hardirqs(void) +{ + int i; + + for (i = 0; i < NR_IRQS; i++) + init_waitqueue_head(&irq_desc[i].wait_for_handler); +} Index: linux-2.6.24-rc8-rt1/kernel/irq/proc.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/irq/proc.c 2008-01-16 22:27:40.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/irq/proc.c 2008-01-16 22:28:24.000000000 -0500 @@ -7,6 +7,8 @@ */ #include +#include +#include #include #include @@ -75,44 +77,6 @@ static int irq_affinity_write_proc(struc #endif -#define MAX_NAMELEN 128 - -static int name_unique(unsigned int irq, struct irqaction *new_action) -{ - struct irq_desc *desc = irq_desc + irq; - struct irqaction *action; - unsigned long flags; - int ret = 1; - - spin_lock_irqsave(&desc->lock, flags); - for (action = desc->action ; action; action = action->next) { - if ((action != new_action) && action->name && - !strcmp(new_action->name, action->name)) { - ret = 0; - break; - } - } - spin_unlock_irqrestore(&desc->lock, flags); - return ret; -} - -void register_handler_proc(unsigned int irq, struct irqaction *action) -{ - char name [MAX_NAMELEN]; - - if (!irq_desc[irq].dir || action->dir || !action->name || - !name_unique(irq, action)) - return; - - memset(name, 0, MAX_NAMELEN); - snprintf(name, MAX_NAMELEN, "%s", action->name); - - /* create /proc/irq/1234/handler/ */ - action->dir = proc_mkdir(name, irq_desc[irq].dir); -} - -#undef MAX_NAMELEN - #define MAX_NAMELEN 10 void register_irq_proc(unsigned int irq) @@ -150,10 +114,96 @@ void register_irq_proc(unsigned int irq) void unregister_handler_proc(unsigned int irq, struct irqaction *action) { + if (action->threaded) + remove_proc_entry(action->threaded->name, action->dir); if (action->dir) remove_proc_entry(action->dir->name, irq_desc[irq].dir); } +#ifndef CONFIG_PREEMPT_RT + +static int threaded_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + return sprintf(page, "%c\n", + ((struct irqaction *)data)->flags & IRQF_NODELAY ? '0' : '1'); +} + +static int threaded_write_proc(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + int c; + struct irqaction *action = data; + irq_desc_t *desc = irq_desc + action->irq; + + if (get_user(c, buffer)) + return -EFAULT; + if (c != '0' && c != '1') + return -EINVAL; + + spin_lock_irq(&desc->lock); + + if (c == '0') + action->flags |= IRQF_NODELAY; + if (c == '1') + action->flags &= ~IRQF_NODELAY; + recalculate_desc_flags(desc); + + spin_unlock_irq(&desc->lock); + + return 1; +} + +#endif + +#define MAX_NAMELEN 128 + +static int name_unique(unsigned int irq, struct irqaction *new_action) +{ + struct irq_desc *desc = irq_desc + irq; + struct irqaction *action; + + for (action = desc->action ; action; action = action->next) + if ((action != new_action) && action->name && + !strcmp(new_action->name, action->name)) + return 0; + return 1; +} + +void register_handler_proc(unsigned int irq, struct irqaction *action) +{ + char name [MAX_NAMELEN]; + + if (!irq_desc[irq].dir || action->dir || !action->name || + !name_unique(irq, action)) + return; + + memset(name, 0, MAX_NAMELEN); + snprintf(name, MAX_NAMELEN, "%s", action->name); + + /* create /proc/irq/1234/handler/ */ + action->dir = proc_mkdir(name, irq_desc[irq].dir); + + if (!action->dir) + return; +#ifndef CONFIG_PREEMPT_RT + { + struct proc_dir_entry *entry; + /* create /proc/irq/1234/handler/threaded */ + entry = create_proc_entry("threaded", 0600, action->dir); + if (!entry) + return; + entry->nlink = 1; + entry->data = (void *)action; + entry->read_proc = threaded_read_proc; + entry->write_proc = threaded_write_proc; + action->threaded = entry; + } +#endif +} + +#undef MAX_NAMELEN + void init_irq_proc(void) { int i; @@ -163,6 +213,9 @@ void init_irq_proc(void) if (!root_irq_dir) return; + /* create /proc/irq/prof_cpu_mask */ + create_prof_cpu_mask(root_irq_dir); + /* * Create entries for all existing IRQs. */ Index: linux-2.6.24-rc8-rt1/kernel/irq/spurious.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/irq/spurious.c 2008-01-16 22:27:40.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/irq/spurious.c 2008-01-16 22:28:52.000000000 -0500 @@ -10,6 +10,10 @@ #include #include #include +#ifdef CONFIG_X86_IO_APIC +# include +# include +#endif static int irqfixup __read_mostly; @@ -55,9 +59,8 @@ static int misrouted_irq(int irq) } action = action->next; } - local_irq_disable(); /* Now clean up the flags */ - spin_lock(&desc->lock); + spin_lock_irq(&desc->lock); action = desc->action; /* @@ -203,6 +206,12 @@ void note_interrupt(unsigned int irq, st * The interrupt is stuck */ __report_bad_irq(irq, desc, action_ret); +#ifdef CONFIG_X86_IO_APIC + if (!sis_apic_bug) { + sis_apic_bug = 1; + printk(KERN_ERR "turning off IO-APIC fast mode.\n"); + } +#else /* * Now kill the IRQ */ @@ -210,6 +219,7 @@ void note_interrupt(unsigned int irq, st desc->status |= IRQ_DISABLED; desc->depth = 1; desc->chip->disable(irq); +#endif } desc->irqs_unhandled = 0; } @@ -228,6 +238,11 @@ __setup("noirqdebug", noirqdebug_setup); static int __init irqfixup_setup(char *str) { +#ifdef CONFIG_PREEMPT_RT + printk(KERN_WARNING "irqfixup boot option not supported " + "w/ CONFIG_PREEMPT_RT\n"); + return 1; +#endif irqfixup = 1; printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); printk(KERN_WARNING "This may impact system performance.\n"); @@ -239,6 +254,11 @@ __setup("irqfixup", irqfixup_setup); static int __init irqpoll_setup(char *str) { +#ifdef CONFIG_PREEMPT_RT + printk(KERN_WARNING "irqpoll boot option not supported " + "w/ CONFIG_PREEMPT_RT\n"); + return 1; +#endif irqfixup = 2; printk(KERN_WARNING "Misrouted IRQ fixup and polling support " "enabled\n"); Index: linux-2.6.24-rc8-rt1/include/linux/timer.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/timer.h 2008-01-16 22:27:40.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/timer.h 2008-01-16 22:28:25.000000000 -0500 @@ -146,10 +146,12 @@ static inline void add_timer(struct time __mod_timer(timer, timer->expires); } -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) + extern int timer_pending_sync(struct timer_list *timer); extern int try_to_del_timer_sync(struct timer_list *timer); extern int del_timer_sync(struct timer_list *timer); #else +# define timer_pending_sync(t) timer_pending(t) # define try_to_del_timer_sync(t) del_timer(t) # define del_timer_sync(t) del_timer(t) #endif Index: linux-2.6.24-rc8-rt1/kernel/timer.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/timer.c 2008-01-16 22:27:40.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/timer.c 2008-01-16 22:29:16.000000000 -0500 @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -69,6 +70,7 @@ typedef struct tvec_root_s { struct tvec_t_base_s { spinlock_t lock; struct timer_list *running_timer; + wait_queue_head_t wait_for_running_timer; unsigned long timer_jiffies; tvec_root_t tv1; tvec_t tv2; @@ -249,9 +251,7 @@ EXPORT_SYMBOL_GPL(round_jiffies_relative static inline void set_running_timer(tvec_base_t *base, struct timer_list *timer) { -#ifdef CONFIG_SMP base->running_timer = timer; -#endif } static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) @@ -395,7 +395,7 @@ int __mod_timer(struct timer_list *timer { tvec_base_t *base, *new_base; unsigned long flags; - int ret = 0; + int ret = 0, cpu; timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); @@ -407,7 +407,8 @@ int __mod_timer(struct timer_list *timer ret = 1; } - new_base = __get_cpu_var(tvec_bases); + cpu = raw_smp_processor_id(); + new_base = per_cpu(tvec_bases, cpu); if (base != new_base) { /* @@ -456,6 +457,17 @@ void add_timer_on(struct timer_list *tim spin_unlock_irqrestore(&base->lock, flags); } +/* + * Wait for a running timer + */ +void wait_for_running_timer(struct timer_list *timer) +{ + tvec_base_t *base = timer->base; + + if (base->running_timer == timer) + wait_event(base->wait_for_running_timer, + base->running_timer != timer); +} /** * mod_timer - modify a timer's timeout @@ -527,7 +539,35 @@ int del_timer(struct timer_list *timer) EXPORT_SYMBOL(del_timer); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) +/* + * This function checks whether a timer is active and not running on any + * CPU. Upon successful (ret >= 0) exit the timer is not queued and the + * handler is not running on any CPU. + * + * It must not be called from interrupt contexts. + */ +int timer_pending_sync(struct timer_list *timer) +{ + tvec_base_t *base; + unsigned long flags; + int ret = -1; + + base = lock_timer_base(timer, &flags); + + if (base->running_timer == timer) + goto out; + + ret = 0; + if (timer_pending(timer)) + ret = 1; +out: + spin_unlock_irqrestore(&base->lock, flags); + + return ret; +} + + /** * try_to_del_timer_sync - Try to deactivate a timer * @timer: timer do del @@ -584,7 +624,7 @@ int del_timer_sync(struct timer_list *ti int ret = try_to_del_timer_sync(timer); if (ret >= 0) return ret; - cpu_relax(); + wait_for_running_timer(timer); } } @@ -630,6 +670,20 @@ static inline void __run_timers(tvec_bas struct list_head *head = &work_list; int index = base->timer_jiffies & TVR_MASK; + if (softirq_need_resched()) { + spin_unlock_irq(&base->lock); + wake_up(&base->wait_for_running_timer); + cond_resched_softirq_context(); + cpu_relax(); + spin_lock_irq(&base->lock); + /* + * We can simply continue after preemption, nobody + * else can touch timer_jiffies so 'index' is still + * valid. Any new jiffy will be taken care of in + * subsequent loops: + */ + } + /* * Cascade timers: */ @@ -657,18 +711,17 @@ static inline void __run_timers(tvec_bas int preempt_count = preempt_count(); fn(data); if (preempt_count != preempt_count()) { - printk(KERN_WARNING "huh, entered %p " - "with preempt_count %08x, exited" - " with %08x?\n", - fn, preempt_count, - preempt_count()); - BUG(); + print_symbol("BUG: unbalanced timer-handler preempt count in %s!\n", (unsigned long) fn); + printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count()); + preempt_count() = preempt_count; } } + set_running_timer(base, NULL); + cond_resched_softirq_context(); spin_lock_irq(&base->lock); } } - set_running_timer(base, NULL); + wake_up(&base->wait_for_running_timer); spin_unlock_irq(&base->lock); } @@ -798,9 +851,22 @@ unsigned long get_next_timer_interrupt(u tvec_base_t *base = __get_cpu_var(tvec_bases); unsigned long expires; +#ifdef CONFIG_PREEMPT_RT + /* + * On PREEMPT_RT we cannot sleep here. If the trylock does not + * succeed then we return the worst-case 'expires in 1 tick' + * value: + */ + if (spin_trylock(&base->lock)) { + expires = __next_timer_interrupt(base); + spin_unlock(&base->lock); + } else + expires = now + 1; +#else spin_lock(&base->lock); expires = __next_timer_interrupt(base); spin_unlock(&base->lock); +#endif if (time_before_eq(expires, now)) return now; @@ -841,10 +907,10 @@ void update_process_times(int user_tick) /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); + scheduler_tick(); run_local_timers(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_tick); - scheduler_tick(); run_posix_cpu_timers(p); } @@ -853,7 +919,30 @@ void update_process_times(int user_tick) */ static unsigned long count_active_tasks(void) { + /* + * On PREEMPT_RT, we are running in the timer softirq thread, + * so consider 1 less running tasks: + */ +#ifdef CONFIG_PREEMPT_RT + return (nr_active() - 1) * FIXED_1; +#else return nr_active() * FIXED_1; +#endif +} + +/* + * Nr of active tasks - counted in fixed-point numbers + */ +static unsigned long count_active_rt_tasks(void) +{ +#ifdef CONFIG_PREEMPT_RT + extern unsigned long rt_nr_running(void); + extern unsigned long rt_nr_uninterruptible(void); + + return (rt_nr_running() + rt_nr_uninterruptible()) * FIXED_1; +#else + return 0; +#endif } /* @@ -868,6 +957,8 @@ unsigned long avenrun[3]; EXPORT_SYMBOL(avenrun); +unsigned long avenrun_rt[3]; + /* * calc_load - given tick count, update the avenrun load estimates. * This is called while holding a write_lock on xtime_lock. @@ -876,49 +967,78 @@ static inline void calc_load(unsigned lo { unsigned long active_tasks; /* fixed-point */ static int count = LOAD_FREQ; +#ifdef CONFIG_PREEMPT_RT + unsigned long active_rt_tasks; /* fixed-point */ +#endif count -= ticks; if (unlikely(count < 0)) { active_tasks = count_active_tasks(); +#ifdef CONFIG_PREEMPT_RT + active_rt_tasks = count_active_rt_tasks(); +#endif do { CALC_LOAD(avenrun[0], EXP_1, active_tasks); CALC_LOAD(avenrun[1], EXP_5, active_tasks); CALC_LOAD(avenrun[2], EXP_15, active_tasks); +#ifdef CONFIG_PREEMPT_RT + CALC_LOAD(avenrun_rt[0], EXP_1, active_rt_tasks); + CALC_LOAD(avenrun_rt[1], EXP_5, active_rt_tasks); + CALC_LOAD(avenrun_rt[2], EXP_15, active_rt_tasks); +#endif count += LOAD_FREQ; + } while (count < 0); } } /* - * This function runs timers and the timer-tq in bottom half context. + * Called by the local, per-CPU timer interrupt on SMP. */ -static void run_timer_softirq(struct softirq_action *h) +void run_local_timers(void) { - tvec_base_t *base = __get_cpu_var(tvec_bases); - - hrtimer_run_queues(); - - if (time_after_eq(jiffies, base->timer_jiffies)) - __run_timers(base); + raise_softirq(TIMER_SOFTIRQ); + softlockup_tick(); } /* - * Called by the local, per-CPU timer interrupt on SMP. + * Time of day handling: */ -void run_local_timers(void) +static inline void update_times(void) { - raise_softirq(TIMER_SOFTIRQ); - softlockup_tick(); + static unsigned long last_tick = INITIAL_JIFFIES; + unsigned long ticks, flags; + + /* + * Dont take the xtime_lock from every CPU in + * every tick - only when needed: + */ + if (jiffies == last_tick) + return; + + write_seqlock_irqsave(&xtime_lock, flags); + ticks = jiffies - last_tick; + if (ticks) { + last_tick += ticks; + update_wall_time(); + calc_load(ticks); + } + write_sequnlock_irqrestore(&xtime_lock, flags); } + /* - * Called by the timer interrupt. xtime_lock must already be taken - * by the timer IRQ! + * This function runs timers and the timer-tq in bottom half context. */ -static inline void update_times(unsigned long ticks) +static void run_timer_softirq(struct softirq_action *h) { - update_wall_time(); - calc_load(ticks); + tvec_base_t *base = per_cpu(tvec_bases, raw_smp_processor_id()); + + update_times(); + hrtimer_run_queues(); + + if (time_after_eq(jiffies, base->timer_jiffies)) + __run_timers(base); } /* @@ -930,7 +1050,7 @@ static inline void update_times(unsigned void do_timer(unsigned long ticks) { jiffies_64 += ticks; - update_times(ticks); + timekeeping_accumulate(); } #ifdef __ARCH_WANT_SYS_ALARM @@ -1262,6 +1382,7 @@ static int __cpuinit init_timers_cpu(int spin_lock_init(&base->lock); lockdep_set_class(&base->lock, base_lock_keys + cpu); + init_waitqueue_head(&base->wait_for_running_timer); for (j = 0; j < TVN_SIZE; j++) { INIT_LIST_HEAD(base->tv5.vec + j); @@ -1299,7 +1420,7 @@ static void __devinit migrate_timers(int old_base = per_cpu(tvec_bases, cpu); new_base = get_cpu_var(tvec_bases); - local_irq_disable(); + local_irq_disable_nort(); double_spin_lock(&new_base->lock, &old_base->lock, smp_processor_id() < cpu); @@ -1316,7 +1437,7 @@ static void __devinit migrate_timers(int double_spin_unlock(&new_base->lock, &old_base->lock, smp_processor_id() < cpu); - local_irq_enable(); + local_irq_enable_nort(); put_cpu_var(tvec_bases); } #endif /* CONFIG_HOTPLUG_CPU */ Index: linux-2.6.24-rc8-rt1/kernel/itimer.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/itimer.c 2008-01-16 22:27:40.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/itimer.c 2008-01-16 22:28:25.000000000 -0500 @@ -170,6 +170,7 @@ again: /* We are sharing ->siglock with it_real_fn() */ if (hrtimer_try_to_cancel(timer) < 0) { spin_unlock_irq(&tsk->sighand->siglock); + hrtimer_wait_for_timer(&tsk->signal->real_timer); goto again; } expires = timeval_to_ktime(value->it_value); Index: linux-2.6.24-rc8-rt1/kernel/posix-timers.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/posix-timers.c 2008-01-16 22:27:40.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/posix-timers.c 2008-01-16 22:28:25.000000000 -0500 @@ -807,6 +807,7 @@ retry: unlock_timer(timr, flag); if (error == TIMER_RETRY) { + hrtimer_wait_for_timer(&timr->it.real.timer); rtn = NULL; // We already got the old time... goto retry; } @@ -846,6 +847,7 @@ retry_delete: if (timer_delete_hook(timer) == TIMER_RETRY) { unlock_timer(timer, flags); + hrtimer_wait_for_timer(&timer->it.real.timer); goto retry_delete; } @@ -878,6 +880,7 @@ retry_delete: if (timer_delete_hook(timer) == TIMER_RETRY) { unlock_timer(timer, flags); + hrtimer_wait_for_timer(&timer->it.real.timer); goto retry_delete; } list_del(&timer->list); Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/i8259_32.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/i8259_32.c 2008-01-16 22:27:40.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/i8259_32.c 2008-01-16 22:28:47.000000000 -0500 @@ -33,7 +33,7 @@ */ static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); +DEFINE_RAW_SPINLOCK(i8259A_lock); static void mask_and_ack_8259A(unsigned int); static struct irq_chip i8259A_chip = { @@ -169,6 +169,8 @@ static void mask_and_ack_8259A(unsigned */ if (cached_irq_mask & irqmask) goto spurious_8259A_irq; + if (irq & 8) + outb(0x60+(irq&7),PIC_SLAVE_CMD); /* 'Specific EOI' to slave */ cached_irq_mask |= irqmask; handle_real_irq: @@ -296,10 +298,10 @@ void init_8259A(int auto_eoi) outb_p(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ outb_p(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ outb_p(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ - if (auto_eoi) /* master does Auto EOI */ - outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); - else /* master expects normal EOI */ + if (!auto_eoi) /* master expects normal EOI */ outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + else /* master does Auto EOI */ + outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); outb_p(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ outb_p(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ @@ -351,6 +353,7 @@ static irqreturn_t math_error_irq(int cp */ static struct irqaction fpu_irq = { .handler = math_error_irq, + .flags = IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "fpu", }; Index: linux-2.6.24-rc8-rt1/arch/x86/mach-default/setup.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/mach-default/setup.c 2008-01-16 22:27:40.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/mach-default/setup.c 2008-01-16 22:28:25.000000000 -0500 @@ -37,6 +37,7 @@ void __init pre_intr_init_hook(void) */ static struct irqaction irq2 = { .handler = no_action, + .flags = IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "cascade", }; @@ -85,7 +86,7 @@ void __init trap_init_hook(void) static struct irqaction irq0 = { .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "timer" }; Index: linux-2.6.24-rc8-rt1/arch/x86/mach-visws/visws_apic.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/mach-visws/visws_apic.c 2008-01-16 22:27:40.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/mach-visws/visws_apic.c 2008-01-16 22:28:25.000000000 -0500 @@ -257,11 +257,13 @@ out_unlock: static struct irqaction master_action = { .handler = piix4_master_intr, .name = "PIIX4-8259", + .flags = IRQF_NODELAY, }; static struct irqaction cascade_action = { .handler = no_action, .name = "cascade", + .flags = IRQF_NODELAY, }; Index: linux-2.6.24-rc8-rt1/arch/x86/mach-voyager/setup.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/mach-voyager/setup.c 2008-01-16 22:27:40.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/mach-voyager/setup.c 2008-01-16 22:28:25.000000000 -0500 @@ -20,6 +20,7 @@ void __init pre_intr_init_hook(void) */ static struct irqaction irq2 = { .handler = no_action, + .flags = IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "cascade", }; @@ -46,7 +47,7 @@ void __init trap_init_hook(void) static struct irqaction irq0 = { .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "timer" }; Index: linux-2.6.24-rc8-rt1/arch/mips/kernel/i8253.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/mips/kernel/i8253.c 2008-01-16 22:27:40.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/mips/kernel/i8253.c 2008-01-16 22:28:26.000000000 -0500 @@ -100,7 +100,7 @@ static irqreturn_t timer_interrupt(int i static struct irqaction irq0 = { .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "timer" }; Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/i8259_64.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/i8259_64.c 2008-01-16 22:27:40.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/i8259_64.c 2008-01-16 22:28:42.000000000 -0500 @@ -96,8 +96,8 @@ static void (*interrupt[NR_VECTORS - FIR */ static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); static void mask_and_ack_8259A(unsigned int); +DEFINE_RAW_SPINLOCK(i8259A_lock); static struct irq_chip i8259A_chip = { .name = "XT-PIC", @@ -397,6 +397,7 @@ device_initcall(i8259A_init_sysfs); static struct irqaction irq2 = { .handler = no_action, + .flags = IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "cascade", }; Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/time_64.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/time_64.c 2008-01-16 22:27:40.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/time_64.c 2008-01-16 22:28:26.000000000 -0500 @@ -259,7 +259,8 @@ static unsigned int __init tsc_calibrate static struct irqaction irq0 = { .handler = timer_event_interrupt, - .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING, + .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING | + IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "timer" }; Index: linux-2.6.24-rc8-rt1/arch/arm/common/time-acorn.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/common/time-acorn.c 2008-01-16 22:27:40.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/common/time-acorn.c 2008-01-16 22:28:27.000000000 -0500 @@ -77,7 +77,7 @@ ioc_timer_interrupt(int irq, void *dev_i static struct irqaction ioc_timer_irq = { .name = "timer", - .flags = IRQF_DISABLED, + .flags = IRQF_DISABLED | IRQF_NODELAY, .handler = ioc_timer_interrupt }; Index: linux-2.6.24-rc8-rt1/arch/arm/oprofile/op_model_xscale.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/oprofile/op_model_xscale.c 2008-01-16 22:27:40.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/oprofile/op_model_xscale.c 2008-01-16 22:28:27.000000000 -0500 @@ -381,8 +381,9 @@ static int xscale_pmu_start(void) { int ret; u32 pmnc = read_pmnc(); + int irq_flags = IRQF_DISABLED | IRQF_NODELAY; - ret = request_irq(XSCALE_PMU_IRQ, xscale_pmu_interrupt, IRQF_DISABLED, + ret = request_irq(XSCALE_PMU_IRQ, xscale_pmu_interrupt, irq_flags, "XScale PMU", (void *)results); if (ret < 0) { Index: linux-2.6.24-rc8-rt1/arch/powerpc/kernel/ppc_ksyms.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/kernel/ppc_ksyms.c 2008-01-16 22:27:39.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/kernel/ppc_ksyms.c 2008-01-16 22:28:44.000000000 -0500 @@ -15,7 +15,6 @@ #include #include -#include #include #include #include @@ -46,7 +45,7 @@ #include #ifdef CONFIG_PPC64 -EXPORT_SYMBOL(local_irq_restore); +EXPORT_SYMBOL(raw_local_irq_restore); #endif #ifdef CONFIG_PPC32 @@ -162,7 +161,6 @@ EXPORT_SYMBOL(screen_info); #ifdef CONFIG_PPC32 EXPORT_SYMBOL(timer_interrupt); -EXPORT_SYMBOL(irq_desc); EXPORT_SYMBOL(tb_ticks_per_jiffy); EXPORT_SYMBOL(console_drivers); EXPORT_SYMBOL(cacheable_memcpy); Index: linux-2.6.24-rc8-rt1/arch/powerpc/platforms/iseries/setup.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/platforms/iseries/setup.c 2008-01-16 22:27:39.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/platforms/iseries/setup.c 2008-01-16 22:28:27.000000000 -0500 @@ -564,12 +564,14 @@ static void iseries_shared_idle(void) { while (1) { tick_nohz_stop_sched_tick(); - while (!need_resched() && !hvlpevent_is_pending()) { + while (!need_resched() && !need_resched_delayed() + && !hvlpevent_is_pending()) { local_irq_disable(); ppc64_runlatch_off(); /* Recheck with irqs off */ - if (!need_resched() && !hvlpevent_is_pending()) + if (!need_resched() && !need_resched_delayed() + && !hvlpevent_is_pending()) yield_shared_processor(); HMT_medium(); Index: linux-2.6.24-rc8-rt1/arch/powerpc/platforms/pseries/setup.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/platforms/pseries/setup.c 2008-01-16 22:27:39.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/platforms/pseries/setup.c 2008-01-16 22:28:27.000000000 -0500 @@ -413,7 +413,8 @@ static void pseries_dedicated_idle_sleep set_thread_flag(TIF_POLLING_NRFLAG); while (get_tb() < start_snooze) { - if (need_resched() || cpu_is_offline(cpu)) + if (need_resched() || need_resched_delayed() || + cpu_is_offline(cpu)) goto out; ppc64_runlatch_off(); HMT_low(); @@ -424,7 +425,8 @@ static void pseries_dedicated_idle_sleep clear_thread_flag(TIF_POLLING_NRFLAG); smp_mb(); local_irq_disable(); - if (need_resched() || cpu_is_offline(cpu)) + if (need_resched() || need_resched_delayed() || + cpu_is_offline(cpu)) goto out; } Index: linux-2.6.24-rc8-rt1/include/asm-powerpc/thread_info.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-powerpc/thread_info.h 2008-01-16 22:27:39.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-powerpc/thread_info.h 2008-01-16 22:29:15.000000000 -0500 @@ -121,9 +121,12 @@ static inline struct thread_info *curren #define TIF_RESTOREALL 11 /* Restore all regs (implies NOERROR) */ #define TIF_NOERROR 12 /* Force successful syscall return */ #define TIF_RESTORE_SIGMASK 13 /* Restore signal mask in do_signal */ -#define TIF_FREEZE 14 /* Freezing for suspend */ -#define TIF_RUNLATCH 15 /* Is the runlatch enabled? */ -#define TIF_ABI_PENDING 16 /* 32/64 bit switch needed */ +#define TIF_NEED_RESCHED_DELAYED \ + 14 /* reschedule on return to userspace */ +#define TIF_FREEZE 15 /* Freezing for suspend */ +#define TIF_RUNLATCH 16 /* Is the runlatch enabled? */ +#define TIF_ABI_PENDING 17 /* 32/64 bit switch needed */ + /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1< /* + * On !PREEMPT_RT, raw irq flags are used as a per-CPU locking + * mechanism. + * + * On PREEMPT_RT, we use per-CPU locks for this. That's why the + * calling convention is changed slightly: a new 'flags' argument + * is passed to 'irq disable/enable' - the PREEMPT_RT code stores + * the CPU number of the lock there. + */ +#ifndef CONFIG_PREEMPT_RT +# define slab_irq_disable(cpu) \ + do { local_irq_disable(); (cpu) = smp_processor_id(); } while (0) +# define slab_irq_enable(cpu) local_irq_enable() +# define slab_irq_save(flags, cpu) \ + do { local_irq_save(flags); (cpu) = smp_processor_id(); } while (0) +# define slab_irq_restore(flags, cpu) local_irq_restore(flags) +/* + * In the __GFP_WAIT case we enable/disable interrupts on !PREEMPT_RT, + * which has no per-CPU locking effect since we are holding the cache + * lock in that case already. + * + * (On PREEMPT_RT, these are NOPs, but we have to drop/get the irq locks.) + */ +# define slab_irq_disable_nort() local_irq_disable() +# define slab_irq_enable_nort() local_irq_enable() +# define slab_irq_disable_rt(flags) do { (void)(flags); } while (0) +# define slab_irq_enable_rt(flags) do { (void)(flags); } while (0) +# define slab_spin_lock_irq(lock, cpu) \ + do { spin_lock_irq(lock); (cpu) = smp_processor_id(); } while (0) +# define slab_spin_unlock_irq(lock, cpu) \ + spin_unlock_irq(lock) +# define slab_spin_lock_irqsave(lock, flags, cpu) \ + do { spin_lock_irqsave(lock, flags); (cpu) = smp_processor_id(); } while (0) +# define slab_spin_unlock_irqrestore(lock, flags, cpu) \ + do { spin_unlock_irqrestore(lock, flags); } while (0) +#else +DEFINE_PER_CPU_LOCKED(int, slab_irq_locks) = { 0, }; +# define slab_irq_disable(cpu) (void)get_cpu_var_locked(slab_irq_locks, &(cpu)) +# define slab_irq_enable(cpu) put_cpu_var_locked(slab_irq_locks, cpu) +# define slab_irq_save(flags, cpu) \ + do { slab_irq_disable(cpu); (void) (flags); } while (0) +# define slab_irq_restore(flags, cpu) \ + do { slab_irq_enable(cpu); (void) (flags); } while (0) +# define slab_irq_disable_rt(cpu) slab_irq_disable(cpu) +# define slab_irq_enable_rt(cpu) slab_irq_enable(cpu) +# define slab_irq_disable_nort() do { } while (0) +# define slab_irq_enable_nort() do { } while (0) +# define slab_spin_lock_irq(lock, cpu) \ + do { slab_irq_disable(cpu); spin_lock(lock); } while (0) +# define slab_spin_unlock_irq(lock, cpu) \ + do { spin_unlock(lock); slab_irq_enable(cpu); } while (0) +# define slab_spin_lock_irqsave(lock, flags, cpu) \ + do { slab_irq_disable(cpu); spin_lock_irqsave(lock, flags); } while (0) +# define slab_spin_unlock_irqrestore(lock, flags, cpu) \ + do { spin_unlock_irqrestore(lock, flags); slab_irq_enable(cpu); } while (0) +#endif + +/* * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. * 0 for faster, smaller code (especially in the critical paths). * @@ -313,7 +370,7 @@ struct kmem_list3 __initdata initkmem_li static int drain_freelist(struct kmem_cache *cache, struct kmem_list3 *l3, int tofree); static void free_block(struct kmem_cache *cachep, void **objpp, int len, - int node); + int node, int *this_cpu); static int enable_cpucache(struct kmem_cache *cachep); static void cache_reap(struct work_struct *unused); @@ -757,9 +814,10 @@ int slab_is_available(void) static DEFINE_PER_CPU(struct delayed_work, reap_work); -static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) +static inline struct array_cache * +cpu_cache_get(struct kmem_cache *cachep, int this_cpu) { - return cachep->array[smp_processor_id()]; + return cachep->array[this_cpu]; } static inline struct kmem_cache *__find_general_cachep(size_t size, @@ -993,7 +1051,7 @@ static int transfer_objects(struct array #ifndef CONFIG_NUMA #define drain_alien_cache(cachep, alien) do { } while (0) -#define reap_alien(cachep, l3) do { } while (0) +#define reap_alien(cachep, l3, this_cpu) 0 static inline struct array_cache **alloc_alien_cache(int node, int limit) { @@ -1004,27 +1062,29 @@ static inline void free_alien_cache(stru { } -static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) +static inline int +cache_free_alien(struct kmem_cache *cachep, void *objp, int *this_cpu) { return 0; } static inline void *alternate_node_alloc(struct kmem_cache *cachep, - gfp_t flags) + gfp_t flags, int *this_cpu) { return NULL; } static inline void *____cache_alloc_node(struct kmem_cache *cachep, - gfp_t flags, int nodeid) + gfp_t flags, int nodeid, int *this_cpu) { return NULL; } #else /* CONFIG_NUMA */ -static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); -static void *alternate_node_alloc(struct kmem_cache *, gfp_t); +static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, + int nodeid, int *this_cpu); +static void *alternate_node_alloc(struct kmem_cache *, gfp_t, int *); static struct array_cache **alloc_alien_cache(int node, int limit) { @@ -1065,7 +1125,8 @@ static void free_alien_cache(struct arra } static void __drain_alien_cache(struct kmem_cache *cachep, - struct array_cache *ac, int node) + struct array_cache *ac, int node, + int *this_cpu) { struct kmem_list3 *rl3 = cachep->nodelists[node]; @@ -1079,7 +1140,7 @@ static void __drain_alien_cache(struct k if (rl3->shared) transfer_objects(rl3->shared, ac, ac->limit); - free_block(cachep, ac->entry, ac->avail, node); + free_block(cachep, ac->entry, ac->avail, node, this_cpu); ac->avail = 0; spin_unlock(&rl3->list_lock); } @@ -1088,38 +1149,42 @@ static void __drain_alien_cache(struct k /* * Called from cache_reap() to regularly drain alien caches round robin. */ -static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) +static int +reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3, int *this_cpu) { - int node = __get_cpu_var(reap_node); + int node = per_cpu(reap_node, *this_cpu); if (l3->alien) { struct array_cache *ac = l3->alien[node]; if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { - __drain_alien_cache(cachep, ac, node); + __drain_alien_cache(cachep, ac, node, this_cpu); spin_unlock_irq(&ac->lock); + return 1; } } + return 0; } static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien) { - int i = 0; + int i = 0, this_cpu; struct array_cache *ac; unsigned long flags; for_each_online_node(i) { ac = alien[i]; if (ac) { - spin_lock_irqsave(&ac->lock, flags); - __drain_alien_cache(cachep, ac, i); - spin_unlock_irqrestore(&ac->lock, flags); + slab_spin_lock_irqsave(&ac->lock, flags, this_cpu); + __drain_alien_cache(cachep, ac, i, &this_cpu); + slab_spin_unlock_irqrestore(&ac->lock, flags, this_cpu); } } } -static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) +static inline int +cache_free_alien(struct kmem_cache *cachep, void *objp, int *this_cpu) { struct slab *slabp = virt_to_slab(objp); int nodeid = slabp->nodeid; @@ -1127,7 +1192,7 @@ static inline int cache_free_alien(struc struct array_cache *alien = NULL; int node; - node = numa_node_id(); + node = cpu_to_node(*this_cpu); /* * Make sure we are not freeing a object from another node to the array @@ -1143,13 +1208,13 @@ static inline int cache_free_alien(struc spin_lock(&alien->lock); if (unlikely(alien->avail == alien->limit)) { STATS_INC_ACOVERFLOW(cachep); - __drain_alien_cache(cachep, alien, nodeid); + __drain_alien_cache(cachep, alien, nodeid, this_cpu); } alien->entry[alien->avail++] = objp; spin_unlock(&alien->lock); } else { spin_lock(&(cachep->nodelists[nodeid])->list_lock); - free_block(cachep, &objp, 1, nodeid); + free_block(cachep, &objp, 1, nodeid, this_cpu); spin_unlock(&(cachep->nodelists[nodeid])->list_lock); } return 1; @@ -1166,6 +1231,7 @@ static void __cpuinit cpuup_canceled(lon struct array_cache *nc; struct array_cache *shared; struct array_cache **alien; + int this_cpu; cpumask_t mask; mask = node_to_cpumask(node); @@ -1177,29 +1243,31 @@ static void __cpuinit cpuup_canceled(lon if (!l3) goto free_array_cache; - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); /* Free limit for this kmem_list3 */ l3->free_limit -= cachep->batchcount; if (nc) - free_block(cachep, nc->entry, nc->avail, node); + free_block(cachep, nc->entry, nc->avail, node, + &this_cpu); if (!cpus_empty(mask)) { - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, + this_cpu); goto free_array_cache; } shared = l3->shared; if (shared) { free_block(cachep, shared->entry, - shared->avail, node); + shared->avail, node, &this_cpu); l3->shared = NULL; } alien = l3->alien; l3->alien = NULL; - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); kfree(shared); if (alien) { @@ -1228,6 +1296,7 @@ static int __cpuinit cpuup_prepare(long struct kmem_list3 *l3 = NULL; int node = cpu_to_node(cpu); const int memsize = sizeof(struct kmem_list3); + int this_cpu; /* * We need to do this right in the beginning since @@ -1258,11 +1327,11 @@ static int __cpuinit cpuup_prepare(long cachep->nodelists[node] = l3; } - spin_lock_irq(&cachep->nodelists[node]->list_lock); + slab_spin_lock_irq(&cachep->nodelists[node]->list_lock, this_cpu); cachep->nodelists[node]->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - spin_unlock_irq(&cachep->nodelists[node]->list_lock); + slab_spin_unlock_irq(&cachep->nodelists[node]->list_lock, this_cpu); } /* @@ -1299,7 +1368,7 @@ static int __cpuinit cpuup_prepare(long l3 = cachep->nodelists[node]; BUG_ON(!l3); - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); if (!l3->shared) { /* * We are serialised from CPU_DEAD or @@ -1314,7 +1383,7 @@ static int __cpuinit cpuup_prepare(long alien = NULL; } #endif - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); kfree(shared); free_alien_cache(alien); } @@ -1393,11 +1462,13 @@ static void init_list(struct kmem_cache int nodeid) { struct kmem_list3 *ptr; + int this_cpu; ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid); BUG_ON(!ptr); - local_irq_disable(); + WARN_ON(spin_is_locked(&list->list_lock)); + slab_irq_disable(this_cpu); memcpy(ptr, list, sizeof(struct kmem_list3)); /* * Do not assume that spinlocks can be initialized via memcpy: @@ -1406,7 +1477,7 @@ static void init_list(struct kmem_cache MAKE_ALL_LISTS(cachep, ptr, nodeid); cachep->nodelists[nodeid] = ptr; - local_irq_enable(); + slab_irq_enable(this_cpu); } /* @@ -1552,36 +1623,34 @@ void __init kmem_cache_init(void) /* 4) Replace the bootstrap head arrays */ { struct array_cache *ptr; + int this_cpu; ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); - local_irq_disable(); - BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); - memcpy(ptr, cpu_cache_get(&cache_cache), - sizeof(struct arraycache_init)); + slab_irq_disable(this_cpu); + BUG_ON(cpu_cache_get(&cache_cache, this_cpu) != &initarray_cache.cache); + memcpy(ptr, cpu_cache_get(&cache_cache, this_cpu), + sizeof(struct arraycache_init)); /* * Do not assume that spinlocks can be initialized via memcpy: */ spin_lock_init(&ptr->lock); - - cache_cache.array[smp_processor_id()] = ptr; - local_irq_enable(); + cache_cache.array[this_cpu] = ptr; + slab_irq_enable(this_cpu); ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); - local_irq_disable(); - BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) - != &initarray_generic.cache); - memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), - sizeof(struct arraycache_init)); + slab_irq_disable(this_cpu); + BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep, this_cpu) + != &initarray_generic.cache); + memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep, this_cpu), + sizeof(struct arraycache_init)); /* * Do not assume that spinlocks can be initialized via memcpy: */ spin_lock_init(&ptr->lock); - - malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = - ptr; - local_irq_enable(); + malloc_sizes[INDEX_AC].cs_cachep->array[this_cpu] = ptr; + slab_irq_enable(this_cpu); } /* 5) Replace the bootstrap kmem_list3's */ { @@ -1734,7 +1803,7 @@ static void store_stackinfo(struct kmem_ *addr++ = 0x12345678; *addr++ = caller; - *addr++ = smp_processor_id(); + *addr++ = raw_smp_processor_id(); size -= 3 * sizeof(unsigned long); { unsigned long *sptr = &caller; @@ -1889,7 +1958,11 @@ static void check_poison_obj(struct kmem } #endif +static void +__cache_free(struct kmem_cache *cachep, void *objp, int *this_cpu); + #if DEBUG + /** * slab_destroy_objs - destroy a slab and its objects * @cachep: cache pointer being destroyed @@ -1898,7 +1971,8 @@ static void check_poison_obj(struct kmem * Call the registered destructor for each object in a slab that is being * destroyed. */ -static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) +static void +slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) { int i; for (i = 0; i < cachep->num; i++) { @@ -1941,7 +2015,8 @@ static void slab_destroy_objs(struct kme * Before calling the slab must have been unlinked from the cache. The * cache-lock is not held/needed. */ -static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) +static void +slab_destroy(struct kmem_cache *cachep, struct slab *slabp, int *this_cpu) { void *addr = slabp->s_mem - slabp->colouroff; @@ -1955,8 +2030,12 @@ static void slab_destroy(struct kmem_cac call_rcu(&slab_rcu->head, kmem_rcu_free); } else { kmem_freepages(cachep, addr); - if (OFF_SLAB(cachep)) - kmem_cache_free(cachep->slabp_cache, slabp); + if (OFF_SLAB(cachep)) { + if (this_cpu) + __cache_free(cachep->slabp_cache, slabp, this_cpu); + else + kmem_cache_free(cachep->slabp_cache, slabp); + } } } @@ -2069,6 +2148,8 @@ static size_t calculate_slab_order(struc static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) { + int this_cpu; + if (g_cpucache_up == FULL) return enable_cpucache(cachep); @@ -2112,10 +2193,12 @@ static int __init_refok setup_cpu_cache( jiffies + REAPTIMEOUT_LIST3 + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; - cpu_cache_get(cachep)->avail = 0; - cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; - cpu_cache_get(cachep)->batchcount = 1; - cpu_cache_get(cachep)->touched = 0; + this_cpu = raw_smp_processor_id(); + + cpu_cache_get(cachep, this_cpu)->avail = 0; + cpu_cache_get(cachep, this_cpu)->limit = BOOT_CPUCACHE_ENTRIES; + cpu_cache_get(cachep, this_cpu)->batchcount = 1; + cpu_cache_get(cachep, this_cpu)->touched = 0; cachep->batchcount = 1; cachep->limit = BOOT_CPUCACHE_ENTRIES; return 0; @@ -2403,19 +2486,19 @@ EXPORT_SYMBOL(kmem_cache_create); #if DEBUG static void check_irq_off(void) { +/* + * On PREEMPT_RT we use locks to protect the per-CPU lists, + * and keep interrupts enabled. + */ +#ifndef CONFIG_PREEMPT_RT BUG_ON(!irqs_disabled()); +#endif } static void check_irq_on(void) { +#ifndef CONFIG_PREEMPT_RT BUG_ON(irqs_disabled()); -} - -static void check_spinlock_acquired(struct kmem_cache *cachep) -{ -#ifdef CONFIG_SMP - check_irq_off(); - assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock); #endif } @@ -2430,34 +2513,67 @@ static void check_spinlock_acquired_node #else #define check_irq_off() do { } while(0) #define check_irq_on() do { } while(0) -#define check_spinlock_acquired(x) do { } while(0) #define check_spinlock_acquired_node(x, y) do { } while(0) #endif -static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, +static int drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, struct array_cache *ac, int force, int node); -static void do_drain(void *arg) +static void __do_drain(void *arg, int this_cpu) { struct kmem_cache *cachep = arg; + int node = cpu_to_node(this_cpu); struct array_cache *ac; - int node = numa_node_id(); check_irq_off(); - ac = cpu_cache_get(cachep); + ac = cpu_cache_get(cachep, this_cpu); spin_lock(&cachep->nodelists[node]->list_lock); - free_block(cachep, ac->entry, ac->avail, node); + free_block(cachep, ac->entry, ac->avail, node, &this_cpu); spin_unlock(&cachep->nodelists[node]->list_lock); ac->avail = 0; } +#ifdef CONFIG_PREEMPT_RT +static void do_drain(void *arg, int this_cpu) +{ + __do_drain(arg, this_cpu); +} +#else +static void do_drain(void *arg) +{ + __do_drain(arg, smp_processor_id()); +} +#endif + +#ifdef CONFIG_PREEMPT_RT +/* + * execute func() for all CPUs. On PREEMPT_RT we dont actually have + * to run on the remote CPUs - we only have to take their CPU-locks. + * (This is a rare operation, so cacheline bouncing is not an issue.) + */ +static void +slab_on_each_cpu(void (*func)(void *arg, int this_cpu), void *arg) +{ + unsigned int i; + + check_irq_on(); + for_each_online_cpu(i) { + spin_lock(&__get_cpu_lock(slab_irq_locks, i)); + func(arg, i); + spin_unlock(&__get_cpu_lock(slab_irq_locks, i)); + } +} +#else +# define slab_on_each_cpu(func, cachep) on_each_cpu(func, cachep, 1, 1) +#endif + static void drain_cpu_caches(struct kmem_cache *cachep) { struct kmem_list3 *l3; int node; - on_each_cpu(do_drain, cachep, 1, 1); + slab_on_each_cpu(do_drain, cachep); check_irq_on(); for_each_online_node(node) { l3 = cachep->nodelists[node]; @@ -2482,16 +2598,16 @@ static int drain_freelist(struct kmem_ca struct kmem_list3 *l3, int tofree) { struct list_head *p; - int nr_freed; + int nr_freed, this_cpu; struct slab *slabp; nr_freed = 0; while (nr_freed < tofree && !list_empty(&l3->slabs_free)) { - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); p = l3->slabs_free.prev; if (p == &l3->slabs_free) { - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); goto out; } @@ -2500,13 +2616,9 @@ static int drain_freelist(struct kmem_ca BUG_ON(slabp->inuse); #endif list_del(&slabp->list); - /* - * Safe to drop the lock. The slab is no longer linked - * to the cache. - */ l3->free_objects -= cache->num; - spin_unlock_irq(&l3->list_lock); - slab_destroy(cache, slabp); + slab_destroy(cache, slabp, &this_cpu); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); nr_freed++; } out: @@ -2757,8 +2869,8 @@ static void slab_map_pages(struct kmem_c * Grow (by 1) the number of slabs within a cache. This is called by * kmem_cache_alloc() when there are no active objs left in a cache. */ -static int cache_grow(struct kmem_cache *cachep, - gfp_t flags, int nodeid, void *objp) +static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid, + void *objp, int *this_cpu) { struct slab *slabp; size_t offset; @@ -2787,7 +2899,8 @@ static int cache_grow(struct kmem_cache offset *= cachep->colour_off; if (local_flags & __GFP_WAIT) - local_irq_enable(); + slab_irq_enable_nort(); + slab_irq_enable_rt(*this_cpu); /* * The test for missing atomic flag is performed here, rather than @@ -2817,8 +2930,10 @@ static int cache_grow(struct kmem_cache cache_init_objs(cachep, slabp); + slab_irq_disable_rt(*this_cpu); if (local_flags & __GFP_WAIT) - local_irq_disable(); + slab_irq_disable_nort(); + check_irq_off(); spin_lock(&l3->list_lock); @@ -2831,8 +2946,9 @@ static int cache_grow(struct kmem_cache opps1: kmem_freepages(cachep, objp); failed: + slab_irq_disable_rt(*this_cpu); if (local_flags & __GFP_WAIT) - local_irq_disable(); + slab_irq_disable_nort(); return 0; } @@ -2954,7 +3070,8 @@ bad: #define check_slabp(x,y) do { } while(0) #endif -static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) +static void * +cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, int *this_cpu) { int batchcount; struct kmem_list3 *l3; @@ -2964,7 +3081,7 @@ static void *cache_alloc_refill(struct k node = numa_node_id(); check_irq_off(); - ac = cpu_cache_get(cachep); + ac = cpu_cache_get(cachep, *this_cpu); retry: batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { @@ -2975,7 +3092,7 @@ retry: */ batchcount = BATCHREFILL_LIMIT; } - l3 = cachep->nodelists[node]; + l3 = cachep->nodelists[cpu_to_node(*this_cpu)]; BUG_ON(ac->avail > 0 || !l3); spin_lock(&l3->list_lock); @@ -2998,7 +3115,7 @@ retry: slabp = list_entry(entry, struct slab, list); check_slabp(cachep, slabp); - check_spinlock_acquired(cachep); + check_spinlock_acquired_node(cachep, cpu_to_node(*this_cpu)); /* * The slab was either on partial or free list so @@ -3012,8 +3129,9 @@ retry: STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, - node); + ac->entry[ac->avail++] = + slab_get_obj(cachep, slabp, + cpu_to_node(*this_cpu)); } check_slabp(cachep, slabp); @@ -3032,10 +3150,10 @@ alloc_done: if (unlikely(!ac->avail)) { int x; - x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); + x = cache_grow(cachep, flags | GFP_THISNODE, cpu_to_node(*this_cpu), NULL, this_cpu); /* cache_grow can reenable interrupts, then ac could change. */ - ac = cpu_cache_get(cachep); + ac = cpu_cache_get(cachep, *this_cpu); if (!x && ac->avail == 0) /* no objects in sight? abort */ return NULL; @@ -3187,21 +3305,22 @@ static inline int should_failslab(struct #endif /* CONFIG_FAILSLAB */ -static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) +static inline void * +____cache_alloc(struct kmem_cache *cachep, gfp_t flags, int *this_cpu) { void *objp; struct array_cache *ac; check_irq_off(); - ac = cpu_cache_get(cachep); + ac = cpu_cache_get(cachep, *this_cpu); if (likely(ac->avail)) { STATS_INC_ALLOCHIT(cachep); ac->touched = 1; objp = ac->entry[--ac->avail]; } else { STATS_INC_ALLOCMISS(cachep); - objp = cache_alloc_refill(cachep, flags); + objp = cache_alloc_refill(cachep, flags, this_cpu); } return objp; } @@ -3213,7 +3332,8 @@ static inline void *____cache_alloc(stru * If we are in_interrupt, then process context, including cpusets and * mempolicy, may not apply and should not be used for allocation policy. */ -static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) +static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags, + int *this_cpu) { int nid_alloc, nid_here; @@ -3225,7 +3345,7 @@ static void *alternate_node_alloc(struct else if (current->mempolicy) nid_alloc = slab_node(current->mempolicy); if (nid_alloc != nid_here) - return ____cache_alloc_node(cachep, flags, nid_alloc); + return ____cache_alloc_node(cachep, flags, nid_alloc, this_cpu); return NULL; } @@ -3237,7 +3357,7 @@ static void *alternate_node_alloc(struct * allocator to do its reclaim / fallback magic. We then insert the * slab into the proper nodelist and then allocate from it. */ -static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) +static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags, int *this_cpu) { struct zonelist *zonelist; gfp_t local_flags; @@ -3263,8 +3383,10 @@ retry: if (cpuset_zone_allowed_hardwall(*z, flags) && cache->nodelists[nid] && cache->nodelists[nid]->free_objects) - obj = ____cache_alloc_node(cache, - flags | GFP_THISNODE, nid); + + obj = ____cache_alloc_node(cache, + flags | GFP_THISNODE, nid, + this_cpu); } if (!obj) { @@ -3275,19 +3397,24 @@ retry: * set and go into memory reserves if necessary. */ if (local_flags & __GFP_WAIT) - local_irq_enable(); + slab_irq_enable_nort(); + slab_irq_enable_rt(*this_cpu); + kmem_flagcheck(cache, flags); obj = kmem_getpages(cache, flags, -1); + + slab_irq_disable_rt(*this_cpu); if (local_flags & __GFP_WAIT) - local_irq_disable(); + slab_irq_disable_nort(); + if (obj) { /* * Insert into the appropriate per node queues */ nid = page_to_nid(virt_to_page(obj)); - if (cache_grow(cache, flags, nid, obj)) { + if (cache_grow(cache, flags, nid, obj, this_cpu)) { obj = ____cache_alloc_node(cache, - flags | GFP_THISNODE, nid); + flags | GFP_THISNODE, nid, this_cpu); if (!obj) /* * Another processor may allocate the @@ -3308,7 +3435,7 @@ retry: * A interface to enable slab creation on nodeid */ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, - int nodeid) + int nodeid, int *this_cpu) { struct list_head *entry; struct slab *slabp; @@ -3356,11 +3483,11 @@ retry: must_grow: spin_unlock(&l3->list_lock); - x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); + x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL, this_cpu); if (x) goto retry; - return fallback_alloc(cachep, flags); + return fallback_alloc(cachep, flags, this_cpu); done: return obj; @@ -3382,39 +3509,41 @@ static __always_inline void * __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, void *caller) { - unsigned long save_flags; + unsigned long irqflags; + int this_cpu; void *ptr; if (should_failslab(cachep, flags)) return NULL; cache_alloc_debugcheck_before(cachep, flags); - local_irq_save(save_flags); + + slab_irq_save(irqflags, this_cpu); if (unlikely(nodeid == -1)) - nodeid = numa_node_id(); + nodeid = cpu_to_node(this_cpu); if (unlikely(!cachep->nodelists[nodeid])) { /* Node not bootstrapped yet */ - ptr = fallback_alloc(cachep, flags); + ptr = fallback_alloc(cachep, flags, &this_cpu); goto out; } - if (nodeid == numa_node_id()) { + if (nodeid == cpu_to_node(this_cpu)) { /* * Use the locally cached objects if possible. * However ____cache_alloc does not allow fallback * to other nodes. It may fail while we still have * objects on other nodes available. */ - ptr = ____cache_alloc(cachep, flags); + ptr = ____cache_alloc(cachep, flags, &this_cpu); if (ptr) goto out; } /* ___cache_alloc_node can fall back to other nodes */ - ptr = ____cache_alloc_node(cachep, flags, nodeid); + ptr = ____cache_alloc_node(cachep, flags, nodeid, &this_cpu); out: - local_irq_restore(save_flags); + slab_irq_restore(irqflags, this_cpu); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); if (unlikely((flags & __GFP_ZERO) && ptr)) @@ -3424,33 +3553,33 @@ __cache_alloc_node(struct kmem_cache *ca } static __always_inline void * -__do_cache_alloc(struct kmem_cache *cache, gfp_t flags) +__do_cache_alloc(struct kmem_cache *cache, gfp_t flags, int *this_cpu) { void *objp; if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { - objp = alternate_node_alloc(cache, flags); + objp = alternate_node_alloc(cache, flags, this_cpu); if (objp) goto out; } - objp = ____cache_alloc(cache, flags); + objp = ____cache_alloc(cache, flags, this_cpu); /* * We may just have run out of memory on the local node. * ____cache_alloc_node() knows how to locate memory on other nodes */ - if (!objp) - objp = ____cache_alloc_node(cache, flags, numa_node_id()); - + if (!objp) + objp = ____cache_alloc_node(cache, flags, + cpu_to_node(*this_cpu), this_cpu); out: return objp; } #else static __always_inline void * -__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) +__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int *this_cpu) { - return ____cache_alloc(cachep, flags); + return ____cache_alloc(cachep, flags, this_cpu); } #endif /* CONFIG_NUMA */ @@ -3459,15 +3588,16 @@ static __always_inline void * __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) { unsigned long save_flags; + int this_cpu; void *objp; if (should_failslab(cachep, flags)) return NULL; cache_alloc_debugcheck_before(cachep, flags); - local_irq_save(save_flags); - objp = __do_cache_alloc(cachep, flags); - local_irq_restore(save_flags); + slab_irq_save(save_flags, this_cpu); + objp = __do_cache_alloc(cachep, flags, &this_cpu); + slab_irq_restore(save_flags, this_cpu); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); prefetchw(objp); @@ -3481,7 +3611,7 @@ __cache_alloc(struct kmem_cache *cachep, * Caller needs to acquire correct kmem_list's list_lock */ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, - int node) + int node, int *this_cpu) { int i; struct kmem_list3 *l3; @@ -3510,7 +3640,7 @@ static void free_block(struct kmem_cache * a different cache, refer to comments before * alloc_slabmgmt. */ - slab_destroy(cachep, slabp); + slab_destroy(cachep, slabp, this_cpu); } else { list_add(&slabp->list, &l3->slabs_free); } @@ -3524,11 +3654,12 @@ static void free_block(struct kmem_cache } } -static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) +static void +cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac, int *this_cpu) { int batchcount; struct kmem_list3 *l3; - int node = numa_node_id(); + int node = cpu_to_node(*this_cpu); batchcount = ac->batchcount; #if DEBUG @@ -3550,7 +3681,7 @@ static void cache_flusharray(struct kmem } } - free_block(cachep, ac->entry, batchcount, node); + free_block(cachep, ac->entry, batchcount, node, this_cpu); free_done: #if STATS { @@ -3579,9 +3710,9 @@ free_done: * Release an obj back to its cache. If the obj has a constructed state, it must * be in this state _before_ it is released. Called with disabled ints. */ -static inline void __cache_free(struct kmem_cache *cachep, void *objp) +static void __cache_free(struct kmem_cache *cachep, void *objp, int *this_cpu) { - struct array_cache *ac = cpu_cache_get(cachep); + struct array_cache *ac = cpu_cache_get(cachep, *this_cpu); check_irq_off(); objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); @@ -3593,7 +3724,7 @@ static inline void __cache_free(struct k * variable to skip the call, which is mostly likely to be present in * the cache. */ - if (numa_platform && cache_free_alien(cachep, objp)) + if (numa_platform && cache_free_alien(cachep, objp, this_cpu)) return; if (likely(ac->avail < ac->limit)) { @@ -3602,7 +3733,7 @@ static inline void __cache_free(struct k return; } else { STATS_INC_FREEMISS(cachep); - cache_flusharray(cachep, ac); + cache_flusharray(cachep, ac, this_cpu); ac->entry[ac->avail++] = objp; } } @@ -3760,11 +3891,12 @@ EXPORT_SYMBOL(__kmalloc); void kmem_cache_free(struct kmem_cache *cachep, void *objp) { unsigned long flags; + int this_cpu; - local_irq_save(flags); + slab_irq_save(flags, this_cpu); debug_check_no_locks_freed(objp, obj_size(cachep)); - __cache_free(cachep, objp); - local_irq_restore(flags); + __cache_free(cachep, objp, &this_cpu); + slab_irq_restore(flags, this_cpu); } EXPORT_SYMBOL(kmem_cache_free); @@ -3781,15 +3913,16 @@ void kfree(const void *objp) { struct kmem_cache *c; unsigned long flags; + int this_cpu; if (unlikely(ZERO_OR_NULL_PTR(objp))) return; - local_irq_save(flags); + slab_irq_save(flags, this_cpu); kfree_debugcheck(objp); c = virt_to_cache(objp); debug_check_no_locks_freed(objp, obj_size(c)); - __cache_free(c, (void *)objp); - local_irq_restore(flags); + __cache_free(c, (void *)objp, &this_cpu); + slab_irq_restore(flags, this_cpu); } EXPORT_SYMBOL(kfree); @@ -3810,7 +3943,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name); */ static int alloc_kmemlist(struct kmem_cache *cachep) { - int node; + int node, this_cpu; struct kmem_list3 *l3; struct array_cache *new_shared; struct array_cache **new_alien = NULL; @@ -3838,11 +3971,11 @@ static int alloc_kmemlist(struct kmem_ca if (l3) { struct array_cache *shared = l3->shared; - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); if (shared) free_block(cachep, shared->entry, - shared->avail, node); + shared->avail, node, &this_cpu); l3->shared = new_shared; if (!l3->alien) { @@ -3851,7 +3984,7 @@ static int alloc_kmemlist(struct kmem_ca } l3->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); kfree(shared); free_alien_cache(new_alien); continue; @@ -3898,42 +4031,50 @@ struct ccupdate_struct { struct array_cache *new[NR_CPUS]; }; -static void do_ccupdate_local(void *info) +static void __do_ccupdate_local(void *info, int this_cpu) { struct ccupdate_struct *new = info; struct array_cache *old; check_irq_off(); - old = cpu_cache_get(new->cachep); + old = cpu_cache_get(new->cachep, this_cpu); + + new->cachep->array[this_cpu] = new->new[this_cpu]; + new->new[this_cpu] = old; +} - new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; - new->new[smp_processor_id()] = old; +#ifdef CONFIG_PREEMPT_RT +static void do_ccupdate_local(void *arg, int this_cpu) +{ + __do_ccupdate_local(arg, this_cpu); } +#else +static void do_ccupdate_local(void *arg) +{ + __do_ccupdate_local(arg, smp_processor_id()); +} +#endif /* Always called with the cache_chain_mutex held */ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, int shared) { - struct ccupdate_struct *new; - int i; - - new = kzalloc(sizeof(*new), GFP_KERNEL); - if (!new) - return -ENOMEM; + struct ccupdate_struct new; + int i, this_cpu; + memset(&new.new, 0, sizeof(new.new)); for_each_online_cpu(i) { - new->new[i] = alloc_arraycache(cpu_to_node(i), limit, + new.new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount); - if (!new->new[i]) { + if (!new.new[i]) { for (i--; i >= 0; i--) - kfree(new->new[i]); - kfree(new); + kfree(new.new[i]); return -ENOMEM; } } - new->cachep = cachep; + new.cachep = cachep; - on_each_cpu(do_ccupdate_local, (void *)new, 1, 1); + slab_on_each_cpu(do_ccupdate_local, (void *)&new); check_irq_on(); cachep->batchcount = batchcount; @@ -3941,15 +4082,15 @@ static int do_tune_cpucache(struct kmem_ cachep->shared = shared; for_each_online_cpu(i) { - struct array_cache *ccold = new->new[i]; + struct array_cache *ccold = new.new[i]; if (!ccold) continue; - spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); - free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i)); - spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); + slab_spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock, this_cpu); + free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i), &this_cpu); + slab_spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock, this_cpu); kfree(ccold); } - kfree(new); + return alloc_kmemlist(cachep); } @@ -4011,29 +4152,31 @@ static int enable_cpucache(struct kmem_c * Drain an array if it contains any elements taking the l3 lock only if * necessary. Note that the l3 listlock also protects the array_cache * if drain_array() is used on the shared array. + * returns non-zero if some work is done */ -void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, - struct array_cache *ac, int force, int node) +int drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, + struct array_cache *ac, int force, int node) { - int tofree; + int tofree, this_cpu; if (!ac || !ac->avail) - return; + return 0; if (ac->touched && !force) { ac->touched = 0; } else { - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); if (ac->avail) { tofree = force ? ac->avail : (ac->limit + 4) / 5; if (tofree > ac->avail) tofree = (ac->avail + 1) / 2; - free_block(cachep, ac->entry, tofree, node); + free_block(cachep, ac->entry, tofree, node, &this_cpu); ac->avail -= tofree; memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail); } - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); } + return 1; } /** @@ -4050,11 +4193,12 @@ void drain_array(struct kmem_cache *cach */ static void cache_reap(struct work_struct *w) { + int this_cpu = raw_smp_processor_id(), node = cpu_to_node(this_cpu); struct kmem_cache *searchp; struct kmem_list3 *l3; - int node = numa_node_id(); struct delayed_work *work = container_of(w, struct delayed_work, work); + int work_done = 0; if (!mutex_trylock(&cache_chain_mutex)) /* Give up. Setup the next iteration. */ @@ -4070,9 +4214,12 @@ static void cache_reap(struct work_struc */ l3 = searchp->nodelists[node]; - reap_alien(searchp, l3); + work_done += reap_alien(searchp, l3, &this_cpu); + + node = cpu_to_node(this_cpu); - drain_array(searchp, l3, cpu_cache_get(searchp), 0, node); + work_done += drain_array(searchp, l3, + cpu_cache_get(searchp, this_cpu), 0, node); /* * These are racy checks but it does not matter @@ -4083,7 +4230,7 @@ static void cache_reap(struct work_struc l3->next_reap = jiffies + REAPTIMEOUT_LIST3; - drain_array(searchp, l3, l3->shared, 0, node); + work_done += drain_array(searchp, l3, l3->shared, 0, node); if (l3->free_touched) l3->free_touched = 0; @@ -4102,7 +4249,8 @@ next: next_reap_node(); out: /* Set up the next iteration */ - schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); + schedule_delayed_work(work, + round_jiffies_relative((1+!work_done) * REAPTIMEOUT_CPUC)); } #ifdef CONFIG_SLABINFO @@ -4161,7 +4309,7 @@ static int s_show(struct seq_file *m, vo unsigned long num_slabs, free_objects = 0, shared_avail = 0; const char *name; char *error = NULL; - int node; + int this_cpu, node; struct kmem_list3 *l3; active_objs = 0; @@ -4172,7 +4320,7 @@ static int s_show(struct seq_file *m, vo continue; check_irq_on(); - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); list_for_each_entry(slabp, &l3->slabs_full, list) { if (slabp->inuse != cachep->num && !error) @@ -4197,7 +4345,7 @@ static int s_show(struct seq_file *m, vo if (l3->shared) shared_avail += l3->shared->avail; - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); } num_slabs += active_slabs; num_objs = num_slabs * cachep->num; @@ -4393,7 +4541,7 @@ static int leaks_show(struct seq_file *m struct kmem_list3 *l3; const char *name; unsigned long *n = m->private; - int node; + int node, this_cpu; int i; if (!(cachep->flags & SLAB_STORE_USER)) @@ -4411,13 +4559,13 @@ static int leaks_show(struct seq_file *m continue; check_irq_on(); - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); list_for_each_entry(slabp, &l3->slabs_full, list) handle_slab(n, cachep, slabp); list_for_each_entry(slabp, &l3->slabs_partial, list) handle_slab(n, cachep, slabp); - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); } name = cachep->name; if (n[0] == n[1]) { Index: linux-2.6.24-rc8-rt1/mm/page_alloc.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/mm/page_alloc.c 2008-01-16 22:27:39.000000000 -0500 +++ linux-2.6.24-rc8-rt1/mm/page_alloc.c 2008-01-16 22:29:27.000000000 -0500 @@ -159,6 +159,53 @@ static unsigned long __meminitdata dma_r EXPORT_SYMBOL(movable_zone); #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ +#ifdef CONFIG_PREEMPT_RT +static DEFINE_PER_CPU_LOCKED(int, pcp_locks); +#endif + +static inline void __lock_cpu_pcp(unsigned long *flags, int cpu) +{ +#ifdef CONFIG_PREEMPT_RT + spin_lock(&__get_cpu_lock(pcp_locks, cpu)); + flags = 0; +#else + local_irq_save(*flags); +#endif +} + +static inline void lock_cpu_pcp(unsigned long *flags, int *this_cpu) +{ +#ifdef CONFIG_PREEMPT_RT + (void)get_cpu_var_locked(pcp_locks, this_cpu); + flags = 0; +#else + local_irq_save(*flags); + *this_cpu = smp_processor_id(); +#endif +} + +static inline void unlock_cpu_pcp(unsigned long flags, int this_cpu) +{ +#ifdef CONFIG_PREEMPT_RT + put_cpu_var_locked(pcp_locks, this_cpu); +#else + local_irq_restore(flags); +#endif +} + +static struct per_cpu_pageset * +get_zone_pcp(struct zone *zone, unsigned long *flags, int *this_cpu) +{ + lock_cpu_pcp(flags, this_cpu); + return zone_pcp(zone, *this_cpu); +} + +static void +put_zone_pcp(struct zone *zone, unsigned long flags, int this_cpu) +{ + unlock_cpu_pcp(flags, this_cpu); +} + #if MAX_NUMNODES > 1 int nr_node_ids __read_mostly = MAX_NUMNODES; EXPORT_SYMBOL(nr_node_ids); @@ -410,8 +457,8 @@ static inline int page_is_buddy(struct p * -- wli */ -static inline void __free_one_page(struct page *page, - struct zone *zone, unsigned int order) +static inline void +__free_one_page(struct page *page, struct zone *zone, unsigned int order) { unsigned long page_idx; int order_size = 1 << order; @@ -515,8 +562,9 @@ static void free_one_page(struct zone *z static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; - int i; int reserved = 0; + int this_cpu; + int i; for (i = 0 ; i < (1 << order) ; ++i) reserved += free_pages_check(page + i); @@ -528,10 +576,10 @@ static void __free_pages_ok(struct page arch_free_page(page, order); kernel_map_pages(page, 1 << order, 0); - local_irq_save(flags); - __count_vm_events(PGFREE, 1 << order); + lock_cpu_pcp(&flags, &this_cpu); + count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, order); - local_irq_restore(flags); + unlock_cpu_pcp(flags, this_cpu); } /* @@ -876,23 +924,19 @@ static int rmqueue_bulk(struct zone *zon */ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { - unsigned long flags; int to_drain; - local_irq_save(flags); if (pcp->count >= pcp->batch) to_drain = pcp->batch; else to_drain = pcp->count; free_pages_bulk(zone, to_drain, &pcp->list, 0); pcp->count -= to_drain; - local_irq_restore(flags); } #endif static void __drain_pages(unsigned int cpu) { - unsigned long flags; struct zone *zone; int i; @@ -903,14 +947,16 @@ static void __drain_pages(unsigned int c continue; pset = zone_pcp(zone, cpu); + if (!pset) { + WARN_ON(1); + continue; + } for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; pcp = &pset->pcp[i]; - local_irq_save(flags); free_pages_bulk(zone, pcp->count, &pcp->list, 0); pcp->count = 0; - local_irq_restore(flags); } } } @@ -957,10 +1003,11 @@ void mark_free_pages(struct zone *zone) void drain_local_pages(void) { unsigned long flags; + int this_cpu; - local_irq_save(flags); - __drain_pages(smp_processor_id()); - local_irq_restore(flags); + lock_cpu_pcp(&flags, &this_cpu); + __drain_pages(this_cpu); + unlock_cpu_pcp(flags, this_cpu); } void smp_drain_local_pages(void *arg) @@ -973,6 +1020,38 @@ void smp_drain_local_pages(void *arg) */ void drain_all_local_pages(void) { +#ifdef CONFIG_PREEMPT_RT + /* + * HACK!!!!! + * For RT we can't use IPIs to run drain_local_pages, since + * that code will call spin_locks that will now sleep. + * But, schedule_on_each_cpu will call kzalloc, which will + * call page_alloc which was what calls this. + * + * Luckily, there's a condition to get here, and that is if + * the order passed in to alloc_pages is greater than 0 + * (alloced more than a page size). The slabs only allocate + * what is needed, and the allocation made by schedule_on_each_cpu + * does an alloc of "sizeof(void *)*nr_cpu_ids". + * + * So we can safely call schedule_on_each_cpu if that number + * is less than a page. Otherwise don't bother. At least warn of + * this issue. + * + * And yes, this is one big hack. Please fix ;-) + */ + if (sizeof(void *)*nr_cpu_ids < PAGE_SIZE) + schedule_on_each_cpu(smp_drain_local_pages, NULL, 0, 1); + else { + static int once; + if (!once) { + printk(KERN_ERR "Can't drain all CPUS due to possible recursion\n"); + once = 1; + } + drain_local_pages(); + } + +#else unsigned long flags; local_irq_save(flags); @@ -980,6 +1059,7 @@ void drain_all_local_pages(void) local_irq_restore(flags); smp_call_function(smp_drain_local_pages, NULL, 0, 1); +#endif } /* @@ -988,8 +1068,10 @@ void drain_all_local_pages(void) static void fastcall free_hot_cold_page(struct page *page, int cold) { struct zone *zone = page_zone(page); + struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; unsigned long flags; + int this_cpu; if (PageAnon(page)) page->mapping = NULL; @@ -1001,9 +1083,11 @@ static void fastcall free_hot_cold_page( arch_free_page(page, 0); kernel_map_pages(page, 1, 0); - pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; - local_irq_save(flags); - __count_vm_event(PGFREE); + pset = get_zone_pcp(zone, &flags, &this_cpu); + pcp = &pset->pcp[cold]; + + count_vm_event(PGFREE); + list_add(&page->lru, &pcp->list); set_page_private(page, get_pageblock_migratetype(page)); pcp->count++; @@ -1011,8 +1095,7 @@ static void fastcall free_hot_cold_page( free_pages_bulk(zone, pcp->batch, &pcp->list, 0); pcp->count -= pcp->batch; } - local_irq_restore(flags); - put_cpu(); + put_zone_pcp(zone, flags, this_cpu); } void fastcall free_hot_page(struct page *page) @@ -1054,16 +1137,15 @@ static struct page *buffered_rmqueue(str unsigned long flags; struct page *page; int cold = !!(gfp_flags & __GFP_COLD); - int cpu; + struct per_cpu_pageset *pset; int migratetype = allocflags_to_migratetype(gfp_flags); + int this_cpu; again: - cpu = get_cpu(); + pset = get_zone_pcp(zone, &flags, &this_cpu); if (likely(order == 0)) { - struct per_cpu_pages *pcp; + struct per_cpu_pages *pcp = &pset->pcp[cold]; - pcp = &zone_pcp(zone, cpu)->pcp[cold]; - local_irq_save(flags); if (!pcp->count) { pcp->count = rmqueue_bulk(zone, 0, pcp->batch, &pcp->list, migratetype); @@ -1086,7 +1168,7 @@ again: list_del(&page->lru); pcp->count--; } else { - spin_lock_irqsave(&zone->lock, flags); + spin_lock(&zone->lock); page = __rmqueue(zone, order, migratetype); spin_unlock(&zone->lock); if (!page) @@ -1095,8 +1177,7 @@ again: __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(zonelist, zone); - local_irq_restore(flags); - put_cpu(); + put_zone_pcp(zone, flags, this_cpu); VM_BUG_ON(bad_range(zone, page)); if (prep_new_page(page, order, gfp_flags)) @@ -1104,8 +1185,7 @@ again: return page; failed: - local_irq_restore(flags); - put_cpu(); + put_zone_pcp(zone, flags, this_cpu); return NULL; } @@ -3978,10 +4058,11 @@ static int page_alloc_cpu_notify(struct int cpu = (unsigned long)hcpu; if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - local_irq_disable(); + unsigned long flags; + __lock_cpu_pcp(&flags, cpu); __drain_pages(cpu); vm_events_fold_cpu(cpu); - local_irq_enable(); + unlock_cpu_pcp(flags, cpu); refresh_cpu_vm_stats(cpu); } return NOTIFY_OK; Index: linux-2.6.24-rc8-rt1/include/linux/smp.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/smp.h 2008-01-16 22:27:39.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/smp.h 2008-01-16 22:29:12.000000000 -0500 @@ -7,6 +7,7 @@ */ #include +#include extern void cpu_idle(void); @@ -33,6 +34,24 @@ extern void smp_send_stop(void); */ extern void smp_send_reschedule(int cpu); +/* + * trigger a reschedule on all other CPUs: + */ +extern void smp_send_reschedule_allbutself(void); + +/* + * trigger a reschedule on all other CPUs: + */ +extern void smp_send_reschedule_allbutself(void); + +#ifdef HAVE_RESCHEDULE_ALLBUTSELF_CPUMASK +extern void smp_send_reschedule_allbutself_cpumask(cpumask_t); +#else +static inline void smp_send_reschedule_allbutself_cpumask(cpumask_t mask) { + smp_send_reschedule_allbutself(); +} +#endif + /* * Prepare machine for booting other CPUs. @@ -98,6 +117,8 @@ static inline int up_smp_call_function(v 0; \ }) static inline void smp_send_reschedule(int cpu) { } +static inline void smp_send_reschedule_allbutself(void) { } +static inline void smp_send_reschedule_allbutself_cpumask(cpumask_t mask) { } #define num_booting_cpus() 1 #define smp_prepare_boot_cpu() do {} while (0) #define smp_call_function_single(cpuid, func, info, retry, wait) \ @@ -137,7 +158,7 @@ static inline void smp_send_reschedule(i #define get_cpu() ({ preempt_disable(); smp_processor_id(); }) #define put_cpu() preempt_enable() -#define put_cpu_no_resched() preempt_enable_no_resched() +#define put_cpu_no_resched() __preempt_enable_no_resched() void smp_setup_processor_id(void); Index: linux-2.6.24-rc8-rt1/kernel/stop_machine.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/stop_machine.c 2008-01-16 22:27:39.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/stop_machine.c 2008-01-16 22:28:49.000000000 -0500 @@ -63,7 +63,7 @@ static int stopmachine(void *cpu) /* Yield in first stage: migration threads need to * help our sisters onto their CPUs. */ if (!prepared && !irqs_disabled) - yield(); + __yield(); else cpu_relax(); } @@ -109,7 +109,7 @@ static int stop_machine(void) /* Wait for them all to come to life. */ while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) - yield(); + __yield(); /* If some failed, kill them all. */ if (ret < 0) { @@ -133,7 +133,7 @@ static void restart_machine(void) { stopmachine_set_state(STOPMACHINE_EXIT); local_irq_enable(); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); } struct stop_machine_data Index: linux-2.6.24-rc8-rt1/net/ipv4/tcp.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/net/ipv4/tcp.c 2008-01-16 22:27:39.000000000 -0500 +++ linux-2.6.24-rc8-rt1/net/ipv4/tcp.c 2008-01-16 22:28:30.000000000 -0500 @@ -1155,11 +1155,11 @@ int tcp_recvmsg(struct kiocb *iocb, stru (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && !sysctl_tcp_low_latency && __get_cpu_var(softnet_data).net_dma) { - preempt_enable_no_resched(); + preempt_enable(); tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len); } else { - preempt_enable_no_resched(); + preempt_enable(); } } #endif Index: linux-2.6.24-rc8-rt1/net/ipv4/route.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/net/ipv4/route.c 2008-01-16 22:27:39.000000000 -0500 +++ linux-2.6.24-rc8-rt1/net/ipv4/route.c 2008-01-16 22:28:55.000000000 -0500 @@ -208,13 +208,13 @@ struct rt_hash_bucket { struct rtable *chain; }; #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ - defined(CONFIG_PROVE_LOCKING) + defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_PREEMPT_RT) /* * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks * The size of this table is a power of two and depends on the number of CPUS. * (on lockdep we have a quite big spinlock_t, so keep the size down there) */ -#ifdef CONFIG_LOCKDEP +#if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT) # define RT_HASH_LOCK_SZ 256 #else # if NR_CPUS >= 32 @@ -240,7 +240,7 @@ static spinlock_t *rt_hash_locks; spin_lock_init(&rt_hash_locks[i]); \ } #else -# define rt_hash_lock_addr(slot) NULL +# define rt_hash_lock_addr(slot) ((spinlock_t *)NULL) # define rt_hash_lock_init() #endif Index: linux-2.6.24-rc8-rt1/drivers/input/ff-memless.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/input/ff-memless.c 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/input/ff-memless.c 2008-01-16 22:28:31.000000000 -0500 @@ -28,6 +28,7 @@ #include #include #include +#include #include #include Index: linux-2.6.24-rc8-rt1/fs/proc/array.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/fs/proc/array.c 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/fs/proc/array.c 2008-01-16 22:28:31.000000000 -0500 @@ -131,17 +131,19 @@ static inline char *task_name(struct tas */ static const char *task_state_array[] = { "R (running)", /* 0 */ - "S (sleeping)", /* 1 */ - "D (disk sleep)", /* 2 */ - "T (stopped)", /* 4 */ - "T (tracing stop)", /* 8 */ - "Z (zombie)", /* 16 */ - "X (dead)" /* 32 */ + "M (running-mutex)", /* 1 */ + "S (sleeping)", /* 2 */ + "D (disk sleep)", /* 4 */ + "T (stopped)", /* 8 */ + "T (tracing stop)", /* 16 */ + "Z (zombie)", /* 32 */ + "X (dead)" /* 64 */ }; static inline const char *get_task_state(struct task_struct *tsk) { unsigned int state = (tsk->state & (TASK_RUNNING | + TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE | TASK_STOPPED | @@ -305,6 +307,19 @@ static inline char *task_context_switch_ p->nivcsw); } +#define get_blocked_on(t) (-1) + +static char *show_blocked_on(struct task_struct *task, char *buffer) +{ + pid_t pid = get_blocked_on(task); + + if (pid < 0) + return buffer; + + return buffer + sprintf(buffer,"BlckOn: %d\n",pid); +} + + int proc_pid_status(struct task_struct *task, char *buffer) { char *orig = buffer; @@ -324,6 +339,7 @@ int proc_pid_status(struct task_struct * buffer = task_show_regs(task, buffer); #endif buffer = task_context_switch_counts(task, buffer); + buffer = show_blocked_on(task,buffer); return buffer - orig; } Index: linux-2.6.24-rc8-rt1/include/linux/bit_spinlock.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/bit_spinlock.h 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/bit_spinlock.h 2008-01-16 22:28:31.000000000 -0500 @@ -1,6 +1,8 @@ #ifndef __LINUX_BIT_SPINLOCK_H #define __LINUX_BIT_SPINLOCK_H +#if 0 + /* * bit-based spin_lock() * @@ -91,5 +93,7 @@ static inline int bit_spin_is_locked(int #endif } +#endif + #endif /* __LINUX_BIT_SPINLOCK_H */ Index: linux-2.6.24-rc8-rt1/include/linux/mutex.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/mutex.h 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/mutex.h 2008-01-16 22:29:10.000000000 -0500 @@ -12,11 +12,75 @@ #include #include +#include #include #include #include +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ + , .dep_map = { .name = #lockname } +#else +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) +#endif + +#ifdef CONFIG_PREEMPT_RT + +#include + +struct mutex { + struct rt_mutex lock; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + + +#define __MUTEX_INITIALIZER(mutexname) \ + { \ + .lock = __RT_MUTEX_INITIALIZER(mutexname.lock) \ + __DEP_MAP_MUTEX_INITIALIZER(mutexname) \ + } + +#define DEFINE_MUTEX(mutexname) \ + struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) + +extern void +_mutex_init(struct mutex *lock, char *name, struct lock_class_key *key); + +extern void __lockfunc _mutex_lock(struct mutex *lock); +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock); +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass); +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass); +extern int __lockfunc _mutex_trylock(struct mutex *lock); +extern void __lockfunc _mutex_unlock(struct mutex *lock); + +#define mutex_is_locked(l) rt_mutex_is_locked(&(l)->lock) +#define mutex_lock(l) _mutex_lock(l) +#define mutex_lock_interruptible(l) _mutex_lock_interruptible(l) +#define mutex_trylock(l) _mutex_trylock(l) +#define mutex_unlock(l) _mutex_unlock(l) +#define mutex_destroy(l) rt_mutex_destroy(&(l)->lock) + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define mutex_lock_nested(l, s) _mutex_lock_nested(l, s) +# define mutex_lock_interruptible_nested(l, s) \ + _mutex_lock_interruptible_nested(l, s) +#else +# define mutex_lock_nested(l, s) _mutex_lock(l) +# define mutex_lock_interruptible_nested(l, s) \ + _mutex_lock_interruptible(l) +#endif + +# define mutex_init(mutex) \ +do { \ + static struct lock_class_key __key; \ + \ + _mutex_init((mutex), #mutex, &__key); \ +} while (0) + +#else /* * Simple, straightforward mutexes with strict semantics: * @@ -86,13 +150,6 @@ do { \ # define mutex_destroy(mutex) do { } while (0) #endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ - , .dep_map = { .name = #lockname } -#else -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) -#endif - #define __MUTEX_INITIALIZER(lockname) \ { .count = ATOMIC_INIT(1) \ , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ @@ -144,3 +201,5 @@ extern int fastcall mutex_trylock(struct extern void fastcall mutex_unlock(struct mutex *lock); #endif + +#endif Index: linux-2.6.24-rc8-rt1/include/linux/plist.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/plist.h 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/plist.h 2008-01-16 22:29:24.000000000 -0500 @@ -81,7 +81,7 @@ struct plist_head { struct list_head prio_list; struct list_head node_list; #ifdef CONFIG_DEBUG_PI_LIST - spinlock_t *lock; + raw_spinlock_t *lock; #endif }; @@ -99,13 +99,13 @@ struct plist_node { /** * PLIST_HEAD_INIT - static struct plist_head initializer * @head: struct plist_head variable name - * @_lock: lock to initialize for this list + * @_lock: lock * to initialize for this list */ #define PLIST_HEAD_INIT(head, _lock) \ { \ .prio_list = LIST_HEAD_INIT((head).prio_list), \ .node_list = LIST_HEAD_INIT((head).node_list), \ - PLIST_HEAD_LOCK_INIT(&(_lock)) \ + PLIST_HEAD_LOCK_INIT(_lock) \ } /** @@ -125,7 +125,7 @@ struct plist_node { * @lock: list spinlock, remembered for debugging */ static inline void -plist_head_init(struct plist_head *head, spinlock_t *lock) +plist_head_init(struct plist_head *head, raw_spinlock_t *lock) { INIT_LIST_HEAD(&head->prio_list); INIT_LIST_HEAD(&head->node_list); @@ -148,6 +148,8 @@ static inline void plist_node_init(struc extern void plist_add(struct plist_node *node, struct plist_head *head); extern void plist_del(struct plist_node *node, struct plist_head *head); +extern void plist_head_splice(struct plist_head *src, struct plist_head *dst); + /** * plist_for_each - iterate over the plist * @pos: the type * to use as a loop counter Index: linux-2.6.24-rc8-rt1/include/linux/rt_lock.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/include/linux/rt_lock.h 2008-01-16 22:29:17.000000000 -0500 @@ -0,0 +1,272 @@ +#ifndef __LINUX_RT_LOCK_H +#define __LINUX_RT_LOCK_H + +/* + * Real-Time Preemption Support + * + * started by Ingo Molnar: + * + * Copyright (C) 2004, 2005 Red Hat, Inc., Ingo Molnar + * + * This file contains the main data structure definitions. + */ +#include +#include +#include + +#ifdef CONFIG_PREEMPT_RT +/* + * spinlocks - an RT mutex plus lock-break field: + */ +typedef struct { + struct rt_mutex lock; + unsigned int break_lock; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +} spinlock_t; + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# define __RT_SPIN_INITIALIZER(name) \ + { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ + .save_state = 1, \ + .file = __FILE__, \ + .line = __LINE__, } +#else +# define __RT_SPIN_INITIALIZER(name) \ + { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED(name.wait_lock) } +#endif + +#define __SPIN_LOCK_UNLOCKED(name) (spinlock_t) \ + { .lock = __RT_SPIN_INITIALIZER(name), \ + SPIN_DEP_MAP_INIT(name) } + +#else /* !PREEMPT_RT */ + +typedef raw_spinlock_t spinlock_t; + +#define __SPIN_LOCK_UNLOCKED _RAW_SPIN_LOCK_UNLOCKED + +#endif + +#define SPIN_LOCK_UNLOCKED __SPIN_LOCK_UNLOCKED(spin_old_style) + + +#define __DEFINE_SPINLOCK(name) \ + spinlock_t name = __SPIN_LOCK_UNLOCKED(name) + +#define DEFINE_SPINLOCK(name) \ + spinlock_t name __cacheline_aligned_in_smp = __SPIN_LOCK_UNLOCKED(name) + +#ifdef CONFIG_PREEMPT_RT + +/* + * RW-semaphores are a spinlock plus a reader-depth count. + * + * Note that the semantics are different from the usual + * Linux rw-sems, in PREEMPT_RT mode we do not allow + * multiple readers to hold the lock at once, we only allow + * a read-lock owner to read-lock recursively. This is + * better for latency, makes the implementation inherently + * fair and makes it simpler as well: + */ +struct rw_semaphore { + struct rt_mutex lock; + int read_depth; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +/* + * rwlocks - an RW semaphore plus lock-break field: + */ +typedef struct { + struct rt_mutex lock; + int read_depth; + unsigned int break_lock; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +} rwlock_t; + +#define __RW_LOCK_UNLOCKED(name) (rwlock_t) \ + { .lock = __RT_SPIN_INITIALIZER(name), \ + RW_DEP_MAP_INIT(name) } +#else /* !PREEMPT_RT */ + +typedef raw_rwlock_t rwlock_t; + +#define __RW_LOCK_UNLOCKED _RAW_RW_LOCK_UNLOCKED + +#endif + +#define RW_LOCK_UNLOCKED __RW_LOCK_UNLOCKED(rw_old_style) + + +#define DEFINE_RWLOCK(name) \ + rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name) + +#ifdef CONFIG_PREEMPT_RT + +/* + * Semaphores - a spinlock plus the semaphore count: + */ +struct semaphore { + atomic_t count; + struct rt_mutex lock; +}; + +#define DECLARE_MUTEX(name) \ +struct semaphore name = \ + { .count = { 1 }, .lock = __RT_MUTEX_INITIALIZER(name.lock) } + +extern void fastcall +__sema_init(struct semaphore *sem, int val, char *name, char *file, int line); + +#define rt_sema_init(sem, val) \ + __sema_init(sem, val, #sem, __FILE__, __LINE__) + +extern void fastcall +__init_MUTEX(struct semaphore *sem, char *name, char *file, int line); +#define rt_init_MUTEX(sem) \ + __init_MUTEX(sem, #sem, __FILE__, __LINE__) + +extern void there_is_no_init_MUTEX_LOCKED_for_RT_semaphores(void); + +/* + * No locked initialization for RT semaphores + */ +#define rt_init_MUTEX_LOCKED(sem) \ + there_is_no_init_MUTEX_LOCKED_for_RT_semaphores() +extern void fastcall rt_down(struct semaphore *sem); +extern int fastcall rt_down_interruptible(struct semaphore *sem); +extern int fastcall rt_down_trylock(struct semaphore *sem); +extern void fastcall rt_up(struct semaphore *sem); + +#define rt_sem_is_locked(s) rt_mutex_is_locked(&(s)->lock) +#define rt_sema_count(s) atomic_read(&(s)->count) + +extern int __bad_func_type(void); + +#include + +/* + * PICK_SEM_OP() is a small redirector to allow less typing of the lock + * types struct compat_semaphore, struct semaphore, at the front of the + * PICK_FUNCTION macro. + */ +#define PICK_SEM_OP(...) PICK_FUNCTION(struct compat_semaphore *, \ + struct semaphore *, ##__VA_ARGS__) +#define PICK_SEM_OP_RET(...) PICK_FUNCTION_RET(struct compat_semaphore *,\ + struct semaphore *, ##__VA_ARGS__) + +#define sema_init(sem, val) \ + PICK_SEM_OP(compat_sema_init, rt_sema_init, sem, val) + +#define init_MUTEX(sem) PICK_SEM_OP(compat_init_MUTEX, rt_init_MUTEX, sem) + +#define init_MUTEX_LOCKED(sem) \ + PICK_SEM_OP(compat_init_MUTEX_LOCKED, rt_init_MUTEX_LOCKED, sem) + +#define down(sem) PICK_SEM_OP(compat_down, rt_down, sem) + +#define down_interruptible(sem) \ + PICK_SEM_OP_RET(compat_down_interruptible, rt_down_interruptible, sem) + +#define down_trylock(sem) \ + PICK_SEM_OP_RET(compat_down_trylock, rt_down_trylock, sem) + +#define up(sem) PICK_SEM_OP(compat_up, rt_up, sem) + +#define sem_is_locked(sem) \ + PICK_SEM_OP_RET(compat_sem_is_locked, rt_sem_is_locked, sem) + +#define sema_count(sem) PICK_SEM_OP_RET(compat_sema_count, rt_sema_count, sem) + +/* + * rwsems: + */ + +#define __RWSEM_INITIALIZER(name) \ + { .lock = __RT_MUTEX_INITIALIZER(name.lock), \ + RW_DEP_MAP_INIT(name) } + +#define DECLARE_RWSEM(lockname) \ + struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname) + +extern void fastcall __rt_rwsem_init(struct rw_semaphore *rwsem, char *name, + struct lock_class_key *key); + +# define rt_init_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __rt_rwsem_init((sem), #sem, &__key); \ +} while (0) + +extern void __dont_do_this_in_rt(struct rw_semaphore *rwsem); + +#define rt_down_read_non_owner(rwsem) __dont_do_this_in_rt(rwsem) +#define rt_up_read_non_owner(rwsem) __dont_do_this_in_rt(rwsem) + +extern void fastcall rt_down_write(struct rw_semaphore *rwsem); +extern void fastcall +rt_down_read_nested(struct rw_semaphore *rwsem, int subclass); +extern void fastcall +rt_down_write_nested(struct rw_semaphore *rwsem, int subclass); +extern void fastcall rt_down_read(struct rw_semaphore *rwsem); +extern int fastcall rt_down_write_trylock(struct rw_semaphore *rwsem); +extern int fastcall rt_down_read_trylock(struct rw_semaphore *rwsem); +extern void fastcall rt_up_read(struct rw_semaphore *rwsem); +extern void fastcall rt_up_write(struct rw_semaphore *rwsem); +extern void fastcall rt_downgrade_write(struct rw_semaphore *rwsem); + +# define rt_rwsem_is_locked(rws) (rt_mutex_is_locked(&(rws)->lock)) + +#define PICK_RWSEM_OP(...) PICK_FUNCTION(struct compat_rw_semaphore *, \ + struct rw_semaphore *, ##__VA_ARGS__) +#define PICK_RWSEM_OP_RET(...) PICK_FUNCTION_RET(struct compat_rw_semaphore *,\ + struct rw_semaphore *, ##__VA_ARGS__) + +#define init_rwsem(rwsem) PICK_RWSEM_OP(compat_init_rwsem, rt_init_rwsem, rwsem) + +#define down_read(rwsem) PICK_RWSEM_OP(compat_down_read, rt_down_read, rwsem) + +#define down_read_non_owner(rwsem) \ + PICK_RWSEM_OP(compat_down_read_non_owner, rt_down_read_non_owner, rwsem) + +#define down_read_trylock(rwsem) \ + PICK_RWSEM_OP_RET(compat_down_read_trylock, rt_down_read_trylock, rwsem) + +#define down_write(rwsem) PICK_RWSEM_OP(compat_down_write, rt_down_write, rwsem) + +#define down_read_nested(rwsem, subclass) \ + PICK_RWSEM_OP(compat_down_read_nested, rt_down_read_nested, \ + rwsem, subclass) + +#define down_write_nested(rwsem, subclass) \ + PICK_RWSEM_OP(compat_down_write_nested, rt_down_write_nested, \ + rwsem, subclass) + +#define down_write_trylock(rwsem) \ + PICK_RWSEM_OP_RET(compat_down_write_trylock, rt_down_write_trylock,\ + rwsem) + +#define up_read(rwsem) PICK_RWSEM_OP(compat_up_read, rt_up_read, rwsem) + +#define up_read_non_owner(rwsem) \ + PICK_RWSEM_OP(compat_up_read_non_owner, rt_up_read_non_owner, rwsem) + +#define up_write(rwsem) PICK_RWSEM_OP(compat_up_write, rt_up_write, rwsem) + +#define downgrade_write(rwsem) \ + PICK_RWSEM_OP(compat_downgrade_write, rt_downgrade_write, rwsem) + +#define rwsem_is_locked(rwsem) \ + PICK_RWSEM_OP_RET(compat_rwsem_is_locked, rt_rwsem_is_locked, rwsem) + +#endif /* CONFIG_PREEMPT_RT */ + +#endif + Index: linux-2.6.24-rc8-rt1/include/linux/rtmutex.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/rtmutex.h 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/rtmutex.h 2008-01-16 22:29:24.000000000 -0500 @@ -24,7 +24,7 @@ * @owner: the mutex owner */ struct rt_mutex { - spinlock_t wait_lock; + raw_spinlock_t wait_lock; struct plist_head wait_list; struct task_struct *owner; #ifdef CONFIG_DEBUG_RT_MUTEXES @@ -63,8 +63,8 @@ struct hrtimer_sleeper; #endif #define __RT_MUTEX_INITIALIZER(mutexname) \ - { .wait_lock = __SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ - , .wait_list = PLIST_HEAD_INIT(mutexname.wait_list, mutexname.wait_lock) \ + { .wait_lock = RAW_SPIN_LOCK_UNLOCKED(mutexname) \ + , .wait_list = PLIST_HEAD_INIT(mutexname.wait_list, &mutexname.wait_lock) \ , .owner = NULL \ __DEBUG_RT_MUTEX_INITIALIZER(mutexname)} @@ -98,7 +98,7 @@ extern void rt_mutex_unlock(struct rt_mu #ifdef CONFIG_RT_MUTEXES # define INIT_RT_MUTEXES(tsk) \ - .pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters, tsk.pi_lock), \ + .pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters, &tsk.pi_lock), \ INIT_RT_MUTEX_DEBUG(tsk) #else # define INIT_RT_MUTEXES(tsk) Index: linux-2.6.24-rc8-rt1/include/linux/rwsem-spinlock.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/rwsem-spinlock.h 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/rwsem-spinlock.h 2008-01-16 22:28:31.000000000 -0500 @@ -28,7 +28,7 @@ struct rwsem_waiter; * - if activity is -1 then there is one active writer * - if wait_list is not empty, then there are processes waiting for the semaphore */ -struct rw_semaphore { +struct compat_rw_semaphore { __s32 activity; spinlock_t wait_lock; struct list_head wait_list; @@ -43,33 +43,32 @@ struct rw_semaphore { # define __RWSEM_DEP_MAP_INIT(lockname) #endif -#define __RWSEM_INITIALIZER(name) \ -{ 0, __SPIN_LOCK_UNLOCKED(name.wait_lock), LIST_HEAD_INIT((name).wait_list) \ - __RWSEM_DEP_MAP_INIT(name) } +#define __COMPAT_RWSEM_INITIALIZER(name) \ +{ 0, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define COMPAT_DECLARE_RWSEM(name) \ + struct compat_rw_semaphore name = __COMPAT_RWSEM_INITIALIZER(name) -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, +extern void __compat_init_rwsem(struct compat_rw_semaphore *sem, const char *name, struct lock_class_key *key); -#define init_rwsem(sem) \ +#define compat_init_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ - __init_rwsem((sem), #sem, &__key); \ + __compat_init_rwsem((sem), #sem, &__key); \ } while (0) -extern void FASTCALL(__down_read(struct rw_semaphore *sem)); -extern int FASTCALL(__down_read_trylock(struct rw_semaphore *sem)); -extern void FASTCALL(__down_write(struct rw_semaphore *sem)); -extern void FASTCALL(__down_write_nested(struct rw_semaphore *sem, int subclass)); -extern int FASTCALL(__down_write_trylock(struct rw_semaphore *sem)); -extern void FASTCALL(__up_read(struct rw_semaphore *sem)); -extern void FASTCALL(__up_write(struct rw_semaphore *sem)); -extern void FASTCALL(__downgrade_write(struct rw_semaphore *sem)); +extern void FASTCALL(__down_read(struct compat_rw_semaphore *sem)); +extern int FASTCALL(__down_read_trylock(struct compat_rw_semaphore *sem)); +extern void FASTCALL(__down_write(struct compat_rw_semaphore *sem)); +extern void FASTCALL(__down_write_nested(struct compat_rw_semaphore *sem, int subclass)); +extern int FASTCALL(__down_write_trylock(struct compat_rw_semaphore *sem)); +extern void FASTCALL(__up_read(struct compat_rw_semaphore *sem)); +extern void FASTCALL(__up_write(struct compat_rw_semaphore *sem)); +extern void FASTCALL(__downgrade_write(struct compat_rw_semaphore *sem)); -static inline int rwsem_is_locked(struct rw_semaphore *sem) +static inline int compat_rwsem_is_locked(struct compat_rw_semaphore *sem) { return (sem->activity != 0); } Index: linux-2.6.24-rc8-rt1/include/linux/rwsem.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/rwsem.h 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/rwsem.h 2008-01-16 22:28:31.000000000 -0500 @@ -9,6 +9,10 @@ #include +#ifdef CONFIG_PREEMPT_RT +# include +#endif + #ifdef __KERNEL__ #include @@ -16,48 +20,59 @@ #include #include -struct rw_semaphore; +#ifndef CONFIG_PREEMPT_RT +/* + * On !PREEMPT_RT all rw-semaphores are compat: + */ +#define compat_rw_semaphore rw_semaphore +#endif + +struct compat_rw_semaphore; #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK -#include /* use a generic implementation */ +# include /* use a generic implementation */ +# ifndef CONFIG_PREEMPT_RT +# define __RWSEM_INITIALIZER __COMPAT_RWSEM_INITIALIZER +# define DECLARE_RWSEM COMPAT_DECLARE_RWSEM +# endif #else -#include /* use an arch-specific implementation */ +# include /* use an arch-specific implementation */ #endif /* * lock for reading */ -extern void down_read(struct rw_semaphore *sem); +extern void compat_down_read(struct compat_rw_semaphore *sem); /* * trylock for reading -- returns 1 if successful, 0 if contention */ -extern int down_read_trylock(struct rw_semaphore *sem); +extern int compat_down_read_trylock(struct compat_rw_semaphore *sem); /* * lock for writing */ -extern void down_write(struct rw_semaphore *sem); +extern void compat_down_write(struct compat_rw_semaphore *sem); /* * trylock for writing -- returns 1 if successful, 0 if contention */ -extern int down_write_trylock(struct rw_semaphore *sem); +extern int compat_down_write_trylock(struct compat_rw_semaphore *sem); /* * release a read lock */ -extern void up_read(struct rw_semaphore *sem); +extern void compat_up_read(struct compat_rw_semaphore *sem); /* * release a write lock */ -extern void up_write(struct rw_semaphore *sem); +extern void compat_up_write(struct compat_rw_semaphore *sem); /* * downgrade write lock to read lock */ -extern void downgrade_write(struct rw_semaphore *sem); +extern void compat_downgrade_write(struct compat_rw_semaphore *sem); #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -73,22 +88,79 @@ extern void downgrade_write(struct rw_se * lockdep_set_class() at lock initialization time. * See Documentation/lockdep-design.txt for more details.) */ -extern void down_read_nested(struct rw_semaphore *sem, int subclass); -extern void down_write_nested(struct rw_semaphore *sem, int subclass); +extern void +compat_down_read_nested(struct compat_rw_semaphore *sem, int subclass); +extern void +compat_down_write_nested(struct compat_rw_semaphore *sem, int subclass); /* * Take/release a lock when not the owner will release it. * * [ This API should be avoided as much as possible - the * proper abstraction for this case is completions. ] */ -extern void down_read_non_owner(struct rw_semaphore *sem); -extern void up_read_non_owner(struct rw_semaphore *sem); +extern void +compat_down_read_non_owner(struct compat_rw_semaphore *sem); +extern void +compat_up_read_non_owner(struct compat_rw_semaphore *sem); #else -# define down_read_nested(sem, subclass) down_read(sem) -# define down_write_nested(sem, subclass) down_write(sem) -# define down_read_non_owner(sem) down_read(sem) -# define up_read_non_owner(sem) up_read(sem) +# define compat_down_read_nested(sem, subclass) compat_down_read(sem) +# define compat_down_write_nested(sem, subclass) compat_down_write(sem) +# define compat_down_read_non_owner(sem) compat_down_read(sem) +# define compat_up_read_non_owner(sem) compat_up_read(sem) #endif +#ifndef CONFIG_PREEMPT_RT + +#define DECLARE_RWSEM COMPAT_DECLARE_RWSEM + +/* + * NOTE, lockdep: this has to be a macro, so that separate class-keys + * get generated by the compiler, if the same function does multiple + * init_rwsem() calls to different rwsems. + */ +#define init_rwsem(rwsem) compat_init_rwsem(rwsem) + +static inline void down_read(struct compat_rw_semaphore *rwsem) +{ + compat_down_read(rwsem); +} +static inline int down_read_trylock(struct compat_rw_semaphore *rwsem) +{ + return compat_down_read_trylock(rwsem); +} +static inline void down_write(struct compat_rw_semaphore *rwsem) +{ + compat_down_write(rwsem); +} +static inline int down_write_trylock(struct compat_rw_semaphore *rwsem) +{ + return compat_down_write_trylock(rwsem); +} +static inline void up_read(struct compat_rw_semaphore *rwsem) +{ + compat_up_read(rwsem); +} +static inline void up_write(struct compat_rw_semaphore *rwsem) +{ + compat_up_write(rwsem); +} +static inline void downgrade_write(struct compat_rw_semaphore *rwsem) +{ + compat_downgrade_write(rwsem); +} +static inline int rwsem_is_locked(struct compat_rw_semaphore *sem) +{ + return compat_rwsem_is_locked(sem); +} +# define down_read_nested(sem, subclass) \ + compat_down_read_nested(sem, subclass) +# define down_write_nested(sem, subclass) \ + compat_down_write_nested(sem, subclass) +# define down_read_non_owner(sem) \ + compat_down_read_non_owner(sem) +# define up_read_non_owner(sem) \ + compat_up_read_non_owner(sem) +#endif /* !CONFIG_PREEMPT_RT */ + #endif /* __KERNEL__ */ #endif /* _LINUX_RWSEM_H */ Index: linux-2.6.24-rc8-rt1/include/linux/semaphore.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/include/linux/semaphore.h 2008-01-16 22:28:31.000000000 -0500 @@ -0,0 +1,49 @@ +#ifndef _LINUX_SEMAPHORE_H +#define _LINUX_SEMAPHORE_H + +#ifdef CONFIG_PREEMPT_RT +# include +#else + +#define DECLARE_MUTEX COMPAT_DECLARE_MUTEX + +static inline void sema_init(struct compat_semaphore *sem, int val) +{ + compat_sema_init(sem, val); +} +static inline void init_MUTEX(struct compat_semaphore *sem) +{ + compat_init_MUTEX(sem); +} +static inline void init_MUTEX_LOCKED(struct compat_semaphore *sem) +{ + compat_init_MUTEX_LOCKED(sem); +} +static inline void down(struct compat_semaphore *sem) +{ + compat_down(sem); +} +static inline int down_interruptible(struct compat_semaphore *sem) +{ + return compat_down_interruptible(sem); +} +static inline int down_trylock(struct compat_semaphore *sem) +{ + return compat_down_trylock(sem); +} +static inline void up(struct compat_semaphore *sem) +{ + compat_up(sem); +} +static inline int sem_is_locked(struct compat_semaphore *sem) +{ + return compat_sem_is_locked(sem); +} +static inline int sema_count(struct compat_semaphore *sem) +{ + return compat_sema_count(sem); +} + +#endif /* CONFIG_PREEMPT_RT */ + +#endif /* _LINUX_SEMAPHORE_H */ Index: linux-2.6.24-rc8-rt1/include/linux/seqlock.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/seqlock.h 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/seqlock.h 2008-01-16 22:29:21.000000000 -0500 @@ -32,46 +32,83 @@ typedef struct { unsigned sequence; spinlock_t lock; -} seqlock_t; +} __seqlock_t; + +typedef struct { + unsigned sequence; + raw_spinlock_t lock; +} __raw_seqlock_t; + +#define seqlock_need_resched(seq) lock_need_resched(&(seq)->lock) + +#ifdef CONFIG_PREEMPT_RT +typedef __seqlock_t seqlock_t; +#else +typedef __raw_seqlock_t seqlock_t; +#endif + +typedef __raw_seqlock_t raw_seqlock_t; /* * These macros triggered gcc-3.x compile-time problems. We think these are * OK now. Be cautious. */ -#define __SEQLOCK_UNLOCKED(lockname) \ - { 0, __SPIN_LOCK_UNLOCKED(lockname) } +#define __RAW_SEQLOCK_UNLOCKED(lockname) \ + { 0, RAW_SPIN_LOCK_UNLOCKED(lockname) } + +#ifdef CONFIG_PREEMPT_RT +# define __SEQLOCK_UNLOCKED(lockname) { 0, __SPIN_LOCK_UNLOCKED(lockname) } +#else +# define __SEQLOCK_UNLOCKED(lockname) __RAW_SEQLOCK_UNLOCKED(lockname) +#endif #define SEQLOCK_UNLOCKED \ __SEQLOCK_UNLOCKED(old_style_seqlock_init) -#define seqlock_init(x) \ - do { \ - (x)->sequence = 0; \ - spin_lock_init(&(x)->lock); \ - } while (0) +#define raw_seqlock_init(x) \ + do { *(x) = (raw_seqlock_t) __RAW_SEQLOCK_UNLOCKED(x); spin_lock_init(&(x)->lock); } while (0) + +#define seqlock_init(x) \ + do { *(x) = (seqlock_t) __SEQLOCK_UNLOCKED(x); spin_lock_init(&(x)->lock); } while (0) #define DEFINE_SEQLOCK(x) \ seqlock_t x = __SEQLOCK_UNLOCKED(x) +#define DEFINE_RAW_SEQLOCK(name) \ + raw_seqlock_t name __cacheline_aligned_in_smp = \ + __RAW_SEQLOCK_UNLOCKED(name) + + /* Lock out other writers and update the count. * Acts like a normal spin_lock/unlock. * Don't need preempt_disable() because that is in the spin_lock already. */ -static inline void write_seqlock(seqlock_t *sl) +static inline void __write_seqlock(seqlock_t *sl) { spin_lock(&sl->lock); ++sl->sequence; smp_wmb(); } -static inline void write_sequnlock(seqlock_t *sl) +static __always_inline unsigned long __write_seqlock_irqsave(seqlock_t *sl) +{ + unsigned long flags; + + local_save_flags(flags); + __write_seqlock(sl); + return flags; +} + +static inline void __write_sequnlock(seqlock_t *sl) { smp_wmb(); sl->sequence++; spin_unlock(&sl->lock); } -static inline int write_tryseqlock(seqlock_t *sl) +#define __write_sequnlock_irqrestore(sl, flags) __write_sequnlock(sl) + +static inline int __write_tryseqlock(seqlock_t *sl) { int ret = spin_trylock(&sl->lock); @@ -83,7 +120,7 @@ static inline int write_tryseqlock(seqlo } /* Start of read calculation -- fetch last complete writer token */ -static __always_inline unsigned read_seqbegin(const seqlock_t *sl) +static __always_inline unsigned __read_seqbegin(const seqlock_t *sl) { unsigned ret = sl->sequence; smp_rmb(); @@ -98,12 +135,195 @@ static __always_inline unsigned read_seq * * Using xor saves one conditional branch. */ -static __always_inline int read_seqretry(const seqlock_t *sl, unsigned iv) +static inline int __read_seqretry(seqlock_t *sl, unsigned iv) +{ + int ret; + + smp_rmb(); + ret = (iv & 1) | (sl->sequence ^ iv); + /* + * If invalid then serialize with the writer, to make sure we + * are not livelocking it: + */ + if (unlikely(ret)) { + unsigned long flags; + spin_lock_irqsave(&sl->lock, flags); + spin_unlock_irqrestore(&sl->lock, flags); + } + return ret; +} + +static __always_inline void __write_seqlock_raw(raw_seqlock_t *sl) +{ + spin_lock(&sl->lock); + ++sl->sequence; + smp_wmb(); +} + +static __always_inline unsigned long +__write_seqlock_irqsave_raw(raw_seqlock_t *sl) +{ + unsigned long flags; + + local_irq_save(flags); + __write_seqlock_raw(sl); + return flags; +} + +static __always_inline void __write_seqlock_irq_raw(raw_seqlock_t *sl) +{ + local_irq_disable(); + __write_seqlock_raw(sl); +} + +static __always_inline void __write_seqlock_bh_raw(raw_seqlock_t *sl) +{ + local_bh_disable(); + __write_seqlock_raw(sl); +} + +static __always_inline void __write_sequnlock_raw(raw_seqlock_t *sl) +{ + smp_wmb(); + sl->sequence++; + spin_unlock(&sl->lock); +} + +static __always_inline void +__write_sequnlock_irqrestore_raw(raw_seqlock_t *sl, unsigned long flags) +{ + __write_sequnlock_raw(sl); + local_irq_restore(flags); + preempt_check_resched(); +} + +static __always_inline void __write_sequnlock_irq_raw(raw_seqlock_t *sl) +{ + __write_sequnlock_raw(sl); + local_irq_enable(); + preempt_check_resched(); +} + +static __always_inline void __write_sequnlock_bh_raw(raw_seqlock_t *sl) +{ + __write_sequnlock_raw(sl); + local_bh_enable(); +} + +static __always_inline int __write_tryseqlock_raw(raw_seqlock_t *sl) +{ + int ret = spin_trylock(&sl->lock); + + if (ret) { + ++sl->sequence; + smp_wmb(); + } + return ret; +} + +static __always_inline unsigned __read_seqbegin_raw(const raw_seqlock_t *sl) +{ + unsigned ret = sl->sequence; + smp_rmb(); + return ret; +} + +static __always_inline int __read_seqretry_raw(const raw_seqlock_t *sl, unsigned iv) { smp_rmb(); return (iv & 1) | (sl->sequence ^ iv); } +extern int __bad_seqlock_type(void); + +/* + * PICK_SEQ_OP() is a small redirector to allow less typing of the lock + * types raw_seqlock_t, seqlock_t, at the front of the PICK_FUNCTION + * macro. + */ +#define PICK_SEQ_OP(...) \ + PICK_FUNCTION(raw_seqlock_t *, seqlock_t *, ##__VA_ARGS__) +#define PICK_SEQ_OP_RET(...) \ + PICK_FUNCTION_RET(raw_seqlock_t *, seqlock_t *, ##__VA_ARGS__) + +#define write_seqlock(sl) PICK_SEQ_OP(__write_seqlock_raw, __write_seqlock, sl) + +#define write_sequnlock(sl) \ + PICK_SEQ_OP(__write_sequnlock_raw, __write_sequnlock, sl) + +#define write_tryseqlock(sl) \ + PICK_SEQ_OP_RET(__write_tryseqlock_raw, __write_tryseqlock, sl) + +#define read_seqbegin(sl) \ + PICK_SEQ_OP_RET(__read_seqbegin_raw, __read_seqbegin, sl) + +#define read_seqretry(sl, iv) \ + PICK_SEQ_OP_RET(__read_seqretry_raw, __read_seqretry, sl, iv) + +#define write_seqlock_irqsave(lock, flags) \ +do { \ + flags = PICK_SEQ_OP_RET(__write_seqlock_irqsave_raw, \ + __write_seqlock_irqsave, lock); \ +} while (0) + +#define write_seqlock_irq(lock) \ + PICK_SEQ_OP(__write_seqlock_irq_raw, __write_seqlock, lock) + +#define write_seqlock_bh(lock) \ + PICK_SEQ_OP(__write_seqlock_bh_raw, __write_seqlock, lock) + +#define write_sequnlock_irqrestore(lock, flags) \ + PICK_SEQ_OP(__write_sequnlock_irqrestore_raw, \ + __write_sequnlock_irqrestore, lock, flags) + +#define write_sequnlock_bh(lock) \ + PICK_SEQ_OP(__write_sequnlock_bh_raw, __write_sequnlock, lock) + +#define write_sequnlock_irq(lock) \ + PICK_SEQ_OP(__write_sequnlock_irq_raw, __write_sequnlock, lock) + +static __always_inline +unsigned long __seq_irqsave_raw(raw_seqlock_t *sl) +{ + unsigned long flags; + + local_irq_save(flags); + return flags; +} + +static __always_inline unsigned long __seq_irqsave(seqlock_t *sl) +{ + unsigned long flags; + + local_save_flags(flags); + return flags; +} + +#define read_seqbegin_irqsave(lock, flags) \ +({ \ + flags = PICK_SEQ_OP_RET(__seq_irqsave_raw, __seq_irqsave, lock);\ + read_seqbegin(lock); \ +}) + +static __always_inline int +__read_seqretry_irqrestore(seqlock_t *sl, unsigned iv, unsigned long flags) +{ + return __read_seqretry(sl, iv); +} + +static __always_inline int +__read_seqretry_irqrestore_raw(raw_seqlock_t *sl, unsigned iv, + unsigned long flags) +{ + int ret = read_seqretry(sl, iv); + local_irq_restore(flags); + preempt_check_resched(); + return ret; +} + +#define read_seqretry_irqrestore(lock, iv, flags) \ + PICK_SEQ_OP_RET(__read_seqretry_irqrestore_raw, \ + __read_seqretry_irqrestore, lock, iv, flags) /* * Version using sequence counter only. @@ -154,32 +374,4 @@ static inline void write_seqcount_end(se smp_wmb(); s->sequence++; } - -/* - * Possible sw/hw IRQ protected versions of the interfaces. - */ -#define write_seqlock_irqsave(lock, flags) \ - do { local_irq_save(flags); write_seqlock(lock); } while (0) -#define write_seqlock_irq(lock) \ - do { local_irq_disable(); write_seqlock(lock); } while (0) -#define write_seqlock_bh(lock) \ - do { local_bh_disable(); write_seqlock(lock); } while (0) - -#define write_sequnlock_irqrestore(lock, flags) \ - do { write_sequnlock(lock); local_irq_restore(flags); } while(0) -#define write_sequnlock_irq(lock) \ - do { write_sequnlock(lock); local_irq_enable(); } while(0) -#define write_sequnlock_bh(lock) \ - do { write_sequnlock(lock); local_bh_enable(); } while(0) - -#define read_seqbegin_irqsave(lock, flags) \ - ({ local_irq_save(flags); read_seqbegin(lock); }) - -#define read_seqretry_irqrestore(lock, iv, flags) \ - ({ \ - int ret = read_seqretry(lock, iv); \ - local_irq_restore(flags); \ - ret; \ - }) - #endif /* __LINUX_SEQLOCK_H */ Index: linux-2.6.24-rc8-rt1/include/linux/spinlock_api_smp.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/spinlock_api_smp.h 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/spinlock_api_smp.h 2008-01-16 22:28:31.000000000 -0500 @@ -19,43 +19,58 @@ int in_lock_functions(unsigned long addr #define assert_spin_locked(x) BUG_ON(!spin_is_locked(x)) -void __lockfunc _spin_lock(spinlock_t *lock) __acquires(lock); -void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) - __acquires(lock); -void __lockfunc _read_lock(rwlock_t *lock) __acquires(lock); -void __lockfunc _write_lock(rwlock_t *lock) __acquires(lock); -void __lockfunc _spin_lock_bh(spinlock_t *lock) __acquires(lock); -void __lockfunc _read_lock_bh(rwlock_t *lock) __acquires(lock); -void __lockfunc _write_lock_bh(rwlock_t *lock) __acquires(lock); -void __lockfunc _spin_lock_irq(spinlock_t *lock) __acquires(lock); -void __lockfunc _read_lock_irq(rwlock_t *lock) __acquires(lock); -void __lockfunc _write_lock_irq(rwlock_t *lock) __acquires(lock); -unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) - __acquires(lock); -unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) - __acquires(lock); -unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) - __acquires(lock); -unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) - __acquires(lock); -int __lockfunc _spin_trylock(spinlock_t *lock); -int __lockfunc _read_trylock(rwlock_t *lock); -int __lockfunc _write_trylock(rwlock_t *lock); -int __lockfunc _spin_trylock_bh(spinlock_t *lock); -void __lockfunc _spin_unlock(spinlock_t *lock) __releases(lock); -void __lockfunc _read_unlock(rwlock_t *lock) __releases(lock); -void __lockfunc _write_unlock(rwlock_t *lock) __releases(lock); -void __lockfunc _spin_unlock_bh(spinlock_t *lock) __releases(lock); -void __lockfunc _read_unlock_bh(rwlock_t *lock) __releases(lock); -void __lockfunc _write_unlock_bh(rwlock_t *lock) __releases(lock); -void __lockfunc _spin_unlock_irq(spinlock_t *lock) __releases(lock); -void __lockfunc _read_unlock_irq(rwlock_t *lock) __releases(lock); -void __lockfunc _write_unlock_irq(rwlock_t *lock) __releases(lock); -void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) - __releases(lock); -void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) - __releases(lock); -void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) - __releases(lock); +#define ACQUIRE_SPIN __acquires(lock) +#define ACQUIRE_RW __acquires(lock) +#define RELEASE_SPIN __releases(lock) +#define RELEASE_RW __releases(lock) + +void __lockfunc __spin_lock(raw_spinlock_t *lock) ACQUIRE_SPIN; +void __lockfunc __spin_lock_nested(raw_spinlock_t *lock, int subclass) + ACQUIRE_SPIN; +void __lockfunc __read_lock(raw_rwlock_t *lock) ACQUIRE_RW; +void __lockfunc __write_lock(raw_rwlock_t *lock) ACQUIRE_RW; +void __lockfunc __spin_lock_bh(raw_spinlock_t *lock) ACQUIRE_SPIN; +void __lockfunc __read_lock_bh(raw_rwlock_t *lock) ACQUIRE_RW; +void __lockfunc __write_lock_bh(raw_rwlock_t *lock) ACQUIRE_RW; +void __lockfunc __spin_lock_irq(raw_spinlock_t *lock) ACQUIRE_SPIN; +void __lockfunc __read_lock_irq(raw_rwlock_t *lock) ACQUIRE_RW; +void __lockfunc __write_lock_irq(raw_rwlock_t *lock) ACQUIRE_RW; +unsigned long __lockfunc __spin_lock_irqsave(raw_spinlock_t *lock) + ACQUIRE_SPIN; +unsigned long __lockfunc +__spin_lock_irqsave_nested(raw_spinlock_t *lock, int subclass) ACQUIRE_SPIN; +unsigned long __lockfunc __read_lock_irqsave(raw_rwlock_t *lock) + ACQUIRE_RW; +unsigned long __lockfunc __write_lock_irqsave(raw_rwlock_t *lock) + ACQUIRE_RW; +int __lockfunc __spin_trylock(raw_spinlock_t *lock); +int __lockfunc +__spin_trylock_irqsave(raw_spinlock_t *lock, unsigned long *flags); +int __lockfunc __read_trylock(raw_rwlock_t *lock); +int __lockfunc __write_trylock(raw_rwlock_t *lock); +int __lockfunc +__write_trylock_irqsave(raw_rwlock_t *lock, unsigned long *flags); +int __lockfunc __spin_trylock_bh(raw_spinlock_t *lock); +int __lockfunc __spin_trylock_irq(raw_spinlock_t *lock); +void __lockfunc __spin_unlock(raw_spinlock_t *lock) RELEASE_SPIN; +void __lockfunc __spin_unlock_no_resched(raw_spinlock_t *lock) + RELEASE_SPIN; +void __lockfunc __read_unlock(raw_rwlock_t *lock) RELEASE_RW; +void __lockfunc __write_unlock(raw_rwlock_t *lock) RELEASE_RW; +void __lockfunc __spin_unlock_bh(raw_spinlock_t *lock) RELEASE_SPIN; +void __lockfunc __read_unlock_bh(raw_rwlock_t *lock) RELEASE_RW; +void __lockfunc __write_unlock_bh(raw_rwlock_t *lock) RELEASE_RW; +void __lockfunc __spin_unlock_irq(raw_spinlock_t *lock) RELEASE_SPIN; +void __lockfunc __read_unlock_irq(raw_rwlock_t *lock) RELEASE_RW; +void __lockfunc __write_unlock_irq(raw_rwlock_t *lock) RELEASE_RW; +void __lockfunc +__spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) + RELEASE_SPIN; +void __lockfunc +__read_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags) + RELEASE_RW; +void +__lockfunc __write_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags) + RELEASE_RW; #endif /* __LINUX_SPINLOCK_API_SMP_H */ Index: linux-2.6.24-rc8-rt1/include/linux/spinlock_api_up.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/spinlock_api_up.h 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/spinlock_api_up.h 2008-01-16 22:28:31.000000000 -0500 @@ -33,12 +33,22 @@ #define __LOCK_IRQ(lock) \ do { local_irq_disable(); __LOCK(lock); } while (0) -#define __LOCK_IRQSAVE(lock, flags) \ - do { local_irq_save(flags); __LOCK(lock); } while (0) +#define __LOCK_IRQSAVE(lock) \ + ({ unsigned long __flags; local_irq_save(__flags); __LOCK(lock); __flags; }) + +#define __TRYLOCK_IRQSAVE(lock, flags) \ + ({ local_irq_save(*(flags)); __LOCK(lock); 1; }) + +#define __spin_trylock_irqsave(lock, flags) __TRYLOCK_IRQSAVE(lock, flags) + +#define __write_trylock_irqsave(lock, flags) __TRYLOCK_IRQSAVE(lock, flags) #define __UNLOCK(lock) \ do { preempt_enable(); __release(lock); (void)(lock); } while (0) +#define __UNLOCK_NO_RESCHED(lock) \ + do { __preempt_enable_no_resched(); __release(lock); (void)(lock); } while (0) + #define __UNLOCK_BH(lock) \ do { preempt_enable_no_resched(); local_bh_enable(); __release(lock); (void)(lock); } while (0) @@ -48,34 +58,36 @@ #define __UNLOCK_IRQRESTORE(lock, flags) \ do { local_irq_restore(flags); __UNLOCK(lock); } while (0) -#define _spin_lock(lock) __LOCK(lock) -#define _spin_lock_nested(lock, subclass) __LOCK(lock) -#define _read_lock(lock) __LOCK(lock) -#define _write_lock(lock) __LOCK(lock) -#define _spin_lock_bh(lock) __LOCK_BH(lock) -#define _read_lock_bh(lock) __LOCK_BH(lock) -#define _write_lock_bh(lock) __LOCK_BH(lock) -#define _spin_lock_irq(lock) __LOCK_IRQ(lock) -#define _read_lock_irq(lock) __LOCK_IRQ(lock) -#define _write_lock_irq(lock) __LOCK_IRQ(lock) -#define _spin_lock_irqsave(lock, flags) __LOCK_IRQSAVE(lock, flags) -#define _read_lock_irqsave(lock, flags) __LOCK_IRQSAVE(lock, flags) -#define _write_lock_irqsave(lock, flags) __LOCK_IRQSAVE(lock, flags) -#define _spin_trylock(lock) ({ __LOCK(lock); 1; }) -#define _read_trylock(lock) ({ __LOCK(lock); 1; }) -#define _write_trylock(lock) ({ __LOCK(lock); 1; }) -#define _spin_trylock_bh(lock) ({ __LOCK_BH(lock); 1; }) -#define _spin_unlock(lock) __UNLOCK(lock) -#define _read_unlock(lock) __UNLOCK(lock) -#define _write_unlock(lock) __UNLOCK(lock) -#define _spin_unlock_bh(lock) __UNLOCK_BH(lock) -#define _write_unlock_bh(lock) __UNLOCK_BH(lock) -#define _read_unlock_bh(lock) __UNLOCK_BH(lock) -#define _spin_unlock_irq(lock) __UNLOCK_IRQ(lock) -#define _read_unlock_irq(lock) __UNLOCK_IRQ(lock) -#define _write_unlock_irq(lock) __UNLOCK_IRQ(lock) -#define _spin_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) -#define _read_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) -#define _write_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) +#define __spin_lock(lock) __LOCK(lock) +#define __spin_lock_nested(lock, subclass) __LOCK(lock) +#define __read_lock(lock) __LOCK(lock) +#define __write_lock(lock) __LOCK(lock) +#define __spin_lock_bh(lock) __LOCK_BH(lock) +#define __read_lock_bh(lock) __LOCK_BH(lock) +#define __write_lock_bh(lock) __LOCK_BH(lock) +#define __spin_lock_irq(lock) __LOCK_IRQ(lock) +#define __read_lock_irq(lock) __LOCK_IRQ(lock) +#define __write_lock_irq(lock) __LOCK_IRQ(lock) +#define __spin_lock_irqsave(lock) __LOCK_IRQSAVE(lock) +#define __read_lock_irqsave(lock) __LOCK_IRQSAVE(lock) +#define __write_lock_irqsave(lock) __LOCK_IRQSAVE(lock) +#define __spin_trylock(lock) ({ __LOCK(lock); 1; }) +#define __read_trylock(lock) ({ __LOCK(lock); 1; }) +#define __write_trylock(lock) ({ __LOCK(lock); 1; }) +#define __spin_trylock_bh(lock) ({ __LOCK_BH(lock); 1; }) +#define __spin_trylock_irq(lock) ({ __LOCK_IRQ(lock); 1; }) +#define __spin_unlock(lock) __UNLOCK(lock) +#define __spin_unlock_no_resched(lock) __UNLOCK_NO_RESCHED(lock) +#define __read_unlock(lock) __UNLOCK(lock) +#define __write_unlock(lock) __UNLOCK(lock) +#define __spin_unlock_bh(lock) __UNLOCK_BH(lock) +#define __write_unlock_bh(lock) __UNLOCK_BH(lock) +#define __read_unlock_bh(lock) __UNLOCK_BH(lock) +#define __spin_unlock_irq(lock) __UNLOCK_IRQ(lock) +#define __read_unlock_irq(lock) __UNLOCK_IRQ(lock) +#define __write_unlock_irq(lock) __UNLOCK_IRQ(lock) +#define __spin_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) +#define __read_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) +#define __write_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) #endif /* __LINUX_SPINLOCK_API_UP_H */ Index: linux-2.6.24-rc8-rt1/include/linux/spinlock_types.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/spinlock_types.h 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/spinlock_types.h 2008-01-16 22:28:31.000000000 -0500 @@ -15,10 +15,27 @@ # include #endif +/* + * Must define these before including other files, inline functions need them + */ +#define LOCK_SECTION_NAME ".text.lock."KBUILD_BASENAME + +#define LOCK_SECTION_START(extra) \ + ".subsection 1\n\t" \ + extra \ + ".ifndef " LOCK_SECTION_NAME "\n\t" \ + LOCK_SECTION_NAME ":\n\t" \ + ".endif\n" + +#define LOCK_SECTION_END \ + ".previous\n\t" + +#define __lockfunc fastcall __attribute__((section(".spinlock.text"))) + #include typedef struct { - raw_spinlock_t raw_lock; + __raw_spinlock_t raw_lock; #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) unsigned int break_lock; #endif @@ -29,12 +46,12 @@ typedef struct { #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif -} spinlock_t; +} raw_spinlock_t; #define SPINLOCK_MAGIC 0xdead4ead typedef struct { - raw_rwlock_t raw_lock; + __raw_rwlock_t raw_lock; #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) unsigned int break_lock; #endif @@ -45,7 +62,7 @@ typedef struct { #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif -} rwlock_t; +} raw_rwlock_t; #define RWLOCK_MAGIC 0xdeaf1eed @@ -64,24 +81,24 @@ typedef struct { #endif #ifdef CONFIG_DEBUG_SPINLOCK -# define __SPIN_LOCK_UNLOCKED(lockname) \ - (spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ +# define _RAW_SPIN_LOCK_UNLOCKED(lockname) \ + { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ .magic = SPINLOCK_MAGIC, \ .owner = SPINLOCK_OWNER_INIT, \ .owner_cpu = -1, \ SPIN_DEP_MAP_INIT(lockname) } -#define __RW_LOCK_UNLOCKED(lockname) \ - (rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ +#define _RAW_RW_LOCK_UNLOCKED(lockname) \ + { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ .magic = RWLOCK_MAGIC, \ .owner = SPINLOCK_OWNER_INIT, \ .owner_cpu = -1, \ RW_DEP_MAP_INIT(lockname) } #else -# define __SPIN_LOCK_UNLOCKED(lockname) \ - (spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ +# define _RAW_SPIN_LOCK_UNLOCKED(lockname) \ + { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ SPIN_DEP_MAP_INIT(lockname) } -#define __RW_LOCK_UNLOCKED(lockname) \ - (rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ +# define _RAW_RW_LOCK_UNLOCKED(lockname) \ + { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ RW_DEP_MAP_INIT(lockname) } #endif @@ -91,10 +108,22 @@ typedef struct { * Please use DEFINE_SPINLOCK()/DEFINE_RWLOCK() or * __SPIN_LOCK_UNLOCKED()/__RW_LOCK_UNLOCKED() as appropriate. */ -#define SPIN_LOCK_UNLOCKED __SPIN_LOCK_UNLOCKED(old_style_spin_init) -#define RW_LOCK_UNLOCKED __RW_LOCK_UNLOCKED(old_style_rw_init) -#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) -#define DEFINE_RWLOCK(x) rwlock_t x = __RW_LOCK_UNLOCKED(x) +# define RAW_SPIN_LOCK_UNLOCKED(lockname) \ + (raw_spinlock_t) _RAW_SPIN_LOCK_UNLOCKED(lockname) + +# define RAW_RW_LOCK_UNLOCKED(lockname) \ + (raw_rwlock_t) _RAW_RW_LOCK_UNLOCKED(lockname) + +#define DEFINE_RAW_SPINLOCK(name) \ + raw_spinlock_t name __cacheline_aligned_in_smp = \ + RAW_SPIN_LOCK_UNLOCKED(name) + +#define __DEFINE_RAW_SPINLOCK(name) \ + raw_spinlock_t name = RAW_SPIN_LOCK_UNLOCKED(name) + +#define DEFINE_RAW_RWLOCK(name) \ + raw_rwlock_t name __cacheline_aligned_in_smp = \ + RAW_RW_LOCK_UNLOCKED(name) #endif /* __LINUX_SPINLOCK_TYPES_H */ Index: linux-2.6.24-rc8-rt1/include/linux/spinlock_types_up.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/spinlock_types_up.h 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/spinlock_types_up.h 2008-01-16 22:28:31.000000000 -0500 @@ -16,13 +16,13 @@ typedef struct { volatile unsigned int slock; -} raw_spinlock_t; +} __raw_spinlock_t; #define __RAW_SPIN_LOCK_UNLOCKED { 1 } #else -typedef struct { } raw_spinlock_t; +typedef struct { } __raw_spinlock_t; #define __RAW_SPIN_LOCK_UNLOCKED { } @@ -30,7 +30,7 @@ typedef struct { } raw_spinlock_t; typedef struct { /* no debug version on UP */ -} raw_rwlock_t; +} __raw_rwlock_t; #define __RAW_RW_LOCK_UNLOCKED { } Index: linux-2.6.24-rc8-rt1/include/linux/spinlock_up.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/spinlock_up.h 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/spinlock_up.h 2008-01-16 22:28:31.000000000 -0500 @@ -20,19 +20,19 @@ #ifdef CONFIG_DEBUG_SPINLOCK #define __raw_spin_is_locked(x) ((x)->slock == 0) -static inline void __raw_spin_lock(raw_spinlock_t *lock) +static inline void __raw_spin_lock(__raw_spinlock_t *lock) { lock->slock = 0; } static inline void -__raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) +__raw_spin_lock_flags(__raw_spinlock_t *lock, unsigned long flags) { local_irq_save(flags); lock->slock = 0; } -static inline int __raw_spin_trylock(raw_spinlock_t *lock) +static inline int __raw_spin_trylock(__raw_spinlock_t *lock) { char oldval = lock->slock; @@ -41,7 +41,7 @@ static inline int __raw_spin_trylock(raw return oldval > 0; } -static inline void __raw_spin_unlock(raw_spinlock_t *lock) +static inline void __raw_spin_unlock(__raw_spinlock_t *lock) { lock->slock = 1; } Index: linux-2.6.24-rc8-rt1/kernel/futex.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/futex.c 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/futex.c 2008-01-16 22:29:21.000000000 -0500 @@ -124,12 +124,14 @@ static struct futex_hash_bucket futex_qu /* Futex-fs vfsmount entry: */ static struct vfsmount *futex_mnt; +int futex_performance_hack; + /* * Take mm->mmap_sem, when futex is shared */ static inline void futex_lock_mm(struct rw_semaphore *fshared) { - if (fshared) + if (fshared && !futex_performance_hack) down_read(fshared); } @@ -138,7 +140,7 @@ static inline void futex_lock_mm(struct */ static inline void futex_unlock_mm(struct rw_semaphore *fshared) { - if (fshared) + if (fshared && !futex_performance_hack) up_read(fshared); } @@ -946,8 +948,12 @@ static int futex_requeue(u32 __user *uad plist_add(&this->list, &hb2->chain); this->lock_ptr = &hb2->lock; #ifdef CONFIG_DEBUG_PI_LIST +#ifdef CONFIG_PREEMPT_RT + this->list.plist.lock = NULL; +#else this->list.plist.lock = &hb2->lock; #endif +#endif } this->key = key2; get_futex_key_refs(&key2); @@ -1007,8 +1013,12 @@ static inline void __queue_me(struct fut plist_node_init(&q->list, prio); #ifdef CONFIG_DEBUG_PI_LIST +#ifdef CONFIG_PREEMPT_RT + q->list.plist.lock = NULL; +#else q->list.plist.lock = &hb->lock; #endif +#endif plist_add(&q->list, &hb->chain); q->task = current; spin_unlock(&hb->lock); @@ -1244,6 +1254,10 @@ static int futex_wait(u32 __user *uaddr, * q.lock_ptr != 0 is not safe, because of ordering against wakeup. */ if (likely(!plist_node_empty(&q.list))) { + unsigned long nosched_flag = current->flags & PF_NOSCHED; + + current->flags &= ~PF_NOSCHED; + if (!abs_time) schedule(); else { @@ -1266,6 +1280,8 @@ static int futex_wait(u32 __user *uaddr, /* Flag if a timeout occured */ rem = (t.task == NULL); } + + current->flags |= nosched_flag; } __set_current_state(TASK_RUNNING); @@ -2135,7 +2151,11 @@ static int __init init(void) } for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { +#ifdef CONFIG_PREEMPT_RT + plist_head_init(&futex_queues[i].chain, NULL); +#else plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); +#endif spin_lock_init(&futex_queues[i].lock); } return 0; Index: linux-2.6.24-rc8-rt1/kernel/rt.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/kernel/rt.c 2008-01-16 22:29:16.000000000 -0500 @@ -0,0 +1,534 @@ +/* + * kernel/rt.c + * + * Real-Time Preemption Support + * + * started by Ingo Molnar: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner + * + * historic credit for proving that Linux spinlocks can be implemented via + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow + * and others) who prototyped it on 2.4 and did lots of comparative + * research and analysis; TimeSys, for proving that you can implement a + * fully preemptible kernel via the use of IRQ threading and mutexes; + * Bill Huey for persuasively arguing on lkml that the mutex model is the + * right one; and to MontaVista, who ported pmutexes to 2.6. + * + * This code is a from-scratch implementation and is not based on pmutexes, + * but the idea of converting spinlocks to mutexes is used here too. + * + * lock debugging, locking tree, deadlock detection: + * + * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey + * Released under the General Public License (GPL). + * + * Includes portions of the generic R/W semaphore implementation from: + * + * Copyright (c) 2001 David Howells (dhowells@redhat.com). + * - Derived partially from idea by Andrea Arcangeli + * - Derived also from comments by Linus + * + * Pending ownership of locks and ownership stealing: + * + * Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt + * + * (also by Steven Rostedt) + * - Converted single pi_lock to individual task locks. + * + * By Esben Nielsen: + * Doing priority inheritance with help of the scheduler. + * + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner + * - major rework based on Esben Nielsens initial patch + * - replaced thread_info references by task_struct refs + * - removed task->pending_owner dependency + * - BKL drop/reacquire for semaphore style locks to avoid deadlocks + * in the scheduler return path as discussed with Steven Rostedt + * + * Copyright (C) 2006, Kihon Technologies Inc. + * Steven Rostedt + * - debugged and patched Thomas Gleixner's rework. + * - added back the cmpxchg to the rework. + * - turned atomic require back on for SMP. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rtmutex_common.h" + +#ifdef CONFIG_PREEMPT_RT +/* + * Unlock these on crash: + */ +void zap_rt_locks(void) +{ + //trace_lock_init(); +} +#endif + +/* + * struct mutex functions + */ +void _mutex_init(struct mutex *lock, char *name, struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + __rt_mutex_init(&lock->lock, name); +} +EXPORT_SYMBOL(_mutex_init); + +void __lockfunc _mutex_lock(struct mutex *lock) +{ + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED_RT(lock, rt_mutex_trylock, rt_mutex_lock); +} +EXPORT_SYMBOL(_mutex_lock); + +static int __lockfunc __rt_mutex_lock_interruptible(struct rt_mutex *lock) +{ + return rt_mutex_lock_interruptible(lock, 0); +} + +int __lockfunc _mutex_lock_interruptible(struct mutex *lock) +{ + int ret; + + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); + ret = LOCK_CONTENDED_RT_RET(lock, rt_mutex_trylock, + __rt_mutex_lock_interruptible); + if (ret) + mutex_release(&lock->dep_map, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(_mutex_lock_interruptible); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass) +{ + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED_RT(lock, rt_mutex_trylock, rt_mutex_lock); +} +EXPORT_SYMBOL(_mutex_lock_nested); + +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass) +{ + int ret; + + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + ret = LOCK_CONTENDED_RT_RET(lock, rt_mutex_trylock, + __rt_mutex_lock_interruptible); + if (ret) + mutex_release(&lock->dep_map, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(_mutex_lock_interruptible_nested); +#endif + +int __lockfunc _mutex_trylock(struct mutex *lock) +{ + int ret = rt_mutex_trylock(&lock->lock); + + if (ret) + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(_mutex_trylock); + +void __lockfunc _mutex_unlock(struct mutex *lock) +{ + mutex_release(&lock->dep_map, 1, _RET_IP_); + rt_mutex_unlock(&lock->lock); +} +EXPORT_SYMBOL(_mutex_unlock); + +/* + * rwlock_t functions + */ +int __lockfunc rt_write_trylock(rwlock_t *rwlock) +{ + int ret = rt_mutex_trylock(&rwlock->lock); + + if (ret) + rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(rt_write_trylock); + +int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags) +{ + *flags = 0; + return rt_write_trylock(rwlock); +} +EXPORT_SYMBOL(rt_write_trylock_irqsave); + +int __lockfunc rt_read_trylock(rwlock_t *rwlock) +{ + struct rt_mutex *lock = &rwlock->lock; + unsigned long flags; + int ret; + + /* + * Read locks within the self-held write lock succeed. + */ + spin_lock_irqsave(&lock->wait_lock, flags); + if (rt_mutex_real_owner(lock) == current) { + spin_unlock_irqrestore(&lock->wait_lock, flags); + rwlock->read_depth++; + rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_); + return 1; + } + spin_unlock_irqrestore(&lock->wait_lock, flags); + + ret = rt_mutex_trylock(lock); + if (ret) + rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(rt_read_trylock); + +void __lockfunc rt_write_lock(rwlock_t *rwlock) +{ + rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED_RT(rwlock, rt_mutex_trylock, __rt_spin_lock); +} +EXPORT_SYMBOL(rt_write_lock); + +void __lockfunc rt_read_lock(rwlock_t *rwlock) +{ + unsigned long flags; + struct rt_mutex *lock = &rwlock->lock; + + rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); + /* + * Read locks within the write lock succeed. + */ + spin_lock_irqsave(&lock->wait_lock, flags); + if (rt_mutex_real_owner(lock) == current) { + spin_unlock_irqrestore(&lock->wait_lock, flags); + rwlock->read_depth++; + return; + } + spin_unlock_irqrestore(&lock->wait_lock, flags); + LOCK_CONTENDED_RT(rwlock, rt_mutex_trylock, __rt_spin_lock); +} + +EXPORT_SYMBOL(rt_read_lock); + +void __lockfunc rt_write_unlock(rwlock_t *rwlock) +{ + /* NOTE: we always pass in '1' for nested, for simplicity */ + rwlock_release(&rwlock->dep_map, 1, _RET_IP_); + __rt_spin_unlock(&rwlock->lock); +} +EXPORT_SYMBOL(rt_write_unlock); + +void __lockfunc rt_read_unlock(rwlock_t *rwlock) +{ + struct rt_mutex *lock = &rwlock->lock; + unsigned long flags; + + rwlock_release(&rwlock->dep_map, 1, _RET_IP_); + // TRACE_WARN_ON(lock->save_state != 1); + /* + * Read locks within the self-held write lock succeed. + */ + spin_lock_irqsave(&lock->wait_lock, flags); + if (rt_mutex_real_owner(lock) == current && rwlock->read_depth) { + spin_unlock_irqrestore(&lock->wait_lock, flags); + rwlock->read_depth--; + return; + } + spin_unlock_irqrestore(&lock->wait_lock, flags); + __rt_spin_unlock(&rwlock->lock); +} +EXPORT_SYMBOL(rt_read_unlock); + +unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock) +{ + rt_write_lock(rwlock); + + return 0; +} +EXPORT_SYMBOL(rt_write_lock_irqsave); + +unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock) +{ + rt_read_lock(rwlock); + + return 0; +} +EXPORT_SYMBOL(rt_read_lock_irqsave); + +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock)); + lockdep_init_map(&rwlock->dep_map, name, key, 0); +#endif + __rt_mutex_init(&rwlock->lock, name); + rwlock->read_depth = 0; +} +EXPORT_SYMBOL(__rt_rwlock_init); + +/* + * rw_semaphores + */ + +void fastcall rt_up_write(struct rw_semaphore *rwsem) +{ + rwsem_release(&rwsem->dep_map, 1, _RET_IP_); + rt_mutex_unlock(&rwsem->lock); +} +EXPORT_SYMBOL(rt_up_write); + +void fastcall rt_up_read(struct rw_semaphore *rwsem) +{ + unsigned long flags; + + rwsem_release(&rwsem->dep_map, 1, _RET_IP_); + /* + * Read locks within the self-held write lock succeed. + */ + spin_lock_irqsave(&rwsem->lock.wait_lock, flags); + if (rt_mutex_real_owner(&rwsem->lock) == current && rwsem->read_depth) { + spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags); + rwsem->read_depth--; + return; + } + spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags); + rt_mutex_unlock(&rwsem->lock); +} +EXPORT_SYMBOL(rt_up_read); + +/* + * downgrade a write lock into a read lock + * - just wake up any readers at the front of the queue + */ +void fastcall rt_downgrade_write(struct rw_semaphore *rwsem) +{ + BUG(); +} +EXPORT_SYMBOL(rt_downgrade_write); + +int fastcall rt_down_write_trylock(struct rw_semaphore *rwsem) +{ + int ret = rt_mutex_trylock(&rwsem->lock); + + if (ret) + rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(rt_down_write_trylock); + +void fastcall rt_down_write(struct rw_semaphore *rwsem) +{ + rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED_RT(rwsem, rt_mutex_trylock, rt_mutex_lock); +} +EXPORT_SYMBOL(rt_down_write); + +void fastcall rt_down_write_nested(struct rw_semaphore *rwsem, int subclass) +{ + rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED_RT(rwsem, rt_mutex_trylock, rt_mutex_lock); +} +EXPORT_SYMBOL(rt_down_write_nested); + +int fastcall rt_down_read_trylock(struct rw_semaphore *rwsem) +{ + unsigned long flags; + int ret; + + /* + * Read locks within the self-held write lock succeed. + */ + spin_lock_irqsave(&rwsem->lock.wait_lock, flags); + if (rt_mutex_real_owner(&rwsem->lock) == current) { + spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags); + rwsem_acquire_read(&rwsem->dep_map, 0, 1, _RET_IP_); + rwsem->read_depth++; + return 1; + } + spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags); + + ret = rt_mutex_trylock(&rwsem->lock); + if (ret) + rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(rt_down_read_trylock); + +static void __rt_down_read(struct rw_semaphore *rwsem, int subclass) +{ + unsigned long flags; + + rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_); + + /* + * Read locks within the write lock succeed. + */ + spin_lock_irqsave(&rwsem->lock.wait_lock, flags); + + if (rt_mutex_real_owner(&rwsem->lock) == current) { + spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags); + rwsem->read_depth++; + return; + } + spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags); + LOCK_CONTENDED_RT(rwsem, rt_mutex_trylock, rt_mutex_lock); +} + +void fastcall rt_down_read(struct rw_semaphore *rwsem) +{ + __rt_down_read(rwsem, 0); +} +EXPORT_SYMBOL(rt_down_read); + +void fastcall rt_down_read_nested(struct rw_semaphore *rwsem, int subclass) +{ + __rt_down_read(rwsem, subclass); +} +EXPORT_SYMBOL(rt_down_read_nested); + +void fastcall __rt_rwsem_init(struct rw_semaphore *rwsem, char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem)); + lockdep_init_map(&rwsem->dep_map, name, key, 0); +#endif + __rt_mutex_init(&rwsem->lock, name); + rwsem->read_depth = 0; +} +EXPORT_SYMBOL(__rt_rwsem_init); + +/* + * Semaphores + */ +/* + * Linux Semaphores implemented via RT-mutexes. + * + * In the down() variants we use the mutex as the semaphore blocking + * object: we always acquire it, decrease the counter and keep the lock + * locked if we did the 1->0 transition. The next down() will then block. + * + * In the up() path we atomically increase the counter and do the + * unlock if we were the one doing the 0->1 transition. + */ + +static inline void __down_complete(struct semaphore *sem) +{ + int count = atomic_dec_return(&sem->count); + + if (unlikely(count > 0)) + rt_mutex_unlock(&sem->lock); +} + +void fastcall rt_down(struct semaphore *sem) +{ + rt_mutex_lock(&sem->lock); + __down_complete(sem); +} +EXPORT_SYMBOL(rt_down); + +int fastcall rt_down_interruptible(struct semaphore *sem) +{ + int ret; + + ret = rt_mutex_lock_interruptible(&sem->lock, 0); + if (ret) + return ret; + __down_complete(sem); + return 0; +} +EXPORT_SYMBOL(rt_down_interruptible); + +/* + * try to down the semaphore, 0 on success and 1 on failure. (inverted) + */ +int fastcall rt_down_trylock(struct semaphore *sem) +{ + /* + * Here we are a tiny bit different from ordinary Linux semaphores, + * because we can get 'transient' locking-failures when say a + * process decreases the count from 9 to 8 and locks/releases the + * embedded mutex internally. It would be quite complex to remove + * these transient failures so lets try it the simple way first: + */ + if (rt_mutex_trylock(&sem->lock)) { + __down_complete(sem); + return 0; + } + return 1; +} +EXPORT_SYMBOL(rt_down_trylock); + +void fastcall rt_up(struct semaphore *sem) +{ + int count; + + /* + * Disable preemption to make sure a highprio trylock-er cannot + * preempt us here and get into an infinite loop: + */ + preempt_disable(); + count = atomic_inc_return(&sem->count); + /* + * If we did the 0 -> 1 transition then we are the ones to unlock it: + */ + if (likely(count == 1)) + rt_mutex_unlock(&sem->lock); + preempt_enable(); +} +EXPORT_SYMBOL(rt_up); + +void fastcall __sema_init(struct semaphore *sem, int val, + char *name, char *file, int line) +{ + atomic_set(&sem->count, val); + switch (val) { + case 0: + __rt_mutex_init(&sem->lock, name); + rt_mutex_lock(&sem->lock); + break; + default: + __rt_mutex_init(&sem->lock, name); + break; + } +} +EXPORT_SYMBOL(__sema_init); + +void fastcall __init_MUTEX(struct semaphore *sem, char *name, char *file, + int line) +{ + __sema_init(sem, 1, name, file, line); +} +EXPORT_SYMBOL(__init_MUTEX); + Index: linux-2.6.24-rc8-rt1/kernel/rtmutex-debug.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/rtmutex-debug.c 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/rtmutex-debug.c 2008-01-16 22:28:31.000000000 -0500 @@ -16,6 +16,7 @@ * * See rt.c in preempt-rt for proper credits and further information */ +#include #include #include #include @@ -29,61 +30,6 @@ #include "rtmutex_common.h" -# define TRACE_WARN_ON(x) WARN_ON(x) -# define TRACE_BUG_ON(x) BUG_ON(x) - -# define TRACE_OFF() \ -do { \ - if (rt_trace_on) { \ - rt_trace_on = 0; \ - console_verbose(); \ - if (spin_is_locked(¤t->pi_lock)) \ - spin_unlock(¤t->pi_lock); \ - } \ -} while (0) - -# define TRACE_OFF_NOLOCK() \ -do { \ - if (rt_trace_on) { \ - rt_trace_on = 0; \ - console_verbose(); \ - } \ -} while (0) - -# define TRACE_BUG_LOCKED() \ -do { \ - TRACE_OFF(); \ - BUG(); \ -} while (0) - -# define TRACE_WARN_ON_LOCKED(c) \ -do { \ - if (unlikely(c)) { \ - TRACE_OFF(); \ - WARN_ON(1); \ - } \ -} while (0) - -# define TRACE_BUG_ON_LOCKED(c) \ -do { \ - if (unlikely(c)) \ - TRACE_BUG_LOCKED(); \ -} while (0) - -#ifdef CONFIG_SMP -# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c) -#else -# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0) -#endif - -/* - * deadlock detection flag. We turn it off when we detect - * the first problem because we dont want to recurse back - * into the tracing code when doing error printk or - * executing a BUG(): - */ -static int rt_trace_on = 1; - static void printk_task(struct task_struct *p) { if (p) @@ -111,8 +57,8 @@ static void printk_lock(struct rt_mutex void rt_mutex_debug_task_free(struct task_struct *task) { - WARN_ON(!plist_head_empty(&task->pi_waiters)); - WARN_ON(task->pi_blocked_on); + DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); + DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); } /* @@ -125,7 +71,7 @@ void debug_rt_mutex_deadlock(int detect, { struct task_struct *task; - if (!rt_trace_on || detect || !act_waiter) + if (!debug_locks || detect || !act_waiter) return; task = rt_mutex_owner(act_waiter->lock); @@ -139,14 +85,15 @@ void debug_rt_mutex_print_deadlock(struc { struct task_struct *task; - if (!waiter->deadlock_lock || !rt_trace_on) + if (!waiter->deadlock_lock || !debug_locks) return; task = find_task_by_pid(waiter->deadlock_task_pid); if (!task) return; - TRACE_OFF_NOLOCK(); + if (!debug_locks_off()) + return; printk("\n============================================\n"); printk( "[ BUG: circular locking deadlock detected! ]\n"); @@ -176,7 +123,6 @@ void debug_rt_mutex_print_deadlock(struc printk("[ turning off deadlock detection." "Please report this trace. ]\n\n"); - local_irq_disable(); } void debug_rt_mutex_lock(struct rt_mutex *lock) @@ -185,7 +131,8 @@ void debug_rt_mutex_lock(struct rt_mutex void debug_rt_mutex_unlock(struct rt_mutex *lock) { - TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); + if (debug_locks) + DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); } void @@ -195,7 +142,7 @@ debug_rt_mutex_proxy_lock(struct rt_mute void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) { - TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock)); + DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); } void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) @@ -207,9 +154,9 @@ void debug_rt_mutex_init_waiter(struct r void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) { - TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); - TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); - TRACE_WARN_ON(waiter->task); + DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); + DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); + DEBUG_LOCKS_WARN_ON(waiter->task); memset(waiter, 0x22, sizeof(*waiter)); } @@ -225,9 +172,36 @@ void debug_rt_mutex_init(struct rt_mutex void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task) { +#ifdef CONFIG_DEBUG_PREEMPT + if (task->lock_count >= MAX_LOCK_STACK) { + if (!debug_locks_off()) + return; + printk("BUG: %s/%d: lock count overflow!\n", + task->comm, task->pid); + dump_stack(); + return; + } +#ifdef CONFIG_PREEMPT_RT + task->owned_lock[task->lock_count] = lock; +#endif + task->lock_count++; +#endif } void rt_mutex_deadlock_account_unlock(struct task_struct *task) { +#ifdef CONFIG_DEBUG_PREEMPT + if (!task->lock_count) { + if (!debug_locks_off()) + return; + printk("BUG: %s/%d: lock count underflow!\n", + task->comm, task->pid); + dump_stack(); + return; + } + task->lock_count--; +#ifdef CONFIG_PREEMPT_RT + task->owned_lock[task->lock_count] = NULL; +#endif +#endif } - Index: linux-2.6.24-rc8-rt1/kernel/rtmutex.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/rtmutex.c 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/rtmutex.c 2008-01-16 22:29:25.000000000 -0500 @@ -14,6 +14,7 @@ #include #include #include +#include #include "rtmutex_common.h" @@ -97,6 +98,22 @@ static inline void mark_rt_mutex_waiters } #endif +int pi_initialized; + +/* + * we initialize the wait_list runtime. (Could be done build-time and/or + * boot-time.) + */ +static inline void init_lists(struct rt_mutex *lock) +{ + if (unlikely(!lock->wait_list.prio_list.prev)) { + plist_head_init(&lock->wait_list, &lock->wait_lock); +#ifdef CONFIG_DEBUG_RT_MUTEXES + pi_initialized++; +#endif + } +} + /* * Calculate task priority from the waiter list priority * @@ -105,11 +122,12 @@ static inline void mark_rt_mutex_waiters */ int rt_mutex_getprio(struct task_struct *task) { + int prio = min(task->normal_prio, get_rcu_prio(task)); + if (likely(!task_has_pi_waiters(task))) - return task->normal_prio; + return prio; - return min(task_top_pi_waiter(task)->pi_list_entry.prio, - task->normal_prio); + return min(task_top_pi_waiter(task)->pi_list_entry.prio, prio); } /* @@ -253,13 +271,13 @@ static int rt_mutex_adjust_prio_chain(st plist_add(&waiter->list_entry, &lock->wait_list); /* Release the task */ - spin_unlock_irqrestore(&task->pi_lock, flags); + spin_unlock(&task->pi_lock); put_task_struct(task); /* Grab the next task */ task = rt_mutex_owner(lock); get_task_struct(task); - spin_lock_irqsave(&task->pi_lock, flags); + spin_lock(&task->pi_lock); if (waiter == rt_mutex_top_waiter(lock)) { /* Boost the owner */ @@ -277,10 +295,10 @@ static int rt_mutex_adjust_prio_chain(st __rt_mutex_adjust_prio(task); } - spin_unlock_irqrestore(&task->pi_lock, flags); + spin_unlock(&task->pi_lock); top_waiter = rt_mutex_top_waiter(lock); - spin_unlock(&lock->wait_lock); + spin_unlock_irqrestore(&lock->wait_lock, flags); if (!detect_deadlock && waiter != top_waiter) goto out_put_task; @@ -304,7 +322,6 @@ static inline int try_to_steal_lock(stru { struct task_struct *pendowner = rt_mutex_owner(lock); struct rt_mutex_waiter *next; - unsigned long flags; if (!rt_mutex_owner_pending(lock)) return 0; @@ -312,9 +329,9 @@ static inline int try_to_steal_lock(stru if (pendowner == current) return 1; - spin_lock_irqsave(&pendowner->pi_lock, flags); + spin_lock(&pendowner->pi_lock); if (current->prio >= pendowner->prio) { - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + spin_unlock(&pendowner->pi_lock); return 0; } @@ -324,7 +341,7 @@ static inline int try_to_steal_lock(stru * priority. */ if (likely(!rt_mutex_has_waiters(lock))) { - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + spin_unlock(&pendowner->pi_lock); return 1; } @@ -332,7 +349,7 @@ static inline int try_to_steal_lock(stru next = rt_mutex_top_waiter(lock); plist_del(&next->pi_list_entry, &pendowner->pi_waiters); __rt_mutex_adjust_prio(pendowner); - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + spin_unlock(&pendowner->pi_lock); /* * We are going to steal the lock and a waiter was @@ -349,10 +366,10 @@ static inline int try_to_steal_lock(stru * might be current: */ if (likely(next->task != current)) { - spin_lock_irqsave(¤t->pi_lock, flags); + spin_lock(¤t->pi_lock); plist_add(&next->pi_list_entry, ¤t->pi_waiters); __rt_mutex_adjust_prio(current); - spin_unlock_irqrestore(¤t->pi_lock, flags); + spin_unlock(¤t->pi_lock); } return 1; } @@ -411,14 +428,13 @@ static int try_to_take_rt_mutex(struct r */ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, - int detect_deadlock) + int detect_deadlock, unsigned long flags) { struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; - unsigned long flags; int chain_walk = 0, res; - spin_lock_irqsave(¤t->pi_lock, flags); + spin_lock(¤t->pi_lock); __rt_mutex_adjust_prio(current); waiter->task = current; waiter->lock = lock; @@ -432,17 +448,17 @@ static int task_blocks_on_rt_mutex(struc current->pi_blocked_on = waiter; - spin_unlock_irqrestore(¤t->pi_lock, flags); + spin_unlock(¤t->pi_lock); if (waiter == rt_mutex_top_waiter(lock)) { - spin_lock_irqsave(&owner->pi_lock, flags); + spin_lock(&owner->pi_lock); plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); plist_add(&waiter->pi_list_entry, &owner->pi_waiters); __rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) chain_walk = 1; - spin_unlock_irqrestore(&owner->pi_lock, flags); + spin_unlock(&owner->pi_lock); } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) chain_walk = 1; @@ -457,12 +473,12 @@ static int task_blocks_on_rt_mutex(struc */ get_task_struct(owner); - spin_unlock(&lock->wait_lock); + spin_unlock_irqrestore(&lock->wait_lock, flags); res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, current); - spin_lock(&lock->wait_lock); + spin_lock_irq(&lock->wait_lock); return res; } @@ -475,13 +491,12 @@ static int task_blocks_on_rt_mutex(struc * * Called with lock->wait_lock held. */ -static void wakeup_next_waiter(struct rt_mutex *lock) +static void wakeup_next_waiter(struct rt_mutex *lock, int savestate) { struct rt_mutex_waiter *waiter; struct task_struct *pendowner; - unsigned long flags; - spin_lock_irqsave(¤t->pi_lock, flags); + spin_lock(¤t->pi_lock); waiter = rt_mutex_top_waiter(lock); plist_del(&waiter->list_entry, &lock->wait_list); @@ -498,7 +513,7 @@ static void wakeup_next_waiter(struct rt rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); - spin_unlock_irqrestore(¤t->pi_lock, flags); + spin_unlock(¤t->pi_lock); /* * Clear the pi_blocked_on variable and enqueue a possible @@ -507,7 +522,7 @@ static void wakeup_next_waiter(struct rt * waiter with higher priority than pending-owner->normal_prio * is blocked on the unboosted (pending) owner. */ - spin_lock_irqsave(&pendowner->pi_lock, flags); + spin_lock(&pendowner->pi_lock); WARN_ON(!pendowner->pi_blocked_on); WARN_ON(pendowner->pi_blocked_on != waiter); @@ -521,9 +536,12 @@ static void wakeup_next_waiter(struct rt next = rt_mutex_top_waiter(lock); plist_add(&next->pi_list_entry, &pendowner->pi_waiters); } - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + spin_unlock(&pendowner->pi_lock); - wake_up_process(pendowner); + if (savestate) + wake_up_process_mutex(pendowner); + else + wake_up_process(pendowner); } /* @@ -532,22 +550,22 @@ static void wakeup_next_waiter(struct rt * Must be called with lock->wait_lock held */ static void remove_waiter(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter) + struct rt_mutex_waiter *waiter, + unsigned long flags) { int first = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); - unsigned long flags; int chain_walk = 0; - spin_lock_irqsave(¤t->pi_lock, flags); + spin_lock(¤t->pi_lock); plist_del(&waiter->list_entry, &lock->wait_list); waiter->task = NULL; current->pi_blocked_on = NULL; - spin_unlock_irqrestore(¤t->pi_lock, flags); + spin_unlock(¤t->pi_lock); if (first && owner != current) { - spin_lock_irqsave(&owner->pi_lock, flags); + spin_lock(&owner->pi_lock); plist_del(&waiter->pi_list_entry, &owner->pi_waiters); @@ -562,7 +580,7 @@ static void remove_waiter(struct rt_mute if (owner->pi_blocked_on) chain_walk = 1; - spin_unlock_irqrestore(&owner->pi_lock, flags); + spin_unlock(&owner->pi_lock); } WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); @@ -573,11 +591,11 @@ static void remove_waiter(struct rt_mute /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(owner); - spin_unlock(&lock->wait_lock); + spin_unlock_irqrestore(&lock->wait_lock, flags); rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); - spin_lock(&lock->wait_lock); + spin_lock_irq(&lock->wait_lock); } /* @@ -598,14 +616,319 @@ void rt_mutex_adjust_pi(struct task_stru return; } - spin_unlock_irqrestore(&task->pi_lock, flags); - /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(task); + spin_unlock_irqrestore(&task->pi_lock, flags); + rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); } /* + * preemptible spin_lock functions: + */ + +#ifdef CONFIG_PREEMPT_RT + +static inline void +rt_spin_lock_fastlock(struct rt_mutex *lock, + void fastcall (*slowfn)(struct rt_mutex *lock)) +{ + /* Temporary HACK! */ + if (!current->in_printk) + might_sleep(); + else if (in_atomic() || irqs_disabled()) + /* don't grab locks for printk in atomic */ + return; + + if (likely(rt_mutex_cmpxchg(lock, NULL, current))) + rt_mutex_deadlock_account_lock(lock, current); + else + slowfn(lock); +} + +static inline void +rt_spin_lock_fastunlock(struct rt_mutex *lock, + void fastcall (*slowfn)(struct rt_mutex *lock)) +{ + /* Temporary HACK! */ + if (current->in_printk && (in_atomic() || irqs_disabled())) + /* don't grab locks for printk in atomic */ + return; + + if (likely(rt_mutex_cmpxchg(lock, current, NULL))) + rt_mutex_deadlock_account_unlock(current); + else + slowfn(lock); +} + +/* + * Slow path lock function spin_lock style: this variant is very + * careful not to miss any non-lock wakeups. + * + * The wakeup side uses wake_up_process_mutex, which, combined with + * the xchg code of this function is a transparent sleep/wakeup + * mechanism nested within any existing sleep/wakeup mechanism. This + * enables the seemless use of arbitrary (blocking) spinlocks within + * sleep/wakeup event loops. + */ +static void fastcall noinline __sched +rt_spin_lock_slowlock(struct rt_mutex *lock) +{ + struct rt_mutex_waiter waiter; + unsigned long saved_state, state, flags; + + debug_rt_mutex_init_waiter(&waiter); + waiter.task = NULL; + + spin_lock_irqsave(&lock->wait_lock, flags); + init_lists(lock); + + /* Try to acquire the lock again: */ + if (try_to_take_rt_mutex(lock)) { + spin_unlock_irqrestore(&lock->wait_lock, flags); + return; + } + + BUG_ON(rt_mutex_owner(lock) == current); + + /* + * Here we save whatever state the task was in originally, + * we'll restore it at the end of the function and we'll take + * any intermediate wakeup into account as well, independently + * of the lock sleep/wakeup mechanism. When we get a real + * wakeup the task->state is TASK_RUNNING and we change + * saved_state accordingly. If we did not get a real wakeup + * then we return with the saved state. + */ + saved_state = xchg(¤t->state, TASK_UNINTERRUPTIBLE); + + for (;;) { + unsigned long saved_flags; + int saved_lock_depth = current->lock_depth; + + /* Try to acquire the lock */ + if (try_to_take_rt_mutex(lock)) + break; + /* + * waiter.task is NULL the first time we come here and + * when we have been woken up by the previous owner + * but the lock got stolen by an higher prio task. + */ + if (!waiter.task) { + task_blocks_on_rt_mutex(lock, &waiter, 0, flags); + /* Wakeup during boost ? */ + if (unlikely(!waiter.task)) + continue; + } + + /* + * Prevent schedule() to drop BKL, while waiting for + * the lock ! We restore lock_depth when we come back. + */ + saved_flags = current->flags & PF_NOSCHED; + current->lock_depth = -1; + current->flags &= ~PF_NOSCHED; + spin_unlock_irqrestore(&lock->wait_lock, flags); + + debug_rt_mutex_print_deadlock(&waiter); + + schedule_rt_mutex(lock); + + spin_lock_irqsave(&lock->wait_lock, flags); + current->flags |= saved_flags; + current->lock_depth = saved_lock_depth; + state = xchg(¤t->state, TASK_UNINTERRUPTIBLE); + if (unlikely(state == TASK_RUNNING)) + saved_state = TASK_RUNNING; + } + + state = xchg(¤t->state, saved_state); + if (unlikely(state == TASK_RUNNING)) + current->state = TASK_RUNNING; + + /* + * Extremely rare case, if we got woken up by a non-mutex wakeup, + * and we managed to steal the lock despite us not being the + * highest-prio waiter (due to SCHED_OTHER changing prio), then we + * can end up with a non-NULL waiter.task: + */ + if (unlikely(waiter.task)) + remove_waiter(lock, &waiter, flags); + /* + * try_to_take_rt_mutex() sets the waiter bit + * unconditionally. We might have to fix that up: + */ + fixup_rt_mutex_waiters(lock); + + spin_unlock_irqrestore(&lock->wait_lock, flags); + + debug_rt_mutex_free_waiter(&waiter); +} + +/* + * Slow path to release a rt_mutex spin_lock style + */ +static void fastcall noinline __sched +rt_spin_lock_slowunlock(struct rt_mutex *lock) +{ + unsigned long flags; + + spin_lock_irqsave(&lock->wait_lock, flags); + + debug_rt_mutex_unlock(lock); + + rt_mutex_deadlock_account_unlock(current); + + if (!rt_mutex_has_waiters(lock)) { + lock->owner = NULL; + spin_unlock_irqrestore(&lock->wait_lock, flags); + return; + } + + wakeup_next_waiter(lock, 1); + + spin_unlock_irqrestore(&lock->wait_lock, flags); + + /* Undo pi boosting.when necessary */ + rt_mutex_adjust_prio(current); +} + +void __lockfunc rt_spin_lock(spinlock_t *lock) +{ + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED_RT(lock, rt_mutex_trylock, __rt_spin_lock); +} +EXPORT_SYMBOL(rt_spin_lock); + +void __lockfunc __rt_spin_lock(struct rt_mutex *lock) +{ + rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock); +} +EXPORT_SYMBOL(__rt_spin_lock); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass) +{ + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED_RT(lock, rt_mutex_trylock, __rt_spin_lock); +} +EXPORT_SYMBOL(rt_spin_lock_nested); + +#endif + +void __lockfunc rt_spin_unlock(spinlock_t *lock) +{ + /* NOTE: we always pass in '1' for nested, for simplicity */ + spin_release(&lock->dep_map, 1, _RET_IP_); + rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock); +} +EXPORT_SYMBOL(rt_spin_unlock); + +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock) +{ + rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock); +} +EXPORT_SYMBOL(__rt_spin_unlock); + +/* + * Wait for the lock to get unlocked: instead of polling for an unlock + * (like raw spinlocks do), we lock and unlock, to force the kernel to + * schedule if there's contention: + */ +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock) +{ + spin_lock(lock); + spin_unlock(lock); +} +EXPORT_SYMBOL(rt_spin_unlock_wait); + +int __lockfunc rt_spin_trylock(spinlock_t *lock) +{ + int ret = rt_mutex_trylock(&lock->lock); + + if (ret) + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(rt_spin_trylock); + +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags) +{ + int ret; + + *flags = 0; + ret = rt_mutex_trylock(&lock->lock); + if (ret) + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(rt_spin_trylock_irqsave); + +int _atomic_dec_and_spin_lock(spinlock_t *lock, atomic_t *atomic) +{ + /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ + if (atomic_add_unless(atomic, -1, 1)) + return 0; + rt_spin_lock(lock); + if (atomic_dec_and_test(atomic)) + return 1; + rt_spin_unlock(lock); + return 0; +} +EXPORT_SYMBOL(_atomic_dec_and_spin_lock); + +void +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + __rt_mutex_init(&lock->lock, name); +} +EXPORT_SYMBOL(__rt_spin_lock_init); + +#endif + +#ifdef CONFIG_PREEMPT_BKL + +static inline int rt_release_bkl(struct rt_mutex *lock, unsigned long flags) +{ + int saved_lock_depth = current->lock_depth; + + current->lock_depth = -1; + /* + * try_to_take_lock set the waiters, make sure it's + * still correct. + */ + fixup_rt_mutex_waiters(lock); + spin_unlock_irqrestore(&lock->wait_lock, flags); + + up(&kernel_sem); + + spin_lock_irq(&lock->wait_lock); + + return saved_lock_depth; +} + +static inline void rt_reacquire_bkl(int saved_lock_depth) +{ + down(&kernel_sem); + current->lock_depth = saved_lock_depth; +} + +#else +# define rt_release_bkl(lock, flags) (-1) +# define rt_reacquire_bkl(depth) do { } while (0) +#endif + +/* * Slow path lock function: */ static int __sched @@ -613,20 +936,29 @@ rt_mutex_slowlock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, int detect_deadlock) { + int ret = 0, saved_lock_depth = -1; struct rt_mutex_waiter waiter; - int ret = 0; + unsigned long flags; debug_rt_mutex_init_waiter(&waiter); waiter.task = NULL; - spin_lock(&lock->wait_lock); + spin_lock_irqsave(&lock->wait_lock, flags); + init_lists(lock); /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock)) { - spin_unlock(&lock->wait_lock); + spin_unlock_irqrestore(&lock->wait_lock, flags); return 0; } + /* + * We drop the BKL here before we go into the wait loop to avoid a + * possible deadlock in the scheduler. + */ + if (unlikely(current->lock_depth >= 0)) + saved_lock_depth = rt_release_bkl(lock, flags); + set_current_state(state); /* Setup the timer, when timeout != NULL */ @@ -635,6 +967,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, HRTIMER_MODE_ABS); for (;;) { + unsigned long saved_flags; + /* Try to acquire the lock: */ if (try_to_take_rt_mutex(lock)) break; @@ -660,7 +994,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, */ if (!waiter.task) { ret = task_blocks_on_rt_mutex(lock, &waiter, - detect_deadlock); + detect_deadlock, flags); /* * If we got woken up by the owner then start loop * all over without going into schedule to try @@ -679,22 +1013,26 @@ rt_mutex_slowlock(struct rt_mutex *lock, if (unlikely(ret)) break; } + saved_flags = current->flags & PF_NOSCHED; + current->flags &= ~PF_NOSCHED; - spin_unlock(&lock->wait_lock); + spin_unlock_irq(&lock->wait_lock); debug_rt_mutex_print_deadlock(&waiter); if (waiter.task) schedule_rt_mutex(lock); - spin_lock(&lock->wait_lock); + spin_lock_irq(&lock->wait_lock); + + current->flags |= saved_flags; set_current_state(state); } set_current_state(TASK_RUNNING); if (unlikely(waiter.task)) - remove_waiter(lock, &waiter); + remove_waiter(lock, &waiter, flags); /* * try_to_take_rt_mutex() sets the waiter bit @@ -702,7 +1040,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, */ fixup_rt_mutex_waiters(lock); - spin_unlock(&lock->wait_lock); + spin_unlock_irqrestore(&lock->wait_lock, flags); /* Remove pending timer: */ if (unlikely(timeout)) @@ -716,6 +1054,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, if (unlikely(ret)) rt_mutex_adjust_prio(current); + /* Must we reaquire the BKL? */ + if (unlikely(saved_lock_depth >= 0)) + rt_reacquire_bkl(saved_lock_depth); + debug_rt_mutex_free_waiter(&waiter); return ret; @@ -727,12 +1069,15 @@ rt_mutex_slowlock(struct rt_mutex *lock, static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) { + unsigned long flags; int ret = 0; - spin_lock(&lock->wait_lock); + spin_lock_irqsave(&lock->wait_lock, flags); if (likely(rt_mutex_owner(lock) != current)) { + init_lists(lock); + ret = try_to_take_rt_mutex(lock); /* * try_to_take_rt_mutex() sets the lock waiters @@ -741,7 +1086,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lo fixup_rt_mutex_waiters(lock); } - spin_unlock(&lock->wait_lock); + spin_unlock_irqrestore(&lock->wait_lock, flags); return ret; } @@ -752,7 +1097,9 @@ rt_mutex_slowtrylock(struct rt_mutex *lo static void __sched rt_mutex_slowunlock(struct rt_mutex *lock) { - spin_lock(&lock->wait_lock); + unsigned long flags; + + spin_lock_irqsave(&lock->wait_lock, flags); debug_rt_mutex_unlock(lock); @@ -760,13 +1107,13 @@ rt_mutex_slowunlock(struct rt_mutex *loc if (!rt_mutex_has_waiters(lock)) { lock->owner = NULL; - spin_unlock(&lock->wait_lock); + spin_unlock_irqrestore(&lock->wait_lock, flags); return; } - wakeup_next_waiter(lock); + wakeup_next_waiter(lock, 0); - spin_unlock(&lock->wait_lock); + spin_unlock_irqrestore(&lock->wait_lock, flags); /* Undo pi boosting if necessary: */ rt_mutex_adjust_prio(current); Index: linux-2.6.24-rc8-rt1/kernel/rwsem.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/rwsem.c 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/rwsem.c 2008-01-16 22:28:31.000000000 -0500 @@ -16,7 +16,7 @@ /* * lock for reading */ -void __sched down_read(struct rw_semaphore *sem) +void __sched compat_down_read(struct compat_rw_semaphore *sem) { might_sleep(); rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); @@ -24,12 +24,12 @@ void __sched down_read(struct rw_semapho LOCK_CONTENDED(sem, __down_read_trylock, __down_read); } -EXPORT_SYMBOL(down_read); +EXPORT_SYMBOL(compat_down_read); /* * trylock for reading -- returns 1 if successful, 0 if contention */ -int down_read_trylock(struct rw_semaphore *sem) +int compat_down_read_trylock(struct compat_rw_semaphore *sem) { int ret = __down_read_trylock(sem); @@ -38,12 +38,12 @@ int down_read_trylock(struct rw_semaphor return ret; } -EXPORT_SYMBOL(down_read_trylock); +EXPORT_SYMBOL(compat_down_read_trylock); /* * lock for writing */ -void __sched down_write(struct rw_semaphore *sem) +void __sched compat_down_write(struct compat_rw_semaphore *sem) { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); @@ -51,12 +51,12 @@ void __sched down_write(struct rw_semaph LOCK_CONTENDED(sem, __down_write_trylock, __down_write); } -EXPORT_SYMBOL(down_write); +EXPORT_SYMBOL(compat_down_write); /* * trylock for writing -- returns 1 if successful, 0 if contention */ -int down_write_trylock(struct rw_semaphore *sem) +int compat_down_write_trylock(struct compat_rw_semaphore *sem) { int ret = __down_write_trylock(sem); @@ -65,36 +65,36 @@ int down_write_trylock(struct rw_semapho return ret; } -EXPORT_SYMBOL(down_write_trylock); +EXPORT_SYMBOL(compat_down_write_trylock); /* * release a read lock */ -void up_read(struct rw_semaphore *sem) +void compat_up_read(struct compat_rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); __up_read(sem); } -EXPORT_SYMBOL(up_read); +EXPORT_SYMBOL(compat_up_read); /* * release a write lock */ -void up_write(struct rw_semaphore *sem) +void compat_up_write(struct compat_rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); __up_write(sem); } -EXPORT_SYMBOL(up_write); +EXPORT_SYMBOL(compat_up_write); /* * downgrade write lock to read lock */ -void downgrade_write(struct rw_semaphore *sem) +void compat_downgrade_write(struct compat_rw_semaphore *sem) { /* * lockdep: a downgraded write will live on as a write @@ -103,11 +103,11 @@ void downgrade_write(struct rw_semaphore __downgrade_write(sem); } -EXPORT_SYMBOL(downgrade_write); +EXPORT_SYMBOL(compat_downgrade_write); #ifdef CONFIG_DEBUG_LOCK_ALLOC -void down_read_nested(struct rw_semaphore *sem, int subclass) +void compat_down_read_nested(struct compat_rw_semaphore *sem, int subclass) { might_sleep(); rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); @@ -115,18 +115,18 @@ void down_read_nested(struct rw_semaphor LOCK_CONTENDED(sem, __down_read_trylock, __down_read); } -EXPORT_SYMBOL(down_read_nested); +EXPORT_SYMBOL(compat_down_read_nested); -void down_read_non_owner(struct rw_semaphore *sem) +void compat_down_read_non_owner(struct compat_rw_semaphore *sem) { might_sleep(); __down_read(sem); } -EXPORT_SYMBOL(down_read_non_owner); +EXPORT_SYMBOL(compat_down_read_non_owner); -void down_write_nested(struct rw_semaphore *sem, int subclass) +void compat_down_write_nested(struct compat_rw_semaphore *sem, int subclass) { might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); @@ -134,14 +134,14 @@ void down_write_nested(struct rw_semapho LOCK_CONTENDED(sem, __down_write_trylock, __down_write); } -EXPORT_SYMBOL(down_write_nested); +EXPORT_SYMBOL(compat_down_write_nested); -void up_read_non_owner(struct rw_semaphore *sem) +void compat_up_read_non_owner(struct compat_rw_semaphore *sem) { __up_read(sem); } -EXPORT_SYMBOL(up_read_non_owner); +EXPORT_SYMBOL(compat_up_read_non_owner); #endif Index: linux-2.6.24-rc8-rt1/lib/dec_and_lock.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/lib/dec_and_lock.c 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/lib/dec_and_lock.c 2008-01-16 22:29:18.000000000 -0500 @@ -17,7 +17,7 @@ * because the spin-lock and the decrement must be * "atomic". */ -int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) +int __atomic_dec_and_spin_lock(raw_spinlock_t *lock, atomic_t *atomic) { #ifdef CONFIG_SMP /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ @@ -32,4 +32,4 @@ int _atomic_dec_and_lock(atomic_t *atomi return 0; } -EXPORT_SYMBOL(_atomic_dec_and_lock); +EXPORT_SYMBOL(__atomic_dec_and_spin_lock); Index: linux-2.6.24-rc8-rt1/lib/kernel_lock.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/lib/kernel_lock.c 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/lib/kernel_lock.c 2008-01-16 22:28:49.000000000 -0500 @@ -24,7 +24,7 @@ * * Don't use in new code. */ -static DECLARE_MUTEX(kernel_sem); +DECLARE_MUTEX(kernel_sem); /* * Re-acquire the kernel semaphore. @@ -35,22 +35,25 @@ static DECLARE_MUTEX(kernel_sem); * about recursion, both due to the down() and due to the enabling of * preemption. schedule() will re-check the preemption flag after * reacquiring the semaphore. + * + * Called with interrupts disabled. */ int __lockfunc __reacquire_kernel_lock(void) { struct task_struct *task = current; int saved_lock_depth = task->lock_depth; + local_irq_enable(); BUG_ON(saved_lock_depth < 0); task->lock_depth = -1; - preempt_enable_no_resched(); down(&kernel_sem); - preempt_disable(); task->lock_depth = saved_lock_depth; + local_irq_disable(); + return 0; } @@ -67,11 +70,15 @@ void __lockfunc lock_kernel(void) struct task_struct *task = current; int depth = task->lock_depth + 1; - if (likely(!depth)) + if (likely(!depth)) { /* * No recursion worries - we set up lock_depth _after_ */ down(&kernel_sem); +#ifdef CONFIG_DEBUG_RT_MUTEXES + current->last_kernel_lock = __builtin_return_address(0); +#endif + } task->lock_depth = depth; } @@ -82,8 +89,12 @@ void __lockfunc unlock_kernel(void) BUG_ON(task->lock_depth < 0); - if (likely(--task->lock_depth < 0)) + if (likely(--task->lock_depth == -1)) { +#ifdef CONFIG_DEBUG_RT_MUTEXES + current->last_kernel_lock = NULL; +#endif up(&kernel_sem); + } } #else @@ -116,11 +127,9 @@ static __cacheline_aligned_in_smp DEFIN */ int __lockfunc __reacquire_kernel_lock(void) { - while (!_raw_spin_trylock(&kernel_flag)) { - if (test_thread_flag(TIF_NEED_RESCHED)) - return -EAGAIN; - cpu_relax(); - } + local_irq_enable(); + _raw_spin_lock(&kernel_flag); + local_irq_disable(); preempt_disable(); return 0; } Index: linux-2.6.24-rc8-rt1/lib/locking-selftest.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/lib/locking-selftest.c 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/lib/locking-selftest.c 2008-01-16 22:28:49.000000000 -0500 @@ -158,7 +158,7 @@ static void init_shared_classes(void) local_bh_disable(); \ local_irq_disable(); \ trace_softirq_enter(); \ - WARN_ON(!in_softirq()); + /* FIXME: preemptible softirqs. WARN_ON(!in_softirq()); */ #define SOFTIRQ_EXIT() \ trace_softirq_exit(); \ @@ -550,6 +550,11 @@ GENERATE_TESTCASE(init_held_rsem) #undef E /* + * FIXME: turns these into raw-spinlock tests on -rt + */ +#ifndef CONFIG_PREEMPT_RT + +/* * locking an irq-safe lock with irqs enabled: */ #define E1() \ @@ -890,6 +895,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_ #include "locking-selftest-softirq.h" // GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft) +#endif /* !CONFIG_PREEMPT_RT */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC # define I_SPINLOCK(x) lockdep_reset_lock(&lock_##x.dep_map) # define I_RWLOCK(x) lockdep_reset_lock(&rwlock_##x.dep_map) @@ -940,6 +947,9 @@ static void dotest(void (*testcase_fn)(v { unsigned long saved_preempt_count = preempt_count(); int expected_failure = 0; +#if defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_DEBUG_RT_MUTEXES) + int saved_lock_count = current->lock_count; +#endif WARN_ON(irqs_disabled()); @@ -989,6 +999,9 @@ static void dotest(void (*testcase_fn)(v #endif reset_locks(); +#if defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_DEBUG_RT_MUTEXES) + current->lock_count = saved_lock_count; +#endif } static inline void print_testname(const char *testname) @@ -998,7 +1011,7 @@ static inline void print_testname(const #define DO_TESTCASE_1(desc, name, nr) \ print_testname(desc"/"#nr); \ - dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ + dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ printk("\n"); #define DO_TESTCASE_1B(desc, name, nr) \ @@ -1006,17 +1019,17 @@ static inline void print_testname(const dotest(name##_##nr, FAILURE, LOCKTYPE_RWLOCK); \ printk("\n"); -#define DO_TESTCASE_3(desc, name, nr) \ - print_testname(desc"/"#nr); \ - dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN); \ - dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ +#define DO_TESTCASE_3(desc, name, nr) \ + print_testname(desc"/"#nr); \ + dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN); \ + dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ printk("\n"); -#define DO_TESTCASE_3RW(desc, name, nr) \ - print_testname(desc"/"#nr); \ +#define DO_TESTCASE_3RW(desc, name, nr) \ + print_testname(desc"/"#nr); \ dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN|LOCKTYPE_RWLOCK);\ - dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ + dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ printk("\n"); @@ -1047,7 +1060,7 @@ static inline void print_testname(const print_testname(desc); \ dotest(name##_spin, FAILURE, LOCKTYPE_SPIN); \ dotest(name##_wlock, FAILURE, LOCKTYPE_RWLOCK); \ - dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK); \ + dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK); \ dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX); \ dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM); \ dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM); \ @@ -1179,6 +1192,7 @@ void locking_selftest(void) /* * irq-context testcases: */ +#ifndef CONFIG_PREEMPT_RT DO_TESTCASE_2x6("irqs-on + irq-safe-A", irqsafe1); DO_TESTCASE_2x3("sirq-safe-A => hirqs-on", irqsafe2A); DO_TESTCASE_2x6("safe-A + irqs-on", irqsafe2B); @@ -1188,6 +1202,7 @@ void locking_selftest(void) DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion); // DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2); +#endif if (unexpected_testcase_failures) { printk("-----------------------------------------------------------------\n"); Index: linux-2.6.24-rc8-rt1/lib/plist.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/lib/plist.c 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/lib/plist.c 2008-01-16 22:29:22.000000000 -0500 @@ -53,7 +53,9 @@ static void plist_check_list(struct list static void plist_check_head(struct plist_head *head) { +#ifndef CONFIG_PREEMPT_RT WARN_ON(!head->lock); +#endif if (head->lock) WARN_ON_SMP(!spin_is_locked(head->lock)); plist_check_list(&head->prio_list); @@ -64,6 +66,30 @@ static void plist_check_head(struct plis # define plist_check_head(h) do { } while (0) #endif +static inline struct plist_node *prev_node(struct plist_node *iter) +{ + return list_entry(iter->plist.node_list.prev, struct plist_node, + plist.node_list); +} + +static inline struct plist_node *next_node(struct plist_node *iter) +{ + return list_entry(iter->plist.node_list.next, struct plist_node, + plist.node_list); +} + +static inline struct plist_node *prev_prio(struct plist_node *iter) +{ + return list_entry(iter->plist.prio_list.prev, struct plist_node, + plist.prio_list); +} + +static inline struct plist_node *next_prio(struct plist_node *iter) +{ + return list_entry(iter->plist.prio_list.next, struct plist_node, + plist.prio_list); +} + /** * plist_add - add @node to @head * @@ -81,8 +107,7 @@ void plist_add(struct plist_node *node, if (node->prio < iter->prio) goto lt_prio; else if (node->prio == iter->prio) { - iter = list_entry(iter->plist.prio_list.next, - struct plist_node, plist.prio_list); + iter = next_prio(iter); goto eq_prio; } } @@ -116,3 +141,44 @@ void plist_del(struct plist_node *node, plist_check_head(head); } + +void plist_head_splice(struct plist_head *src, struct plist_head *dst) +{ + struct plist_node *src_iter_first, *src_iter_last, *dst_iter; + struct plist_node *tail = container_of(dst, struct plist_node, plist); + + dst_iter = next_prio(tail); + + while (!plist_head_empty(src) && dst_iter != tail) { + src_iter_first = plist_first(src); + + src_iter_last = next_prio(src_iter_first); + src_iter_last = prev_node(src_iter_last); + + WARN_ON(src_iter_first->prio != src_iter_last->prio); + WARN_ON(list_empty(&src_iter_first->plist.prio_list)); + + while (src_iter_first->prio > dst_iter->prio) { + dst_iter = next_prio(dst_iter); + if (dst_iter == tail) + goto tail; + } + + list_del_init(&src_iter_first->plist.prio_list); + + if (src_iter_first->prio < dst_iter->prio) { + list_add_tail(&src_iter_first->plist.prio_list, + &dst_iter->plist.prio_list); + } else if (src_iter_first->prio == dst_iter->prio) { + dst_iter = next_prio(dst_iter); + } else BUG(); + + list_splice2_tail(&src_iter_first->plist.node_list, + &src_iter_last->plist.node_list, + &dst_iter->plist.node_list); + } + +tail: + list_splice_tail_init(&src->prio_list, &dst->prio_list); + list_splice_tail_init(&src->node_list, &dst->node_list); +} Index: linux-2.6.24-rc8-rt1/lib/rwsem-spinlock.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/lib/rwsem-spinlock.c 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/lib/rwsem-spinlock.c 2008-01-16 22:28:31.000000000 -0500 @@ -20,7 +20,7 @@ struct rwsem_waiter { /* * initialise the semaphore */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, +void __compat_init_rwsem(struct compat_rw_semaphore *sem, const char *name, struct lock_class_key *key) { #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -44,8 +44,8 @@ void __init_rwsem(struct rw_semaphore *s * - woken process blocks are discarded from the list after having task zeroed * - writers are only woken if wakewrite is non-zero */ -static inline struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) +static inline struct compat_rw_semaphore * +__rwsem_do_wake(struct compat_rw_semaphore *sem, int wakewrite) { struct rwsem_waiter *waiter; struct task_struct *tsk; @@ -103,8 +103,8 @@ __rwsem_do_wake(struct rw_semaphore *sem /* * wake a single writer */ -static inline struct rw_semaphore * -__rwsem_wake_one_writer(struct rw_semaphore *sem) +static inline struct compat_rw_semaphore * +__rwsem_wake_one_writer(struct compat_rw_semaphore *sem) { struct rwsem_waiter *waiter; struct task_struct *tsk; @@ -125,7 +125,7 @@ __rwsem_wake_one_writer(struct rw_semaph /* * get a read lock on the semaphore */ -void fastcall __sched __down_read(struct rw_semaphore *sem) +void fastcall __sched __down_read(struct compat_rw_semaphore *sem) { struct rwsem_waiter waiter; struct task_struct *tsk; @@ -168,7 +168,7 @@ void fastcall __sched __down_read(struct /* * trylock for reading -- returns 1 if successful, 0 if contention */ -int fastcall __down_read_trylock(struct rw_semaphore *sem) +int fastcall __down_read_trylock(struct compat_rw_semaphore *sem) { unsigned long flags; int ret = 0; @@ -191,7 +191,8 @@ int fastcall __down_read_trylock(struct * get a write lock on the semaphore * - we increment the waiting count anyway to indicate an exclusive lock */ -void fastcall __sched __down_write_nested(struct rw_semaphore *sem, int subclass) +void fastcall __sched +__down_write_nested(struct compat_rw_semaphore *sem, int subclass) { struct rwsem_waiter waiter; struct task_struct *tsk; @@ -231,7 +232,7 @@ void fastcall __sched __down_write_neste ; } -void fastcall __sched __down_write(struct rw_semaphore *sem) +void fastcall __sched __down_write(struct compat_rw_semaphore *sem) { __down_write_nested(sem, 0); } @@ -239,7 +240,7 @@ void fastcall __sched __down_write(struc /* * trylock for writing -- returns 1 if successful, 0 if contention */ -int fastcall __down_write_trylock(struct rw_semaphore *sem) +int fastcall __down_write_trylock(struct compat_rw_semaphore *sem) { unsigned long flags; int ret = 0; @@ -260,7 +261,7 @@ int fastcall __down_write_trylock(struct /* * release a read lock on the semaphore */ -void fastcall __up_read(struct rw_semaphore *sem) +void fastcall __up_read(struct compat_rw_semaphore *sem) { unsigned long flags; @@ -275,7 +276,7 @@ void fastcall __up_read(struct rw_semaph /* * release a write lock on the semaphore */ -void fastcall __up_write(struct rw_semaphore *sem) +void fastcall __up_write(struct compat_rw_semaphore *sem) { unsigned long flags; @@ -292,7 +293,7 @@ void fastcall __up_write(struct rw_semap * downgrade a write lock into a read lock * - just wake up any readers at the front of the queue */ -void fastcall __downgrade_write(struct rw_semaphore *sem) +void fastcall __downgrade_write(struct compat_rw_semaphore *sem) { unsigned long flags; @@ -305,7 +306,7 @@ void fastcall __downgrade_write(struct r spin_unlock_irqrestore(&sem->wait_lock, flags); } -EXPORT_SYMBOL(__init_rwsem); +EXPORT_SYMBOL(__compat_init_rwsem); EXPORT_SYMBOL(__down_read); EXPORT_SYMBOL(__down_read_trylock); EXPORT_SYMBOL(__down_write_nested); Index: linux-2.6.24-rc8-rt1/lib/rwsem.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/lib/rwsem.c 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/lib/rwsem.c 2008-01-16 22:28:31.000000000 -0500 @@ -11,8 +11,8 @@ /* * Initialize an rwsem: */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) +void __compat_init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -26,7 +26,7 @@ void __init_rwsem(struct rw_semaphore *s INIT_LIST_HEAD(&sem->wait_list); } -EXPORT_SYMBOL(__init_rwsem); +EXPORT_SYMBOL(__compat_init_rwsem); struct rwsem_waiter { struct list_head list; Index: linux-2.6.24-rc8-rt1/lib/semaphore-sleepers.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/lib/semaphore-sleepers.c 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/lib/semaphore-sleepers.c 2008-01-16 22:28:31.000000000 -0500 @@ -15,6 +15,7 @@ #include #include #include +#include #include /* @@ -48,12 +49,12 @@ * we cannot lose wakeup events. */ -fastcall void __up(struct semaphore *sem) +fastcall void __compat_up(struct compat_semaphore *sem) { wake_up(&sem->wait); } -fastcall void __sched __down(struct semaphore * sem) +fastcall void __sched __compat_down(struct compat_semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -90,7 +91,7 @@ fastcall void __sched __down(struct sema tsk->state = TASK_RUNNING; } -fastcall int __sched __down_interruptible(struct semaphore * sem) +fastcall int __sched __compat_down_interruptible(struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -153,7 +154,7 @@ fastcall int __sched __down_interruptibl * single "cmpxchg" without failure cases, * but then it wouldn't work on a 386. */ -fastcall int __down_trylock(struct semaphore * sem) +fastcall int __compat_down_trylock(struct compat_semaphore * sem) { int sleepers; unsigned long flags; @@ -174,3 +175,10 @@ fastcall int __down_trylock(struct semap spin_unlock_irqrestore(&sem->wait.lock, flags); return 1; } + +int fastcall compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} + +EXPORT_SYMBOL(compat_sem_is_locked); Index: linux-2.6.24-rc8-rt1/lib/spinlock_debug.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/lib/spinlock_debug.c 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/lib/spinlock_debug.c 2008-01-16 22:28:31.000000000 -0500 @@ -13,8 +13,8 @@ #include #include -void __spin_lock_init(spinlock_t *lock, const char *name, - struct lock_class_key *key) +void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, + struct lock_class_key *key) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -23,16 +23,16 @@ void __spin_lock_init(spinlock_t *lock, debug_check_no_locks_freed((void *)lock, sizeof(*lock)); lockdep_init_map(&lock->dep_map, name, key, 0); #endif - lock->raw_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + lock->raw_lock = (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; lock->magic = SPINLOCK_MAGIC; lock->owner = SPINLOCK_OWNER_INIT; lock->owner_cpu = -1; } -EXPORT_SYMBOL(__spin_lock_init); +EXPORT_SYMBOL(__raw_spin_lock_init); -void __rwlock_init(rwlock_t *lock, const char *name, - struct lock_class_key *key) +void __raw_rwlock_init(raw_rwlock_t *lock, const char *name, + struct lock_class_key *key) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -41,15 +41,15 @@ void __rwlock_init(rwlock_t *lock, const debug_check_no_locks_freed((void *)lock, sizeof(*lock)); lockdep_init_map(&lock->dep_map, name, key, 0); #endif - lock->raw_lock = (raw_rwlock_t) __RAW_RW_LOCK_UNLOCKED; + lock->raw_lock = (__raw_rwlock_t) __RAW_RW_LOCK_UNLOCKED; lock->magic = RWLOCK_MAGIC; lock->owner = SPINLOCK_OWNER_INIT; lock->owner_cpu = -1; } -EXPORT_SYMBOL(__rwlock_init); +EXPORT_SYMBOL(__raw_rwlock_init); -static void spin_bug(spinlock_t *lock, const char *msg) +static void spin_bug(raw_spinlock_t *lock, const char *msg) { struct task_struct *owner = NULL; @@ -73,7 +73,7 @@ static void spin_bug(spinlock_t *lock, c #define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg) static inline void -debug_spin_lock_before(spinlock_t *lock) +debug_spin_lock_before(raw_spinlock_t *lock) { SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); SPIN_BUG_ON(lock->owner == current, lock, "recursion"); @@ -81,13 +81,13 @@ debug_spin_lock_before(spinlock_t *lock) lock, "cpu recursion"); } -static inline void debug_spin_lock_after(spinlock_t *lock) +static inline void debug_spin_lock_after(raw_spinlock_t *lock) { lock->owner_cpu = raw_smp_processor_id(); lock->owner = current; } -static inline void debug_spin_unlock(spinlock_t *lock) +static inline void debug_spin_unlock(raw_spinlock_t *lock) { SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); SPIN_BUG_ON(!spin_is_locked(lock), lock, "already unlocked"); @@ -98,7 +98,7 @@ static inline void debug_spin_unlock(spi lock->owner_cpu = -1; } -static void __spin_lock_debug(spinlock_t *lock) +static void __spin_lock_debug(raw_spinlock_t *lock) { u64 i; u64 loops = loops_per_jiffy * HZ; @@ -125,7 +125,7 @@ static void __spin_lock_debug(spinlock_t } } -void _raw_spin_lock(spinlock_t *lock) +void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) { debug_spin_lock_before(lock); if (unlikely(!__raw_spin_trylock(&lock->raw_lock))) @@ -133,7 +133,7 @@ void _raw_spin_lock(spinlock_t *lock) debug_spin_lock_after(lock); } -int _raw_spin_trylock(spinlock_t *lock) +int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) { int ret = __raw_spin_trylock(&lock->raw_lock); @@ -148,13 +148,13 @@ int _raw_spin_trylock(spinlock_t *lock) return ret; } -void _raw_spin_unlock(spinlock_t *lock) +void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) { debug_spin_unlock(lock); __raw_spin_unlock(&lock->raw_lock); } -static void rwlock_bug(rwlock_t *lock, const char *msg) +static void rwlock_bug(raw_rwlock_t *lock, const char *msg) { if (!debug_locks_off()) return; @@ -167,8 +167,8 @@ static void rwlock_bug(rwlock_t *lock, c #define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg) -#if 0 /* __write_lock_debug() can lock up - maybe this can too? */ -static void __read_lock_debug(rwlock_t *lock) +#if 1 /* __write_lock_debug() can lock up - maybe this can too? */ +static void __raw_read_lock_debug(raw_rwlock_t *lock) { u64 i; u64 loops = loops_per_jiffy * HZ; @@ -193,13 +193,13 @@ static void __read_lock_debug(rwlock_t * } #endif -void _raw_read_lock(rwlock_t *lock) +void __lockfunc _raw_read_lock(raw_rwlock_t *lock) { RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - __raw_read_lock(&lock->raw_lock); + __raw_read_lock_debug(lock); } -int _raw_read_trylock(rwlock_t *lock) +int __lockfunc _raw_read_trylock(raw_rwlock_t *lock) { int ret = __raw_read_trylock(&lock->raw_lock); @@ -212,13 +212,13 @@ int _raw_read_trylock(rwlock_t *lock) return ret; } -void _raw_read_unlock(rwlock_t *lock) +void __lockfunc _raw_read_unlock(raw_rwlock_t *lock) { RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); __raw_read_unlock(&lock->raw_lock); } -static inline void debug_write_lock_before(rwlock_t *lock) +static inline void debug_write_lock_before(raw_rwlock_t *lock) { RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); RWLOCK_BUG_ON(lock->owner == current, lock, "recursion"); @@ -226,13 +226,13 @@ static inline void debug_write_lock_befo lock, "cpu recursion"); } -static inline void debug_write_lock_after(rwlock_t *lock) +static inline void debug_write_lock_after(raw_rwlock_t *lock) { lock->owner_cpu = raw_smp_processor_id(); lock->owner = current; } -static inline void debug_write_unlock(rwlock_t *lock) +static inline void debug_write_unlock(raw_rwlock_t *lock) { RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner"); @@ -242,8 +242,8 @@ static inline void debug_write_unlock(rw lock->owner_cpu = -1; } -#if 0 /* This can cause lockups */ -static void __write_lock_debug(rwlock_t *lock) +#if 1 /* This can cause lockups */ +static void __raw_write_lock_debug(raw_rwlock_t *lock) { u64 i; u64 loops = loops_per_jiffy * HZ; @@ -268,14 +268,14 @@ static void __write_lock_debug(rwlock_t } #endif -void _raw_write_lock(rwlock_t *lock) +void __lockfunc _raw_write_lock(raw_rwlock_t *lock) { debug_write_lock_before(lock); - __raw_write_lock(&lock->raw_lock); + __raw_write_lock_debug(lock); debug_write_lock_after(lock); } -int _raw_write_trylock(rwlock_t *lock) +int __lockfunc _raw_write_trylock(raw_rwlock_t *lock) { int ret = __raw_write_trylock(&lock->raw_lock); @@ -290,7 +290,7 @@ int _raw_write_trylock(rwlock_t *lock) return ret; } -void _raw_write_unlock(rwlock_t *lock) +void __lockfunc _raw_write_unlock(raw_rwlock_t *lock) { debug_write_unlock(lock); __raw_write_unlock(&lock->raw_lock); Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/apm_32.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/apm_32.c 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/apm_32.c 2008-01-16 22:28:32.000000000 -0500 @@ -783,7 +783,7 @@ static int apm_do_idle(void) */ smp_mb(); } - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { idled = 1; ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax); } Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/i386_ksyms_32.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/i386_ksyms_32.c 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/i386_ksyms_32.c 2008-01-16 22:28:32.000000000 -0500 @@ -4,10 +4,12 @@ #include #include -EXPORT_SYMBOL(__down_failed); -EXPORT_SYMBOL(__down_failed_interruptible); -EXPORT_SYMBOL(__down_failed_trylock); -EXPORT_SYMBOL(__up_wakeup); +#ifdef CONFIG_ASM_SEMAPHORES +EXPORT_SYMBOL(__compat_down_failed); +EXPORT_SYMBOL(__compat_down_failed_interruptible); +EXPORT_SYMBOL(__compat_down_failed_trylock); +EXPORT_SYMBOL(__compat_up_wakeup); +#endif /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); @@ -22,7 +24,7 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(strstr); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_ASM_SEMAPHORES) extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); EXPORT_SYMBOL(__write_lock_failed); Index: linux-2.6.24-rc8-rt1/arch/x86/lib/semaphore_32.S =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/lib/semaphore_32.S 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/lib/semaphore_32.S 2008-01-16 22:28:32.000000000 -0500 @@ -30,7 +30,7 @@ * value or just clobbered.. */ .section .sched.text -ENTRY(__down_failed) +ENTRY(__compat_down_failed) CFI_STARTPROC FRAME pushl %edx @@ -39,7 +39,7 @@ ENTRY(__down_failed) pushl %ecx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ecx,0 - call __down + call __compat_down popl %ecx CFI_ADJUST_CFA_OFFSET -4 CFI_RESTORE ecx @@ -49,9 +49,9 @@ ENTRY(__down_failed) ENDFRAME ret CFI_ENDPROC - END(__down_failed) + END(__compat_down_failed) -ENTRY(__down_failed_interruptible) +ENTRY(__compat_down_failed_interruptible) CFI_STARTPROC FRAME pushl %edx @@ -60,7 +60,7 @@ ENTRY(__down_failed_interruptible) pushl %ecx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ecx,0 - call __down_interruptible + call __compat_down_interruptible popl %ecx CFI_ADJUST_CFA_OFFSET -4 CFI_RESTORE ecx @@ -70,9 +70,9 @@ ENTRY(__down_failed_interruptible) ENDFRAME ret CFI_ENDPROC - END(__down_failed_interruptible) + END(__compat_down_failed_interruptible) -ENTRY(__down_failed_trylock) +ENTRY(__compat_down_failed_trylock) CFI_STARTPROC FRAME pushl %edx @@ -81,7 +81,7 @@ ENTRY(__down_failed_trylock) pushl %ecx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ecx,0 - call __down_trylock + call __compat_down_trylock popl %ecx CFI_ADJUST_CFA_OFFSET -4 CFI_RESTORE ecx @@ -91,9 +91,9 @@ ENTRY(__down_failed_trylock) ENDFRAME ret CFI_ENDPROC - END(__down_failed_trylock) + END(__compat_down_failed_trylock) -ENTRY(__up_wakeup) +ENTRY(__compat_up_wakeup) CFI_STARTPROC FRAME pushl %edx @@ -102,7 +102,7 @@ ENTRY(__up_wakeup) pushl %ecx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ecx,0 - call __up + call __compat_up popl %ecx CFI_ADJUST_CFA_OFFSET -4 CFI_RESTORE ecx @@ -112,7 +112,7 @@ ENTRY(__up_wakeup) ENDFRAME ret CFI_ENDPROC - END(__up_wakeup) + END(__compat_up_wakeup) /* * rw spinlock fallbacks Index: linux-2.6.24-rc8-rt1/include/asm-x86/rwsem.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/rwsem.h 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/rwsem.h 2008-01-16 22:28:32.000000000 -0500 @@ -44,15 +44,15 @@ struct rwsem_waiter; -extern struct rw_semaphore *FASTCALL(rwsem_down_read_failed(struct rw_semaphore *sem)); -extern struct rw_semaphore *FASTCALL(rwsem_down_write_failed(struct rw_semaphore *sem)); -extern struct rw_semaphore *FASTCALL(rwsem_wake(struct rw_semaphore *)); -extern struct rw_semaphore *FASTCALL(rwsem_downgrade_wake(struct rw_semaphore *sem)); +extern struct compat_rw_semaphore *FASTCALL(rwsem_down_read_failed(struct compat_rw_semaphore *sem)); +extern struct compat_rw_semaphore *FASTCALL(rwsem_down_write_failed(struct compat_rw_semaphore *sem)); +extern struct compat_rw_semaphore *FASTCALL(rwsem_wake(struct compat_rw_semaphore *)); +extern struct compat_rw_semaphore *FASTCALL(rwsem_downgrade_wake(struct compat_rw_semaphore *sem)); /* * the semaphore definition */ -struct rw_semaphore { +struct compat_rw_semaphore { signed long count; #define RWSEM_UNLOCKED_VALUE 0x00000000 #define RWSEM_ACTIVE_BIAS 0x00000001 @@ -78,23 +78,23 @@ struct rw_semaphore { { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define COMPAT_DECLARE_RWSEM(name) \ + struct compat_rw_semaphore name = __RWSEM_INITIALIZER(name) -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, +extern void __compat_init_rwsem(struct rw_semaphore *sem, const char *name, struct lock_class_key *key); -#define init_rwsem(sem) \ +#define compat_init_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ - __init_rwsem((sem), #sem, &__key); \ + __compat_init_rwsem((sem), #sem, &__key); \ } while (0) /* * lock for reading */ -static inline void __down_read(struct rw_semaphore *sem) +static inline void __down_read(struct compat_rw_semaphore *sem) { __asm__ __volatile__( "# beginning down_read\n\t" @@ -111,7 +111,7 @@ LOCK_PREFIX " incl (%%eax)\n\t" /* /* * trylock for reading -- returns 1 if successful, 0 if contention */ -static inline int __down_read_trylock(struct rw_semaphore *sem) +static inline int __down_read_trylock(struct compat_rw_semaphore *sem) { __s32 result, tmp; __asm__ __volatile__( @@ -134,7 +134,8 @@ LOCK_PREFIX " cmpxchgl %2,%0\n\t" /* * lock for writing */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline void +__down_write_nested(struct compat_rw_semaphore *sem, int subclass) { int tmp; @@ -160,7 +161,7 @@ static inline void __down_write(struct r /* * trylock for writing -- returns 1 if successful, 0 if contention */ -static inline int __down_write_trylock(struct rw_semaphore *sem) +static inline int __down_write_trylock(struct compat_rw_semaphore *sem) { signed long ret = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, @@ -173,7 +174,7 @@ static inline int __down_write_trylock(s /* * unlock after reading */ -static inline void __up_read(struct rw_semaphore *sem) +static inline void __up_read(struct compat_rw_semaphore *sem) { __s32 tmp = -RWSEM_ACTIVE_READ_BIAS; __asm__ __volatile__( @@ -191,7 +192,7 @@ LOCK_PREFIX " xadd %%edx,(%%eax)\n /* * unlock after writing */ -static inline void __up_write(struct rw_semaphore *sem) +static inline void __up_write(struct compat_rw_semaphore *sem) { __asm__ __volatile__( "# beginning __up_write\n\t" @@ -209,7 +210,7 @@ LOCK_PREFIX " xaddl %%edx,(%%eax)\n /* * downgrade write lock to read lock */ -static inline void __downgrade_write(struct rw_semaphore *sem) +static inline void __downgrade_write(struct compat_rw_semaphore *sem) { __asm__ __volatile__( "# beginning __downgrade_write\n\t" @@ -226,7 +227,7 @@ LOCK_PREFIX " addl %2,(%%eax)\n\t" /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem) { __asm__ __volatile__( LOCK_PREFIX "addl %1,%0" @@ -237,7 +238,7 @@ LOCK_PREFIX "addl %1,%0" /* * implement exchange and add functionality */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem) { int tmp = delta; @@ -249,7 +250,7 @@ LOCK_PREFIX "xadd %0,%1" return tmp+delta; } -static inline int rwsem_is_locked(struct rw_semaphore *sem) +static inline int compat_rwsem_is_locked(struct rw_semaphore *sem) { return (sem->count != 0); } Index: linux-2.6.24-rc8-rt1/include/asm-x86/semaphore_32.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/semaphore_32.h 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/semaphore_32.h 2008-01-16 22:28:32.000000000 -0500 @@ -3,8 +3,6 @@ #include -#ifdef __KERNEL__ - /* * SMP- and interrupt-safe semaphores.. * @@ -41,29 +39,39 @@ #include #include -struct semaphore { +/* + * On !PREEMPT_RT all semaphores are compat: + */ +#ifndef CONFIG_PREEMPT_RT +# define compat_semaphore semaphore +#endif + +struct compat_semaphore { atomic_t count; int sleepers; wait_queue_head_t wait; }; -#define __SEMAPHORE_INITIALIZER(name, n) \ +#define __COMPAT_SEMAPHORE_INITIALIZER(name, n) \ { \ .count = ATOMIC_INIT(n), \ .sleepers = 0, \ .wait = __WAIT_QUEUE_HEAD_INITIALIZER((name).wait) \ } -#define __DECLARE_SEMAPHORE_GENERIC(name,count) \ - struct semaphore name = __SEMAPHORE_INITIALIZER(name,count) +#define __COMPAT_MUTEX_INITIALIZER(name) \ + __COMPAT_SEMAPHORE_INITIALIZER(name,1) -#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1) +#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \ + struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count) -static inline void sema_init (struct semaphore *sem, int val) +#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1) + +static inline void compat_sema_init (struct compat_semaphore *sem, int val) { /* - * *sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val); + * *sem = (struct compat_semaphore)__SEMAPHORE_INITIALIZER((*sem),val); * * i'd rather use the more flexible initialization above, but sadly * GCC 2.7.2.3 emits a bogus warning. EGCS doesn't. Oh well. @@ -73,27 +81,27 @@ static inline void sema_init (struct sem init_waitqueue_head(&sem->wait); } -static inline void init_MUTEX (struct semaphore *sem) +static inline void compat_init_MUTEX (struct compat_semaphore *sem) { - sema_init(sem, 1); + compat_sema_init(sem, 1); } -static inline void init_MUTEX_LOCKED (struct semaphore *sem) +static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem) { - sema_init(sem, 0); + compat_sema_init(sem, 0); } -fastcall void __down_failed(void /* special register calling convention */); -fastcall int __down_failed_interruptible(void /* params in registers */); -fastcall int __down_failed_trylock(void /* params in registers */); -fastcall void __up_wakeup(void /* special register calling convention */); +fastcall void __compat_down_failed(void /* special register calling convention */); +fastcall int __compat_down_failed_interruptible(void /* params in registers */); +fastcall int __compat_down_failed_trylock(void /* params in registers */); +fastcall void __compat_up_wakeup(void /* special register calling convention */); /* * This is ugly, but we want the default case to fall through. * "__down_failed" is a special asm handler that calls the C * routine that actually waits. See arch/i386/kernel/semaphore.c */ -static inline void down(struct semaphore * sem) +static inline void compat_down(struct compat_semaphore * sem) { might_sleep(); __asm__ __volatile__( @@ -101,7 +109,7 @@ static inline void down(struct semaphore LOCK_PREFIX "decl %0\n\t" /* --sem->count */ "jns 2f\n" "\tlea %0,%%eax\n\t" - "call __down_failed\n" + "call __compat_down_failed\n" "2:" :"+m" (sem->count) : @@ -112,7 +120,7 @@ static inline void down(struct semaphore * Interruptible try to acquire a semaphore. If we obtained * it, return zero. If we were interrupted, returns -EINTR */ -static inline int down_interruptible(struct semaphore * sem) +static inline int compat_down_interruptible(struct compat_semaphore * sem) { int result; @@ -123,7 +131,7 @@ static inline int down_interruptible(str LOCK_PREFIX "decl %1\n\t" /* --sem->count */ "jns 2f\n\t" "lea %1,%%eax\n\t" - "call __down_failed_interruptible\n" + "call __compat_down_failed_interruptible\n" "2:" :"=&a" (result), "+m" (sem->count) : @@ -135,7 +143,7 @@ static inline int down_interruptible(str * Non-blockingly attempt to down() a semaphore. * Returns zero if we acquired it */ -static inline int down_trylock(struct semaphore * sem) +static inline int compat_down_trylock(struct compat_semaphore * sem) { int result; @@ -145,7 +153,7 @@ static inline int down_trylock(struct se LOCK_PREFIX "decl %1\n\t" /* --sem->count */ "jns 2f\n\t" "lea %1,%%eax\n\t" - "call __down_failed_trylock\n\t" + "call __compat_down_failed_trylock\n\t" "2:\n" :"=&a" (result), "+m" (sem->count) : @@ -157,19 +165,24 @@ static inline int down_trylock(struct se * Note! This is subtle. We jump to wake people up only if * the semaphore was negative (== somebody was waiting on it). */ -static inline void up(struct semaphore * sem) +static inline void compat_up(struct compat_semaphore * sem) { __asm__ __volatile__( "# atomic up operation\n\t" LOCK_PREFIX "incl %0\n\t" /* ++sem->count */ "jg 1f\n\t" "lea %0,%%eax\n\t" - "call __up_wakeup\n" + "call __compat_up_wakeup\n" "1:" :"+m" (sem->count) : :"memory","ax"); } -#endif +extern int FASTCALL(compat_sem_is_locked(struct compat_semaphore *sem)); + +#define compat_sema_count(sem) atomic_read(&(sem)->count) + +#include + #endif Index: linux-2.6.24-rc8-rt1/include/asm-x86/spinlock_32.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/spinlock_32.h 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/spinlock_32.h 2008-01-16 22:28:32.000000000 -0500 @@ -27,12 +27,12 @@ * (the type definitions are in asm/spinlock_types.h) */ -static inline int __raw_spin_is_locked(raw_spinlock_t *x) +static inline int __raw_spin_is_locked(__raw_spinlock_t *x) { return *(volatile signed char *)(&(x)->slock) <= 0; } -static inline void __raw_spin_lock(raw_spinlock_t *lock) +static inline void __raw_spin_lock(__raw_spinlock_t *lock) { asm volatile("\n1:\t" LOCK_PREFIX " ; decb %0\n\t" @@ -55,7 +55,7 @@ static inline void __raw_spin_lock(raw_s * irq-traced, but on CONFIG_TRACE_IRQFLAGS we never use this variant. */ #ifndef CONFIG_PROVE_LOCKING -static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) +static inline void __raw_spin_lock_flags(__raw_spinlock_t *lock, unsigned long flags) { asm volatile( "\n1:\t" @@ -84,7 +84,7 @@ static inline void __raw_spin_lock_flags } #endif -static inline int __raw_spin_trylock(raw_spinlock_t *lock) +static inline int __raw_spin_trylock(__raw_spinlock_t *lock) { char oldval; asm volatile( @@ -103,14 +103,14 @@ static inline int __raw_spin_trylock(raw #if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE) -static inline void __raw_spin_unlock(raw_spinlock_t *lock) +static inline void __raw_spin_unlock(__raw_spinlock_t *lock) { asm volatile("movb $1,%0" : "+m" (lock->slock) :: "memory"); } #else -static inline void __raw_spin_unlock(raw_spinlock_t *lock) +static inline void __raw_spin_unlock(__raw_spinlock_t *lock) { char oldval = 1; @@ -121,7 +121,7 @@ static inline void __raw_spin_unlock(raw #endif -static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) +static inline void __raw_spin_unlock_wait(__raw_spinlock_t *lock) { while (__raw_spin_is_locked(lock)) cpu_relax(); @@ -152,7 +152,7 @@ static inline void __raw_spin_unlock_wai * read_can_lock - would read_trylock() succeed? * @lock: the rwlock in question. */ -static inline int __raw_read_can_lock(raw_rwlock_t *x) +static inline int __raw_read_can_lock(__raw_rwlock_t *x) { return (int)(x)->lock > 0; } @@ -161,12 +161,12 @@ static inline int __raw_read_can_lock(ra * write_can_lock - would write_trylock() succeed? * @lock: the rwlock in question. */ -static inline int __raw_write_can_lock(raw_rwlock_t *x) +static inline int __raw_write_can_lock(__raw_rwlock_t *x) { return (x)->lock == RW_LOCK_BIAS; } -static inline void __raw_read_lock(raw_rwlock_t *rw) +static inline void __raw_read_lock(__raw_rwlock_t *rw) { asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t" "jns 1f\n" @@ -175,7 +175,7 @@ static inline void __raw_read_lock(raw_r ::"a" (rw) : "memory"); } -static inline void __raw_write_lock(raw_rwlock_t *rw) +static inline void __raw_write_lock(__raw_rwlock_t *rw) { asm volatile(LOCK_PREFIX " subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" "jz 1f\n" @@ -184,7 +184,7 @@ static inline void __raw_write_lock(raw_ ::"a" (rw) : "memory"); } -static inline int __raw_read_trylock(raw_rwlock_t *lock) +static inline int __raw_read_trylock(__raw_rwlock_t *lock) { atomic_t *count = (atomic_t *)lock; atomic_dec(count); @@ -194,7 +194,7 @@ static inline int __raw_read_trylock(raw return 0; } -static inline int __raw_write_trylock(raw_rwlock_t *lock) +static inline int __raw_write_trylock(__raw_rwlock_t *lock) { atomic_t *count = (atomic_t *)lock; if (atomic_sub_and_test(RW_LOCK_BIAS, count)) @@ -203,19 +203,19 @@ static inline int __raw_write_trylock(ra return 0; } -static inline void __raw_read_unlock(raw_rwlock_t *rw) +static inline void __raw_read_unlock(__raw_rwlock_t *rw) { asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory"); } -static inline void __raw_write_unlock(raw_rwlock_t *rw) +static inline void __raw_write_unlock(__raw_rwlock_t *rw) { asm volatile(LOCK_PREFIX "addl $" RW_LOCK_BIAS_STR ", %0" : "+m" (rw->lock) : : "memory"); } -#define _raw_spin_relax(lock) cpu_relax() -#define _raw_read_relax(lock) cpu_relax() -#define _raw_write_relax(lock) cpu_relax() +#define __raw_spin_relax(lock) cpu_relax() +#define __raw_read_relax(lock) cpu_relax() +#define __raw_write_relax(lock) cpu_relax() #endif /* __ASM_SPINLOCK_H */ Index: linux-2.6.24-rc8-rt1/include/asm-x86/spinlock_types.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/spinlock_types.h 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/spinlock_types.h 2008-01-16 22:28:32.000000000 -0500 @@ -7,13 +7,13 @@ typedef struct { unsigned int slock; -} raw_spinlock_t; +} __raw_spinlock_t; #define __RAW_SPIN_LOCK_UNLOCKED { 1 } typedef struct { unsigned int lock; -} raw_rwlock_t; +} __raw_rwlock_t; #define __RAW_RW_LOCK_UNLOCKED { RW_LOCK_BIAS } Index: linux-2.6.24-rc8-rt1/include/asm-x86/thread_info_32.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/thread_info_32.h 2008-01-16 22:27:38.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/thread_info_32.h 2008-01-16 22:28:32.000000000 -0500 @@ -132,15 +132,18 @@ static inline struct thread_info *curren #define TIF_SYSCALL_AUDIT 6 /* syscall auditing active */ #define TIF_SECCOMP 7 /* secure computing */ #define TIF_RESTORE_SIGMASK 8 /* restore signal mask in do_signal() */ +#define TIF_NEED_RESCHED_DELAYED 10 /* reschedule on return to userspace */ #define TIF_MEMDIE 16 #define TIF_DEBUG 17 /* uses debug registers */ #define TIF_IO_BITMAP 18 /* uses I/O bitmap */ #define TIF_FREEZE 19 /* is freezing for suspend */ #define TIF_NOTSC 20 /* TSC is not accessible in userland */ + #define _TIF_SYSCALL_TRACE (1<counter) : "Ir" (i), "m" (v->counter) : "memory"); - } else { + } +#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT) + else { unsigned long flags; raw_local_irq_save(flags); @@ -180,6 +182,7 @@ static __inline__ int atomic_add_return( v->counter = result; raw_local_irq_restore(flags); } +#endif smp_llsc_mb(); @@ -223,7 +226,9 @@ static __inline__ int atomic_sub_return( : "=&r" (result), "=&r" (temp), "=m" (v->counter) : "Ir" (i), "m" (v->counter) : "memory"); - } else { + } +#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT) + else { unsigned long flags; raw_local_irq_save(flags); @@ -232,6 +237,7 @@ static __inline__ int atomic_sub_return( v->counter = result; raw_local_irq_restore(flags); } +#endif smp_llsc_mb(); @@ -291,7 +297,9 @@ static __inline__ int atomic_sub_if_posi : "=&r" (result), "=&r" (temp), "=m" (v->counter) : "Ir" (i), "m" (v->counter) : "memory"); - } else { + } +#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT) + else { unsigned long flags; raw_local_irq_save(flags); @@ -301,6 +309,7 @@ static __inline__ int atomic_sub_if_posi v->counter = result; raw_local_irq_restore(flags); } +#endif smp_llsc_mb(); @@ -552,7 +561,9 @@ static __inline__ long atomic64_add_retu : "=&r" (result), "=&r" (temp), "=m" (v->counter) : "Ir" (i), "m" (v->counter) : "memory"); - } else { + } +#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT) + else { unsigned long flags; raw_local_irq_save(flags); @@ -561,6 +572,7 @@ static __inline__ long atomic64_add_retu v->counter = result; raw_local_irq_restore(flags); } +#endif smp_llsc_mb(); @@ -604,7 +616,9 @@ static __inline__ long atomic64_sub_retu : "=&r" (result), "=&r" (temp), "=m" (v->counter) : "Ir" (i), "m" (v->counter) : "memory"); - } else { + } +#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT) + else { unsigned long flags; raw_local_irq_save(flags); @@ -682,6 +696,7 @@ static __inline__ long atomic64_sub_if_p v->counter = result; raw_local_irq_restore(flags); } +#endif smp_llsc_mb(); Index: linux-2.6.24-rc8-rt1/include/asm-mips/semaphore.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-mips/semaphore.h 2008-01-16 22:27:37.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-mips/semaphore.h 2008-01-16 22:28:41.000000000 -0500 @@ -24,12 +24,20 @@ #ifdef __KERNEL__ -#include -#include #include #include -struct semaphore { +/* + * On !PREEMPT_RT all semaphores are compat: + */ +#ifndef CONFIG_PREEMPT_RT +# define compat_semaphore semaphore +#endif + +#include +#include + +struct compat_semaphore { /* * Note that any negative value of count is equivalent to 0, * but additionally indicates that some process(es) might be @@ -39,38 +47,41 @@ struct semaphore { wait_queue_head_t wait; }; -#define __SEMAPHORE_INITIALIZER(name, n) \ +#define __COMPAT_SEMAPHORE_INITIALIZER(name, n) \ { \ .count = ATOMIC_INIT(n), \ .wait = __WAIT_QUEUE_HEAD_INITIALIZER((name).wait) \ } -#define __DECLARE_SEMAPHORE_GENERIC(name, count) \ - struct semaphore name = __SEMAPHORE_INITIALIZER(name, count) +#define __COMPAT_MUTEX_INITIALIZER(name) \ + __COMPAT_SEMAPHORE_INITIALIZER(name, 1) + +#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, count) \ + struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count) -#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name, 1) +#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 1) -static inline void sema_init(struct semaphore *sem, int val) +static inline void compat_sema_init (struct compat_semaphore *sem, int val) { atomic_set(&sem->count, val); init_waitqueue_head(&sem->wait); } -static inline void init_MUTEX(struct semaphore *sem) +static inline void compat_init_MUTEX (struct compat_semaphore *sem) { - sema_init(sem, 1); + compat_sema_init(sem, 1); } -static inline void init_MUTEX_LOCKED(struct semaphore *sem) +static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem) { - sema_init(sem, 0); + compat_sema_init(sem, 0); } -extern void __down(struct semaphore * sem); -extern int __down_interruptible(struct semaphore * sem); -extern void __up(struct semaphore * sem); +extern void __compat_down(struct compat_semaphore * sem); +extern int __compat_down_interruptible(struct compat_semaphore * sem); +extern void __compat_up(struct compat_semaphore * sem); -static inline void down(struct semaphore * sem) +static inline void compat_down(struct compat_semaphore * sem) { might_sleep(); @@ -78,31 +89,37 @@ static inline void down(struct semaphore * Try to get the semaphore, take the slow path if we fail. */ if (unlikely(atomic_dec_return(&sem->count) < 0)) - __down(sem); + __compat_down(sem); } -static inline int down_interruptible(struct semaphore * sem) +static inline int compat_down_interruptible(struct compat_semaphore * sem) { int ret = 0; might_sleep(); if (unlikely(atomic_dec_return(&sem->count) < 0)) - ret = __down_interruptible(sem); + ret = __compat_down_interruptible(sem); return ret; } -static inline int down_trylock(struct semaphore * sem) +static inline int compat_down_trylock(struct compat_semaphore * sem) { return atomic_dec_if_positive(&sem->count) < 0; } -static inline void up(struct semaphore * sem) +static inline void compat_up(struct compat_semaphore * sem) { if (unlikely(atomic_inc_return(&sem->count) <= 0)) - __up(sem); + __compat_up(sem); } +extern int compat_sem_is_locked(struct compat_semaphore *sem); + +#define compat_sema_count(sem) atomic_read(&(sem)->count) + +#include + #endif /* __KERNEL__ */ #endif /* __ASM_SEMAPHORE_H */ Index: linux-2.6.24-rc8-rt1/arch/powerpc/kernel/Makefile =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/kernel/Makefile 2008-01-16 22:27:37.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/kernel/Makefile 2008-01-16 22:28:32.000000000 -0500 @@ -10,11 +10,12 @@ CFLAGS_prom_init.o += -fPIC CFLAGS_btext.o += -fPIC endif -obj-y := semaphore.o cputable.o ptrace.o syscalls.o \ +obj-y := cputable.o ptrace.o syscalls.o \ irq.o align.o signal_32.o pmc.o vdso.o \ init_task.o process.o systbl.o idle.o \ signal.o obj-y += vdso32/ +obj-$(CONFIG_ASM_SEMAPHORES) += semaphore.o obj-$(CONFIG_PPC64) += setup_64.o binfmt_elf32.o sys_ppc32.o \ signal_64.o ptrace32.o \ paca.o cpu_setup_ppc970.o \ Index: linux-2.6.24-rc8-rt1/arch/powerpc/kernel/semaphore.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/kernel/semaphore.c 2008-01-16 22:27:37.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/kernel/semaphore.c 2008-01-16 22:28:32.000000000 -0500 @@ -31,7 +31,7 @@ * sem->count = tmp; * return old_count; */ -static inline int __sem_update_count(struct semaphore *sem, int incr) +static inline int __sem_update_count(struct compat_semaphore *sem, int incr) { int old_count, tmp; @@ -50,7 +50,7 @@ static inline int __sem_update_count(str return old_count; } -void __up(struct semaphore *sem) +void __compat_up(struct compat_semaphore *sem) { /* * Note that we incremented count in up() before we came here, @@ -63,7 +63,7 @@ void __up(struct semaphore *sem) __sem_update_count(sem, 1); wake_up(&sem->wait); } -EXPORT_SYMBOL(__up); +EXPORT_SYMBOL(__compat_up); /* * Note that when we come in to __down or __down_interruptible, @@ -73,7 +73,7 @@ EXPORT_SYMBOL(__up); * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __sched __down(struct semaphore *sem) +void __sched __compat_down(struct compat_semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -101,9 +101,9 @@ void __sched __down(struct semaphore *se */ wake_up(&sem->wait); } -EXPORT_SYMBOL(__down); +EXPORT_SYMBOL(__compat_down); -int __sched __down_interruptible(struct semaphore * sem) +int __sched __compat_down_interruptible(struct compat_semaphore *sem) { int retval = 0; struct task_struct *tsk = current; @@ -132,4 +132,10 @@ int __sched __down_interruptible(struct wake_up(&sem->wait); return retval; } -EXPORT_SYMBOL(__down_interruptible); +EXPORT_SYMBOL(__compat_down_interruptible); + +int compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} +EXPORT_SYMBOL(compat_sem_is_locked); Index: linux-2.6.24-rc8-rt1/arch/powerpc/lib/locks.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/lib/locks.c 2008-01-16 22:27:37.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/lib/locks.c 2008-01-16 22:29:15.000000000 -0500 @@ -25,7 +25,7 @@ #include #include -void __spin_yield(raw_spinlock_t *lock) +void __spin_yield(__raw_spinlock_t *lock) { unsigned int lock_value, holder_cpu, yield_count; @@ -55,7 +55,7 @@ void __spin_yield(raw_spinlock_t *lock) * This turns out to be the same for read and write locks, since * we only know the holder if it is write-locked. */ -void __rw_yield(raw_rwlock_t *rw) +void __rw_yield(__raw_rwlock_t *rw) { int lock_value; unsigned int holder_cpu, yield_count; @@ -82,7 +82,7 @@ void __rw_yield(raw_rwlock_t *rw) } #endif -void __raw_spin_unlock_wait(raw_spinlock_t *lock) +void __raw_spin_unlock_wait(__raw_spinlock_t *lock) { while (lock->slock) { HMT_low(); Index: linux-2.6.24-rc8-rt1/arch/ppc/Kconfig =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ppc/Kconfig 2008-01-16 22:27:37.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ppc/Kconfig 2008-01-16 22:28:32.000000000 -0500 @@ -16,13 +16,6 @@ config GENERIC_HARDIRQS bool default y -config RWSEM_GENERIC_SPINLOCK - bool - -config RWSEM_XCHGADD_ALGORITHM - bool - default y - config ARCH_HAS_ILOG2_U32 bool default y @@ -979,6 +972,18 @@ config ARCH_POPULATES_NODE_MAP source kernel/Kconfig.hz source kernel/Kconfig.preempt + +config RWSEM_GENERIC_SPINLOCK + bool + default y + +config ASM_SEMAPHORES + bool + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + source "mm/Kconfig" source "fs/Kconfig.binfmt" Index: linux-2.6.24-rc8-rt1/arch/ppc/kernel/entry.S =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ppc/kernel/entry.S 2008-01-16 22:27:37.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ppc/kernel/entry.S 2008-01-16 22:28:32.000000000 -0500 @@ -892,7 +892,7 @@ global_dbcr0: #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */ do_work: /* r10 contains MSR_KERNEL here */ - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) beq do_user_signal do_resched: /* r10 contains MSR_KERNEL here */ @@ -906,7 +906,7 @@ recheck: MTMSRD(r10) /* disable interrupts */ rlwinm r9,r1,0,0,18 lwz r9,TI_FLAGS(r9) - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne- do_resched andi. r0,r9,_TIF_SIGPENDING beq restore_user Index: linux-2.6.24-rc8-rt1/arch/ppc/kernel/semaphore.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ppc/kernel/semaphore.c 2008-01-16 22:27:37.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ppc/kernel/semaphore.c 2008-01-16 22:28:32.000000000 -0500 @@ -29,7 +29,7 @@ * sem->count = tmp; * return old_count; */ -static inline int __sem_update_count(struct semaphore *sem, int incr) +static inline int __sem_update_count(struct compat_semaphore *sem, int incr) { int old_count, tmp; @@ -48,7 +48,7 @@ static inline int __sem_update_count(str return old_count; } -void __up(struct semaphore *sem) +void __compat_up(struct compat_semaphore *sem) { /* * Note that we incremented count in up() before we came here, @@ -70,7 +70,7 @@ void __up(struct semaphore *sem) * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __sched __down(struct semaphore *sem) +void __sched __compat_down(struct compat_semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -100,7 +100,7 @@ void __sched __down(struct semaphore *se wake_up(&sem->wait); } -int __sched __down_interruptible(struct semaphore * sem) +int __sched __compat_down_interruptible(struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -129,3 +129,8 @@ int __sched __down_interruptible(struct wake_up(&sem->wait); return retval; } + +int compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} Index: linux-2.6.24-rc8-rt1/arch/ppc/lib/locks.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ppc/lib/locks.c 2008-01-16 22:27:37.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ppc/lib/locks.c 2008-01-16 22:28:32.000000000 -0500 @@ -42,7 +42,7 @@ static inline unsigned long __spin_trylo return ret; } -void _raw_spin_lock(spinlock_t *lock) +void __raw_spin_lock(raw_spinlock_t *lock) { int cpu = smp_processor_id(); unsigned int stuck = INIT_STUCK; @@ -62,9 +62,9 @@ void _raw_spin_lock(spinlock_t *lock) lock->owner_pc = (unsigned long)__builtin_return_address(0); lock->owner_cpu = cpu; } -EXPORT_SYMBOL(_raw_spin_lock); +EXPORT_SYMBOL(__raw_spin_lock); -int _raw_spin_trylock(spinlock_t *lock) +int __raw_spin_trylock(raw_spinlock_t *lock) { if (__spin_trylock(&lock->lock)) return 0; @@ -72,9 +72,9 @@ int _raw_spin_trylock(spinlock_t *lock) lock->owner_pc = (unsigned long)__builtin_return_address(0); return 1; } -EXPORT_SYMBOL(_raw_spin_trylock); +EXPORT_SYMBOL(__raw_spin_trylock); -void _raw_spin_unlock(spinlock_t *lp) +void __raw_spin_unlock(raw_spinlock_t *lp) { if ( !lp->lock ) printk("_spin_unlock(%p): no lock cpu %d curr PC %p %s/%d\n", @@ -88,13 +88,13 @@ void _raw_spin_unlock(spinlock_t *lp) wmb(); lp->lock = 0; } -EXPORT_SYMBOL(_raw_spin_unlock); +EXPORT_SYMBOL(__raw_spin_unlock); /* * For rwlocks, zero is unlocked, -1 is write-locked, * positive is read-locked. */ -static __inline__ int __read_trylock(rwlock_t *rw) +static __inline__ int __read_trylock(raw_rwlock_t *rw) { signed int tmp; @@ -114,13 +114,13 @@ static __inline__ int __read_trylock(rwl return tmp; } -int _raw_read_trylock(rwlock_t *rw) +int __raw_read_trylock(raw_rwlock_t *rw) { return __read_trylock(rw) > 0; } -EXPORT_SYMBOL(_raw_read_trylock); +EXPORT_SYMBOL(__raw_read_trylock); -void _raw_read_lock(rwlock_t *rw) +void __raw_read_lock(rwlock_t *rw) { unsigned int stuck; @@ -135,9 +135,9 @@ void _raw_read_lock(rwlock_t *rw) } } } -EXPORT_SYMBOL(_raw_read_lock); +EXPORT_SYMBOL(__raw_read_lock); -void _raw_read_unlock(rwlock_t *rw) +void __raw_read_unlock(raw_rwlock_t *rw) { if ( rw->lock == 0 ) printk("_read_unlock(): %s/%d (nip %08lX) lock %d\n", @@ -146,9 +146,9 @@ void _raw_read_unlock(rwlock_t *rw) wmb(); atomic_dec((atomic_t *) &(rw)->lock); } -EXPORT_SYMBOL(_raw_read_unlock); +EXPORT_SYMBOL(__raw_read_unlock); -void _raw_write_lock(rwlock_t *rw) +void __raw_write_lock(raw_rwlock_t *rw) { unsigned int stuck; @@ -164,18 +164,18 @@ void _raw_write_lock(rwlock_t *rw) } wmb(); } -EXPORT_SYMBOL(_raw_write_lock); +EXPORT_SYMBOL(__raw_write_lock); -int _raw_write_trylock(rwlock_t *rw) +int __raw_write_trylock(raw_rwlock_t *rw) { if (cmpxchg(&rw->lock, 0, -1) != 0) return 0; wmb(); return 1; } -EXPORT_SYMBOL(_raw_write_trylock); +EXPORT_SYMBOL(__raw_write_trylock); -void _raw_write_unlock(rwlock_t *rw) +void __raw_write_unlock(raw_rwlock_t *rw) { if (rw->lock >= 0) printk("_write_lock(): %s/%d (nip %08lX) lock %d\n", @@ -184,6 +184,6 @@ void _raw_write_unlock(rwlock_t *rw) wmb(); rw->lock = 0; } -EXPORT_SYMBOL(_raw_write_unlock); +EXPORT_SYMBOL(__raw_write_unlock); #endif Index: linux-2.6.24-rc8-rt1/drivers/macintosh/adb.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/macintosh/adb.c 2008-01-16 22:27:37.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/macintosh/adb.c 2008-01-16 22:28:32.000000000 -0500 @@ -250,6 +250,8 @@ adb_probe_task(void *x) { strcpy(current->comm, "kadbprobe"); + down(&adb_probe_mutex); + printk(KERN_INFO "adb: starting probe task...\n"); do_adb_reset_bus(); printk(KERN_INFO "adb: finished probe task...\n"); @@ -276,7 +278,9 @@ adb_reset_bus(void) return 0; } - down(&adb_probe_mutex); + if (adb_got_sleep) + return 0; + schedule_work(&adb_reset_work); return 0; } @@ -339,9 +343,8 @@ adb_notify_sleep(struct pmu_sleep_notifi { switch (when) { case PBOOK_SLEEP_REQUEST: + /* Signal to discontiue probing */ adb_got_sleep = 1; - /* We need to get a lock on the probe thread */ - down(&adb_probe_mutex); /* Stop autopoll */ if (adb_controller->autopoll) adb_controller->autopoll(0); @@ -350,7 +353,6 @@ adb_notify_sleep(struct pmu_sleep_notifi break; case PBOOK_WAKE: adb_got_sleep = 0; - up(&adb_probe_mutex); adb_reset_bus(); break; } Index: linux-2.6.24-rc8-rt1/include/asm-powerpc/rwsem.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-powerpc/rwsem.h 2008-01-16 22:27:37.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-powerpc/rwsem.h 2008-01-16 22:28:32.000000000 -0500 @@ -21,7 +21,7 @@ /* * the semaphore definition */ -struct rw_semaphore { +struct compat_rw_semaphore { /* XXX this should be able to be an atomic_t -- paulus */ signed int count; #define RWSEM_UNLOCKED_VALUE 0x00000000 @@ -30,7 +30,7 @@ struct rw_semaphore { #define RWSEM_WAITING_BIAS (-0x00010000) #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - spinlock_t wait_lock; + raw_spinlock_t wait_lock; struct list_head wait_list; }; @@ -38,15 +38,15 @@ struct rw_semaphore { { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \ LIST_HEAD_INIT((name).wait_list) } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define COMPAT_DECLARE_RWSEM(name) \ + struct compat_rw_semaphore name = __RWSEM_INITIALIZER(name) -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_down_read_failed(struct compat_rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_down_write_failed(struct compat_rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_wake(struct compat_rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_downgrade_wake(struct compat_rw_semaphore *sem); -static inline void init_rwsem(struct rw_semaphore *sem) +static inline void compat_init_rwsem(struct compat_rw_semaphore *sem) { sem->count = RWSEM_UNLOCKED_VALUE; spin_lock_init(&sem->wait_lock); @@ -56,13 +56,13 @@ static inline void init_rwsem(struct rw_ /* * lock for reading */ -static inline void __down_read(struct rw_semaphore *sem) +static inline void __down_read(struct compat_rw_semaphore *sem) { if (unlikely(atomic_inc_return((atomic_t *)(&sem->count)) <= 0)) rwsem_down_read_failed(sem); } -static inline int __down_read_trylock(struct rw_semaphore *sem) +static inline int __down_read_trylock(struct compat_rw_semaphore *sem) { int tmp; @@ -78,7 +78,7 @@ static inline int __down_read_trylock(st /* * lock for writing */ -static inline void __down_write(struct rw_semaphore *sem) +static inline void __down_write(struct compat_rw_semaphore *sem) { int tmp; @@ -88,7 +88,7 @@ static inline void __down_write(struct r rwsem_down_write_failed(sem); } -static inline int __down_write_trylock(struct rw_semaphore *sem) +static inline int __down_write_trylock(struct compat_rw_semaphore *sem) { int tmp; @@ -100,7 +100,7 @@ static inline int __down_write_trylock(s /* * unlock after reading */ -static inline void __up_read(struct rw_semaphore *sem) +static inline void __up_read(struct compat_rw_semaphore *sem) { int tmp; @@ -112,7 +112,7 @@ static inline void __up_read(struct rw_s /* * unlock after writing */ -static inline void __up_write(struct rw_semaphore *sem) +static inline void __up_write(struct compat_rw_semaphore *sem) { if (unlikely(atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS, (atomic_t *)(&sem->count)) < 0)) @@ -122,7 +122,7 @@ static inline void __up_write(struct rw_ /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem) { atomic_add(delta, (atomic_t *)(&sem->count)); } @@ -130,7 +130,7 @@ static inline void rwsem_atomic_add(int /* * downgrade write lock to read lock */ -static inline void __downgrade_write(struct rw_semaphore *sem) +static inline void __downgrade_write(struct compat_rw_semaphore *sem) { int tmp; @@ -142,12 +142,12 @@ static inline void __downgrade_write(str /* * implement exchange and add functionality */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem) { return atomic_add_return(delta, (atomic_t *)(&sem->count)); } -static inline int rwsem_is_locked(struct rw_semaphore *sem) +static inline int compat_rwsem_is_locked(struct compat_rw_semaphore *sem) { return (sem->count != 0); } Index: linux-2.6.24-rc8-rt1/include/asm-powerpc/semaphore.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-powerpc/semaphore.h 2008-01-16 22:27:37.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-powerpc/semaphore.h 2008-01-16 22:28:32.000000000 -0500 @@ -15,48 +15,58 @@ #include #include -struct semaphore { +/* + * On !PREEMPT_RT all sempahores are compat + */ +#ifndef CONFIG_PREEMPT_RT +# define compat_semaphore semaphore +#endif + +struct compat_semaphore { /* * Note that any negative value of count is equivalent to 0, * but additionally indicates that some process(es) might be * sleeping on `wait'. */ atomic_t count; + int sleepers; wait_queue_head_t wait; }; -#define __SEMAPHORE_INITIALIZER(name, n) \ +#define __COMPAT_SEMAPHORE_INITIALIZER(name, n) \ { \ .count = ATOMIC_INIT(n), \ .wait = __WAIT_QUEUE_HEAD_INITIALIZER((name).wait) \ } -#define __DECLARE_SEMAPHORE_GENERIC(name, count) \ - struct semaphore name = __SEMAPHORE_INITIALIZER(name,count) +#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, count) \ + struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count) -#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name, 1) +#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 1) -static inline void sema_init (struct semaphore *sem, int val) +static inline void compat_sema_init (struct compat_semaphore *sem, int val) { atomic_set(&sem->count, val); init_waitqueue_head(&sem->wait); } -static inline void init_MUTEX (struct semaphore *sem) +static inline void compat_init_MUTEX (struct compat_semaphore *sem) { - sema_init(sem, 1); + compat_sema_init(sem, 1); } -static inline void init_MUTEX_LOCKED (struct semaphore *sem) +static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem) { - sema_init(sem, 0); + compat_sema_init(sem, 0); } -extern void __down(struct semaphore * sem); -extern int __down_interruptible(struct semaphore * sem); -extern void __up(struct semaphore * sem); +extern void __compat_down(struct compat_semaphore * sem); +extern int __compat_down_interruptible(struct compat_semaphore * sem); +extern void __compat_up(struct compat_semaphore * sem); + +extern int compat_sem_is_locked(struct compat_semaphore *sem); -static inline void down(struct semaphore * sem) +static inline void compat_down(struct compat_semaphore * sem) { might_sleep(); @@ -64,31 +74,35 @@ static inline void down(struct semaphore * Try to get the semaphore, take the slow path if we fail. */ if (unlikely(atomic_dec_return(&sem->count) < 0)) - __down(sem); + __compat_down(sem); } -static inline int down_interruptible(struct semaphore * sem) +static inline int compat_down_interruptible(struct compat_semaphore * sem) { int ret = 0; might_sleep(); if (unlikely(atomic_dec_return(&sem->count) < 0)) - ret = __down_interruptible(sem); + ret = __compat_down_interruptible(sem); return ret; } -static inline int down_trylock(struct semaphore * sem) +static inline int compat_down_trylock(struct compat_semaphore * sem) { return atomic_dec_if_positive(&sem->count) < 0; } -static inline void up(struct semaphore * sem) +static inline void compat_up(struct compat_semaphore * sem) { if (unlikely(atomic_inc_return(&sem->count) <= 0)) - __up(sem); + __compat_up(sem); } +#define compat_sema_count(sem) atomic_read(&(sem)->count) + +#include + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_SEMAPHORE_H */ Index: linux-2.6.24-rc8-rt1/include/asm-powerpc/spinlock.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-powerpc/spinlock.h 2008-01-16 22:27:37.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-powerpc/spinlock.h 2008-01-16 22:28:45.000000000 -0500 @@ -53,7 +53,7 @@ * This returns the old value in the lock, so we succeeded * in getting the lock if the return value is 0. */ -static __inline__ unsigned long __spin_trylock(raw_spinlock_t *lock) +static __inline__ unsigned long ___raw_spin_trylock(__raw_spinlock_t *lock) { unsigned long tmp, token; @@ -72,10 +72,10 @@ static __inline__ unsigned long __spin_t return tmp; } -static int __inline__ __raw_spin_trylock(raw_spinlock_t *lock) +static int __inline__ __raw_spin_trylock(__raw_spinlock_t *lock) { CLEAR_IO_SYNC; - return __spin_trylock(lock) == 0; + return ___raw_spin_trylock(lock) == 0; } /* @@ -95,19 +95,19 @@ static int __inline__ __raw_spin_trylock #if defined(CONFIG_PPC_SPLPAR) || defined(CONFIG_PPC_ISERIES) /* We only yield to the hypervisor if we are in shared processor mode */ #define SHARED_PROCESSOR (get_lppaca()->shared_proc) -extern void __spin_yield(raw_spinlock_t *lock); -extern void __rw_yield(raw_rwlock_t *lock); +extern void __spin_yield(__raw_spinlock_t *lock); +extern void __rw_yield(__raw_rwlock_t *lock); #else /* SPLPAR || ISERIES */ #define __spin_yield(x) barrier() #define __rw_yield(x) barrier() #define SHARED_PROCESSOR 0 #endif -static void __inline__ __raw_spin_lock(raw_spinlock_t *lock) +static void __inline__ __raw_spin_lock(__raw_spinlock_t *lock) { CLEAR_IO_SYNC; while (1) { - if (likely(__spin_trylock(lock) == 0)) + if (likely(___raw_spin_trylock(lock) == 0)) break; do { HMT_low(); @@ -118,13 +118,13 @@ static void __inline__ __raw_spin_lock(r } } -static void __inline__ __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) +static void __inline__ __raw_spin_lock_flags(__raw_spinlock_t *lock, unsigned long flags) { unsigned long flags_dis; CLEAR_IO_SYNC; while (1) { - if (likely(__spin_trylock(lock) == 0)) + if (likely(___raw_spin_trylock(lock) == 0)) break; local_save_flags(flags_dis); local_irq_restore(flags); @@ -138,7 +138,7 @@ static void __inline__ __raw_spin_lock_f } } -static __inline__ void __raw_spin_unlock(raw_spinlock_t *lock) +static __inline__ void __raw_spin_unlock(__raw_spinlock_t *lock) { SYNC_IO; __asm__ __volatile__("# __raw_spin_unlock\n\t" @@ -147,7 +147,7 @@ static __inline__ void __raw_spin_unlock } #ifdef CONFIG_PPC64 -extern void __raw_spin_unlock_wait(raw_spinlock_t *lock); +extern void __raw_spin_unlock_wait(__raw_spinlock_t *lock); #else #define __raw_spin_unlock_wait(lock) \ do { while (__raw_spin_is_locked(lock)) cpu_relax(); } while (0) @@ -179,7 +179,7 @@ extern void __raw_spin_unlock_wait(raw_s * This returns the old value in the lock + 1, * so we got a read lock if the return value is > 0. */ -static long __inline__ __read_trylock(raw_rwlock_t *rw) +static long __inline__ ___raw_read_trylock(__raw_rwlock_t *rw) { long tmp; @@ -203,7 +203,7 @@ static long __inline__ __read_trylock(ra * This returns the old value in the lock, * so we got the write lock if the return value is 0. */ -static __inline__ long __write_trylock(raw_rwlock_t *rw) +static __inline__ long ___raw_write_trylock(__raw_rwlock_t *rw) { long tmp, token; @@ -223,10 +223,10 @@ static __inline__ long __write_trylock(r return tmp; } -static void __inline__ __raw_read_lock(raw_rwlock_t *rw) +static void __inline__ __raw_read_lock(__raw_rwlock_t *rw) { while (1) { - if (likely(__read_trylock(rw) > 0)) + if (likely(___raw_read_trylock(rw) > 0)) break; do { HMT_low(); @@ -237,10 +237,10 @@ static void __inline__ __raw_read_lock(r } } -static void __inline__ __raw_write_lock(raw_rwlock_t *rw) +static void __inline__ __raw_write_lock(__raw_rwlock_t *rw) { while (1) { - if (likely(__write_trylock(rw) == 0)) + if (likely(___raw_write_trylock(rw) == 0)) break; do { HMT_low(); @@ -251,17 +251,17 @@ static void __inline__ __raw_write_lock( } } -static int __inline__ __raw_read_trylock(raw_rwlock_t *rw) +static int __inline__ __raw_read_trylock(__raw_rwlock_t *rw) { - return __read_trylock(rw) > 0; + return ___raw_read_trylock(rw) > 0; } -static int __inline__ __raw_write_trylock(raw_rwlock_t *rw) +static int __inline__ __raw_write_trylock(__raw_rwlock_t *rw) { - return __write_trylock(rw) == 0; + return ___raw_write_trylock(rw) == 0; } -static void __inline__ __raw_read_unlock(raw_rwlock_t *rw) +static void __inline__ __raw_read_unlock(__raw_rwlock_t *rw) { long tmp; @@ -278,7 +278,7 @@ static void __inline__ __raw_read_unlock : "cr0", "memory"); } -static __inline__ void __raw_write_unlock(raw_rwlock_t *rw) +static __inline__ void __raw_write_unlock(__raw_rwlock_t *rw) { __asm__ __volatile__("# write_unlock\n\t" LWSYNC_ON_SMP: : :"memory"); @@ -289,5 +289,9 @@ static __inline__ void __raw_write_unloc #define _raw_read_relax(lock) __rw_yield(lock) #define _raw_write_relax(lock) __rw_yield(lock) +#define __raw_spin_relax(lock) cpu_relax() +#define __raw_read_relax(lock) cpu_relax() +#define __raw_write_relax(lock) cpu_relax() + #endif /* __KERNEL__ */ #endif /* __ASM_SPINLOCK_H */ Index: linux-2.6.24-rc8-rt1/include/asm-powerpc/spinlock_types.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-powerpc/spinlock_types.h 2008-01-16 22:27:37.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-powerpc/spinlock_types.h 2008-01-16 22:28:32.000000000 -0500 @@ -7,13 +7,13 @@ typedef struct { volatile unsigned int slock; -} raw_spinlock_t; +} __raw_spinlock_t; #define __RAW_SPIN_LOCK_UNLOCKED { 0 } typedef struct { volatile signed int lock; -} raw_rwlock_t; +} __raw_rwlock_t; #define __RAW_RW_LOCK_UNLOCKED { 0 } Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/x8664_ksyms_64.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/x8664_ksyms_64.c 2008-01-16 22:27:37.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/x8664_ksyms_64.c 2008-01-16 22:28:33.000000000 -0500 @@ -11,10 +11,12 @@ EXPORT_SYMBOL(kernel_thread); -EXPORT_SYMBOL(__down_failed); -EXPORT_SYMBOL(__down_failed_interruptible); -EXPORT_SYMBOL(__down_failed_trylock); -EXPORT_SYMBOL(__up_wakeup); +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK +EXPORT_SYMBOL(__compat_down_failed); +EXPORT_SYMBOL(__compat_down_failed_interruptible); +EXPORT_SYMBOL(__compat_down_failed_trylock); +EXPORT_SYMBOL(__compat_up_wakeup); +#endif EXPORT_SYMBOL(__get_user_1); EXPORT_SYMBOL(__get_user_2); Index: linux-2.6.24-rc8-rt1/include/asm-x86/semaphore_64.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/semaphore_64.h 2008-01-16 22:27:37.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/semaphore_64.h 2008-01-16 22:28:33.000000000 -0500 @@ -5,6 +5,10 @@ #ifdef __KERNEL__ +#ifndef CONFIG_PREEMPT_RT +# define compat_semaphore semaphore +#endif + /* * SMP- and interrupt-safe semaphores.. * @@ -43,28 +47,33 @@ #include #include -struct semaphore { +struct compat_semaphore { atomic_t count; int sleepers; wait_queue_head_t wait; }; -#define __SEMAPHORE_INITIALIZER(name, n) \ +#define __COMPAT_SEMAPHORE_INITIALIZER(name, n) \ { \ .count = ATOMIC_INIT(n), \ .sleepers = 0, \ .wait = __WAIT_QUEUE_HEAD_INITIALIZER((name).wait) \ } -#define __DECLARE_SEMAPHORE_GENERIC(name,count) \ - struct semaphore name = __SEMAPHORE_INITIALIZER(name,count) +#define __COMPAT_MUTEX_INITIALIZER(name) \ + __COMPAT_SEMAPHORE_INITIALIZER(name,1) + +#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \ + struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count) -#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1) +#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1) -static inline void sema_init (struct semaphore *sem, int val) +#define compat_sema_count(sem) atomic_read(&(sem)->count) + +static inline void compat_sema_init (struct compat_semaphore *sem, int val) { /* - * *sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val); + * *sem = (struct compat_semaphore)__SEMAPHORE_INITIALIZER((*sem),val); * * i'd rather use the more flexible initialization above, but sadly * GCC 2.7.2.3 emits a bogus warning. EGCS doesn't. Oh well. @@ -74,32 +83,33 @@ static inline void sema_init (struct sem init_waitqueue_head(&sem->wait); } -static inline void init_MUTEX (struct semaphore *sem) +static inline void compat_init_MUTEX (struct compat_semaphore *sem) { - sema_init(sem, 1); + compat_sema_init(sem, 1); } -static inline void init_MUTEX_LOCKED (struct semaphore *sem) +static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem) { - sema_init(sem, 0); + compat_sema_init(sem, 0); } -asmlinkage void __down_failed(void /* special register calling convention */); -asmlinkage int __down_failed_interruptible(void /* params in registers */); -asmlinkage int __down_failed_trylock(void /* params in registers */); -asmlinkage void __up_wakeup(void /* special register calling convention */); +asmlinkage void __compat_down_failed(void /* special register calling convention */); +asmlinkage int __compat_down_failed_interruptible(void /* params in registers */); +asmlinkage int __compat_down_failed_trylock(void /* params in registers */); +asmlinkage void __compat_up_wakeup(void /* special register calling convention */); -asmlinkage void __down(struct semaphore * sem); -asmlinkage int __down_interruptible(struct semaphore * sem); -asmlinkage int __down_trylock(struct semaphore * sem); -asmlinkage void __up(struct semaphore * sem); +asmlinkage void __compat_down(struct compat_semaphore * sem); +asmlinkage int __compat_down_interruptible(struct compat_semaphore * sem); +asmlinkage int __compat_down_trylock(struct compat_semaphore * sem); +asmlinkage void __compat_up(struct compat_semaphore * sem); +asmlinkage int compat_sem_is_locked(struct compat_semaphore *sem); /* * This is ugly, but we want the default case to fall through. * "__down_failed" is a special asm handler that calls the C * routine that actually waits. See arch/x86_64/kernel/semaphore.c */ -static inline void down(struct semaphore * sem) +static inline void compat_down(struct compat_semaphore * sem) { might_sleep(); @@ -107,7 +117,7 @@ static inline void down(struct semaphore "# atomic down operation\n\t" LOCK_PREFIX "decl %0\n\t" /* --sem->count */ "jns 1f\n\t" - "call __down_failed\n" + "call __compat_down_failed\n" "1:" :"=m" (sem->count) :"D" (sem) @@ -118,7 +128,7 @@ static inline void down(struct semaphore * Interruptible try to acquire a semaphore. If we obtained * it, return zero. If we were interrupted, returns -EINTR */ -static inline int down_interruptible(struct semaphore * sem) +static inline int compat_down_interruptible(struct compat_semaphore * sem) { int result; @@ -129,7 +139,7 @@ static inline int down_interruptible(str "xorl %0,%0\n\t" LOCK_PREFIX "decl %1\n\t" /* --sem->count */ "jns 2f\n\t" - "call __down_failed_interruptible\n" + "call __compat_down_failed_interruptible\n" "2:\n" :"=&a" (result), "=m" (sem->count) :"D" (sem) @@ -141,7 +151,7 @@ static inline int down_interruptible(str * Non-blockingly attempt to down() a semaphore. * Returns zero if we acquired it */ -static inline int down_trylock(struct semaphore * sem) +static inline int compat_down_trylock(struct compat_semaphore * sem) { int result; @@ -150,7 +160,7 @@ static inline int down_trylock(struct se "xorl %0,%0\n\t" LOCK_PREFIX "decl %1\n\t" /* --sem->count */ "jns 2f\n\t" - "call __down_failed_trylock\n\t" + "call __compat_down_failed_trylock\n\t" "2:\n" :"=&a" (result), "=m" (sem->count) :"D" (sem) @@ -164,17 +174,20 @@ static inline int down_trylock(struct se * The default case (no contention) will result in NO * jumps for both down() and up(). */ -static inline void up(struct semaphore * sem) +static inline void compat_up(struct compat_semaphore * sem) { __asm__ __volatile__( "# atomic up operation\n\t" LOCK_PREFIX "incl %0\n\t" /* ++sem->count */ "jg 1f\n\t" - "call __up_wakeup\n" + "call __compat_up_wakeup\n" "1:" :"=m" (sem->count) :"D" (sem) :"memory"); } + +#include + #endif /* __KERNEL__ */ #endif Index: linux-2.6.24-rc8-rt1/include/asm-x86/spinlock_64.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/spinlock_64.h 2008-01-16 22:27:37.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/spinlock_64.h 2008-01-16 22:28:42.000000000 -0500 @@ -17,12 +17,12 @@ * (the type definitions are in asm/spinlock_types.h) */ -static inline int __raw_spin_is_locked(raw_spinlock_t *lock) +static inline int __raw_spin_is_locked(__raw_spinlock_t *lock) { return *(volatile signed int *)(&(lock)->slock) <= 0; } -static inline void __raw_spin_lock(raw_spinlock_t *lock) +static inline void __raw_spin_lock(__raw_spinlock_t *lock) { asm volatile( "\n1:\t" @@ -40,7 +40,7 @@ static inline void __raw_spin_lock(raw_s * Same as __raw_spin_lock, but reenable interrupts during spinning. */ #ifndef CONFIG_PROVE_LOCKING -static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) +static inline void __raw_spin_lock_flags(__raw_spinlock_t *lock, unsigned long flags) { asm volatile( "\n1:\t" @@ -65,7 +65,7 @@ static inline void __raw_spin_lock_flags } #endif -static inline int __raw_spin_trylock(raw_spinlock_t *lock) +static inline int __raw_spin_trylock(__raw_spinlock_t *lock) { int oldval; @@ -77,12 +77,12 @@ static inline int __raw_spin_trylock(raw return oldval > 0; } -static inline void __raw_spin_unlock(raw_spinlock_t *lock) +static inline void __raw_spin_unlock(__raw_spinlock_t *lock) { asm volatile("movl $1,%0" :"=m" (lock->slock) :: "memory"); } -static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) +static inline void __raw_spin_unlock_wait(__raw_spinlock_t *lock) { while (__raw_spin_is_locked(lock)) cpu_relax(); @@ -102,17 +102,17 @@ static inline void __raw_spin_unlock_wai * with the high bit (sign) being the "contended" bit. */ -static inline int __raw_read_can_lock(raw_rwlock_t *lock) +static inline int __raw_read_can_lock(__raw_rwlock_t *lock) { return (int)(lock)->lock > 0; } -static inline int __raw_write_can_lock(raw_rwlock_t *lock) +static inline int __raw_write_can_lock(__raw_rwlock_t *lock) { return (lock)->lock == RW_LOCK_BIAS; } -static inline void __raw_read_lock(raw_rwlock_t *rw) +static inline void __raw_read_lock(__raw_rwlock_t *rw) { asm volatile(LOCK_PREFIX "subl $1,(%0)\n\t" "jns 1f\n" @@ -121,7 +121,7 @@ static inline void __raw_read_lock(raw_r ::"D" (rw), "i" (RW_LOCK_BIAS) : "memory"); } -static inline void __raw_write_lock(raw_rwlock_t *rw) +static inline void __raw_write_lock(__raw_rwlock_t *rw) { asm volatile(LOCK_PREFIX "subl %1,(%0)\n\t" "jz 1f\n" @@ -130,7 +130,7 @@ static inline void __raw_write_lock(raw_ ::"D" (rw), "i" (RW_LOCK_BIAS) : "memory"); } -static inline int __raw_read_trylock(raw_rwlock_t *lock) +static inline int __raw_read_trylock(__raw_rwlock_t *lock) { atomic_t *count = (atomic_t *)lock; atomic_dec(count); @@ -140,7 +140,7 @@ static inline int __raw_read_trylock(raw return 0; } -static inline int __raw_write_trylock(raw_rwlock_t *lock) +static inline int __raw_write_trylock(__raw_rwlock_t *lock) { atomic_t *count = (atomic_t *)lock; if (atomic_sub_and_test(RW_LOCK_BIAS, count)) @@ -149,19 +149,19 @@ static inline int __raw_write_trylock(ra return 0; } -static inline void __raw_read_unlock(raw_rwlock_t *rw) +static inline void __raw_read_unlock(__raw_rwlock_t *rw) { asm volatile(LOCK_PREFIX " ; incl %0" :"=m" (rw->lock) : : "memory"); } -static inline void __raw_write_unlock(raw_rwlock_t *rw) +static inline void __raw_write_unlock(__raw_rwlock_t *rw) { asm volatile(LOCK_PREFIX " ; addl $" RW_LOCK_BIAS_STR ",%0" : "=m" (rw->lock) : : "memory"); } -#define _raw_spin_relax(lock) cpu_relax() -#define _raw_read_relax(lock) cpu_relax() -#define _raw_write_relax(lock) cpu_relax() +#define __raw_spin_relax(lock) cpu_relax() +#define __raw_read_relax(lock) cpu_relax() +#define __raw_write_relax(lock) cpu_relax() #endif /* __ASM_SPINLOCK_H */ Index: linux-2.6.24-rc8-rt1/include/asm-x86/thread_info_64.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/thread_info_64.h 2008-01-16 22:27:37.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/thread_info_64.h 2008-01-16 22:28:33.000000000 -0500 @@ -111,6 +111,7 @@ static inline struct thread_info *stack_ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ #define TIF_IRET 5 /* force IRET */ +#define TIF_NEED_RESCHED_DELAYED 6 /* reschedul on return to userspace */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal */ @@ -133,6 +134,7 @@ static inline struct thread_info *stack_ #define _TIF_SECCOMP (1<wait); } +EXPORT_SYMBOL(__compat_up); + static DEFINE_SPINLOCK(semaphore_lock); -void __sched __down(struct semaphore * sem) +fastcall void __attribute_used__ __sched __compat_down(struct compat_semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -89,7 +91,9 @@ void __sched __down(struct semaphore * s wake_up(&sem->wait); } -int __sched __down_interruptible(struct semaphore * sem) +EXPORT_SYMBOL(__compat_down); + +fastcall int __attribute_used__ __sched __compat_down_interruptible(struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -140,6 +144,8 @@ int __sched __down_interruptible(struct return retval; } +EXPORT_SYMBOL(__compat_down_interruptible); + /* * Trylock failed - make sure we correct for * having decremented the count. @@ -148,7 +154,7 @@ int __sched __down_interruptible(struct * single "cmpxchg" without failure cases, * but then it wouldn't work on a 386. */ -int __down_trylock(struct semaphore * sem) +fastcall int __attribute_used__ __sched __compat_down_trylock(struct compat_semaphore * sem) { int sleepers; unsigned long flags; @@ -168,6 +174,15 @@ int __down_trylock(struct semaphore * se return 1; } +EXPORT_SYMBOL(__compat_down_trylock); + +fastcall int __sched compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} + +EXPORT_SYMBOL(compat_sem_is_locked); + /* * The semaphore operations have a special calling sequence that * allow us to do a simpler in-line version of them. These routines @@ -185,7 +200,7 @@ asm(" .section .sched.text,\"ax\",%progb __down_failed: \n\ stmfd sp!, {r0 - r4, lr} \n\ mov r0, ip \n\ - bl __down \n\ + bl __compat_down \n\ ldmfd sp!, {r0 - r4, pc} \n\ \n\ .align 5 \n\ @@ -193,7 +208,7 @@ __down_failed: \n\ __down_interruptible_failed: \n\ stmfd sp!, {r0 - r4, lr} \n\ mov r0, ip \n\ - bl __down_interruptible \n\ + bl __compat_down_interruptible \n\ mov ip, r0 \n\ ldmfd sp!, {r0 - r4, pc} \n\ \n\ @@ -202,7 +217,7 @@ __down_interruptible_failed: \n\ __down_trylock_failed: \n\ stmfd sp!, {r0 - r4, lr} \n\ mov r0, ip \n\ - bl __down_trylock \n\ + bl __compat_down_trylock \n\ mov ip, r0 \n\ ldmfd sp!, {r0 - r4, pc} \n\ \n\ @@ -211,7 +226,7 @@ __down_trylock_failed: \n\ __up_wakeup: \n\ stmfd sp!, {r0 - r4, lr} \n\ mov r0, ip \n\ - bl __up \n\ + bl __compat_up \n\ ldmfd sp!, {r0 - r4, pc} \n\ "); Index: linux-2.6.24-rc8-rt1/include/asm-arm/semaphore.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-arm/semaphore.h 2008-01-16 22:27:37.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-arm/semaphore.h 2008-01-16 22:28:33.000000000 -0500 @@ -5,45 +5,65 @@ #define __ASM_ARM_SEMAPHORE_H #include + +#ifdef CONFIG_PREEMPT_RT +# include +#endif + #include #include #include +/* + * On !PREEMPT_RT all semaphores are compat: + */ +#ifndef CONFIG_PREEMPT_RT +# define semaphore compat_semaphore +#endif + #include #include -struct semaphore { +struct compat_semaphore { atomic_t count; int sleepers; wait_queue_head_t wait; }; -#define __SEMAPHORE_INIT(name, cnt) \ +#define __COMPAT_SEMAPHORE_INITIALIZER(name, cnt) \ { \ .count = ATOMIC_INIT(cnt), \ .wait = __WAIT_QUEUE_HEAD_INITIALIZER((name).wait), \ } -#define __DECLARE_SEMAPHORE_GENERIC(name,count) \ - struct semaphore name = __SEMAPHORE_INIT(name,count) +#define __COMPAT_MUTEX_INITIALIZER(name) \ + __COMPAT_SEMAPHORE_INITIALIZER(name,1) + +#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \ + struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count) -#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1) +#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1) -static inline void sema_init(struct semaphore *sem, int val) +static inline void compat_sema_init(struct compat_semaphore *sem, int val) { atomic_set(&sem->count, val); sem->sleepers = 0; init_waitqueue_head(&sem->wait); } -static inline void init_MUTEX(struct semaphore *sem) +static inline void compat_init_MUTEX(struct compat_semaphore *sem) +{ + compat_sema_init(sem, 1); +} + +static inline void compat_init_MUTEX_LOCKED(struct compat_semaphore *sem) { - sema_init(sem, 1); + compat_sema_init(sem, 0); } -static inline void init_MUTEX_LOCKED(struct semaphore *sem) +static inline int compat_sema_count(struct compat_semaphore *sem) { - sema_init(sem, 0); + return atomic_read(&sem->count); } /* @@ -54,16 +74,18 @@ asmlinkage int __down_interruptible_fai asmlinkage int __down_trylock_failed(void); asmlinkage void __up_wakeup(void); -extern void __down(struct semaphore * sem); -extern int __down_interruptible(struct semaphore * sem); -extern int __down_trylock(struct semaphore * sem); -extern void __up(struct semaphore * sem); +extern void __compat_up(struct compat_semaphore *sem); +extern int __compat_down_interruptible(struct compat_semaphore * sem); +extern int __compat_down_trylock(struct compat_semaphore * sem); +extern void __compat_down(struct compat_semaphore * sem); + +extern int compat_sem_is_locked(struct compat_semaphore *sem); /* * This is ugly, but we want the default case to fall through. * "__down" is the actual routine that waits... */ -static inline void down(struct semaphore * sem) +static inline void compat_down(struct compat_semaphore * sem) { might_sleep(); __down_op(sem, __down_failed); @@ -73,13 +95,13 @@ static inline void down(struct semaphore * This is ugly, but we want the default case to fall through. * "__down_interruptible" is the actual routine that waits... */ -static inline int down_interruptible (struct semaphore * sem) +static inline int compat_down_interruptible (struct compat_semaphore * sem) { might_sleep(); return __down_op_ret(sem, __down_interruptible_failed); } -static inline int down_trylock(struct semaphore *sem) +static inline int compat_down_trylock(struct compat_semaphore *sem) { return __down_op_ret(sem, __down_trylock_failed); } @@ -90,9 +112,10 @@ static inline int down_trylock(struct se * The default case (no contention) will result in NO * jumps for both down() and up(). */ -static inline void up(struct semaphore * sem) +static inline void compat_up(struct compat_semaphore * sem) { __up_op(sem, __up_wakeup); } +#include #endif Index: linux-2.6.24-rc8-rt1/include/asm-arm/thread_info.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-arm/thread_info.h 2008-01-16 22:27:37.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-arm/thread_info.h 2008-01-16 22:28:33.000000000 -0500 @@ -141,6 +141,7 @@ extern void iwmmxt_task_switch(struct th */ #define TIF_SIGPENDING 0 #define TIF_NEED_RESCHED 1 +#define TIF_NEED_RESCHED_DELAYED 3 #define TIF_SYSCALL_TRACE 8 #define TIF_POLLING_NRFLAG 16 #define TIF_USING_IWMMXT 17 @@ -149,6 +150,7 @@ extern void iwmmxt_task_switch(struct th #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_NEED_RESCHED_DELAYED (1<cpu = cpu; tlb->mm = mm; /* Use fast mode if only one CPU is online */ @@ -91,7 +94,7 @@ tlb_finish_mmu(struct mmu_gather *tlb, u /* keep the page table cache within bounds */ check_pgt_cache(); - put_cpu_var(mmu_gathers); + put_cpu_var_locked(mmu_gathers, tlb->cpu); } /* tlb_remove_page Index: linux-2.6.24-rc8-rt1/include/asm-x86/percpu_32.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/percpu_32.h 2008-01-16 22:27:36.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/percpu_32.h 2008-01-16 22:28:34.000000000 -0500 @@ -51,6 +51,10 @@ extern unsigned long __per_cpu_offset[]; /* Separate out the type, so (int[3], foo) works. */ #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name +#define DECLARE_PER_CPU_LOCKED(type, name) \ + extern spinlock_t per_cpu_lock__##name##_locked; \ + extern __typeof__(type) per_cpu__##name##_locked + #define DEFINE_PER_CPU(type, name) \ __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name @@ -59,6 +63,10 @@ extern unsigned long __per_cpu_offset[]; __typeof__(type) per_cpu__##name \ ____cacheline_aligned_in_smp +#define DEFINE_PER_CPU_LOCKED(type, name) \ + __attribute__((__section__(".data.percpu"))) __DEFINE_SPINLOCK(per_cpu_lock__##name##_locked); \ + __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name##_locked + /* We can use this directly for local CPU (faster). */ DECLARE_PER_CPU(unsigned long, this_cpu_off); @@ -74,6 +82,15 @@ DECLARE_PER_CPU(unsigned long, this_cpu_ #define __get_cpu_var(var) __raw_get_cpu_var(var) +#define per_cpu_lock(var, cpu) \ + (*RELOC_HIDE(&per_cpu_lock__##var##_locked, __per_cpu_offset[cpu])) +#define per_cpu_var_locked(var, cpu) \ + (*RELOC_HIDE(&per_cpu__##var##_locked, __per_cpu_offset[cpu])) +#define __get_cpu_lock(var, cpu) \ + per_cpu_lock(var, cpu) +#define __get_cpu_var_locked(var, cpu) \ + per_cpu_var_locked(var, cpu) + /* A macro to avoid #include hell... */ #define percpu_modcopy(pcpudst, src, size) \ do { \ @@ -85,6 +102,8 @@ do { \ #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var) #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var) +#define EXPORT_PER_CPU_LOCKED_SYMBOL(var) EXPORT_SYMBOL(per_cpu_lock__##var##_locked); EXPORT_SYMBOL(per_cpu__##var##_locked) +#define EXPORT_PER_CPU_LOCKED_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu_lock__##var##_locked); EXPORT_SYMBOL_GPL(per_cpu__##var##_locked) /* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */ #define __percpu_seg "%%fs:" Index: linux-2.6.24-rc8-rt1/include/asm-x86/percpu_64.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/percpu_64.h 2008-01-16 22:27:36.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/percpu_64.h 2008-01-16 22:28:34.000000000 -0500 @@ -19,6 +19,9 @@ /* Separate out the type, so (int[3], foo) works. */ #define DEFINE_PER_CPU(type, name) \ __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name +#define DEFINE_PER_CPU_LOCKED(type, name) \ + __attribute__((__section__(".data.percpu"))) __DEFINE_SPINLOCK(per_cpu_lock__##name##_locked); \ + __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name##_locked #define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ __attribute__((__section__(".data.percpu.shared_aligned"))) \ @@ -36,6 +39,15 @@ extern int simple_identifier_##var(void); \ RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()); })) +#define per_cpu_lock(var, cpu) \ + (*RELOC_HIDE(&per_cpu_lock__##var##_locked, __per_cpu_offset(cpu))) +#define per_cpu_var_locked(var, cpu) \ + (*RELOC_HIDE(&per_cpu__##var##_locked, __per_cpu_offset(cpu))) +#define __get_cpu_lock(var, cpu) \ + per_cpu_lock(var, cpu) +#define __get_cpu_var_locked(var, cpu) \ + per_cpu_var_locked(var, cpu) + /* A macro to avoid #include hell... */ #define percpu_modcopy(pcpudst, src, size) \ do { \ @@ -54,15 +66,28 @@ extern void setup_per_cpu_areas(void); #define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ DEFINE_PER_CPU(type, name) +#define DEFINE_PER_CPU_LOCKED(type, name) \ + spinlock_t per_cpu_lock__##name##_locked = SPIN_LOCK_UNLOCKED; \ + __typeof__(type) per_cpu__##name##_locked + #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) +#define per_cpu_var_locked(var, cpu) (*((void)(cpu), &per_cpu__##var##_locked)) #define __get_cpu_var(var) per_cpu__##var #define __raw_get_cpu_var(var) per_cpu__##var +#define __get_cpu_lock(var, cpu) per_cpu_lock__##var##_locked +#define __get_cpu_var_locked(var, cpu) per_cpu__##var##_locked #endif /* SMP */ #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name +#define DECLARE_PER_CPU_LOCKED(type, name) \ + extern spinlock_t per_cpu_lock__##name##_locked; \ + extern __typeof__(type) per_cpu__##name##_locked + #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var) #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var) +#define EXPORT_PER_CPU_LOCKED_SYMBOL(var) EXPORT_SYMBOL(per_cpu_lock__##var##_locked); EXPORT_SYMBOL(per_cpu__##var##_locked) +#define EXPORT_PER_CPU_LOCKED_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu_lock__##var##_locked); EXPORT_SYMBOL_GPL(per_cpu__##var##_locked) #endif /* _ASM_X8664_PERCPU_H_ */ Index: linux-2.6.24-rc8-rt1/include/linux/percpu.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/percpu.h 2008-01-16 22:27:36.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/percpu.h 2008-01-16 22:28:34.000000000 -0500 @@ -31,6 +31,29 @@ &__get_cpu_var(var); })) #define put_cpu_var(var) preempt_enable() +/* + * Per-CPU data structures with an additional lock - useful for + * PREEMPT_RT code that wants to reschedule but also wants + * per-CPU data structures. + * + * 'cpu' gets updated with the CPU the task is currently executing on. + * + * NOTE: on normal !PREEMPT_RT kernels these per-CPU variables + * are the same as the normal per-CPU variables, so there no + * runtime overhead. + */ +#define get_cpu_var_locked(var, cpuptr) \ +(*({ \ + int __cpu = raw_smp_processor_id(); \ + \ + *(cpuptr) = __cpu; \ + spin_lock(&__get_cpu_lock(var, __cpu)); \ + &__get_cpu_var_locked(var, __cpu); \ +})) + +#define put_cpu_var_locked(var, cpu) \ + do { (void)cpu; spin_unlock(&__get_cpu_lock(var, cpu)); } while (0) + #ifdef CONFIG_SMP struct percpu_data { Index: linux-2.6.24-rc8-rt1/mm/swap.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/mm/swap.c 2008-01-16 22:27:36.000000000 -0500 +++ linux-2.6.24-rc8-rt1/mm/swap.c 2008-01-16 22:29:26.000000000 -0500 @@ -33,10 +33,64 @@ /* How many pages do we try to swap or page in/out together? */ int page_cluster; +/* + * On PREEMPT_RT we don't want to disable preemption for cpu variables. + * We grab a cpu and then use that cpu to lock the variables accordingly. + */ +#ifdef CONFIG_PREEMPT_RT +static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_add_pvecs) = { 0, }; +static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_add_active_pvecs) = { 0, }; +static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_rotate_pvecs) = { 0, }; + +#define swap_get_cpu_var_irq_save(var, flags, cpu) \ + ({ \ + (void)flags; \ + &get_cpu_var_locked(var, &cpu); \ + }) +#define swap_put_cpu_var_irq_restore(var, flags, cpu) \ + put_cpu_var_locked(var, cpu) +#define swap_get_cpu_var(var, cpu) \ + &get_cpu_var_locked(var, &cpu) +#define swap_put_cpu_var(var, cpu) \ + put_cpu_var_locked(var, cpu) +#define swap_per_cpu_lock(var, cpu) \ + ({ \ + spin_lock(&__get_cpu_lock(var, cpu)); \ + &__get_cpu_var_locked(var, cpu); \ + }) +#define swap_per_cpu_unlock(var, cpu) \ + spin_unlock(&__get_cpu_lock(var, cpu)); +#define swap_get_cpu() raw_smp_processor_id(); +#define swap_put_cpu() +#else static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, }; +#define swap_get_cpu_var_irq_save(var, flags, cpu) \ + ({ \ + (void)cpu; \ + local_irq_save(flags); \ + &__get_cpu_var(var); \ + }) +#define swap_put_cpu_var_irq_restore(var, flags, cpu) \ + local_irq_restore(flags) +#define swap_get_cpu_var(var, cpu) \ + ({ (void)cpu; &get_cpu_var(var); }) +#define swap_put_cpu_var(var, cpu) \ + do { \ + (void)cpu; \ + put_cpu_var(var); \ + } while(0) +#define swap_per_cpu_lock(var, cpu) \ + &per_cpu(lru_add_pvecs, cpu) +#define swap_per_cpu_unlock(var, cpu) \ + do { } while(0) +#define swap_get_cpu() get_cpu() +#define swap_put_cpu() put_cpu(); + +#endif /* CONFIG_PREEMPT_RT */ + /* * This path almost never happens for VM activity - pages are normally * freed via pagevecs. But it gets used by networking. @@ -139,6 +193,7 @@ int rotate_reclaimable_page(struct page { struct pagevec *pvec; unsigned long flags; + int cpu; if (PageLocked(page)) return 1; @@ -150,11 +205,10 @@ int rotate_reclaimable_page(struct page return 1; page_cache_get(page); - local_irq_save(flags); - pvec = &__get_cpu_var(lru_rotate_pvecs); + pvec = swap_get_cpu_var_irq_save(lru_rotate_pvecs, flags, cpu); if (!pagevec_add(pvec, page)) pagevec_move_tail(pvec); - local_irq_restore(flags); + swap_put_cpu_var_irq_restore(lru_rotate_pvecs, flags, cpu); if (!test_clear_page_writeback(page)) BUG(); @@ -204,22 +258,24 @@ EXPORT_SYMBOL(mark_page_accessed); */ void fastcall lru_cache_add(struct page *page) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); + int cpu; + struct pagevec *pvec = swap_get_cpu_var(lru_add_pvecs, cpu); page_cache_get(page); if (!pagevec_add(pvec, page)) __pagevec_lru_add(pvec); - put_cpu_var(lru_add_pvecs); + swap_put_cpu_var(lru_add_pvecs, cpu); } void fastcall lru_cache_add_active(struct page *page) { - struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); + int cpu; + struct pagevec *pvec = swap_get_cpu_var(lru_add_active_pvecs, cpu); page_cache_get(page); if (!pagevec_add(pvec, page)) __pagevec_lru_add_active(pvec); - put_cpu_var(lru_add_active_pvecs); + swap_put_cpu_var(lru_add_active_pvecs, cpu); } /* @@ -231,33 +287,38 @@ static void drain_cpu_pagevecs(int cpu) { struct pagevec *pvec; - pvec = &per_cpu(lru_add_pvecs, cpu); + pvec = swap_per_cpu_lock(lru_add_pvecs, cpu); if (pagevec_count(pvec)) __pagevec_lru_add(pvec); + swap_per_cpu_unlock(lru_add_pvecs, cpu); - pvec = &per_cpu(lru_add_active_pvecs, cpu); + pvec = swap_per_cpu_lock(lru_add_active_pvecs, cpu); if (pagevec_count(pvec)) __pagevec_lru_add_active(pvec); + swap_per_cpu_unlock(lru_add_active_pvecs, cpu); - pvec = &per_cpu(lru_rotate_pvecs, cpu); + pvec = swap_per_cpu_lock(lru_rotate_pvecs, cpu); if (pagevec_count(pvec)) { unsigned long flags; /* No harm done if a racing interrupt already did this */ - local_irq_save(flags); + local_irq_save_nort(flags); pagevec_move_tail(pvec); - local_irq_restore(flags); + local_irq_restore_nort(flags); } + swap_per_cpu_unlock(lru_rotate_pvecs, cpu); } void lru_add_drain(void) { - drain_cpu_pagevecs(get_cpu()); - put_cpu(); + int cpu; + cpu = swap_get_cpu(); + drain_cpu_pagevecs(cpu); + swap_put_cpu(); } #ifdef CONFIG_NUMA -static void lru_add_drain_per_cpu(struct work_struct *dummy) +static void lru_add_drain_per_cpu(void *info) { lru_add_drain(); } @@ -267,7 +328,7 @@ static void lru_add_drain_per_cpu(struct */ int lru_add_drain_all(void) { - return schedule_on_each_cpu(lru_add_drain_per_cpu); + return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL, 0, 1); } #else Index: linux-2.6.24-rc8-rt1/net/ipv4/netfilter/arp_tables.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/net/ipv4/netfilter/arp_tables.c 2008-01-16 22:27:36.000000000 -0500 +++ linux-2.6.24-rc8-rt1/net/ipv4/netfilter/arp_tables.c 2008-01-16 22:28:34.000000000 -0500 @@ -241,7 +241,7 @@ unsigned int arpt_do_table(struct sk_buf read_lock_bh(&table->lock); private = table->private; - table_base = (void *)private->entries[smp_processor_id()]; + table_base = (void *)private->entries[raw_smp_processor_id()]; e = get_entry(table_base, private->hook_entry[hook]); back = get_entry(table_base, private->underflow[hook]); @@ -951,7 +951,7 @@ static int do_add_counters(void __user * i = 0; /* Choose the copy that is on our node */ - loc_cpu_entry = private->entries[smp_processor_id()]; + loc_cpu_entry = private->entries[raw_smp_processor_id()]; ARPT_ENTRY_ITERATE(loc_cpu_entry, private->size, add_counter_to_entry, Index: linux-2.6.24-rc8-rt1/net/ipv4/netfilter/ip_tables.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/net/ipv4/netfilter/ip_tables.c 2008-01-16 22:27:36.000000000 -0500 +++ linux-2.6.24-rc8-rt1/net/ipv4/netfilter/ip_tables.c 2008-01-16 22:28:34.000000000 -0500 @@ -346,7 +346,7 @@ ipt_do_table(struct sk_buff *skb, read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); private = table->private; - table_base = (void *)private->entries[smp_processor_id()]; + table_base = (void *)private->entries[raw_smp_processor_id()]; e = get_entry(table_base, private->hook_entry[hook]); /* For return from builtin chain */ Index: linux-2.6.24-rc8-rt1/include/net/netfilter/nf_conntrack_ecache.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/net/netfilter/nf_conntrack_ecache.h 2008-01-16 22:27:36.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/net/netfilter/nf_conntrack_ecache.h 2008-01-16 22:28:35.000000000 -0500 @@ -15,16 +15,15 @@ struct nf_conntrack_ecache { struct nf_conn *ct; unsigned int events; }; -DECLARE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache); - -#define CONNTRACK_ECACHE(x) (__get_cpu_var(nf_conntrack_ecache).x) +DECLARE_PER_CPU_LOCKED(struct nf_conntrack_ecache, nf_conntrack_ecache); extern struct atomic_notifier_head nf_conntrack_chain; extern int nf_conntrack_register_notifier(struct notifier_block *nb); extern int nf_conntrack_unregister_notifier(struct notifier_block *nb); extern void nf_ct_deliver_cached_events(const struct nf_conn *ct); -extern void __nf_ct_event_cache_init(struct nf_conn *ct); +extern void __nf_ct_event_cache_init(struct nf_conntrack_ecache *ecache, + struct nf_conn *ct); extern void nf_ct_event_cache_flush(void); static inline void @@ -33,12 +32,14 @@ nf_conntrack_event_cache(enum ip_conntra { struct nf_conn *ct = (struct nf_conn *)skb->nfct; struct nf_conntrack_ecache *ecache; + int cpu; local_bh_disable(); - ecache = &__get_cpu_var(nf_conntrack_ecache); + ecache = &get_cpu_var_locked(nf_conntrack_ecache, &cpu); if (ct != ecache->ct) - __nf_ct_event_cache_init(ct); + __nf_ct_event_cache_init(ecache, ct); ecache->events |= event; + put_cpu_var_locked(nf_conntrack_ecache, cpu); local_bh_enable(); } Index: linux-2.6.24-rc8-rt1/net/netfilter/nf_conntrack_ecache.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/net/netfilter/nf_conntrack_ecache.c 2008-01-16 22:27:36.000000000 -0500 +++ linux-2.6.24-rc8-rt1/net/netfilter/nf_conntrack_ecache.c 2008-01-16 22:28:35.000000000 -0500 @@ -29,8 +29,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_chain); ATOMIC_NOTIFIER_HEAD(nf_ct_expect_chain); EXPORT_SYMBOL_GPL(nf_ct_expect_chain); -DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache); -EXPORT_PER_CPU_SYMBOL_GPL(nf_conntrack_ecache); +DEFINE_PER_CPU_LOCKED(struct nf_conntrack_ecache, nf_conntrack_ecache); +EXPORT_PER_CPU_LOCKED_SYMBOL_GPL(nf_conntrack_ecache); /* deliver cached events and clear cache entry - must be called with locally * disabled softirqs */ @@ -52,22 +52,22 @@ __nf_ct_deliver_cached_events(struct nf_ void nf_ct_deliver_cached_events(const struct nf_conn *ct) { struct nf_conntrack_ecache *ecache; + int cpu; local_bh_disable(); - ecache = &__get_cpu_var(nf_conntrack_ecache); + ecache = &get_cpu_var_locked(nf_conntrack_ecache, &cpu); if (ecache->ct == ct) __nf_ct_deliver_cached_events(ecache); + put_cpu_var_locked(nf_conntrack_ecache, cpu); local_bh_enable(); } EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); /* Deliver cached events for old pending events, if current conntrack != old */ -void __nf_ct_event_cache_init(struct nf_conn *ct) +void +__nf_ct_event_cache_init(struct nf_conntrack_ecache *ecache, struct nf_conn *ct) { - struct nf_conntrack_ecache *ecache; - /* take care of delivering potentially old events */ - ecache = &__get_cpu_var(nf_conntrack_ecache); BUG_ON(ecache->ct == ct); if (ecache->ct) __nf_ct_deliver_cached_events(ecache); @@ -85,7 +85,7 @@ void nf_ct_event_cache_flush(void) int cpu; for_each_possible_cpu(cpu) { - ecache = &per_cpu(nf_conntrack_ecache, cpu); + ecache = &__get_cpu_var_locked(nf_conntrack_ecache, cpu); if (ecache->ct) nf_ct_put(ecache->ct); } Index: linux-2.6.24-rc8-rt1/arch/powerpc/mm/init_32.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/mm/init_32.c 2008-01-16 22:27:36.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/mm/init_32.c 2008-01-16 22:28:35.000000000 -0500 @@ -54,7 +54,7 @@ #endif #define MAX_LOW_MEM CONFIG_LOWMEM_SIZE -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); unsigned long total_memory; unsigned long total_lowmem; Index: linux-2.6.24-rc8-rt1/arch/powerpc/mm/tlb_64.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/mm/tlb_64.c 2008-01-16 22:27:36.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/mm/tlb_64.c 2008-01-16 22:29:28.000000000 -0500 @@ -30,13 +30,14 @@ #include #include #include +#include DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); /* This is declared as we are using the more or less generic * include/asm-powerpc/tlb.h file -- tgall */ -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); unsigned long pte_freelist_forced_free; @@ -91,8 +92,11 @@ static void pte_free_submit(struct pte_f void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) { - /* This is safe since tlb_gather_mmu has disabled preemption */ - cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id()); + /* + * This is safe since tlb_gather_mmu has disabled preemption. + * tlb->cpu is set by tlb_gather_mmu as well. + */ + cpumask_t local_cpumask = cpumask_of_cpu(tlb->cpu); struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); if (atomic_read(&tlb->mm->mm_users) < 2 || @@ -127,7 +131,7 @@ void pgtable_free_tlb(struct mmu_gather void hpte_need_flush(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long pte, int huge) { - struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); + struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch); unsigned long vsid, vaddr; unsigned int psize; int ssize; @@ -178,6 +182,7 @@ void hpte_need_flush(struct mm_struct *m */ if (!batch->active) { flush_hash_page(vaddr, rpte, psize, ssize, 0); + put_cpu_var(ppc64_tlb_batch); return; } @@ -204,8 +209,22 @@ void hpte_need_flush(struct mm_struct *m batch->pte[i] = rpte; batch->vaddr[i] = vaddr; batch->index = ++i; + +#ifdef CONFIG_PREEMPT_RT + /* + * Since flushing tlb needs expensive hypervisor call(s) on celleb, + * always flush it on RT to reduce scheduling latency. + */ + if (machine_is(celleb)) { + __flush_tlb_pending(batch); + put_cpu_var(ppc64_tlb_batch); + return; + } +#endif /* CONFIG_PREEMPT_RT */ + if (i >= PPC64_TLB_BATCH_NR) __flush_tlb_pending(batch); + put_cpu_var(ppc64_tlb_batch); } /* Index: linux-2.6.24-rc8-rt1/include/asm-powerpc/percpu.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-powerpc/percpu.h 2008-01-16 22:27:36.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-powerpc/percpu.h 2008-01-16 22:29:28.000000000 -0500 @@ -19,6 +19,9 @@ /* Separate out the type, so (int[3], foo) works. */ #define DEFINE_PER_CPU(type, name) \ __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name +#define DEFINE_PER_CPU_LOCKED(type, name) \ + __attribute__((__section__(".data.percpu"))) __DEFINE_SPINLOCK(per_cpu_lock__##name##_locked); \ + __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name##_locked #define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ __attribute__((__section__(".data.percpu.shared_aligned"))) \ @@ -30,6 +33,15 @@ #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset())) #define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, local_paca->data_offset)) +#define per_cpu_lock(var, cpu) \ + (*RELOC_HIDE(&per_cpu_lock__##var##_locked, __per_cpu_offset(cpu))) +#define per_cpu_var_locked(var, cpu) \ + (*RELOC_HIDE(&per_cpu__##var##_locked, __per_cpu_offset(cpu))) +#define __get_cpu_lock(var, cpu) \ + per_cpu_lock(var, cpu) +#define __get_cpu_var_locked(var, cpu) \ + per_cpu_var_locked(var, cpu) + /* A macro to avoid #include hell... */ #define percpu_modcopy(pcpudst, src, size) \ do { \ @@ -47,17 +59,29 @@ extern void setup_per_cpu_areas(void); __typeof__(type) per_cpu__##name #define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ DEFINE_PER_CPU(type, name) +#define DEFINE_PER_CPU_LOCKED(type, name) \ + __DEFINE_SPINLOCK(per_cpu_lock__##name##_locked); \ + __typeof__(type) per_cpu__##name##_locked #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) +#define per_cpu_var_locked(var, cpu) (*((void)(cpu), &per_cpu__##var##_locked)) + #define __get_cpu_var(var) per_cpu__##var #define __raw_get_cpu_var(var) per_cpu__##var +#define __get_cpu_lock(var, cpu) per_cpu_lock__##var##_locked +#define __get_cpu_var_locked(var, cpu) per_cpu__##var##_locked #endif /* SMP */ #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name +#define DECLARE_PER_CPU_LOCKED(type, name) \ + extern spinlock_t per_cpu_lock__##name##_locked; \ + extern __typeof__(type) per_cpu__##name##_locked #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var) #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var) +#define EXPORT_PER_CPU_LOCKED_SYMBOL(var) EXPORT_SYMBOL(per_cpu_lock__##var##_locked); EXPORT_SYMBOL(per_cpu__##var##_locked) +#define EXPORT_PER_CPU_LOCKED_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu_lock__##var##_locked); EXPORT_SYMBOL_GPL(per_cpu__##var##_locked) #else #include Index: linux-2.6.24-rc8-rt1/net/core/dev.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/net/core/dev.c 2008-01-16 22:27:36.000000000 -0500 +++ linux-2.6.24-rc8-rt1/net/core/dev.c 2008-01-16 22:29:28.000000000 -0500 @@ -1692,11 +1692,10 @@ gso: Either shot noqueue qdisc, it is even simpler 8) */ if (dev->flags & IFF_UP) { - int cpu = smp_processor_id(); /* ok because BHs are off */ - if (dev->xmit_lock_owner != cpu) { + if (dev->xmit_lock_owner != (void *)current) { - HARD_TX_LOCK(dev, cpu); + HARD_TX_LOCK(dev); if (!netif_queue_stopped(dev) && !netif_subqueue_stopped(dev, skb)) { @@ -1801,8 +1800,8 @@ int netif_rx_ni(struct sk_buff *skb) { int err; - preempt_disable(); err = netif_rx(skb); + preempt_disable(); if (local_softirq_pending()) do_softirq(); preempt_enable(); @@ -1830,7 +1829,8 @@ static inline struct net_device *skb_bon static void net_tx_action(struct softirq_action *h) { - struct softnet_data *sd = &__get_cpu_var(softnet_data); + struct softnet_data *sd = &per_cpu(softnet_data, + raw_smp_processor_id()); if (sd->completion_queue) { struct sk_buff *clist; @@ -1846,6 +1846,11 @@ static void net_tx_action(struct softirq BUG_TRAP(!atomic_read(&skb->users)); __kfree_skb(skb); + /* + * Safe to reschedule - the list is private + * at this point. + */ + cond_resched_softirq_context(); } } @@ -1864,12 +1869,27 @@ static void net_tx_action(struct softirq smp_mb__before_clear_bit(); clear_bit(__LINK_STATE_SCHED, &dev->state); + /* + * We are executing in softirq context here, and + * if softirqs are preemptible, we must avoid + * infinite reactivation of the softirq by + * either the tx handler, or by netif_schedule(). + * (it would result in an infinitely looping + * softirq context) + * So we take the spinlock unconditionally. + */ +#ifdef CONFIG_PREEMPT_SOFTIRQS + spin_lock(&dev->queue_lock); + qdisc_run(dev); + spin_unlock(&dev->queue_lock); +#else if (spin_trylock(&dev->queue_lock)) { qdisc_run(dev); spin_unlock(&dev->queue_lock); } else { netif_schedule(dev); } +#endif } } } @@ -2037,7 +2057,7 @@ int netif_receive_skb(struct sk_buff *sk if (!orig_dev) return NET_RX_DROP; - __get_cpu_var(netdev_rx_stat).total++; + per_cpu(netdev_rx_stat, raw_smp_processor_id()).total++; skb_reset_network_header(skb); skb_reset_transport_header(skb); @@ -2104,9 +2124,10 @@ out: static int process_backlog(struct napi_struct *napi, int quota) { int work = 0; - struct softnet_data *queue = &__get_cpu_var(softnet_data); + struct softnet_data *queue; unsigned long start_time = jiffies; + queue = &per_cpu(softnet_data, raw_smp_processor_id()); napi->weight = weight_p; do { struct sk_buff *skb; @@ -2144,7 +2165,7 @@ void fastcall __napi_schedule(struct nap local_irq_save(flags); list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list); - __raise_softirq_irqoff(NET_RX_SOFTIRQ); + raise_softirq_irqoff(NET_RX_SOFTIRQ); local_irq_restore(flags); } EXPORT_SYMBOL(__napi_schedule); @@ -2238,7 +2259,7 @@ out: softnet_break: __get_cpu_var(netdev_rx_stat).time_squeeze++; - __raise_softirq_irqoff(NET_RX_SOFTIRQ); + raise_softirq_irqoff(NET_RX_SOFTIRQ); goto out; } @@ -3605,7 +3626,7 @@ int register_netdevice(struct net_device spin_lock_init(&dev->queue_lock); spin_lock_init(&dev->_xmit_lock); netdev_set_lockdep_class(&dev->_xmit_lock, dev->type); - dev->xmit_lock_owner = -1; + dev->xmit_lock_owner = (void *)-1; spin_lock_init(&dev->ingress_lock); dev->iflink = -1; Index: linux-2.6.24-rc8-rt1/fs/buffer.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/fs/buffer.c 2008-01-16 22:27:36.000000000 -0500 +++ linux-2.6.24-rc8-rt1/fs/buffer.c 2008-01-16 22:29:02.000000000 -0500 @@ -40,7 +40,6 @@ #include #include #include -#include static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -348,7 +347,7 @@ void invalidate_bdev(struct block_device { struct address_space *mapping = bdev->bd_inode->i_mapping; - if (mapping->nrpages == 0) + if (mapping_nrpages(mapping) == 0) return; invalidate_bh_lrus(); @@ -403,8 +402,7 @@ static void end_buffer_async_read(struct * decide that the page is now completely done. */ first = page_buffers(page); - local_irq_save(flags); - bit_spin_lock(BH_Uptodate_Lock, &first->b_state); + spin_lock_irqsave(&first->b_uptodate_lock, flags); clear_buffer_async_read(bh); unlock_buffer(bh); tmp = bh; @@ -417,8 +415,7 @@ static void end_buffer_async_read(struct } tmp = tmp->b_this_page; } while (tmp != bh); - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); /* * If none of the buffers had errors and they are all @@ -430,8 +427,7 @@ static void end_buffer_async_read(struct return; still_busy: - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); return; } @@ -466,8 +462,7 @@ static void end_buffer_async_write(struc } first = page_buffers(page); - local_irq_save(flags); - bit_spin_lock(BH_Uptodate_Lock, &first->b_state); + spin_lock_irqsave(&first->b_uptodate_lock, flags); clear_buffer_async_write(bh); unlock_buffer(bh); @@ -479,14 +474,12 @@ static void end_buffer_async_write(struc } tmp = tmp->b_this_page; } - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); end_page_writeback(page); return; still_busy: - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); return; } @@ -704,8 +697,9 @@ static int __set_page_dirty(struct page if (TestSetPageDirty(page)) return 0; - write_lock_irq(&mapping->tree_lock); + lock_page_ref_irq(page); if (page->mapping) { /* Race with truncate? */ + DEFINE_RADIX_TREE_CONTEXT(ctx, &mapping->page_tree); WARN_ON_ONCE(warn && !PageUptodate(page)); if (mapping_cap_account_dirty(mapping)) { @@ -714,10 +708,12 @@ static int __set_page_dirty(struct page BDI_RECLAIMABLE); task_io_account_write(PAGE_CACHE_SIZE); } - radix_tree_tag_set(&mapping->page_tree, + radix_tree_lock(&ctx); + radix_tree_tag_set(ctx.tree, page_index(page), PAGECACHE_TAG_DIRTY); + radix_tree_unlock(&ctx); } - write_unlock_irq(&mapping->tree_lock); + unlock_page_ref_irq(page); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); return 1; @@ -3173,6 +3169,8 @@ struct buffer_head *alloc_buffer_head(gf set_migrateflags(gfp_flags, __GFP_RECLAIMABLE)); if (ret) { INIT_LIST_HEAD(&ret->b_assoc_buffers); + spin_lock_init(&ret->b_uptodate_lock); + spin_lock_init(&ret->b_state_lock); get_cpu_var(bh_accounting).nr++; recalc_bh_state(); put_cpu_var(bh_accounting); @@ -3184,6 +3182,8 @@ EXPORT_SYMBOL(alloc_buffer_head); void free_buffer_head(struct buffer_head *bh) { BUG_ON(!list_empty(&bh->b_assoc_buffers)); + BUG_ON(spin_is_locked(&bh->b_uptodate_lock)); + BUG_ON(spin_is_locked(&bh->b_state_lock)); kmem_cache_free(bh_cachep, bh); get_cpu_var(bh_accounting).nr--; recalc_bh_state(); Index: linux-2.6.24-rc8-rt1/fs/ntfs/aops.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/fs/ntfs/aops.c 2008-01-16 22:27:36.000000000 -0500 +++ linux-2.6.24-rc8-rt1/fs/ntfs/aops.c 2008-01-16 22:29:24.000000000 -0500 @@ -103,8 +103,7 @@ static void ntfs_end_buffer_async_read(s "0x%llx.", (unsigned long long)bh->b_blocknr); } first = page_buffers(page); - local_irq_save(flags); - bit_spin_lock(BH_Uptodate_Lock, &first->b_state); + spin_lock_irqsave(&first->b_uptodate_lock, flags); clear_buffer_async_read(bh); unlock_buffer(bh); tmp = bh; @@ -119,8 +118,7 @@ static void ntfs_end_buffer_async_read(s } tmp = tmp->b_this_page; } while (tmp != bh); - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); /* * If none of the buffers had errors then we can set the page uptodate, * but we first have to perform the post read mst fixups, if the @@ -141,13 +139,13 @@ static void ntfs_end_buffer_async_read(s recs = PAGE_CACHE_SIZE / rec_size; /* Should have been verified before we got here... */ BUG_ON(!recs); - local_irq_save(flags); + local_irq_save_nort(flags); kaddr = kmap_atomic(page, KM_BIO_SRC_IRQ); for (i = 0; i < recs; i++) post_read_mst_fixup((NTFS_RECORD*)(kaddr + i * rec_size), rec_size); kunmap_atomic(kaddr, KM_BIO_SRC_IRQ); - local_irq_restore(flags); + local_irq_restore_nort(flags); flush_dcache_page(page); if (likely(page_uptodate && !PageError(page))) SetPageUptodate(page); @@ -155,8 +153,7 @@ static void ntfs_end_buffer_async_read(s unlock_page(page); return; still_busy: - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); return; } Index: linux-2.6.24-rc8-rt1/include/linux/buffer_head.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/buffer_head.h 2008-01-16 22:27:36.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/buffer_head.h 2008-01-16 22:28:36.000000000 -0500 @@ -21,10 +21,6 @@ enum bh_state_bits { BH_Dirty, /* Is dirty */ BH_Lock, /* Is locked */ BH_Req, /* Has been submitted for I/O */ - BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise - * IO completion of other buffers in the page - */ - BH_Mapped, /* Has a disk mapping */ BH_New, /* Disk mapping was newly created by get_block */ BH_Async_Read, /* Is under end_buffer_async_read I/O */ @@ -73,6 +69,8 @@ struct buffer_head { struct address_space *b_assoc_map; /* mapping this buffer is associated with */ atomic_t b_count; /* users using this buffer_head */ + spinlock_t b_uptodate_lock; + spinlock_t b_state_lock; }; /* Index: linux-2.6.24-rc8-rt1/include/linux/jbd.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/jbd.h 2008-01-16 22:27:36.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/jbd.h 2008-01-16 22:28:36.000000000 -0500 @@ -264,6 +264,15 @@ void buffer_assertion_failure(struct buf #define J_ASSERT_JH(jh, expr) J_ASSERT(expr) #endif +/* + * For assertions that are only valid on SMP (e.g. spin_is_locked()): + */ +#ifdef CONFIG_SMP +# define J_ASSERT_JH_SMP(jh, expr) J_ASSERT_JH(jh, expr) +#else +# define J_ASSERT_JH_SMP(jh, assert) do { } while (0) +#endif + #if defined(JBD_PARANOID_IOFAIL) #define J_EXPECT(expr, why...) J_ASSERT(expr) #define J_EXPECT_BH(bh, expr, why...) J_ASSERT_BH(bh, expr) @@ -319,32 +328,32 @@ static inline struct journal_head *bh2jh static inline void jbd_lock_bh_state(struct buffer_head *bh) { - bit_spin_lock(BH_State, &bh->b_state); + spin_lock(&bh->b_state_lock); } static inline int jbd_trylock_bh_state(struct buffer_head *bh) { - return bit_spin_trylock(BH_State, &bh->b_state); + return spin_trylock(&bh->b_state_lock); } static inline int jbd_is_locked_bh_state(struct buffer_head *bh) { - return bit_spin_is_locked(BH_State, &bh->b_state); + return spin_is_locked(&bh->b_state_lock); } static inline void jbd_unlock_bh_state(struct buffer_head *bh) { - bit_spin_unlock(BH_State, &bh->b_state); + spin_unlock(&bh->b_state_lock); } static inline void jbd_lock_bh_journal_head(struct buffer_head *bh) { - bit_spin_lock(BH_JournalHead, &bh->b_state); + spin_lock_irq(&bh->b_uptodate_lock); } static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) { - bit_spin_unlock(BH_JournalHead, &bh->b_state); + spin_unlock_irq(&bh->b_uptodate_lock); } struct jbd_revoke_table_s; Index: linux-2.6.24-rc8-rt1/fs/jbd/transaction.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/fs/jbd/transaction.c 2008-01-16 22:27:36.000000000 -0500 +++ linux-2.6.24-rc8-rt1/fs/jbd/transaction.c 2008-01-16 22:28:36.000000000 -0500 @@ -1514,7 +1514,7 @@ static void __journal_temp_unlink_buffer transaction_t *transaction; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh)); transaction = jh->b_transaction; if (transaction) assert_spin_locked(&transaction->t_journal->j_list_lock); @@ -1957,7 +1957,7 @@ void __journal_file_buffer(struct journa int was_dirty = 0; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh)); assert_spin_locked(&transaction->t_journal->j_list_lock); J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); @@ -2046,7 +2046,7 @@ void __journal_refile_buffer(struct jour int was_dirty; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh)); if (jh->b_transaction) assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); Index: linux-2.6.24-rc8-rt1/include/linux/kernel_stat.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/kernel_stat.h 2008-01-16 22:27:35.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/kernel_stat.h 2008-01-16 22:28:37.000000000 -0500 @@ -23,6 +23,8 @@ struct cpu_usage_stat { cputime64_t idle; cputime64_t iowait; cputime64_t steal; + cputime64_t user_rt; + cputime64_t system_rt; cputime64_t guest; }; Index: linux-2.6.24-rc8-rt1/include/asm-generic/bug.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-generic/bug.h 2008-01-16 22:27:35.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-generic/bug.h 2008-01-16 22:28:54.000000000 -0500 @@ -3,6 +3,10 @@ #include +#ifndef __ASSEMBLY__ +extern void __WARN_ON(const char *func, const char *file, const int line); +#endif /* __ASSEMBLY__ */ + #ifdef CONFIG_BUG #ifdef CONFIG_GENERIC_BUG @@ -76,4 +80,16 @@ struct bug_entry { # define WARN_ON_SMP(x) do { } while (0) #endif +#ifdef CONFIG_PREEMPT_RT +# define BUG_ON_RT(c) BUG_ON(c) +# define BUG_ON_NONRT(c) do { } while (0) +# define WARN_ON_RT(condition) WARN_ON(condition) +# define WARN_ON_NONRT(condition) do { } while (0) +#else +# define BUG_ON_RT(c) do { } while (0) +# define BUG_ON_NONRT(c) BUG_ON(c) +# define WARN_ON_RT(condition) do { } while (0) +# define WARN_ON_NONRT(condition) WARN_ON(condition) +#endif + #endif Index: linux-2.6.24-rc8-rt1/include/linux/posix-timers.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/posix-timers.h 2008-01-16 22:27:35.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/posix-timers.h 2008-01-16 22:28:38.000000000 -0500 @@ -115,4 +115,6 @@ void set_process_cpu_timer(struct task_s long clock_nanosleep_restart(struct restart_block *restart_block); +int posix_cpu_thread_init(void); + #endif Index: linux-2.6.24-rc8-rt1/kernel/posix-cpu-timers.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/posix-cpu-timers.c 2008-01-16 22:27:35.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/posix-cpu-timers.c 2008-01-16 22:28:38.000000000 -0500 @@ -578,7 +578,7 @@ static void arm_timer(struct k_itimer *t p->cpu_timers : p->signal->cpu_timers); head += CPUCLOCK_WHICH(timer->it_clock); - BUG_ON(!irqs_disabled()); + BUG_ON_NONRT(!irqs_disabled()); spin_lock(&p->sighand->siglock); listpos = head; @@ -735,7 +735,7 @@ int posix_cpu_timer_set(struct k_itimer /* * Disarm any old timer after extracting its expiry time. */ - BUG_ON(!irqs_disabled()); + BUG_ON_NONRT(!irqs_disabled()); ret = 0; spin_lock(&p->sighand->siglock); @@ -1287,28 +1287,21 @@ out: * already updated our counts. We need to check if any timers fire now. * Interrupts are disabled. */ -void run_posix_cpu_timers(struct task_struct *tsk) +void __run_posix_cpu_timers(struct task_struct *tsk) { LIST_HEAD(firing); struct k_itimer *timer, *next; - BUG_ON(!irqs_disabled()); - -#define UNEXPIRED(clock) \ - (cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \ - cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires)) - - if (UNEXPIRED(prof) && UNEXPIRED(virt) && - (tsk->it_sched_expires == 0 || - tsk->se.sum_exec_runtime < tsk->it_sched_expires)) - return; - -#undef UNEXPIRED - /* * Double-check with locks held. */ read_lock(&tasklist_lock); + /* Make sure the task doesn't exit under us. */ + if (unlikely(tsk->exit_state)) { + read_unlock(&tasklist_lock); + return; + } + if (likely(tsk->signal != NULL)) { spin_lock(&tsk->sighand->siglock); @@ -1355,6 +1348,182 @@ void run_posix_cpu_timers(struct task_st } } +#include +#include +DEFINE_PER_CPU(struct task_struct *, posix_timer_task); +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist); + +static int posix_cpu_timers_thread(void *data) +{ + int cpu = (long)data; + + BUG_ON(per_cpu(posix_timer_task,cpu) != current); + + + while (!kthread_should_stop()) { + struct task_struct *tsk = NULL; + struct task_struct *next = NULL; + + if (cpu_is_offline(cpu)) { + goto wait_to_die; + } + + /* grab task list */ + raw_local_irq_disable(); + tsk = per_cpu(posix_timer_tasklist, cpu); + per_cpu(posix_timer_tasklist, cpu) = NULL; + raw_local_irq_enable(); + + + /* its possible the list is empty, just return */ + if (!tsk) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + continue; + } + + /* Process task list */ + while (1) { + /* save next */ + next = tsk->posix_timer_list; + + /* run the task timers, clear its ptr and + * unreference it + */ + __run_posix_cpu_timers(tsk); + tsk->posix_timer_list = NULL; + put_task_struct(tsk); + + /* check if this is the last on the list */ + if (next == tsk) + break; + tsk = next; + } + } + return 0; + +wait_to_die: + /* Wait for kthread_stop */ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +void run_posix_cpu_timers(struct task_struct *tsk) +{ + unsigned long cpu = smp_processor_id(); + struct task_struct *tasklist; + + BUG_ON(!irqs_disabled()); + if(!per_cpu(posix_timer_task, cpu)) + return; + + +#define UNEXPIRED(clock) \ + (cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \ + cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires)) + + if (UNEXPIRED(prof) && UNEXPIRED(virt) && + (tsk->it_sched_expires == 0 || + tsk->se.sum_exec_runtime < tsk->it_sched_expires)) + return; + +#undef UNEXPIRED + + /* get per-cpu references */ + tasklist = per_cpu(posix_timer_tasklist, cpu); + + /* check to see if we're already queued */ + if (!tsk->posix_timer_list) { + get_task_struct(tsk); + if (tasklist) { + tsk->posix_timer_list = tasklist; + } else { + /* + * The list is terminated by a self-pointing + * task_struct + */ + tsk->posix_timer_list = tsk; + } + per_cpu(posix_timer_tasklist, cpu) = tsk; + } + /* XXX signal the thread somehow */ + wake_up_process(per_cpu(posix_timer_task, cpu)); +} + + + + +/* + * posix_cpu_thread_call - callback that gets triggered when a CPU is added. + * Here we can start up the necessary migration thread for the new CPU. + */ +static int posix_cpu_thread_call(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + struct task_struct *p; + struct sched_param param; + + switch (action) { + case CPU_UP_PREPARE: + p = kthread_create(posix_cpu_timers_thread, hcpu, + "posix_cpu_timers/%d",cpu); + if (IS_ERR(p)) + return NOTIFY_BAD; + p->flags |= PF_NOFREEZE; + kthread_bind(p, cpu); + /* Must be high prio to avoid getting starved */ + param.sched_priority = MAX_RT_PRIO-1; + sched_setscheduler(p, SCHED_FIFO, ¶m); + per_cpu(posix_timer_task,cpu) = p; + break; + case CPU_ONLINE: + /* Strictly unneccessary, as first user will wake it. */ + wake_up_process(per_cpu(posix_timer_task,cpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + /* Unbind it from offline cpu so it can run. Fall thru. */ + kthread_bind(per_cpu(posix_timer_task,cpu), + any_online_cpu(cpu_online_map)); + kthread_stop(per_cpu(posix_timer_task,cpu)); + per_cpu(posix_timer_task,cpu) = NULL; + break; + case CPU_DEAD: + kthread_stop(per_cpu(posix_timer_task,cpu)); + per_cpu(posix_timer_task,cpu) = NULL; + break; +#endif + } + return NOTIFY_OK; +} + +/* Register at highest priority so that task migration (migrate_all_tasks) + * happens before everything else. + */ +static struct notifier_block __devinitdata posix_cpu_thread_notifier = { + .notifier_call = posix_cpu_thread_call, + .priority = 10 +}; + +int __init posix_cpu_thread_init(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + /* Start one for boot CPU. */ + posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, cpu); + posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, cpu); + register_cpu_notifier(&posix_cpu_thread_notifier); + return 0; +} + + + /* * Set one of the process-wide special case CPU timers. * The tasklist_lock and tsk->sighand->siglock must be held by the caller. @@ -1620,6 +1789,12 @@ static __init int init_posix_cpu_timers( .nsleep = thread_cpu_nsleep, .nsleep_restart = thread_cpu_nsleep_restart, }; + unsigned long cpu; + + /* init the per-cpu posix_timer_tasklets */ + for_each_cpu_mask(cpu, cpu_possible_map) { + per_cpu(posix_timer_tasklist, cpu) = NULL; + } register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); Index: linux-2.6.24-rc8-rt1/drivers/net/3c59x.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/net/3c59x.c 2008-01-16 22:27:35.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/net/3c59x.c 2008-01-16 22:28:38.000000000 -0500 @@ -792,9 +792,9 @@ static void poll_vortex(struct net_devic { struct vortex_private *vp = netdev_priv(dev); unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev); - local_irq_restore(flags); + local_irq_restore_nort(flags); } #endif @@ -1739,6 +1739,7 @@ vortex_timer(unsigned long data) int next_tick = 60*HZ; int ok = 0; int media_status, old_window; + unsigned long flags; if (vortex_debug > 2) { printk(KERN_DEBUG "%s: Media selection timer tick happened, %s.\n", @@ -1746,7 +1747,7 @@ vortex_timer(unsigned long data) printk(KERN_DEBUG "dev->watchdog_timeo=%d\n", dev->watchdog_timeo); } - disable_irq_lockdep(dev->irq); + spin_lock_irqsave(&vp->lock, flags); old_window = ioread16(ioaddr + EL3_CMD) >> 13; EL3WINDOW(4); media_status = ioread16(ioaddr + Wn4_Media); @@ -1769,9 +1770,7 @@ vortex_timer(unsigned long data) case XCVR_MII: case XCVR_NWAY: { ok = 1; - spin_lock_bh(&vp->lock); vortex_check_media(dev, 0); - spin_unlock_bh(&vp->lock); } break; default: /* Other media types handled by Tx timeouts. */ @@ -1827,7 +1826,7 @@ leave_media_alone: dev->name, media_tbl[dev->if_port].name); EL3WINDOW(old_window); - enable_irq_lockdep(dev->irq); + spin_unlock_irqrestore(&vp->lock, flags); mod_timer(&vp->timer, RUN_AT(next_tick)); if (vp->deferred) iowrite16(FakeIntr, ioaddr + EL3_CMD); @@ -1860,13 +1859,17 @@ static void vortex_tx_timeout(struct net /* * Block interrupts because vortex_interrupt does a bare spin_lock() */ +#ifndef CONFIG_PREEMPT_RT unsigned long flags; local_irq_save(flags); +#endif if (vp->full_bus_master_tx) boomerang_interrupt(dev->irq, dev); else vortex_interrupt(dev->irq, dev); +#ifndef CONFIG_PREEMPT_RT local_irq_restore(flags); +#endif } } Index: linux-2.6.24-rc8-rt1/drivers/serial/8250.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/serial/8250.c 2008-01-16 22:27:35.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/serial/8250.c 2008-01-16 22:28:39.000000000 -0500 @@ -1455,7 +1455,10 @@ static irqreturn_t serial8250_interrupt( { struct irq_info *i = dev_id; struct list_head *l, *end = NULL; - int pass_counter = 0, handled = 0; +#ifndef CONFIG_PREEMPT_RT + int pass_counter = 0; +#endif + int handled = 0; DEBUG_INTR("serial8250_interrupt(%d)...", irq); @@ -1493,12 +1496,18 @@ static irqreturn_t serial8250_interrupt( l = l->next; + /* + * On preempt-rt we can be preempted and run in our + * own thread. + */ +#ifndef CONFIG_PREEMPT_RT if (l == i->head && pass_counter++ > PASS_LIMIT) { /* If we hit this, we're dead. */ printk(KERN_ERR "serial8250: too much work for " "irq%d\n", irq); break; } +#endif } while (l != end); spin_unlock(&i->lock); @@ -2473,14 +2482,10 @@ serial8250_console_write(struct console touch_nmi_watchdog(); - local_irq_save(flags); - if (up->port.sysrq) { - /* serial8250_handle_port() already took the lock */ - locked = 0; - } else if (oops_in_progress) { - locked = spin_trylock(&up->port.lock); - } else - spin_lock(&up->port.lock); + if (up->port.sysrq || oops_in_progress) + locked = spin_trylock_irqsave(&up->port.lock, flags); + else + spin_lock_irqsave(&up->port.lock, flags); /* * First save the IER then disable the interrupts @@ -2512,8 +2517,7 @@ serial8250_console_write(struct console check_modem_status(up); if (locked) - spin_unlock(&up->port.lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&up->port.lock, flags); } static int __init serial8250_console_setup(struct console *co, char *options) Index: linux-2.6.24-rc8-rt1/drivers/net/ibm_emac/ibm_emac_core.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/net/ibm_emac/ibm_emac_core.c 2008-01-16 22:27:35.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/net/ibm_emac/ibm_emac_core.c 2008-01-16 22:28:39.000000000 -0500 @@ -1058,6 +1058,8 @@ static inline int emac_xmit_finish(struc ++dev->stats.tx_packets; dev->stats.tx_bytes += len; + spin_unlock(&dev->tx_lock); + return 0; } @@ -1071,6 +1073,7 @@ static int emac_start_xmit(struct sk_buf u16 ctrl = EMAC_TX_CTRL_GFCS | EMAC_TX_CTRL_GP | MAL_TX_CTRL_READY | MAL_TX_CTRL_LAST | emac_tx_csum(dev, skb); + spin_lock(&dev->tx_lock); slot = dev->tx_slot++; if (dev->tx_slot == NUM_TX_BUFF) { dev->tx_slot = 0; @@ -1133,6 +1136,8 @@ static int emac_start_xmit_sg(struct sk_ if (likely(!nr_frags && len <= MAL_MAX_TX_SIZE)) return emac_start_xmit(skb, ndev); + spin_lock(&dev->tx_lock); + len -= skb->data_len; /* Note, this is only an *estimation*, we can still run out of empty @@ -1201,6 +1206,7 @@ static int emac_start_xmit_sg(struct sk_ stop_queue: netif_stop_queue(ndev); DBG2("%d: stopped TX queue" NL, dev->def->index); + spin_unlock(&dev->tx_lock); return 1; } #else @@ -1240,6 +1246,7 @@ static void emac_poll_tx(void *param) DBG2("%d: poll_tx, %d %d" NL, dev->def->index, dev->tx_cnt, dev->ack_slot); + spin_lock(&dev->tx_lock); if (dev->tx_cnt) { u16 ctrl; int slot = dev->ack_slot, n = 0; @@ -1249,6 +1256,7 @@ static void emac_poll_tx(void *param) struct sk_buff *skb = dev->tx_skb[slot]; ++n; + spin_unlock(&dev->tx_lock); if (skb) { dev_kfree_skb(skb); dev->tx_skb[slot] = NULL; @@ -1258,6 +1266,7 @@ static void emac_poll_tx(void *param) if (unlikely(EMAC_IS_BAD_TX(ctrl))) emac_parse_tx_error(dev, ctrl); + spin_lock(&dev->tx_lock); if (--dev->tx_cnt) goto again; } @@ -1270,6 +1279,7 @@ static void emac_poll_tx(void *param) DBG2("%d: tx %d pkts" NL, dev->def->index, n); } } + spin_unlock(&dev->tx_lock); } static inline void emac_recycle_rx_skb(struct ocp_enet_private *dev, int slot, @@ -1964,6 +1974,7 @@ static int __init emac_probe(struct ocp_ dev->ndev = ndev; dev->ldev = &ocpdev->dev; dev->def = ocpdev->def; + spin_lock_init(&dev->tx_lock); /* Find MAL device we are connected to */ maldev = Index: linux-2.6.24-rc8-rt1/drivers/net/ibm_emac/ibm_emac_core.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/net/ibm_emac/ibm_emac_core.h 2008-01-16 22:27:35.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/net/ibm_emac/ibm_emac_core.h 2008-01-16 22:28:39.000000000 -0500 @@ -193,6 +193,8 @@ struct ocp_enet_private { struct ibm_emac_error_stats estats; struct net_device_stats nstats; + spinlock_t tx_lock; + struct device* ldev; }; Index: linux-2.6.24-rc8-rt1/drivers/char/tty_io.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/char/tty_io.c 2008-01-16 22:27:35.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/char/tty_io.c 2008-01-16 22:29:00.000000000 -0500 @@ -242,14 +242,13 @@ int tty_paranoia_check(struct tty_struct static int check_tty_count(struct tty_struct *tty, const char *routine) { #ifdef CHECK_TTY_COUNT - struct list_head *p; + struct file *filp; int count = 0; - - file_list_lock(); - list_for_each(p, &tty->tty_files) { + + percpu_list_fold(&tty->tty_files); + lock_list_for_each_entry(filp, percpu_list_head(&tty->tty_files), f_u.fu_llist) count++; - } - file_list_unlock(); + if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_SLAVE && tty->link && tty->link->count) @@ -258,6 +257,7 @@ static int check_tty_count(struct tty_st printk(KERN_WARNING "Warning: dev (%s) tty->count(%d) " "!= #fd's(%d) in %s\n", tty->name, tty->count, count, routine); + dump_stack(); return count; } #endif @@ -1375,9 +1375,8 @@ static void do_tty_hangup(struct work_st spin_unlock(&redirect_lock); check_tty_count(tty, "do_tty_hangup"); - file_list_lock(); /* This breaks for file handles being sent over AF_UNIX sockets ? */ - list_for_each_entry(filp, &tty->tty_files, f_u.fu_list) { + lock_list_for_each_entry(filp, percpu_list_head(&tty->tty_files), f_u.fu_llist) { if (filp->f_op->write == redirected_tty_write) cons_filp = filp; if (filp->f_op->write != tty_write) @@ -1386,7 +1385,6 @@ static void do_tty_hangup(struct work_st tty_fasync(-1, filp, 0); /* can't block */ filp->f_op = &hung_up_tty_fops; } - file_list_unlock(); /* FIXME! What are the locking issues here? This may me overdoing things.. * this question is especially important now that we've removed the irqlock. */ @@ -2267,9 +2265,9 @@ static void release_one_tty(struct tty_s tty->magic = 0; tty->driver->refcount--; - file_list_lock(); - list_del_init(&tty->tty_files); - file_list_unlock(); + percpu_list_fold(&tty->tty_files); + lock_list_del_init(percpu_list_head(&tty->tty_files)); + percpu_list_destroy(&tty->tty_files); free_tty_struct(tty); } @@ -3691,10 +3689,14 @@ void tty_flip_buffer_push(struct tty_str tty->buf.tail->commit = tty->buf.tail->used; spin_unlock_irqrestore(&tty->buf.lock, flags); +#ifndef CONFIG_PREEMPT_RT if (tty->low_latency) flush_to_ldisc(&tty->buf.work.work); else schedule_delayed_work(&tty->buf.work, 1); +#else + flush_to_ldisc(&tty->buf.work.work); +#endif } EXPORT_SYMBOL(tty_flip_buffer_push); @@ -3729,7 +3731,7 @@ static void initialize_tty_struct(struct mutex_init(&tty->atomic_read_lock); mutex_init(&tty->atomic_write_lock); spin_lock_init(&tty->read_lock); - INIT_LIST_HEAD(&tty->tty_files); + percpu_list_init(&tty->tty_files); INIT_WORK(&tty->SAK_work, do_SAK_work); } Index: linux-2.6.24-rc8-rt1/arch/arm/kernel/dma.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/kernel/dma.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/kernel/dma.c 2008-01-16 22:28:39.000000000 -0500 @@ -20,7 +20,7 @@ #include -DEFINE_SPINLOCK(dma_spin_lock); +DEFINE_RAW_SPINLOCK(dma_spin_lock); EXPORT_SYMBOL(dma_spin_lock); static dma_t dma_chan[MAX_DMA_CHANNELS]; Index: linux-2.6.24-rc8-rt1/arch/arm/kernel/signal.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/kernel/signal.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/kernel/signal.c 2008-01-16 22:28:39.000000000 -0500 @@ -623,6 +623,14 @@ static int do_signal(sigset_t *oldset, s siginfo_t info; int signr; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif + /* * We want the common case to go fast, which * is why we may in certain cases get here from Index: linux-2.6.24-rc8-rt1/arch/arm/kernel/smp.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/kernel/smp.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/kernel/smp.c 2008-01-16 22:28:39.000000000 -0500 @@ -522,7 +522,7 @@ static void ipi_call_function(unsigned i cpu_clear(cpu, data->unfinished); } -static DEFINE_SPINLOCK(stop_lock); +static DEFINE_RAW_SPINLOCK(stop_lock); /* * ipi_cpu_stop - handle IPI from smp_send_stop() Index: linux-2.6.24-rc8-rt1/arch/arm/mm/consistent.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/mm/consistent.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/mm/consistent.c 2008-01-16 22:28:39.000000000 -0500 @@ -40,7 +40,7 @@ * These are the page tables (2MB each) covering uncached, DMA consistent allocations */ static pte_t *consistent_pte[NUM_CONSISTENT_PTES]; -static DEFINE_SPINLOCK(consistent_lock); +static DEFINE_RAW_SPINLOCK(consistent_lock); /* * VM region handling support. Index: linux-2.6.24-rc8-rt1/arch/arm/mm/copypage-v6.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/mm/copypage-v6.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/mm/copypage-v6.c 2008-01-16 22:28:39.000000000 -0500 @@ -26,7 +26,7 @@ #define from_address (0xffff8000) #define to_address (0xffffc000) -static DEFINE_SPINLOCK(v6_lock); +static DEFINE_RAW_SPINLOCK(v6_lock); /* * Copy the user page. No aliasing to deal with so we can just Index: linux-2.6.24-rc8-rt1/arch/arm/mm/mmu.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/mm/mmu.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/mm/mmu.c 2008-01-16 22:28:39.000000000 -0500 @@ -25,7 +25,7 @@ #include "mm.h" -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); extern void _stext, _etext, __data_start, _end; extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; Index: linux-2.6.24-rc8-rt1/include/asm-arm/dma.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-arm/dma.h 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-arm/dma.h 2008-01-16 22:28:39.000000000 -0500 @@ -27,7 +27,7 @@ typedef unsigned int dmamode_t; #define DMA_MODE_CASCADE 2 #define DMA_AUTOINIT 4 -extern spinlock_t dma_spin_lock; +extern raw_spinlock_t dma_spin_lock; static inline unsigned long claim_dma_lock(void) { Index: linux-2.6.24-rc8-rt1/include/asm-arm/tlb.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-arm/tlb.h 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-arm/tlb.h 2008-01-16 22:28:39.000000000 -0500 @@ -36,15 +36,18 @@ struct mmu_gather { struct mm_struct *mm; unsigned int fullmm; + int cpu; }; -DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); +DECLARE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); static inline struct mmu_gather * tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) { - struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); + int cpu; + struct mmu_gather *tlb = &get_cpu_var_locked(mmu_gathers, &cpu); + tlb->cpu = cpu; tlb->mm = mm; tlb->fullmm = full_mm_flush; @@ -60,7 +63,7 @@ tlb_finish_mmu(struct mmu_gather *tlb, u /* keep the page table cache within bounds */ check_pgt_cache(); - put_cpu_var(mmu_gathers); + put_cpu_var_locked(mmu_gathers, tlb->cpu); } #define tlb_remove_tlb_entry(tlb,ptep,address) do { } while (0) Index: linux-2.6.24-rc8-rt1/arch/arm/mm/context.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/mm/context.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/mm/context.c 2008-01-16 22:28:40.000000000 -0500 @@ -14,7 +14,7 @@ #include #include -static DEFINE_SPINLOCK(cpu_asid_lock); +static DEFINE_RAW_SPINLOCK(cpu_asid_lock); unsigned int cpu_last_asid = ASID_FIRST_VERSION; /* Index: linux-2.6.24-rc8-rt1/arch/arm/mach-sa1100/badge4.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/mach-sa1100/badge4.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/mach-sa1100/badge4.c 2008-01-16 22:28:40.000000000 -0500 @@ -240,15 +240,22 @@ void badge4_set_5V(unsigned subsystem, i /* detect on->off and off->on transitions */ if ((!old_5V_bitmap) && (badge4_5V_bitmap)) { /* was off, now on */ - printk(KERN_INFO "%s: enabling 5V supply rail\n", __FUNCTION__); GPSR = BADGE4_GPIO_PCMEN5V; } else if ((old_5V_bitmap) && (!badge4_5V_bitmap)) { /* was on, now off */ - printk(KERN_INFO "%s: disabling 5V supply rail\n", __FUNCTION__); GPCR = BADGE4_GPIO_PCMEN5V; } local_irq_restore(flags); + + /* detect on->off and off->on transitions */ + if ((!old_5V_bitmap) && (badge4_5V_bitmap)) { + /* was off, now on */ + printk(KERN_INFO "%s: enabling 5V supply rail\n", __FUNCTION__); + } else if ((old_5V_bitmap) && (!badge4_5V_bitmap)) { + /* was on, now off */ + printk(KERN_INFO "%s: disabling 5V supply rail\n", __FUNCTION__); + } } EXPORT_SYMBOL(badge4_set_5V); Index: linux-2.6.24-rc8-rt1/arch/arm/mach-footbridge/netwinder-hw.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/mach-footbridge/netwinder-hw.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/mach-footbridge/netwinder-hw.c 2008-01-16 22:28:40.000000000 -0500 @@ -67,7 +67,7 @@ static inline void wb977_ww(int reg, int /* * This is a lock for accessing ports GP1_IO_BASE and GP2_IO_BASE */ -DEFINE_SPINLOCK(gpio_lock); +DEFINE_RAW_SPINLOCK(gpio_lock); static unsigned int current_gpio_op; static unsigned int current_gpio_io; Index: linux-2.6.24-rc8-rt1/arch/arm/mach-footbridge/netwinder-leds.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/mach-footbridge/netwinder-leds.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/mach-footbridge/netwinder-leds.c 2008-01-16 22:28:40.000000000 -0500 @@ -32,7 +32,7 @@ static char led_state; static char hw_led_state; static DEFINE_SPINLOCK(leds_lock); -extern spinlock_t gpio_lock; +extern raw_spinlock_t gpio_lock; static void netwinder_leds_event(led_event_t evt) { Index: linux-2.6.24-rc8-rt1/arch/arm/mach-integrator/core.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/mach-integrator/core.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/mach-integrator/core.c 2008-01-16 22:28:41.000000000 -0500 @@ -164,7 +164,7 @@ static struct amba_pl010_data integrator #define CM_CTRL IO_ADDRESS(INTEGRATOR_HDR_BASE) + INTEGRATOR_HDR_CTRL_OFFSET -static DEFINE_SPINLOCK(cm_lock); +static DEFINE_RAW_SPINLOCK(cm_lock); /** * cm_control - update the CM_CTRL register. Index: linux-2.6.24-rc8-rt1/arch/arm/mach-integrator/pci_v3.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/mach-integrator/pci_v3.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/mach-integrator/pci_v3.c 2008-01-16 22:28:41.000000000 -0500 @@ -162,7 +162,7 @@ * 7:2 register number * */ -static DEFINE_SPINLOCK(v3_lock); +static DEFINE_RAW_SPINLOCK(v3_lock); #define PCI_BUS_NONMEM_START 0x00000000 #define PCI_BUS_NONMEM_SIZE SZ_256M Index: linux-2.6.24-rc8-rt1/arch/arm/mach-ixp4xx/common-pci.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/mach-ixp4xx/common-pci.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/mach-ixp4xx/common-pci.c 2008-01-16 22:28:41.000000000 -0500 @@ -53,7 +53,7 @@ unsigned long ixp4xx_pci_reg_base = 0; * these transactions are atomic or we will end up * with corrupt data on the bus or in a driver. */ -static DEFINE_SPINLOCK(ixp4xx_pci_lock); +static DEFINE_RAW_SPINLOCK(ixp4xx_pci_lock); /* * Read from PCI config space Index: linux-2.6.24-rc8-rt1/include/asm-arm/arch-pxa/timex.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-arm/arch-pxa/timex.h 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-arm/arch-pxa/timex.h 2008-01-16 22:28:41.000000000 -0500 @@ -16,6 +16,8 @@ #define CLOCK_TICK_RATE 3686400 #elif defined(CONFIG_PXA27x) /* PXA27x timer base */ +#include +#include #ifdef CONFIG_MACH_MAINSTONE #define CLOCK_TICK_RATE 3249600 #else @@ -24,3 +26,7 @@ #else #define CLOCK_TICK_RATE 3250000 #endif + +#define mach_read_cycles() OSCR +#define mach_cycles_to_usecs(d) (((d) * ((1000000LL << 32) / CLOCK_TICK_RATE)) >> 32) +#define mach_usecs_to_cycles(d) (((d) * (((long long)CLOCK_TICK_RATE << 32) / 1000000)) >> 32) Index: linux-2.6.24-rc8-rt1/arch/arm/mach-shark/leds.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/arm/mach-shark/leds.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/arm/mach-shark/leds.c 2008-01-16 22:28:41.000000000 -0500 @@ -32,7 +32,7 @@ static char led_state; static short hw_led_state; static short saved_state; -static DEFINE_SPINLOCK(leds_lock); +static DEFINE_RAW_SPINLOCK(leds_lock); short sequoia_read(int addr) { outw(addr,0x24); Index: linux-2.6.24-rc8-rt1/arch/mips/kernel/asm-offsets.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/mips/kernel/asm-offsets.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/mips/kernel/asm-offsets.c 2008-01-16 22:28:41.000000000 -0500 @@ -10,9 +10,11 @@ */ #include #include +#include #include #include #include +#include #include #include Index: linux-2.6.24-rc8-rt1/arch/mips/kernel/entry.S =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/mips/kernel/entry.S 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/mips/kernel/entry.S 2008-01-16 22:28:41.000000000 -0500 @@ -30,7 +30,7 @@ .align 5 #ifndef CONFIG_PREEMPT FEXPORT(ret_from_exception) - local_irq_disable # preempt stop + raw_local_irq_disable # preempt stop b __ret_from_irq #endif FEXPORT(ret_from_irq) @@ -41,7 +41,7 @@ FEXPORT(__ret_from_irq) beqz t0, resume_kernel resume_userspace: - local_irq_disable # make sure we dont miss an + raw_local_irq_disable # make sure we dont miss an # interrupt setting need_resched # between sampling and return LONG_L a2, TI_FLAGS($28) # current->work @@ -51,7 +51,9 @@ resume_userspace: #ifdef CONFIG_PREEMPT resume_kernel: - local_irq_disable + raw_local_irq_disable + lw t0, kernel_preemption + beqz t0, restore_all lw t0, TI_PRE_COUNT($28) bnez t0, restore_all need_resched: @@ -61,7 +63,9 @@ need_resched: LONG_L t0, PT_STATUS(sp) # Interrupts off? andi t0, 1 beqz t0, restore_all + raw_local_irq_disable jal preempt_schedule_irq + sw zero, TI_PRE_COUNT($28) b need_resched #endif @@ -69,7 +73,7 @@ FEXPORT(ret_from_fork) jal schedule_tail # a0 = struct task_struct *prev FEXPORT(syscall_exit) - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) # current->work @@ -142,19 +146,21 @@ FEXPORT(restore_partial) # restore part .set at work_pending: - andi t0, a2, _TIF_NEED_RESCHED # a2 is preloaded with TI_FLAGS + # a2 is preloaded with TI_FLAGS + andi t0, a2, (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) beqz t0, work_notifysig work_resched: + raw_local_irq_enable t0 jal schedule - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) andi t0, a2, _TIF_WORK_MASK # is there any work to be done # other than syscall tracing? beqz t0, restore_all - andi t0, a2, _TIF_NEED_RESCHED + andi t0, a2, (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bnez t0, work_resched work_notifysig: # deal with pending signals and @@ -170,7 +176,7 @@ syscall_exit_work: li t0, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT and t0, a2 # a2 is preloaded with TI_FLAGS beqz t0, work_pending # trace bit set? - local_irq_enable # could let do_syscall_trace() + raw_local_irq_enable # could let do_syscall_trace() # call schedule() instead move a0, sp li a1, 1 Index: linux-2.6.24-rc8-rt1/arch/mips/kernel/i8259.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/mips/kernel/i8259.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/mips/kernel/i8259.c 2008-01-16 22:28:41.000000000 -0500 @@ -29,7 +29,7 @@ */ static int i8259A_auto_eoi = -1; -DEFINE_SPINLOCK(i8259A_lock); +DEFINE_RAW_SPINLOCK(i8259A_lock); static void disable_8259A_irq(unsigned int irq); static void enable_8259A_irq(unsigned int irq); static void mask_and_ack_8259A(unsigned int irq); Index: linux-2.6.24-rc8-rt1/arch/mips/kernel/module.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/mips/kernel/module.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/mips/kernel/module.c 2008-01-16 22:28:41.000000000 -0500 @@ -40,7 +40,7 @@ struct mips_hi16 { static struct mips_hi16 *mips_hi16_list; static LIST_HEAD(dbe_list); -static DEFINE_SPINLOCK(dbe_lock); +static DEFINE_RAW_SPINLOCK(dbe_lock); void *module_alloc(unsigned long size) { Index: linux-2.6.24-rc8-rt1/arch/mips/kernel/process.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/mips/kernel/process.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/mips/kernel/process.c 2008-01-16 22:28:41.000000000 -0500 @@ -54,7 +54,7 @@ void __noreturn cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { tick_nohz_stop_sched_tick(); - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { #ifdef CONFIG_SMTC_IDLE_HOOK_DEBUG extern void smtc_idle_loop_hook(void); @@ -64,9 +64,11 @@ void __noreturn cpu_idle(void) (*cpu_wait)(); } tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } Index: linux-2.6.24-rc8-rt1/arch/mips/kernel/scall32-o32.S =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/mips/kernel/scall32-o32.S 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/mips/kernel/scall32-o32.S 2008-01-16 22:28:41.000000000 -0500 @@ -73,7 +73,7 @@ stack_done: 1: sw v0, PT_R2(sp) # result o32_syscall_exit: - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return lw a2, TI_FLAGS($28) # current->work Index: linux-2.6.24-rc8-rt1/arch/mips/kernel/scall64-64.S =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/mips/kernel/scall64-64.S 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/mips/kernel/scall64-64.S 2008-01-16 22:28:41.000000000 -0500 @@ -72,7 +72,7 @@ NESTED(handle_sys64, PT_SIZE, sp) 1: sd v0, PT_R2(sp) # result n64_syscall_exit: - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) # current->work Index: linux-2.6.24-rc8-rt1/arch/mips/kernel/scall64-n32.S =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/mips/kernel/scall64-n32.S 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/mips/kernel/scall64-n32.S 2008-01-16 22:28:41.000000000 -0500 @@ -69,7 +69,7 @@ NESTED(handle_sysn32, PT_SIZE, sp) sd v0, PT_R0(sp) # set flag for syscall restarting 1: sd v0, PT_R2(sp) # result - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) # current->work Index: linux-2.6.24-rc8-rt1/arch/mips/kernel/scall64-o32.S =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/mips/kernel/scall64-o32.S 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/mips/kernel/scall64-o32.S 2008-01-16 22:28:41.000000000 -0500 @@ -98,7 +98,7 @@ NESTED(handle_sys, PT_SIZE, sp) 1: sd v0, PT_R2(sp) # result o32_syscall_exit: - local_irq_disable # make need_resched and + raw_local_irq_disable # make need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) Index: linux-2.6.24-rc8-rt1/arch/mips/kernel/semaphore.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/mips/kernel/semaphore.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/mips/kernel/semaphore.c 2008-01-16 22:28:41.000000000 -0500 @@ -36,7 +36,7 @@ * sem->count and sem->waking atomic. Scalability isn't an issue because * this lock is used on UP only so it's just an empty variable. */ -static inline int __sem_update_count(struct semaphore *sem, int incr) +static inline int __sem_update_count(struct compat_semaphore *sem, int incr) { int old_count, tmp; @@ -67,7 +67,7 @@ static inline int __sem_update_count(str : "=&r" (old_count), "=&r" (tmp), "=m" (sem->count) : "r" (incr), "m" (sem->count)); } else { - static DEFINE_SPINLOCK(semaphore_lock); + static DEFINE_RAW_SPINLOCK(semaphore_lock); unsigned long flags; spin_lock_irqsave(&semaphore_lock, flags); @@ -80,7 +80,7 @@ static inline int __sem_update_count(str return old_count; } -void __up(struct semaphore *sem) +void __compat_up(struct compat_semaphore *sem) { /* * Note that we incremented count in up() before we came here, @@ -94,7 +94,7 @@ void __up(struct semaphore *sem) wake_up(&sem->wait); } -EXPORT_SYMBOL(__up); +EXPORT_SYMBOL(__compat_up); /* * Note that when we come in to __down or __down_interruptible, @@ -104,7 +104,7 @@ EXPORT_SYMBOL(__up); * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __sched __down(struct semaphore *sem) +void __sched __compat_down(struct compat_semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -133,9 +133,9 @@ void __sched __down(struct semaphore *se wake_up(&sem->wait); } -EXPORT_SYMBOL(__down); +EXPORT_SYMBOL(__compat_down); -int __sched __down_interruptible(struct semaphore * sem) +int __sched __compat_down_interruptible(struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -165,4 +165,10 @@ int __sched __down_interruptible(struct return retval; } -EXPORT_SYMBOL(__down_interruptible); +EXPORT_SYMBOL(__compat_down_interruptible); + +int fastcall compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} +EXPORT_SYMBOL(compat_sem_is_locked); Index: linux-2.6.24-rc8-rt1/arch/mips/kernel/signal.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/mips/kernel/signal.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/mips/kernel/signal.c 2008-01-16 22:28:41.000000000 -0500 @@ -629,6 +629,10 @@ static void do_signal(struct pt_regs *re siginfo_t info; int signr; +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which is why we may in certain * cases get here from kernel mode. Just return without doing anything Index: linux-2.6.24-rc8-rt1/arch/mips/kernel/signal32.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/mips/kernel/signal32.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/mips/kernel/signal32.c 2008-01-16 22:28:41.000000000 -0500 @@ -655,6 +655,10 @@ static int setup_rt_frame_32(struct k_si if (err) goto give_sigsegv; +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif /* * Arguments to signal handler: * Index: linux-2.6.24-rc8-rt1/arch/mips/kernel/smp.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/mips/kernel/smp.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/mips/kernel/smp.c 2008-01-16 22:28:41.000000000 -0500 @@ -91,7 +91,22 @@ asmlinkage __cpuinit void start_secondar cpu_idle(); } -DEFINE_SPINLOCK(smp_call_lock); +DEFINE_RAW_SPINLOCK(smp_call_lock); + +/* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them. + */ +void smp_send_reschedule_allbutself(void) +{ + int cpu = smp_processor_id(); + int i; + + for (i = 0; i < NR_CPUS; i++) + if (cpu_online(i) && i != cpu) + core_send_ipi(i, SMP_RESCHEDULE_YOURSELF); +} struct call_data_struct *call_data; @@ -314,6 +329,8 @@ int setup_profiling_timer(unsigned int m return 0; } +static DEFINE_RAW_SPINLOCK(tlbstate_lock); + static void flush_tlb_all_ipi(void *info) { local_flush_tlb_all(); @@ -371,6 +388,7 @@ static inline void smp_on_each_tlb(void void flush_tlb_mm(struct mm_struct *mm) { preempt_disable(); + spin_lock(&tlbstate_lock); if ((atomic_read(&mm->mm_users) != 1) || (current->mm != mm)) { smp_on_other_tlbs(flush_tlb_mm_ipi, mm); @@ -383,6 +401,7 @@ void flush_tlb_mm(struct mm_struct *mm) if (cpu_context(cpu, mm)) cpu_context(cpu, mm) = 0; } + spin_unlock(&tlbstate_lock); local_flush_tlb_mm(mm); preempt_enable(); @@ -406,6 +425,8 @@ void flush_tlb_range(struct vm_area_stru struct mm_struct *mm = vma->vm_mm; preempt_disable(); + spin_lock(&tlbstate_lock); + if ((atomic_read(&mm->mm_users) != 1) || (current->mm != mm)) { struct flush_tlb_data fd = { .vma = vma, @@ -423,6 +444,7 @@ void flush_tlb_range(struct vm_area_stru if (cpu_context(cpu, mm)) cpu_context(cpu, mm) = 0; } + spin_unlock(&tlbstate_lock); local_flush_tlb_range(vma, start, end); preempt_enable(); } @@ -454,6 +476,8 @@ static void flush_tlb_page_ipi(void *inf void flush_tlb_page(struct vm_area_struct *vma, unsigned long page) { preempt_disable(); + spin_lock(&tlbstate_lock); + if ((atomic_read(&vma->vm_mm->mm_users) != 1) || (current->mm != vma->vm_mm)) { struct flush_tlb_data fd = { .vma = vma, @@ -470,6 +494,7 @@ void flush_tlb_page(struct vm_area_struc if (cpu_context(cpu, vma->vm_mm)) cpu_context(cpu, vma->vm_mm) = 0; } + spin_unlock(&tlbstate_lock); local_flush_tlb_page(vma, page); preempt_enable(); } Index: linux-2.6.24-rc8-rt1/arch/mips/kernel/traps.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/mips/kernel/traps.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/mips/kernel/traps.c 2008-01-16 22:28:41.000000000 -0500 @@ -320,7 +320,7 @@ void show_registers(const struct pt_regs printk("\n"); } -static DEFINE_SPINLOCK(die_lock); +static DEFINE_RAW_SPINLOCK(die_lock); void __noreturn die(const char * str, const struct pt_regs * regs) { Index: linux-2.6.24-rc8-rt1/arch/mips/mm/init.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/mips/mm/init.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/mips/mm/init.c 2008-01-16 22:28:41.000000000 -0500 @@ -61,7 +61,7 @@ #endif /* CONFIG_MIPS_MT_SMTC */ -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); /* * We have up to 8 empty zeroed pages so we can map one of the right colour Index: linux-2.6.24-rc8-rt1/arch/mips/sibyte/cfe/smp.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/mips/sibyte/cfe/smp.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/mips/sibyte/cfe/smp.c 2008-01-16 22:28:41.000000000 -0500 @@ -107,4 +107,8 @@ void __cpuinit prom_smp_finish(void) */ void prom_cpus_done(void) { +#ifdef CONFIG_HIGH_RES_TIMERS + extern void sync_c0_count_master(void); + sync_c0_count_master(); +#endif } Index: linux-2.6.24-rc8-rt1/arch/mips/sibyte/sb1250/irq.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/mips/sibyte/sb1250/irq.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/mips/sibyte/sb1250/irq.c 2008-01-16 22:28:41.000000000 -0500 @@ -82,7 +82,7 @@ static struct irq_chip sb1250_irq_type = /* Store the CPU id (not the logical number) */ int sb1250_irq_owner[SB1250_NR_IRQS]; -DEFINE_SPINLOCK(sb1250_imr_lock); +DEFINE_RAW_SPINLOCK(sb1250_imr_lock); void sb1250_mask_irq(int cpu, int irq) { @@ -316,6 +316,10 @@ void __init arch_init_irq(void) #ifdef CONFIG_KGDB imask |= STATUSF_IP6; #endif + +#ifdef CONFIG_HIGH_RES_TIMERS + imask |= STATUSF_IP7; +#endif /* Enable necessary IPs, disable the rest */ change_c0_status(ST0_IM, imask); Index: linux-2.6.24-rc8-rt1/arch/mips/sibyte/sb1250/smp.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/mips/sibyte/sb1250/smp.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/mips/sibyte/sb1250/smp.c 2008-01-16 22:28:41.000000000 -0500 @@ -60,7 +60,7 @@ void __cpuinit sb1250_smp_finish(void) extern void sb1250_clockevent_init(void); sb1250_clockevent_init(); - local_irq_enable(); + raw_local_irq_enable(); } /* Index: linux-2.6.24-rc8-rt1/arch/mips/sibyte/swarm/setup.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/mips/sibyte/swarm/setup.c 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/mips/sibyte/swarm/setup.c 2008-01-16 22:28:41.000000000 -0500 @@ -136,6 +136,12 @@ void __init plat_mem_setup(void) if (m41t81_probe()) swarm_rtc_type = RTC_M4LT81; +#ifdef CONFIG_HIGH_RES_TIMERS + /* + * set the mips_hpt_frequency here + */ + mips_hpt_frequency = CONFIG_CPU_SPEED * 1000000; +#endif printk("This kernel optimized for " #ifdef CONFIG_SIMULATION "simulation" Index: linux-2.6.24-rc8-rt1/include/asm-mips/asmmacro.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-mips/asmmacro.h 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-mips/asmmacro.h 2008-01-16 22:28:41.000000000 -0500 @@ -21,7 +21,7 @@ #endif #ifdef CONFIG_MIPS_MT_SMTC - .macro local_irq_enable reg=t0 + .macro raw_local_irq_enable reg=t0 mfc0 \reg, CP0_TCSTATUS ori \reg, \reg, TCSTATUS_IXMT xori \reg, \reg, TCSTATUS_IXMT @@ -29,21 +29,21 @@ _ehb .endm - .macro local_irq_disable reg=t0 + .macro raw_local_irq_disable reg=t0 mfc0 \reg, CP0_TCSTATUS ori \reg, \reg, TCSTATUS_IXMT mtc0 \reg, CP0_TCSTATUS _ehb .endm #else - .macro local_irq_enable reg=t0 + .macro raw_local_irq_enable reg=t0 mfc0 \reg, CP0_STATUS ori \reg, \reg, 1 mtc0 \reg, CP0_STATUS irq_enable_hazard .endm - .macro local_irq_disable reg=t0 + .macro raw_local_irq_disable reg=t0 mfc0 \reg, CP0_STATUS ori \reg, \reg, 1 xori \reg, \reg, 1 Index: linux-2.6.24-rc8-rt1/include/asm-mips/bitops.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-mips/bitops.h 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-mips/bitops.h 2008-01-16 22:28:41.000000000 -0500 @@ -606,9 +606,6 @@ static inline unsigned long __ffs(unsign } /* - * fls - find last bit set. - * @word: The word to search - * * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ @@ -626,6 +623,8 @@ static inline int fls64(__u64 word) return 64 - word; } +#define __bi_local_irq_save(x) raw_local_irq_save(x) +#define __bi_local_irq_restore(x) raw_local_irq_restore(x) #else #include #endif Index: linux-2.6.24-rc8-rt1/include/asm-mips/hw_irq.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-mips/hw_irq.h 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-mips/hw_irq.h 2008-01-16 22:28:41.000000000 -0500 @@ -9,6 +9,7 @@ #define __ASM_HW_IRQ_H #include +#include extern atomic_t irq_err_count; Index: linux-2.6.24-rc8-rt1/include/asm-mips/i8259.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-mips/i8259.h 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-mips/i8259.h 2008-01-16 22:28:41.000000000 -0500 @@ -35,7 +35,7 @@ #define SLAVE_ICW4_DEFAULT 0x01 #define PIC_ICW4_AEOI 2 -extern spinlock_t i8259A_lock; +extern raw_spinlock_t i8259A_lock; extern int i8259A_irq_pending(unsigned int irq); extern void make_8259A_irq(unsigned int irq); Index: linux-2.6.24-rc8-rt1/include/asm-mips/io.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-mips/io.h 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-mips/io.h 2008-01-16 22:28:41.000000000 -0500 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include Index: linux-2.6.24-rc8-rt1/include/asm-mips/linkage.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-mips/linkage.h 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-mips/linkage.h 2008-01-16 22:28:41.000000000 -0500 @@ -3,6 +3,11 @@ #ifdef __ASSEMBLY__ #include + +/* FASTCALL stuff */ +#define FASTCALL(x) x +#define fastcall + #endif #define __weak __attribute__((weak)) Index: linux-2.6.24-rc8-rt1/include/asm-mips/m48t35.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-mips/m48t35.h 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-mips/m48t35.h 2008-01-16 22:28:41.000000000 -0500 @@ -6,7 +6,7 @@ #include -extern spinlock_t rtc_lock; +extern raw_spinlock_t rtc_lock; struct m48t35_rtc { volatile u8 pad[0x7ff8]; /* starts at 0x7ff8 */ Index: linux-2.6.24-rc8-rt1/include/asm-mips/rwsem.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/include/asm-mips/rwsem.h 2008-01-16 22:28:41.000000000 -0500 @@ -0,0 +1,176 @@ +/* + * include/asm-mips/rwsem.h: R/W semaphores for MIPS using the stuff + * in lib/rwsem.c. Adapted largely from include/asm-ppc/rwsem.h + * by john.cooper@timesys.com + */ + +#ifndef _MIPS_RWSEM_H +#define _MIPS_RWSEM_H + +#ifndef _LINUX_RWSEM_H +#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead" +#endif + +#ifdef __KERNEL__ +#include +#include +#include +#include + +/* + * the semaphore definition + */ +struct compat_rw_semaphore { + /* XXX this should be able to be an atomic_t -- paulus */ + signed long count; +#define RWSEM_UNLOCKED_VALUE 0x00000000 +#define RWSEM_ACTIVE_BIAS 0x00000001 +#define RWSEM_ACTIVE_MASK 0x0000ffff +#define RWSEM_WAITING_BIAS (-0x00010000) +#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS +#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) + raw_spinlock_t wait_lock; + struct list_head wait_list; +#if RWSEM_DEBUG + int debug; +#endif +}; + +/* + * initialisation + */ +#if RWSEM_DEBUG +#define __RWSEM_DEBUG_INIT , 0 +#else +#define __RWSEM_DEBUG_INIT /* */ +#endif + +#define __COMPAT_RWSEM_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \ + LIST_HEAD_INIT((name).wait_list) \ + __RWSEM_DEBUG_INIT } + +#define COMPAT_DECLARE_RWSEM(name) \ + struct compat_rw_semaphore name = __COMPAT_RWSEM_INITIALIZER(name) + +extern struct compat_rw_semaphore *rwsem_down_read_failed(struct compat_rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_down_write_failed(struct compat_rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_wake(struct compat_rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_downgrade_wake(struct compat_rw_semaphore *sem); + +static inline void compat_init_rwsem(struct compat_rw_semaphore *sem) +{ + sem->count = RWSEM_UNLOCKED_VALUE; + spin_lock_init(&sem->wait_lock); + INIT_LIST_HEAD(&sem->wait_list); +#if RWSEM_DEBUG + sem->debug = 0; +#endif +} + +/* + * lock for reading + */ +static inline void __down_read(struct compat_rw_semaphore *sem) +{ + if (atomic_inc_return((atomic_t *)(&sem->count)) > 0) + smp_wmb(); + else + rwsem_down_read_failed(sem); +} + +static inline int __down_read_trylock(struct compat_rw_semaphore *sem) +{ + int tmp; + + while ((tmp = sem->count) >= 0) { + if (tmp == cmpxchg(&sem->count, tmp, + tmp + RWSEM_ACTIVE_READ_BIAS)) { + smp_wmb(); + return 1; + } + } + return 0; +} + +/* + * lock for writing + */ +static inline void __down_write(struct compat_rw_semaphore *sem) +{ + int tmp; + + tmp = atomic_add_return(RWSEM_ACTIVE_WRITE_BIAS, + (atomic_t *)(&sem->count)); + if (tmp == RWSEM_ACTIVE_WRITE_BIAS) + smp_wmb(); + else + rwsem_down_write_failed(sem); +} + +static inline int __down_write_trylock(struct compat_rw_semaphore *sem) +{ + int tmp; + + tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, + RWSEM_ACTIVE_WRITE_BIAS); + smp_wmb(); + return tmp == RWSEM_UNLOCKED_VALUE; +} + +/* + * unlock after reading + */ +static inline void __up_read(struct compat_rw_semaphore *sem) +{ + int tmp; + + smp_wmb(); + tmp = atomic_dec_return((atomic_t *)(&sem->count)); + if (tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0) + rwsem_wake(sem); +} + +/* + * unlock after writing + */ +static inline void __up_write(struct compat_rw_semaphore *sem) +{ + smp_wmb(); + if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS, + (atomic_t *)(&sem->count)) < 0) + rwsem_wake(sem); +} + +/* + * implement atomic add functionality + */ +static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem) +{ + atomic_add(delta, (atomic_t *)(&sem->count)); +} + +/* + * downgrade write lock to read lock + */ +static inline void __downgrade_write(struct compat_rw_semaphore *sem) +{ + int tmp; + + smp_wmb(); + tmp = atomic_add_return(-RWSEM_WAITING_BIAS, (atomic_t *)(&sem->count)); + if (tmp < 0) + rwsem_downgrade_wake(sem); +} + +/* + * implement exchange and add functionality + */ +static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem) +{ + smp_mb(); + return atomic_add_return(delta, (atomic_t *)(&sem->count)); +} + +#endif /* __KERNEL__ */ +#endif /* _MIPS_RWSEM_H */ Index: linux-2.6.24-rc8-rt1/include/asm-mips/spinlock.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-mips/spinlock.h 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-mips/spinlock.h 2008-01-16 22:28:41.000000000 -0500 @@ -28,7 +28,7 @@ * We make no fairness assumptions. They have a cost. */ -static inline void __raw_spin_lock(raw_spinlock_t *lock) +static inline void __raw_spin_lock(__raw_spinlock_t *lock) { unsigned int tmp; @@ -70,7 +70,7 @@ static inline void __raw_spin_lock(raw_s smp_llsc_mb(); } -static inline void __raw_spin_unlock(raw_spinlock_t *lock) +static inline void __raw_spin_unlock(__raw_spinlock_t *lock) { smp_mb(); @@ -83,7 +83,7 @@ static inline void __raw_spin_unlock(raw : "memory"); } -static inline unsigned int __raw_spin_trylock(raw_spinlock_t *lock) +static inline unsigned int __raw_spin_trylock(__raw_spinlock_t *lock) { unsigned int temp, res; @@ -144,7 +144,7 @@ static inline unsigned int __raw_spin_tr */ #define __raw_write_can_lock(rw) (!(rw)->lock) -static inline void __raw_read_lock(raw_rwlock_t *rw) +static inline void __raw_read_lock(__raw_rwlock_t *rw) { unsigned int tmp; @@ -189,7 +189,7 @@ static inline void __raw_read_lock(raw_r /* Note the use of sub, not subu which will make the kernel die with an overflow exception if we ever try to unlock an rwlock that is already unlocked or is being held by a writer. */ -static inline void __raw_read_unlock(raw_rwlock_t *rw) +static inline void __raw_read_unlock(__raw_rwlock_t *rw) { unsigned int tmp; @@ -223,7 +223,7 @@ static inline void __raw_read_unlock(raw } } -static inline void __raw_write_lock(raw_rwlock_t *rw) +static inline void __raw_write_lock(__raw_rwlock_t *rw) { unsigned int tmp; @@ -265,7 +265,7 @@ static inline void __raw_write_lock(raw_ smp_llsc_mb(); } -static inline void __raw_write_unlock(raw_rwlock_t *rw) +static inline void __raw_write_unlock(__raw_rwlock_t *rw) { smp_mb(); @@ -277,7 +277,7 @@ static inline void __raw_write_unlock(ra : "memory"); } -static inline int __raw_read_trylock(raw_rwlock_t *rw) +static inline int __raw_read_trylock(__raw_rwlock_t *rw) { unsigned int tmp; int ret; @@ -321,7 +321,7 @@ static inline int __raw_read_trylock(raw return ret; } -static inline int __raw_write_trylock(raw_rwlock_t *rw) +static inline int __raw_write_trylock(__raw_rwlock_t *rw) { unsigned int tmp; int ret; Index: linux-2.6.24-rc8-rt1/include/asm-mips/spinlock_types.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-mips/spinlock_types.h 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-mips/spinlock_types.h 2008-01-16 22:28:41.000000000 -0500 @@ -7,13 +7,13 @@ typedef struct { volatile unsigned int lock; -} raw_spinlock_t; +} __raw_spinlock_t; #define __RAW_SPIN_LOCK_UNLOCKED { 0 } typedef struct { volatile unsigned int lock; -} raw_rwlock_t; +} __raw_rwlock_t; #define __RAW_RW_LOCK_UNLOCKED { 0 } Index: linux-2.6.24-rc8-rt1/include/asm-mips/thread_info.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-mips/thread_info.h 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-mips/thread_info.h 2008-01-16 22:28:41.000000000 -0500 @@ -112,6 +112,7 @@ register struct thread_info *__current_t #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ #define TIF_SYSCALL_AUDIT 3 /* syscall auditing active */ #define TIF_SECCOMP 4 /* secure computing */ +#define TIF_NEED_RESCHED_DELAYED 6 /* reschedule on return to userspace */ #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */ #define TIF_USEDFPU 16 /* FPU was used by this task this quantum (SMP) */ #define TIF_POLLING_NRFLAG 17 /* true if poll_idle() is polling TIF_NEED_RESCHED */ @@ -129,6 +130,7 @@ register struct thread_info *__current_t #define _TIF_NEED_RESCHED (1< #include -extern spinlock_t rtc_lock; - /* * RTC ops. By default, they point to weak no-op RTC functions. * rtc_mips_set_time - reverse the above translation and set time to RTC. Index: linux-2.6.24-rc8-rt1/include/asm-mips/timeofday.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/include/asm-mips/timeofday.h 2008-01-16 22:28:41.000000000 -0500 @@ -0,0 +1,5 @@ +#ifndef _ASM_MIPS_TIMEOFDAY_H +#define _ASM_MIPS_TIMEOFDAY_H +#include +#endif + Index: linux-2.6.24-rc8-rt1/include/asm-mips/uaccess.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-mips/uaccess.h 2008-01-16 22:27:34.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-mips/uaccess.h 2008-01-16 22:28:41.000000000 -0500 @@ -427,7 +427,6 @@ extern size_t __copy_user(void *__to, co const void *__cu_from; \ long __cu_len; \ \ - might_sleep(); \ __cu_to = (to); \ __cu_from = (from); \ __cu_len = (n); \ @@ -483,7 +482,6 @@ extern size_t __copy_user_inatomic(void const void *__cu_from; \ long __cu_len; \ \ - might_sleep(); \ __cu_to = (to); \ __cu_from = (from); \ __cu_len = (n); \ @@ -562,7 +560,6 @@ extern size_t __copy_user_inatomic(void const void __user *__cu_from; \ long __cu_len; \ \ - might_sleep(); \ __cu_to = (to); \ __cu_from = (from); \ __cu_len = (n); \ @@ -593,7 +590,6 @@ extern size_t __copy_user_inatomic(void const void __user *__cu_from; \ long __cu_len; \ \ - might_sleep(); \ __cu_to = (to); \ __cu_from = (from); \ __cu_len = (n); \ @@ -611,7 +607,6 @@ extern size_t __copy_user_inatomic(void const void __user *__cu_from; \ long __cu_len; \ \ - might_sleep(); \ __cu_to = (to); \ __cu_from = (from); \ __cu_len = (n); \ @@ -638,7 +633,6 @@ __clear_user(void __user *addr, __kernel { __kernel_size_t res; - might_sleep(); __asm__ __volatile__( "move\t$4, %1\n\t" "move\t$5, $0\n\t" @@ -687,7 +681,6 @@ __strncpy_from_user(char *__to, const ch { long res; - might_sleep(); __asm__ __volatile__( "move\t$4, %1\n\t" "move\t$5, %2\n\t" @@ -724,7 +717,6 @@ strncpy_from_user(char *__to, const char { long res; - might_sleep(); __asm__ __volatile__( "move\t$4, %1\n\t" "move\t$5, %2\n\t" @@ -743,7 +735,6 @@ static inline long __strlen_user(const c { long res; - might_sleep(); __asm__ __volatile__( "move\t$4, %1\n\t" __MODULE_JAL(__strlen_user_nocheck_asm) @@ -773,7 +764,6 @@ static inline long strlen_user(const cha { long res; - might_sleep(); __asm__ __volatile__( "move\t$4, %1\n\t" __MODULE_JAL(__strlen_user_asm) @@ -790,7 +780,6 @@ static inline long __strnlen_user(const { long res; - might_sleep(); __asm__ __volatile__( "move\t$4, %1\n\t" "move\t$5, %2\n\t" @@ -821,7 +810,6 @@ static inline long strnlen_user(const ch { long res; - might_sleep(); __asm__ __volatile__( "move\t$4, %1\n\t" "move\t$5, %2\n\t" Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/early_printk.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/early_printk.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/early_printk.c 2008-01-16 22:28:42.000000000 -0500 @@ -198,7 +198,7 @@ static int early_console_initialized = 0 void early_printk(const char *fmt, ...) { - char buf[512]; + static char buf[512]; int n; va_list ap; Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/signal_64.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/signal_64.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/signal_64.c 2008-01-16 22:28:42.000000000 -0500 @@ -394,6 +394,13 @@ static void do_signal(struct pt_regs *re int signr; sigset_t *oldset; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which * is why we may in certain cases get here from Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/smp_64.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/smp_64.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/smp_64.c 2008-01-16 22:29:12.000000000 -0500 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -56,7 +57,7 @@ union smp_flush_state { struct mm_struct *flush_mm; unsigned long flush_va; #define FLUSH_ALL -1ULL - spinlock_t tlbstate_lock; + raw_spinlock_t tlbstate_lock; }; char pad[SMP_CACHE_BYTES]; } ____cacheline_aligned; @@ -296,10 +297,28 @@ void smp_send_reschedule(int cpu) } /* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + send_IPI_allbutself(RESCHEDULE_VECTOR); +} + +void smp_send_reschedule_allbutself_cpumask(cpumask_t mask) +{ + cpu_clear(smp_processor_id(), mask); + cpus_and(mask, mask, cpu_online_map); + if (!cpus_empty(mask)) + send_IPI_mask(mask, RESCHEDULE_VECTOR); +} + +/* * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. */ -static DEFINE_SPINLOCK(call_lock); +static DEFINE_RAW_SPINLOCK(call_lock); struct call_data_struct { void (*func) (void *info); Index: linux-2.6.24-rc8-rt1/include/asm-x86/acpi_64.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/acpi_64.h 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/acpi_64.h 2008-01-16 22:28:42.000000000 -0500 @@ -51,8 +51,8 @@ #define ACPI_ASM_MACROS #define BREAKPOINT3 -#define ACPI_DISABLE_IRQS() local_irq_disable() -#define ACPI_ENABLE_IRQS() local_irq_enable() +#define ACPI_DISABLE_IRQS() local_irq_disable_nort() +#define ACPI_ENABLE_IRQS() local_irq_enable_nort() #define ACPI_FLUSH_CPU_CACHE() wbinvd() int __acpi_acquire_global_lock(unsigned int *lock); Index: linux-2.6.24-rc8-rt1/include/asm-x86/hw_irq_64.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/hw_irq_64.h 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/hw_irq_64.h 2008-01-16 22:28:42.000000000 -0500 @@ -118,7 +118,7 @@ void i8254_timer_resume(void); typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); extern void __setup_vector_irq(int cpu); -extern spinlock_t vector_lock; +extern raw_spinlock_t vector_lock; /* * Various low-level irq details needed by irq.c, process.c, Index: linux-2.6.24-rc8-rt1/include/asm-x86/io_apic_64.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/io_apic_64.h 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/io_apic_64.h 2008-01-16 22:28:42.000000000 -0500 @@ -131,7 +131,7 @@ extern int sis_apic_bug; /* dummy */ void enable_NMI_through_LVT0 (void * dummy); -extern spinlock_t i8259A_lock; +extern raw_spinlock_t i8259A_lock; extern int timer_over_8254; Index: linux-2.6.24-rc8-rt1/include/asm-x86/tlbflush_64.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/tlbflush_64.h 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/tlbflush_64.h 2008-01-16 22:28:42.000000000 -0500 @@ -8,14 +8,20 @@ static inline void __flush_tlb(void) { + preempt_disable(); write_cr3(read_cr3()); + preempt_enable(); } static inline void __flush_tlb_all(void) { - unsigned long cr4 = read_cr4(); + unsigned long cr4; + + preempt_disable(); + cr4 = read_cr4(); write_cr4(cr4 & ~X86_CR4_PGE); /* clear PGE */ write_cr4(cr4); /* write old PGE again and flush TLBs */ + preempt_enable(); } #define __flush_tlb_one(addr) \ Index: linux-2.6.24-rc8-rt1/arch/ia64/Kconfig =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ia64/Kconfig 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ia64/Kconfig 2008-01-16 22:28:42.000000000 -0500 @@ -44,6 +44,7 @@ config SWIOTLB config RWSEM_XCHGADD_ALGORITHM bool + depends on !PREEMPT_RT default y config ARCH_HAS_ILOG2_U32 @@ -280,6 +281,69 @@ config SMP If you don't know what to do here, say N. + +config GENERIC_TIME + bool + default y + +config HIGH_RES_TIMERS + bool "High-Resolution Timers" + help + + POSIX timers are available by default. This option enables + high-resolution POSIX timers. With this option the resolution + is at least 1 microsecond. High resolution is not free. If + enabled this option will add a small overhead each time a + timer expires that is not on a 1/HZ tick boundary. If no such + timers are used the overhead is nil. + + This option enables two additional POSIX CLOCKS, + CLOCK_REALTIME_HR and CLOCK_MONOTONIC_HR. Note that this + option does not change the resolution of CLOCK_REALTIME or + CLOCK_MONOTONIC which remain at 1/HZ resolution. + +config HIGH_RES_RESOLUTION + int "High-Resolution-Timer resolution (nanoseconds)" + depends on HIGH_RES_TIMERS + default 1000 + help + + This sets the resolution of timers accessed with + CLOCK_REALTIME_HR and CLOCK_MONOTONIC_HR. Too + fine a resolution (small a number) will usually not + be observable due to normal system latencies. For an + 800 MHZ processor about 10,000 is the recommended maximum + (smallest number). If you don't need that sort of resolution, + higher numbers may generate less overhead. + +choice + prompt "Clock source" + depends on HIGH_RES_TIMERS + default HIGH_RES_TIMER_ITC + help + This option allows you to choose the hardware source in charge + of generating high precision interruptions on your system. + On IA-64 these are: + + + ITC Interval Time Counter 1/CPU clock + HPET High Precision Event Timer ~ (XXX:have to check the spec) + + The ITC timer is available on all the ia64 computers because + it is integrated directly into the processor. However it may not + give correct results on MP machines with processors running + at different clock rates. In this case you may want to use + the HPET if available on your machine. + + +config HIGH_RES_TIMER_ITC + bool "Interval Time Counter/ITC" + +config HIGH_RES_TIMER_HPET + bool "High Precision Event Timer/HPET" + +endchoice + config NR_CPUS int "Maximum number of CPUs (2-1024)" range 2 1024 Index: linux-2.6.24-rc8-rt1/arch/ia64/kernel/asm-offsets.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ia64/kernel/asm-offsets.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ia64/kernel/asm-offsets.c 2008-01-16 22:28:42.000000000 -0500 @@ -257,6 +257,7 @@ void foo(void) offsetof (struct pal_min_state_area_s, pmsa_xip)); BLANK(); +#ifdef CONFIG_TIME_INTERPOLATION /* used by fsys_gettimeofday in arch/ia64/kernel/fsys.S */ DEFINE(IA64_GTOD_LOCK_OFFSET, offsetof (struct fsyscall_gtod_data_t, lock)); @@ -278,4 +279,5 @@ void foo(void) offsetof (struct itc_jitter_data_t, itc_jitter)); DEFINE(IA64_ITC_LASTCYCLE_OFFSET, offsetof (struct itc_jitter_data_t, itc_lastcycle)); +#endif } Index: linux-2.6.24-rc8-rt1/arch/ia64/kernel/entry.S =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ia64/kernel/entry.S 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ia64/kernel/entry.S 2008-01-16 22:28:42.000000000 -0500 @@ -1098,23 +1098,24 @@ skip_rbs_switch: st8 [r2]=r8 st8 [r3]=r10 .work_pending: - tbit.z p6,p0=r31,TIF_NEED_RESCHED // current_thread_info()->need_resched==0? + tbit.nz p6,p0=r31,TIF_NEED_RESCHED // current_thread_info()->need_resched==0? +(p6) br.cond.sptk.few .needresched + tbit.z p6,p0=r31,TIF_NEED_RESCHED_DELAYED // current_thread_info()->need_resched_delayed==0? (p6) br.cond.sptk.few .notify -#ifdef CONFIG_PREEMPT -(pKStk) dep r21=-1,r0,PREEMPT_ACTIVE_BIT,1 + +.needresched: + +(pKStk) br.cond.sptk.many .fromkernel ;; -(pKStk) st4 [r20]=r21 ssm psr.i // enable interrupts -#endif br.call.spnt.many rp=schedule -.ret9: cmp.eq p6,p0=r0,r0 // p6 <- 1 - rsm psr.i // disable interrupts - ;; -#ifdef CONFIG_PREEMPT -(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13 +.ret9a: rsm psr.i // disable interrupts ;; -(pKStk) st4 [r20]=r0 // preempt_count() <- 0 -#endif + br.cond.sptk.many .endpreemptdep +.fromkernel: + br.call.spnt.many rp=preempt_schedule_irq +.ret9b: rsm psr.i // disable interrupts +.endpreemptdep: (pLvSys)br.cond.sptk.few .work_pending_syscall_end br.cond.sptk.many .work_processed_kernel // re-check Index: linux-2.6.24-rc8-rt1/arch/ia64/kernel/fsys.S =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ia64/kernel/fsys.S 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ia64/kernel/fsys.S 2008-01-16 22:28:42.000000000 -0500 @@ -26,6 +26,7 @@ #include "entry.h" +#ifdef CONFIG_TIME_INTERPOLATION /* * See Documentation/ia64/fsys.txt for details on fsyscalls. * @@ -349,6 +350,26 @@ ENTRY(fsys_clock_gettime) br.many .gettime END(fsys_clock_gettime) + +#else // !CONFIG_TIME_INTERPOLATION + +# define fsys_gettimeofday 0 +# define fsys_clock_gettime 0 + +.fail_einval: + mov r8 = EINVAL + mov r10 = -1 + FSYS_RETURN + +.fail_efault: + mov r8 = EFAULT + mov r10 = -1 + FSYS_RETURN + +#endif + + + /* * long fsys_rt_sigprocmask (int how, sigset_t *set, sigset_t *oset, size_t sigsetsize). */ Index: linux-2.6.24-rc8-rt1/arch/ia64/kernel/iosapic.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ia64/kernel/iosapic.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ia64/kernel/iosapic.c 2008-01-16 22:28:42.000000000 -0500 @@ -111,7 +111,7 @@ (PAGE_SIZE / sizeof(struct iosapic_rte_info)) #define RTE_PREALLOCATED (1) -static DEFINE_SPINLOCK(iosapic_lock); +static DEFINE_RAW_SPINLOCK(iosapic_lock); /* * These tables map IA-64 vectors to the IOSAPIC pin that generates this @@ -390,6 +390,34 @@ iosapic_startup_level_irq (unsigned int return 0; } +/* + * In the preemptible case mask the IRQ first then handle it and ack it. + */ +#ifdef CONFIG_PREEMPT_HARDIRQS + +static void +iosapic_ack_level_irq (unsigned int irq) +{ + ia64_vector vec = irq_to_vector(irq); + struct iosapic_rte_info *rte; + + move_irq(irq); + mask_irq(irq); + list_for_each_entry(rte, &iosapic_intr_info[vec].rtes, rte_list) + iosapic_eoi(rte->addr, vec); +} + +static void +iosapic_end_level_irq (unsigned int irq) +{ + if (!(irq_desc[irq].status & IRQ_INPROGRESS)) + unmask_irq(irq); +} + +#else /* !CONFIG_PREEMPT_HARDIRQS */ + +#define iosapic_ack_level_irq nop + static void iosapic_end_level_irq (unsigned int irq) { @@ -411,10 +439,11 @@ iosapic_end_level_irq (unsigned int irq) } } +#endif + #define iosapic_shutdown_level_irq mask_irq #define iosapic_enable_level_irq unmask_irq #define iosapic_disable_level_irq mask_irq -#define iosapic_ack_level_irq nop static struct irq_chip irq_type_iosapic_level = { .name = "IO-SAPIC-level", Index: linux-2.6.24-rc8-rt1/arch/ia64/kernel/mca.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ia64/kernel/mca.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ia64/kernel/mca.c 2008-01-16 22:28:42.000000000 -0500 @@ -323,7 +323,7 @@ ia64_mca_spin(const char *func) typedef struct ia64_state_log_s { - spinlock_t isl_lock; + raw_spinlock_t isl_lock; int isl_index; unsigned long isl_count; ia64_err_rec_t *isl_log[IA64_MAX_LOGS]; /* need space to store header + error log */ Index: linux-2.6.24-rc8-rt1/arch/ia64/kernel/perfmon.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ia64/kernel/perfmon.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ia64/kernel/perfmon.c 2008-01-16 22:28:42.000000000 -0500 @@ -280,7 +280,7 @@ typedef struct { */ typedef struct pfm_context { - spinlock_t ctx_lock; /* context protection */ + raw_spinlock_t ctx_lock; /* context protection */ pfm_context_flags_t ctx_flags; /* bitmask of flags (block reason incl.) */ unsigned int ctx_state; /* state: active/inactive (no bitfield) */ @@ -369,7 +369,7 @@ typedef struct pfm_context { * mostly used to synchronize between system wide and per-process */ typedef struct { - spinlock_t pfs_lock; /* lock the structure */ + raw_spinlock_t pfs_lock; /* lock the structure */ unsigned int pfs_task_sessions; /* number of per task sessions */ unsigned int pfs_sys_sessions; /* number of per system wide sessions */ @@ -510,7 +510,7 @@ static pfm_intr_handler_desc_t *pfm_alt static struct proc_dir_entry *perfmon_dir; static pfm_uuid_t pfm_null_uuid = {0,}; -static spinlock_t pfm_buffer_fmt_lock; +static raw_spinlock_t pfm_buffer_fmt_lock; static LIST_HEAD(pfm_buffer_fmt_list); static pmu_config_t *pmu_conf; Index: linux-2.6.24-rc8-rt1/arch/ia64/kernel/process.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ia64/kernel/process.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ia64/kernel/process.c 2008-01-16 22:28:42.000000000 -0500 @@ -95,6 +95,9 @@ show_stack (struct task_struct *task, un void dump_stack (void) { + if (irqs_disabled()) { + printk("Uh oh.. entering dump_stack() with irqs disabled.\n"); + } show_stack(NULL, NULL); } @@ -200,7 +203,7 @@ void default_idle (void) { local_irq_enable(); - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { if (can_do_pal_halt) { local_irq_disable(); if (!need_resched()) { @@ -288,7 +291,7 @@ cpu_idle (void) current_thread_info()->status |= TS_POLLING; } - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { void (*idle)(void); #ifdef CONFIG_SMP min_xtp(); @@ -310,10 +313,11 @@ cpu_idle (void) normal_xtp(); #endif } - preempt_enable_no_resched(); - schedule(); + __preempt_enable_no_resched(); + __schedule(); + preempt_disable(); - check_pgt_cache(); + if (cpu_is_offline(cpu)) play_dead(); } Index: linux-2.6.24-rc8-rt1/arch/ia64/kernel/sal.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ia64/kernel/sal.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ia64/kernel/sal.c 2008-01-16 22:28:42.000000000 -0500 @@ -18,7 +18,7 @@ #include #include - __cacheline_aligned DEFINE_SPINLOCK(sal_lock); + __cacheline_aligned DEFINE_RAW_SPINLOCK(sal_lock); unsigned long sal_platform_features; unsigned short sal_revision; Index: linux-2.6.24-rc8-rt1/arch/ia64/kernel/salinfo.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ia64/kernel/salinfo.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ia64/kernel/salinfo.c 2008-01-16 22:28:42.000000000 -0500 @@ -140,7 +140,7 @@ enum salinfo_state { struct salinfo_data { cpumask_t cpu_event; /* which cpus have outstanding events */ - struct semaphore mutex; + struct compat_semaphore mutex; u8 *log_buffer; u64 log_size; u8 *oemdata; /* decoded oem data */ @@ -156,8 +156,8 @@ struct salinfo_data { static struct salinfo_data salinfo_data[ARRAY_SIZE(salinfo_log_name)]; -static DEFINE_SPINLOCK(data_lock); -static DEFINE_SPINLOCK(data_saved_lock); +static DEFINE_RAW_SPINLOCK(data_lock); +static DEFINE_RAW_SPINLOCK(data_saved_lock); /** salinfo_platform_oemdata - optional callback to decode oemdata from an error * record. Index: linux-2.6.24-rc8-rt1/arch/ia64/kernel/semaphore.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ia64/kernel/semaphore.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ia64/kernel/semaphore.c 2008-01-16 22:28:42.000000000 -0500 @@ -40,12 +40,12 @@ */ void -__up (struct semaphore *sem) +__up (struct compat_semaphore *sem) { wake_up(&sem->wait); } -void __sched __down (struct semaphore *sem) +void __sched __down (struct compat_semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -82,7 +82,7 @@ void __sched __down (struct semaphore *s tsk->state = TASK_RUNNING; } -int __sched __down_interruptible (struct semaphore * sem) +int __sched __down_interruptible (struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -142,7 +142,7 @@ int __sched __down_interruptible (struct * count. */ int -__down_trylock (struct semaphore *sem) +__down_trylock (struct compat_semaphore *sem) { unsigned long flags; int sleepers; Index: linux-2.6.24-rc8-rt1/arch/ia64/kernel/signal.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ia64/kernel/signal.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ia64/kernel/signal.c 2008-01-16 22:28:42.000000000 -0500 @@ -438,6 +438,14 @@ ia64_do_signal (struct sigscratch *scr, long errno = scr->pt.r8; # define ERR_CODE(c) (IS_IA32_PROCESS(&scr->pt) ? -(c) : (c)) +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif + /* * In the ia64_leave_kernel code path, we want the common case to go fast, which * is why we may in certain cases get here from kernel mode. Just return without Index: linux-2.6.24-rc8-rt1/arch/ia64/kernel/smp.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ia64/kernel/smp.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ia64/kernel/smp.c 2008-01-16 22:28:42.000000000 -0500 @@ -261,6 +261,22 @@ smp_send_reschedule (int cpu) } /* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + unsigned int cpu; + + for_each_online_cpu(cpu) { + if (cpu != smp_processor_id()) + platform_send_ipi(cpu, IA64_IPI_RESCHEDULE, + IA64_IPI_DM_INT, 0); + } +} + +/* * Called with preemption disabled. */ static void Index: linux-2.6.24-rc8-rt1/arch/ia64/kernel/smpboot.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ia64/kernel/smpboot.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ia64/kernel/smpboot.c 2008-01-16 22:28:42.000000000 -0500 @@ -372,6 +372,8 @@ smp_setup_percpu_timer (void) { } +extern void register_itc_clockevent(void); + static void __cpuinit smp_callin (void) { @@ -450,6 +452,7 @@ smp_callin (void) #ifdef CONFIG_IA32_SUPPORT ia32_gdt_init(); #endif + register_itc_clockevent(); /* * Allow the master to continue. Index: linux-2.6.24-rc8-rt1/arch/ia64/kernel/time.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ia64/kernel/time.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ia64/kernel/time.c 2008-01-16 22:28:42.000000000 -0500 @@ -70,6 +70,7 @@ timer_interrupt (int irq, void *dev_id) platform_timer_interrupt(irq, dev_id); +#if 0 new_itm = local_cpu_data->itm_next; if (!time_after(ia64_get_itc(), new_itm)) @@ -77,29 +78,48 @@ timer_interrupt (int irq, void *dev_id) ia64_get_itc(), new_itm); profile_tick(CPU_PROFILING); +#endif + + if (time_after(ia64_get_itc(), local_cpu_data->itm_tick_next)) { - while (1) { - update_process_times(user_mode(get_irq_regs())); + unsigned long new_tick_itm; + new_tick_itm = local_cpu_data->itm_tick_next; - new_itm += local_cpu_data->itm_delta; + profile_tick(CPU_PROFILING, get_irq_regs()); - if (smp_processor_id() == time_keeper_id) { - /* - * Here we are in the timer irq handler. We have irqs locally - * disabled, but we don't know if the timer_bh is running on - * another CPU. We need to avoid to SMP race by acquiring the - * xtime_lock. - */ - write_seqlock(&xtime_lock); - do_timer(1); - local_cpu_data->itm_next = new_itm; - write_sequnlock(&xtime_lock); - } else - local_cpu_data->itm_next = new_itm; + while (1) { + update_process_times(user_mode(get_irq_regs())); + + new_tick_itm += local_cpu_data->itm_tick_delta; + + if (smp_processor_id() == time_keeper_id) { + /* + * Here we are in the timer irq handler. We have irqs locally + * disabled, but we don't know if the timer_bh is running on + * another CPU. We need to avoid to SMP race by acquiring the + * xtime_lock. + */ + write_seqlock(&xtime_lock); + do_timer(get_irq_regs()); + local_cpu_data->itm_tick_next = new_tick_itm; + write_sequnlock(&xtime_lock); + } else + local_cpu_data->itm_tick_next = new_tick_itm; + + if (time_after(new_tick_itm, ia64_get_itc())) + break; + } + } - if (time_after(new_itm, ia64_get_itc())) - break; + if (time_after(ia64_get_itc(), local_cpu_data->itm_timer_next)) { + if (itc_clockevent.event_handler) + itc_clockevent.event_handler(get_irq_regs()); + // FIXME, really, please + new_itm = local_cpu_data->itm_tick_next; + + if (time_after(new_itm, local_cpu_data->itm_timer_next)) + new_itm = local_cpu_data->itm_timer_next; /* * Allow IPIs to interrupt the timer loop. */ @@ -117,8 +137,8 @@ timer_interrupt (int irq, void *dev_id) * too fast (with the potentially devastating effect * of losing monotony of time). */ - while (!time_after(new_itm, ia64_get_itc() + local_cpu_data->itm_delta/2)) - new_itm += local_cpu_data->itm_delta; + while (!time_after(new_itm, ia64_get_itc() + local_cpu_data->itm_tick_delta/2)) + new_itm += local_cpu_data->itm_tick_delta; ia64_set_itm(new_itm); /* double check, in case we got hit by a (slow) PMI: */ } while (time_after_eq(ia64_get_itc(), new_itm)); @@ -137,7 +157,7 @@ ia64_cpu_local_tick (void) /* arrange for the cycle counter to generate a timer interrupt: */ ia64_set_itv(IA64_TIMER_VECTOR); - delta = local_cpu_data->itm_delta; + delta = local_cpu_data->itm_tick_delta; /* * Stagger the timer tick for each CPU so they don't occur all at (almost) the * same time: @@ -146,8 +166,8 @@ ia64_cpu_local_tick (void) unsigned long hi = 1UL << ia64_fls(cpu); shift = (2*(cpu - hi) + 1) * delta/hi/2; } - local_cpu_data->itm_next = ia64_get_itc() + delta + shift; - ia64_set_itm(local_cpu_data->itm_next); + local_cpu_data->itm_tick_next = ia64_get_itc() + delta + shift; + ia64_set_itm(local_cpu_data->itm_tick_next); } static int nojitter; @@ -205,7 +225,7 @@ ia64_init_itm (void) itc_freq = (platform_base_freq*itc_ratio.num)/itc_ratio.den; - local_cpu_data->itm_delta = (itc_freq + HZ/2) / HZ; + local_cpu_data->itm_tick_delta = (itc_freq + HZ/2) / HZ; printk(KERN_DEBUG "CPU %d: base freq=%lu.%03luMHz, ITC ratio=%u/%u, " "ITC freq=%lu.%03luMHz", smp_processor_id(), platform_base_freq / 1000000, (platform_base_freq / 1000) % 1000, @@ -225,6 +245,7 @@ ia64_init_itm (void) local_cpu_data->nsec_per_cyc = ((NSEC_PER_SEC<mfh = 1; } - preempt_enable_no_resched(); + __preempt_enable_no_resched(); } static inline int Index: linux-2.6.24-rc8-rt1/arch/ia64/kernel/unwind.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ia64/kernel/unwind.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ia64/kernel/unwind.c 2008-01-16 22:28:42.000000000 -0500 @@ -82,7 +82,7 @@ typedef unsigned long unw_word; typedef unsigned char unw_hash_index_t; static struct { - spinlock_t lock; /* spinlock for unwind data */ + raw_spinlock_t lock; /* spinlock for unwind data */ /* list of unwind tables (one per load-module) */ struct unw_table *tables; @@ -146,7 +146,7 @@ static struct { # endif } unw = { .tables = &unw.kernel_table, - .lock = __SPIN_LOCK_UNLOCKED(unw.lock), + .lock = RAW_SPIN_LOCK_UNLOCKED(unw.lock), .save_order = { UNW_REG_RP, UNW_REG_PFS, UNW_REG_PSP, UNW_REG_PR, UNW_REG_UNAT, UNW_REG_LC, UNW_REG_FPSR, UNW_REG_PRI_UNAT_GR Index: linux-2.6.24-rc8-rt1/arch/ia64/kernel/unwind_i.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ia64/kernel/unwind_i.h 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ia64/kernel/unwind_i.h 2008-01-16 22:28:42.000000000 -0500 @@ -154,7 +154,7 @@ struct unw_script { unsigned long ip; /* ip this script is for */ unsigned long pr_mask; /* mask of predicates script depends on */ unsigned long pr_val; /* predicate values this script is for */ - rwlock_t lock; + raw_rwlock_t lock; unsigned int flags; /* see UNW_FLAG_* in unwind.h */ unsigned short lru_chain; /* used for least-recently-used chain */ unsigned short coll_chain; /* used for hash collisions */ Index: linux-2.6.24-rc8-rt1/arch/ia64/mm/init.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ia64/mm/init.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ia64/mm/init.c 2008-01-16 22:28:42.000000000 -0500 @@ -37,7 +37,7 @@ #include #include -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); extern void ia64_tlb_init (void); Index: linux-2.6.24-rc8-rt1/arch/ia64/mm/tlb.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ia64/mm/tlb.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ia64/mm/tlb.c 2008-01-16 22:28:42.000000000 -0500 @@ -33,7 +33,7 @@ static struct { } purge; struct ia64_ctx ia64_ctx = { - .lock = __SPIN_LOCK_UNLOCKED(ia64_ctx.lock), + .lock = RAW_SPIN_LOCK_UNLOCKED(ia64_ctx.lock), .next = 1, .max_ctx = ~0U }; Index: linux-2.6.24-rc8-rt1/include/asm-ia64/irqflags.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24-rc8-rt1/include/asm-ia64/irqflags.h 2008-01-16 22:28:42.000000000 -0500 @@ -0,0 +1,95 @@ + +/* + * include/asm-i64/irqflags.h + * + * IRQ flags handling + * + * This file gets included from lowlevel asm headers too, to provide + * wrapped versions of the local_irq_*() APIs, based on the + * raw_local_irq_*() macros from the lowlevel headers. + */ +#ifndef _ASM_IRQFLAGS_H +#define _ASM_IRQFLAGS_H + +/* For spinlocks etc */ + +/* + * - clearing psr.i is implicitly serialized (visible by next insn) + * - setting psr.i requires data serialization + * - we need a stop-bit before reading PSR because we sometimes + * write a floating-point register right before reading the PSR + * and that writes to PSR.mfl + */ +#define __local_irq_save(x) \ +do { \ + ia64_stop(); \ + (x) = ia64_getreg(_IA64_REG_PSR); \ + ia64_stop(); \ + ia64_rsm(IA64_PSR_I); \ +} while (0) + +#define __local_irq_disable() \ +do { \ + ia64_stop(); \ + ia64_rsm(IA64_PSR_I); \ +} while (0) + +#define __local_irq_restore(x) ia64_intrin_local_irq_restore((x) & IA64_PSR_I) + +#ifdef CONFIG_IA64_DEBUG_IRQ + + extern unsigned long last_cli_ip; + +# define __save_ip() last_cli_ip = ia64_getreg(_IA64_REG_IP) + +# define raw_local_irq_save(x) \ +do { \ + unsigned long psr; \ + \ + __local_irq_save(psr); \ + if (psr & IA64_PSR_I) \ + __save_ip(); \ + (x) = psr; \ +} while (0) + +# define raw_local_irq_disable() do { unsigned long x; local_irq_save(x); } while (0) + +# define raw_local_irq_restore(x) \ +do { \ + unsigned long old_psr, psr = (x); \ + \ + local_save_flags(old_psr); \ + __local_irq_restore(psr); \ + if ((old_psr & IA64_PSR_I) && !(psr & IA64_PSR_I)) \ + __save_ip(); \ +} while (0) + +#else /* !CONFIG_IA64_DEBUG_IRQ */ +# define raw_local_irq_save(x) __local_irq_save(x) +# define raw_local_irq_disable() __local_irq_disable() +# define raw_local_irq_restore(x) __local_irq_restore(x) +#endif /* !CONFIG_IA64_DEBUG_IRQ */ + +#define raw_local_irq_enable() ({ ia64_stop(); ia64_ssm(IA64_PSR_I); ia64_srlz_d(); }) +#define raw_local_save_flags(flags) ({ ia64_stop(); (flags) = ia64_getreg(_IA64_REG_PSR); }) + +#define raw_irqs_disabled() \ +({ \ + unsigned long __ia64_id_flags; \ + local_save_flags(__ia64_id_flags); \ + (__ia64_id_flags & IA64_PSR_I) == 0; \ +}) + +#define raw_irqs_disabled_flags(flags) ((flags & IA64_PSR_I) == 0) + + +#define raw_safe_halt() ia64_pal_halt_light() /* PAL_HALT_LIGHT */ + +/* TBD... */ +# define TRACE_IRQS_ON +# define TRACE_IRQS_OFF +# define TRACE_IRQS_ON_STR +# define TRACE_IRQS_OFF_STR + +#endif + Index: linux-2.6.24-rc8-rt1/include/asm-ia64/mmu_context.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-ia64/mmu_context.h 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-ia64/mmu_context.h 2008-01-16 22:28:42.000000000 -0500 @@ -32,7 +32,7 @@ #include struct ia64_ctx { - spinlock_t lock; + raw_spinlock_t lock; unsigned int next; /* next context number to use */ unsigned int limit; /* available free range */ unsigned int max_ctx; /* max. context value supported by all CPUs */ Index: linux-2.6.24-rc8-rt1/include/asm-ia64/percpu.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-ia64/percpu.h 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-ia64/percpu.h 2008-01-16 22:28:42.000000000 -0500 @@ -24,10 +24,17 @@ #define DECLARE_PER_CPU(type, name) \ extern __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name +#define DECLARE_PER_CPU_LOCKED(type, name) \ + extern spinlock_t per_cpu_lock__##name##_locked; \ + extern __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name##_locked + /* Separate out the type, so (int[3], foo) works. */ #define DEFINE_PER_CPU(type, name) \ - __attribute__((__section__(".data.percpu"))) \ - __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name + __attribute__((__section__(".data.percpu"))) __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name + +#define DEFINE_PER_CPU_LOCKED(type, name) \ + __attribute__((__section__(".data.percpu"))) __SMALL_ADDR_AREA __DEFINE_SPINLOCK(per_cpu_lock__##name##_locked); \ + __attribute__((__section__(".data.percpu"))) __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name##_locked #ifdef CONFIG_SMP #define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ @@ -55,6 +62,16 @@ DECLARE_PER_CPU(unsigned long, local_per #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __ia64_per_cpu_var(local_per_cpu_offset))) #define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __ia64_per_cpu_var(local_per_cpu_offset))) +#define per_cpu_lock(var, cpu) \ + (*RELOC_HIDE(&per_cpu_lock__##var##_locked, __per_cpu_offset[cpu])) +#define per_cpu_var_locked(var, cpu) \ + (*RELOC_HIDE(&per_cpu__##var##_locked, __per_cpu_offset[cpu])) +#define __get_cpu_lock(var, cpu) \ + per_cpu_lock(var, cpu) +#define __get_cpu_var_locked(var, cpu) \ + per_cpu_var_locked(var, cpu) + + extern void percpu_modcopy(void *pcpudst, const void *src, unsigned long size); extern void setup_per_cpu_areas (void); extern void *per_cpu_init(void); Index: linux-2.6.24-rc8-rt1/include/asm-ia64/processor.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-ia64/processor.h 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-ia64/processor.h 2008-01-16 22:28:42.000000000 -0500 @@ -124,8 +124,10 @@ struct ia64_psr { */ struct cpuinfo_ia64 { __u32 softirq_pending; - __u64 itm_delta; /* # of clock cycles between clock ticks */ - __u64 itm_next; /* interval timer mask value to use for next clock tick */ + __u64 itm_tick_delta; /* # of clock cycles between clock ticks */ + __u64 itm_tick_next; /* interval timer mask value to use for next clock tick */ + __u64 itm_timer_next; + __u64 __itm_next; __u64 nsec_per_cyc; /* (1000000000<count = RWSEM_UNLOCKED_VALUE; spin_lock_init(&sem->wait_lock); @@ -70,7 +70,7 @@ init_rwsem (struct rw_semaphore *sem) * lock for reading */ static inline void -__down_read (struct rw_semaphore *sem) +__down_read (struct compat_rw_semaphore *sem) { long result = ia64_fetchadd8_acq((unsigned long *)&sem->count, 1); @@ -82,7 +82,7 @@ __down_read (struct rw_semaphore *sem) * lock for writing */ static inline void -__down_write (struct rw_semaphore *sem) +__down_write (struct compat_rw_semaphore *sem) { long old, new; @@ -99,7 +99,7 @@ __down_write (struct rw_semaphore *sem) * unlock after reading */ static inline void -__up_read (struct rw_semaphore *sem) +__up_read (struct compat_rw_semaphore *sem) { long result = ia64_fetchadd8_rel((unsigned long *)&sem->count, -1); @@ -111,7 +111,7 @@ __up_read (struct rw_semaphore *sem) * unlock after writing */ static inline void -__up_write (struct rw_semaphore *sem) +__up_write (struct compat_rw_semaphore *sem) { long old, new; @@ -128,7 +128,7 @@ __up_write (struct rw_semaphore *sem) * trylock for reading -- returns 1 if successful, 0 if contention */ static inline int -__down_read_trylock (struct rw_semaphore *sem) +__down_read_trylock (struct compat_rw_semaphore *sem) { long tmp; while ((tmp = sem->count) >= 0) { @@ -143,7 +143,7 @@ __down_read_trylock (struct rw_semaphore * trylock for writing -- returns 1 if successful, 0 if contention */ static inline int -__down_write_trylock (struct rw_semaphore *sem) +__down_write_trylock (struct compat_rw_semaphore *sem) { long tmp = cmpxchg_acq(&sem->count, RWSEM_UNLOCKED_VALUE, RWSEM_ACTIVE_WRITE_BIAS); @@ -154,7 +154,7 @@ __down_write_trylock (struct rw_semaphor * downgrade write lock to read lock */ static inline void -__downgrade_write (struct rw_semaphore *sem) +__downgrade_write (struct compat_rw_semaphore *sem) { long old, new; @@ -174,7 +174,7 @@ __downgrade_write (struct rw_semaphore * #define rwsem_atomic_add(delta, sem) atomic64_add(delta, (atomic64_t *)(&(sem)->count)) #define rwsem_atomic_update(delta, sem) atomic64_add_return(delta, (atomic64_t *)(&(sem)->count)) -static inline int rwsem_is_locked(struct rw_semaphore *sem) +static inline int compat_rwsem_is_locked(struct compat_rw_semaphore *sem) { return (sem->count != 0); } Index: linux-2.6.24-rc8-rt1/include/asm-ia64/sal.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-ia64/sal.h 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-ia64/sal.h 2008-01-16 22:28:42.000000000 -0500 @@ -43,7 +43,7 @@ #include #include -extern spinlock_t sal_lock; +extern raw_spinlock_t sal_lock; /* SAL spec _requires_ eight args for each call. */ #define __IA64_FW_CALL(entry,result,a0,a1,a2,a3,a4,a5,a6,a7) \ Index: linux-2.6.24-rc8-rt1/include/asm-ia64/semaphore.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-ia64/semaphore.h 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-ia64/semaphore.h 2008-01-16 22:28:42.000000000 -0500 @@ -11,53 +11,64 @@ #include -struct semaphore { +/* + * On !PREEMPT_RT all semaphores are compat: + */ +#ifndef CONFIG_PREEMPT_RT +# define compat_semaphore semaphore +#endif + +struct compat_semaphore { atomic_t count; int sleepers; wait_queue_head_t wait; }; -#define __SEMAPHORE_INITIALIZER(name, n) \ +#define __COMPAT_SEMAPHORE_INITIALIZER(name, n) \ { \ .count = ATOMIC_INIT(n), \ .sleepers = 0, \ .wait = __WAIT_QUEUE_HEAD_INITIALIZER((name).wait) \ } -#define __DECLARE_SEMAPHORE_GENERIC(name,count) \ - struct semaphore name = __SEMAPHORE_INITIALIZER(name, count) +#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \ + struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name, count) -#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name, 1) +#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 1) + +#define compat_sema_count(sem) atomic_read(&(sem)->count) + +asmlinkage int compat_sem_is_locked(struct compat_semaphore *sem); static inline void -sema_init (struct semaphore *sem, int val) +compat_sema_init (struct compat_semaphore *sem, int val) { - *sem = (struct semaphore) __SEMAPHORE_INITIALIZER(*sem, val); + *sem = (struct compat_semaphore) __COMPAT_SEMAPHORE_INITIALIZER(*sem, val); } static inline void -init_MUTEX (struct semaphore *sem) +compat_init_MUTEX (struct compat_semaphore *sem) { - sema_init(sem, 1); + compat_sema_init(sem, 1); } static inline void -init_MUTEX_LOCKED (struct semaphore *sem) +compat_init_MUTEX_LOCKED (struct compat_semaphore *sem) { - sema_init(sem, 0); + compat_sema_init(sem, 0); } -extern void __down (struct semaphore * sem); -extern int __down_interruptible (struct semaphore * sem); -extern int __down_trylock (struct semaphore * sem); -extern void __up (struct semaphore * sem); +extern void __down (struct compat_semaphore * sem); +extern int __down_interruptible (struct compat_semaphore * sem); +extern int __down_trylock (struct compat_semaphore * sem); +extern void __up (struct compat_semaphore * sem); /* * Atomically decrement the semaphore's count. If it goes negative, * block the calling thread in the TASK_UNINTERRUPTIBLE state. */ static inline void -down (struct semaphore *sem) +compat_down (struct compat_semaphore *sem) { might_sleep(); if (ia64_fetchadd(-1, &sem->count.counter, acq) < 1) @@ -69,7 +80,7 @@ down (struct semaphore *sem) * block the calling thread in the TASK_INTERRUPTIBLE state. */ static inline int -down_interruptible (struct semaphore * sem) +compat_down_interruptible (struct compat_semaphore * sem) { int ret = 0; @@ -80,7 +91,7 @@ down_interruptible (struct semaphore * s } static inline int -down_trylock (struct semaphore *sem) +compat_down_trylock (struct compat_semaphore *sem) { int ret = 0; @@ -90,10 +101,12 @@ down_trylock (struct semaphore *sem) } static inline void -up (struct semaphore * sem) +compat_up (struct compat_semaphore * sem) { if (ia64_fetchadd(1, &sem->count.counter, rel) <= -1) __up(sem); } +#include + #endif /* _ASM_IA64_SEMAPHORE_H */ Index: linux-2.6.24-rc8-rt1/include/asm-ia64/spinlock.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-ia64/spinlock.h 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-ia64/spinlock.h 2008-01-16 22:28:42.000000000 -0500 @@ -17,8 +17,6 @@ #include #include -#define __raw_spin_lock_init(x) ((x)->lock = 0) - #ifdef ASM_SUPPORTED /* * Try to get the lock. If we fail to get the lock, make a non-standard call to @@ -30,7 +28,7 @@ #define IA64_SPINLOCK_CLOBBERS "ar.ccv", "ar.pfs", "p14", "p15", "r27", "r28", "r29", "r30", "b6", "memory" static inline void -__raw_spin_lock_flags (raw_spinlock_t *lock, unsigned long flags) +__raw_spin_lock_flags (__raw_spinlock_t *lock, unsigned long flags) { register volatile unsigned int *ptr asm ("r31") = &lock->lock; @@ -89,7 +87,7 @@ __raw_spin_lock_flags (raw_spinlock_t *l #define __raw_spin_lock(lock) __raw_spin_lock_flags(lock, 0) /* Unlock by doing an ordered store and releasing the cacheline with nta */ -static inline void __raw_spin_unlock(raw_spinlock_t *x) { +static inline void __raw_spin_unlock(__raw_spinlock_t *x) { barrier(); asm volatile ("st4.rel.nta [%0] = r0\n\t" :: "r"(x)); } @@ -109,7 +107,7 @@ do { \ } while (ia64_spinlock_val); \ } \ } while (0) -#define __raw_spin_unlock(x) do { barrier(); ((raw_spinlock_t *) x)->lock = 0; } while (0) +#define __raw_spin_unlock(x) do { barrier(); ((__raw_spinlock_t *) x)->lock = 0; } while (0) #endif /* !ASM_SUPPORTED */ #define __raw_spin_is_locked(x) ((x)->lock != 0) @@ -122,7 +120,7 @@ do { \ #define __raw_read_lock(rw) \ do { \ - raw_rwlock_t *__read_lock_ptr = (rw); \ + __raw_rwlock_t *__read_lock_ptr = (rw); \ \ while (unlikely(ia64_fetchadd(1, (int *) __read_lock_ptr, acq) < 0)) { \ ia64_fetchadd(-1, (int *) __read_lock_ptr, rel); \ @@ -133,7 +131,7 @@ do { \ #define __raw_read_unlock(rw) \ do { \ - raw_rwlock_t *__read_lock_ptr = (rw); \ + __raw_rwlock_t *__read_lock_ptr = (rw); \ ia64_fetchadd(-1, (int *) __read_lock_ptr, rel); \ } while (0) @@ -165,7 +163,7 @@ do { \ (result == 0); \ }) -static inline void __raw_write_unlock(raw_rwlock_t *x) +static inline void __raw_write_unlock(__raw_rwlock_t *x) { u8 *y = (u8 *)x; barrier(); @@ -193,7 +191,7 @@ static inline void __raw_write_unlock(ra (ia64_val == 0); \ }) -static inline void __raw_write_unlock(raw_rwlock_t *x) +static inline void __raw_write_unlock(__raw_rwlock_t *x) { barrier(); x->write_lock = 0; @@ -201,10 +199,10 @@ static inline void __raw_write_unlock(ra #endif /* !ASM_SUPPORTED */ -static inline int __raw_read_trylock(raw_rwlock_t *x) +static inline int __raw_read_trylock(__raw_rwlock_t *x) { union { - raw_rwlock_t lock; + __raw_rwlock_t lock; __u32 word; } old, new; old.lock = new.lock = *x; @@ -213,8 +211,8 @@ static inline int __raw_read_trylock(raw return (u32)ia64_cmpxchg4_acq((__u32 *)(x), new.word, old.word) == old.word; } -#define _raw_spin_relax(lock) cpu_relax() -#define _raw_read_relax(lock) cpu_relax() -#define _raw_write_relax(lock) cpu_relax() +#define __raw_spin_relax(lock) cpu_relax() +#define __raw_read_relax(lock) cpu_relax() +#define __raw_write_relax(lock) cpu_relax() #endif /* _ASM_IA64_SPINLOCK_H */ Index: linux-2.6.24-rc8-rt1/include/asm-ia64/spinlock_types.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-ia64/spinlock_types.h 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-ia64/spinlock_types.h 2008-01-16 22:28:42.000000000 -0500 @@ -7,14 +7,14 @@ typedef struct { volatile unsigned int lock; -} raw_spinlock_t; +} __raw_spinlock_t; #define __RAW_SPIN_LOCK_UNLOCKED { 0 } typedef struct { volatile unsigned int read_counter : 31; volatile unsigned int write_lock : 1; -} raw_rwlock_t; +} __raw_rwlock_t; #define __RAW_RW_LOCK_UNLOCKED { 0, 0 } Index: linux-2.6.24-rc8-rt1/include/asm-ia64/system.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-ia64/system.h 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-ia64/system.h 2008-01-16 22:28:42.000000000 -0500 @@ -106,81 +106,16 @@ extern struct ia64_boot_param { */ #define set_mb(var, value) do { (var) = (value); mb(); } while (0) -#define safe_halt() ia64_pal_halt_light() /* PAL_HALT_LIGHT */ /* * The group barrier in front of the rsm & ssm are necessary to ensure * that none of the previous instructions in the same group are * affected by the rsm/ssm. */ -/* For spinlocks etc */ -/* - * - clearing psr.i is implicitly serialized (visible by next insn) - * - setting psr.i requires data serialization - * - we need a stop-bit before reading PSR because we sometimes - * write a floating-point register right before reading the PSR - * and that writes to PSR.mfl - */ -#define __local_irq_save(x) \ -do { \ - ia64_stop(); \ - (x) = ia64_getreg(_IA64_REG_PSR); \ - ia64_stop(); \ - ia64_rsm(IA64_PSR_I); \ -} while (0) - -#define __local_irq_disable() \ -do { \ - ia64_stop(); \ - ia64_rsm(IA64_PSR_I); \ -} while (0) - -#define __local_irq_restore(x) ia64_intrin_local_irq_restore((x) & IA64_PSR_I) - -#ifdef CONFIG_IA64_DEBUG_IRQ - extern unsigned long last_cli_ip; - -# define __save_ip() last_cli_ip = ia64_getreg(_IA64_REG_IP) - -# define local_irq_save(x) \ -do { \ - unsigned long psr; \ - \ - __local_irq_save(psr); \ - if (psr & IA64_PSR_I) \ - __save_ip(); \ - (x) = psr; \ -} while (0) - -# define local_irq_disable() do { unsigned long x; local_irq_save(x); } while (0) - -# define local_irq_restore(x) \ -do { \ - unsigned long old_psr, psr = (x); \ - \ - local_save_flags(old_psr); \ - __local_irq_restore(psr); \ - if ((old_psr & IA64_PSR_I) && !(psr & IA64_PSR_I)) \ - __save_ip(); \ -} while (0) +#include -#else /* !CONFIG_IA64_DEBUG_IRQ */ -# define local_irq_save(x) __local_irq_save(x) -# define local_irq_disable() __local_irq_disable() -# define local_irq_restore(x) __local_irq_restore(x) -#endif /* !CONFIG_IA64_DEBUG_IRQ */ - -#define local_irq_enable() ({ ia64_stop(); ia64_ssm(IA64_PSR_I); ia64_srlz_d(); }) -#define local_save_flags(flags) ({ ia64_stop(); (flags) = ia64_getreg(_IA64_REG_PSR); }) - -#define irqs_disabled() \ -({ \ - unsigned long __ia64_id_flags; \ - local_save_flags(__ia64_id_flags); \ - (__ia64_id_flags & IA64_PSR_I) == 0; \ -}) #ifdef __KERNEL__ Index: linux-2.6.24-rc8-rt1/include/asm-ia64/thread_info.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-ia64/thread_info.h 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-ia64/thread_info.h 2008-01-16 22:28:42.000000000 -0500 @@ -91,6 +91,7 @@ struct thread_info { #define TIF_MCA_INIT 18 /* this task is processing MCA or INIT */ #define TIF_DB_DISABLED 19 /* debug trap disabled for fsyscall */ #define TIF_FREEZE 20 /* is freezing for suspend */ +#define TIF_NEED_RESCHED_DELAYED 20 /* reschedule on return to userspace */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) Index: linux-2.6.24-rc8-rt1/include/asm-ia64/tlb.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-ia64/tlb.h 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-ia64/tlb.h 2008-01-16 22:28:42.000000000 -0500 @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -61,11 +62,12 @@ struct mmu_gather { unsigned char need_flush; /* really unmapped some PTEs? */ unsigned long start_addr; unsigned long end_addr; + int cpu; struct page *pages[FREE_PTE_NR]; }; /* Users of the generic TLB shootdown code must declare this storage space. */ -DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); +DECLARE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); /* * Flush the TLB for address range START to END and, if not in fast mode, release the @@ -127,8 +129,10 @@ ia64_tlb_flush_mmu (struct mmu_gather *t static inline struct mmu_gather * tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush) { - struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); + int cpu; + struct mmu_gather *tlb = &get_cpu_var_locked(mmu_gathers, &cpu); + tlb->cpu = cpu; tlb->mm = mm; /* * Use fast mode if only 1 CPU is online. @@ -165,7 +169,7 @@ tlb_finish_mmu (struct mmu_gather *tlb, /* keep the page table cache within bounds */ check_pgt_cache(); - put_cpu_var(mmu_gathers); + put_cpu_var_locked(mmu_gathers, tlb->cpu); } /* Index: linux-2.6.24-rc8-rt1/arch/powerpc/kernel/idle.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/kernel/idle.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/kernel/idle.c 2008-01-16 22:28:44.000000000 -0500 @@ -61,7 +61,8 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while (1) { tick_nohz_stop_sched_tick(); - while (!need_resched() && !cpu_should_die()) { + while (!need_resched() && !need_resched_delayed() && + !cpu_should_die()) { ppc64_runlatch_off(); if (ppc_md.power_save) { @@ -74,7 +75,9 @@ void cpu_idle(void) local_irq_disable(); /* check again after disabling irqs */ - if (!need_resched() && !cpu_should_die()) + if (!need_resched() && + !need_resched_delayed() && + !cpu_should_die()) ppc_md.power_save(); local_irq_enable(); @@ -95,7 +98,7 @@ void cpu_idle(void) tick_nohz_restart_sched_tick(); if (cpu_should_die()) cpu_die(); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); schedule(); preempt_disable(); } Index: linux-2.6.24-rc8-rt1/arch/powerpc/kernel/smp.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/kernel/smp.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/kernel/smp.c 2008-01-16 22:28:43.000000000 -0500 @@ -126,6 +126,16 @@ void smp_send_reschedule(int cpu) smp_ops->message_pass(cpu, PPC_MSG_RESCHEDULE); } +/* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + smp_ops->message_pass(MSG_ALL_BUT_SELF, PPC_MSG_RESCHEDULE); +} + #ifdef CONFIG_DEBUGGER void smp_send_debugger_break(int cpu) { @@ -157,7 +167,7 @@ static void stop_this_cpu(void *dummy) * static memory requirements. It also looks cleaner. * Stolen from the i386 version. */ -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_lock); +static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(call_lock); static struct call_data_struct { void (*func) (void *info); Index: linux-2.6.24-rc8-rt1/arch/powerpc/kernel/traps.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/kernel/traps.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/kernel/traps.c 2008-01-16 22:28:43.000000000 -0500 @@ -97,11 +97,11 @@ static inline void pmac_backlight_unblan int die(const char *str, struct pt_regs *regs, long err) { static struct { - spinlock_t lock; + raw_spinlock_t lock; u32 lock_owner; int lock_owner_depth; } die = { - .lock = __SPIN_LOCK_UNLOCKED(die.lock), + .lock = _RAW_SPIN_LOCK_UNLOCKED(die.lock), .lock_owner = -1, .lock_owner_depth = 0 }; @@ -188,6 +188,11 @@ void _exception(int signr, struct pt_reg addr, regs->nip, regs->link, code); } +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif + memset(&info, 0, sizeof(info)); info.si_signo = signr; info.si_code = code; Index: linux-2.6.24-rc8-rt1/arch/powerpc/platforms/cell/smp.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/platforms/cell/smp.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/platforms/cell/smp.c 2008-01-16 22:28:43.000000000 -0500 @@ -134,7 +134,7 @@ static void __devinit smp_iic_setup_cpu( iic_setup_cpu(); } -static DEFINE_SPINLOCK(timebase_lock); +static DEFINE_RAW_SPINLOCK(timebase_lock); static unsigned long timebase = 0; static void __devinit cell_give_timebase(void) Index: linux-2.6.24-rc8-rt1/arch/powerpc/platforms/chrp/smp.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/platforms/chrp/smp.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/platforms/chrp/smp.c 2008-01-16 22:28:43.000000000 -0500 @@ -42,7 +42,7 @@ static void __devinit smp_chrp_setup_cpu mpic_setup_this_cpu(); } -static DEFINE_SPINLOCK(timebase_lock); +static DEFINE_RAW_SPINLOCK(timebase_lock); static unsigned int timebase_upper = 0, timebase_lower = 0; void __devinit smp_chrp_give_timebase(void) Index: linux-2.6.24-rc8-rt1/arch/powerpc/platforms/chrp/time.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/platforms/chrp/time.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/platforms/chrp/time.c 2008-01-16 22:29:30.000000000 -0500 @@ -83,7 +83,12 @@ int chrp_set_rtc_time(struct rtc_time *t unsigned char save_control, save_freq_select; struct rtc_time tm = *tmarg; +#if CONFIG_PREEMPT_RT + if (!spin_trylock(&rtc_lock)) + return -1; +#else spin_lock(&rtc_lock); +#endif save_control = chrp_cmos_clock_read(RTC_CONTROL); /* tell the clock it's being set */ Index: linux-2.6.24-rc8-rt1/arch/powerpc/platforms/powermac/feature.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/platforms/powermac/feature.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/platforms/powermac/feature.c 2008-01-16 22:28:43.000000000 -0500 @@ -59,7 +59,7 @@ extern struct device_node *k2_skiplist[2 * We use a single global lock to protect accesses. Each driver has * to take care of its own locking */ -DEFINE_SPINLOCK(feature_lock); +DEFINE_RAW_SPINLOCK(feature_lock); #define LOCK(flags) spin_lock_irqsave(&feature_lock, flags); #define UNLOCK(flags) spin_unlock_irqrestore(&feature_lock, flags); Index: linux-2.6.24-rc8-rt1/arch/powerpc/platforms/powermac/nvram.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/platforms/powermac/nvram.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/platforms/powermac/nvram.c 2008-01-16 22:28:43.000000000 -0500 @@ -80,7 +80,7 @@ static int is_core_99; static int core99_bank = 0; static int nvram_partitions[3]; // XXX Turn that into a sem -static DEFINE_SPINLOCK(nv_lock); +static DEFINE_RAW_SPINLOCK(nv_lock); static int (*core99_write_bank)(int bank, u8* datas); static int (*core99_erase_bank)(int bank); Index: linux-2.6.24-rc8-rt1/arch/powerpc/platforms/powermac/pic.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/platforms/powermac/pic.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/platforms/powermac/pic.c 2008-01-16 22:28:43.000000000 -0500 @@ -63,7 +63,7 @@ static int max_irqs; static int max_real_irqs; static u32 level_mask[4]; -static DEFINE_SPINLOCK(pmac_pic_lock); +static DEFINE_RAW_SPINLOCK(pmac_pic_lock); #define NR_MASK_WORDS ((NR_IRQS + 31) / 32) static unsigned long ppc_lost_interrupts[NR_MASK_WORDS]; Index: linux-2.6.24-rc8-rt1/arch/powerpc/platforms/pseries/smp.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/platforms/pseries/smp.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/platforms/pseries/smp.c 2008-01-16 22:28:43.000000000 -0500 @@ -154,7 +154,7 @@ static void __devinit smp_xics_setup_cpu } #endif /* CONFIG_XICS */ -static DEFINE_SPINLOCK(timebase_lock); +static DEFINE_RAW_SPINLOCK(timebase_lock); static unsigned long timebase = 0; static void __devinit pSeries_give_timebase(void) Index: linux-2.6.24-rc8-rt1/arch/ppc/8260_io/enet.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ppc/8260_io/enet.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ppc/8260_io/enet.c 2008-01-16 22:28:43.000000000 -0500 @@ -115,7 +115,7 @@ struct scc_enet_private { scc_t *sccp; struct net_device_stats stats; uint tx_full; - spinlock_t lock; + raw_spinlock_t lock; }; static int scc_enet_open(struct net_device *dev); Index: linux-2.6.24-rc8-rt1/arch/ppc/8260_io/fcc_enet.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ppc/8260_io/fcc_enet.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ppc/8260_io/fcc_enet.c 2008-01-16 22:28:43.000000000 -0500 @@ -375,7 +375,7 @@ struct fcc_enet_private { volatile fcc_enet_t *ep; struct net_device_stats stats; uint tx_free; - spinlock_t lock; + raw_spinlock_t lock; #ifdef CONFIG_USE_MDIO uint phy_id; Index: linux-2.6.24-rc8-rt1/arch/ppc/8xx_io/commproc.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ppc/8xx_io/commproc.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ppc/8xx_io/commproc.c 2008-01-16 22:28:43.000000000 -0500 @@ -370,7 +370,7 @@ cpm_setbrg(uint brg, uint rate) /* * dpalloc / dpfree bits. */ -static spinlock_t cpm_dpmem_lock; +static raw_spinlock_t cpm_dpmem_lock; /* * 16 blocks should be enough to satisfy all requests * until the memory subsystem goes up... Index: linux-2.6.24-rc8-rt1/arch/ppc/8xx_io/enet.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ppc/8xx_io/enet.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ppc/8xx_io/enet.c 2008-01-16 22:28:43.000000000 -0500 @@ -143,7 +143,7 @@ struct scc_enet_private { unsigned char *rx_vaddr[RX_RING_SIZE]; struct net_device_stats stats; uint tx_full; - spinlock_t lock; + raw_spinlock_t lock; }; static int scc_enet_open(struct net_device *dev); Index: linux-2.6.24-rc8-rt1/arch/ppc/8xx_io/fec.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ppc/8xx_io/fec.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ppc/8xx_io/fec.c 2008-01-16 22:28:43.000000000 -0500 @@ -164,7 +164,7 @@ struct fec_enet_private { struct net_device_stats stats; uint tx_full; - spinlock_t lock; + raw_spinlock_t lock; #ifdef CONFIG_USE_MDIO uint phy_id; Index: linux-2.6.24-rc8-rt1/arch/ppc/kernel/smp.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ppc/kernel/smp.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ppc/kernel/smp.c 2008-01-16 22:28:43.000000000 -0500 @@ -136,6 +136,16 @@ void smp_send_reschedule(int cpu) smp_message_pass(cpu, PPC_MSG_RESCHEDULE); } +/* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + smp_message_pass(MSG_ALL_BUT_SELF, PPC_MSG_RESCHEDULE, 0, 0); +} + #ifdef CONFIG_XMON void smp_send_xmon_break(int cpu) { @@ -160,7 +170,7 @@ void smp_send_stop(void) * static memory requirements. It also looks cleaner. * Stolen from the i386 version. */ -static DEFINE_SPINLOCK(call_lock); +static DEFINE_RAW_SPINLOCK(call_lock); static struct call_data_struct { void (*func) (void *info); Index: linux-2.6.24-rc8-rt1/arch/ppc/kernel/traps.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ppc/kernel/traps.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ppc/kernel/traps.c 2008-01-16 22:28:43.000000000 -0500 @@ -72,7 +72,7 @@ void (*debugger_fault_handler)(struct pt * Trap & Exception support */ -DEFINE_SPINLOCK(die_lock); +DEFINE_RAW_SPINLOCK(die_lock); int die(const char * str, struct pt_regs * fp, long err) { @@ -108,6 +108,10 @@ void _exception(int signr, struct pt_reg debugger(regs); die("Exception in kernel mode", regs, signr); } +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif info.si_signo = signr; info.si_errno = 0; info.si_code = code; Index: linux-2.6.24-rc8-rt1/arch/ppc/platforms/hdpu.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ppc/platforms/hdpu.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ppc/platforms/hdpu.c 2008-01-16 22:28:43.000000000 -0500 @@ -55,7 +55,7 @@ static void parse_bootinfo(unsigned long static void hdpu_set_l1pe(void); static void hdpu_cpustate_set(unsigned char new_state); #ifdef CONFIG_SMP -static DEFINE_SPINLOCK(timebase_lock); +static DEFINE_RAW_SPINLOCK(timebase_lock); static unsigned int timebase_upper = 0, timebase_lower = 0; extern int smp_tb_synchronized; Index: linux-2.6.24-rc8-rt1/arch/ppc/platforms/sbc82xx.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ppc/platforms/sbc82xx.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ppc/platforms/sbc82xx.c 2008-01-16 22:28:43.000000000 -0500 @@ -65,7 +65,7 @@ static void sbc82xx_time_init(void) static volatile char *sbc82xx_i8259_map; static char sbc82xx_i8259_mask = 0xff; -static DEFINE_SPINLOCK(sbc82xx_i8259_lock); +static DEFINE_RAW_SPINLOCK(sbc82xx_i8259_lock); static void sbc82xx_i8259_mask_and_ack_irq(unsigned int irq_nr) { Index: linux-2.6.24-rc8-rt1/arch/ppc/syslib/cpm2_common.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ppc/syslib/cpm2_common.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ppc/syslib/cpm2_common.c 2008-01-16 22:28:43.000000000 -0500 @@ -114,7 +114,7 @@ cpm2_fastbrg(uint brg, uint rate, int di /* * dpalloc / dpfree bits. */ -static spinlock_t cpm_dpmem_lock; +static raw_spinlock_t cpm_dpmem_lock; /* 16 blocks should be enough to satisfy all requests * until the memory subsystem goes up... */ static rh_block_t cpm_boot_dpmem_rh_block[16]; Index: linux-2.6.24-rc8-rt1/arch/ppc/syslib/open_pic.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ppc/syslib/open_pic.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ppc/syslib/open_pic.c 2008-01-16 22:28:43.000000000 -0500 @@ -526,7 +526,7 @@ void openpic_reset_processor_phys(u_int } #if defined(CONFIG_SMP) || defined(CONFIG_PM) -static DEFINE_SPINLOCK(openpic_setup_lock); +static DEFINE_RAW_SPINLOCK(openpic_setup_lock); #endif #ifdef CONFIG_SMP Index: linux-2.6.24-rc8-rt1/arch/ppc/syslib/open_pic2.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/ppc/syslib/open_pic2.c 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/ppc/syslib/open_pic2.c 2008-01-16 22:28:43.000000000 -0500 @@ -380,7 +380,7 @@ static void openpic2_set_spurious(u_int vec); } -static DEFINE_SPINLOCK(openpic2_setup_lock); +static DEFINE_RAW_SPINLOCK(openpic2_setup_lock); /* * Initialize a timer interrupt (and disable it) Index: linux-2.6.24-rc8-rt1/include/asm-powerpc/hw_irq.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-powerpc/hw_irq.h 2008-01-16 22:27:33.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-powerpc/hw_irq.h 2008-01-16 22:28:44.000000000 -0500 @@ -16,7 +16,7 @@ extern void timer_interrupt(struct pt_re #ifdef CONFIG_PPC64 #include -static inline unsigned long local_get_flags(void) +static inline unsigned long raw_local_get_flags(void) { unsigned long flags; @@ -27,7 +27,7 @@ static inline unsigned long local_get_fl return flags; } -static inline unsigned long local_irq_disable(void) +static inline unsigned long raw_local_irq_disable(void) { unsigned long flags, zero; @@ -39,17 +39,22 @@ static inline unsigned long local_irq_di return flags; } -extern void local_irq_restore(unsigned long); + extern void iseries_handle_interrupts(void); +extern unsigned long raw_local_get_flags(void); +extern unsigned long raw_local_irq_disable(void); +extern void raw_local_irq_restore(unsigned long); + +#define raw_local_irq_enable() raw_local_irq_restore(1) +#define raw_local_save_flags(flags) ((flags) = raw_local_get_flags()) +#define raw_local_irq_save(flags) ((flags) = raw_local_irq_disable()) -#define local_irq_enable() local_irq_restore(1) -#define local_save_flags(flags) ((flags) = local_get_flags()) -#define local_irq_save(flags) ((flags) = local_irq_disable()) +#define raw_irqs_disabled() (raw_local_get_flags() == 0) +#define raw_irqs_disabled_flags(flags) ((flags) == 0) -#define irqs_disabled() (local_get_flags() == 0) -#define __hard_irq_enable() __mtmsrd(mfmsr() | MSR_EE, 1) -#define __hard_irq_disable() __mtmsrd(mfmsr() & ~MSR_EE, 1) +#define __hard_irq_enable() __mtmsrd(mfmsr() | MSR_EE, 1) +#define __hard_irq_disable() __mtmsrd(mfmsr() & ~MSR_EE, 1) #define hard_irq_disable() \ do { \ @@ -58,17 +63,17 @@ extern void iseries_handle_interrupts(vo get_paca()->hard_enabled = 0; \ } while(0) -#else +#else /* CONFIG_PPC64 */ #if defined(CONFIG_BOOKE) #define SET_MSR_EE(x) mtmsr(x) -#define local_irq_restore(flags) __asm__ __volatile__("wrtee %0" : : "r" (flags) : "memory") +#define raw_local_irq_restore(flags) __asm__ __volatile__("wrtee %0" : : "r" (flags) : "memory") #else #define SET_MSR_EE(x) mtmsr(x) -#define local_irq_restore(flags) mtmsr(flags) -#endif +#define raw_local_irq_restore(flags) mtmsr(flags) +#endif /* CONFIG_BOOKE */ -static inline void local_irq_disable(void) +static inline void raw_local_irq_disable(void) { #ifdef CONFIG_BOOKE __asm__ __volatile__("wrteei 0": : :"memory"); @@ -80,7 +85,7 @@ static inline void local_irq_disable(voi #endif } -static inline void local_irq_enable(void) +static inline void raw_local_irq_enable(void) { #ifdef CONFIG_BOOKE __asm__ __volatile__("wrteei 1": : :"memory"); @@ -92,7 +97,7 @@ static inline void local_irq_enable(void #endif } -static inline void local_irq_save_ptr(unsigned long *flags) +static inline void raw_local_irq_save_ptr(unsigned long *flags) { unsigned long msr; msr = mfmsr(); @@ -105,13 +110,16 @@ static inline void local_irq_save_ptr(un __asm__ __volatile__("": : :"memory"); } -#define local_save_flags(flags) ((flags) = mfmsr()) -#define local_irq_save(flags) local_irq_save_ptr(&flags) -#define irqs_disabled() ((mfmsr() & MSR_EE) == 0) +#define raw_local_save_flags(flags) ((flags) = mfmsr()) +#define raw_local_irq_save(flags) raw_local_irq_save_ptr(&flags) +#define raw_irqs_disabled() ((mfmsr() & MSR_EE) == 0) +#define raw_irqs_disabled_flags(flags) ((flags & MSR_EE) == 0) #define hard_irq_enable() local_irq_enable() #define hard_irq_disable() local_irq_disable() +#include + #endif /* CONFIG_PPC64 */ /* Index: linux-2.6.24-rc8-rt1/arch/powerpc/Kconfig.debug =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/Kconfig.debug 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/Kconfig.debug 2008-01-16 22:28:44.000000000 -0500 @@ -2,6 +2,10 @@ menu "Kernel hacking" source "lib/Kconfig.debug" +config TRACE_IRQFLAGS_SUPPORT + bool + default y + config DEBUG_STACKOVERFLOW bool "Check for stack overflows" depends on DEBUG_KERNEL Index: linux-2.6.24-rc8-rt1/include/asm-powerpc/pmac_feature.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-powerpc/pmac_feature.h 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-powerpc/pmac_feature.h 2008-01-16 22:28:44.000000000 -0500 @@ -378,7 +378,7 @@ extern struct macio_chip* macio_find(str * Those are exported by pmac feature for internal use by arch code * only like the platform function callbacks, do not use directly in drivers */ -extern spinlock_t feature_lock; +extern raw_spinlock_t feature_lock; extern struct device_node *uninorth_node; extern u32 __iomem *uninorth_base; Index: linux-2.6.24-rc8-rt1/arch/powerpc/kernel/head_64.S =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/kernel/head_64.S 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/kernel/head_64.S 2008-01-16 22:28:44.000000000 -0500 @@ -878,7 +878,7 @@ END_FW_FTR_SECTION_IFCLR(FW_FEATURE_ISER * handles any interrupts pending at this point. */ ld r3,SOFTE(r1) - bl .local_irq_restore + bl .raw_local_irq_restore b 11f /* Here we have a page fault that hash_page can't handle. */ Index: linux-2.6.24-rc8-rt1/arch/powerpc/kernel/rtas.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/kernel/rtas.c 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/kernel/rtas.c 2008-01-16 22:28:44.000000000 -0500 @@ -41,7 +41,7 @@ #include struct rtas_t rtas = { - .lock = SPIN_LOCK_UNLOCKED + .lock = RAW_SPIN_LOCK_UNLOCKED(lock) }; EXPORT_SYMBOL(rtas); Index: linux-2.6.24-rc8-rt1/arch/powerpc/mm/hash_native_64.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/mm/hash_native_64.c 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/mm/hash_native_64.c 2008-01-16 22:28:44.000000000 -0500 @@ -36,7 +36,7 @@ #define HPTE_LOCK_BIT 3 -static DEFINE_SPINLOCK(native_tlbie_lock); +static DEFINE_RAW_SPINLOCK(native_tlbie_lock); static inline void __tlbie(unsigned long va, int psize, int ssize) { Index: linux-2.6.24-rc8-rt1/include/asm-powerpc/rtas.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-powerpc/rtas.h 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-powerpc/rtas.h 2008-01-16 22:28:44.000000000 -0500 @@ -58,7 +58,7 @@ struct rtas_t { unsigned long entry; /* physical address pointer */ unsigned long base; /* physical address pointer */ unsigned long size; - spinlock_t lock; + raw_spinlock_t lock; struct rtas_args args; struct device_node *dev; /* virtual address pointer */ }; Index: linux-2.6.24-rc8-rt1/arch/powerpc/platforms/celleb/htab.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/platforms/celleb/htab.c 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/platforms/celleb/htab.c 2008-01-16 22:28:45.000000000 -0500 @@ -40,7 +40,7 @@ #define DBG_LOW(fmt...) do { } while(0) #endif -static DEFINE_SPINLOCK(beat_htab_lock); +static DEFINE_RAW_SPINLOCK(beat_htab_lock); static inline unsigned int beat_read_mask(unsigned hpte_group) { Index: linux-2.6.24-rc8-rt1/arch/powerpc/kernel/pmc.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/kernel/pmc.c 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/kernel/pmc.c 2008-01-16 22:28:46.000000000 -0500 @@ -37,7 +37,7 @@ static void dummy_perf(struct pt_regs *r } -static DEFINE_SPINLOCK(pmc_owner_lock); +static DEFINE_RAW_SPINLOCK(pmc_owner_lock); static void *pmc_owner_caller; /* mostly for debugging */ perf_irq_t perf_irq = dummy_perf; Index: linux-2.6.24-rc8-rt1/arch/powerpc/sysdev/i8259.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/sysdev/i8259.c 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/sysdev/i8259.c 2008-01-16 22:28:46.000000000 -0500 @@ -23,7 +23,7 @@ static unsigned char cached_8259[2] = { #define cached_A1 (cached_8259[0]) #define cached_21 (cached_8259[1]) -static DEFINE_SPINLOCK(i8259_lock); +static DEFINE_RAW_SPINLOCK(i8259_lock); static struct irq_host *i8259_host; Index: linux-2.6.24-rc8-rt1/arch/powerpc/sysdev/ipic.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/powerpc/sysdev/ipic.c 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/powerpc/sysdev/ipic.c 2008-01-16 22:28:46.000000000 -0500 @@ -30,7 +30,7 @@ #include "ipic.h" static struct ipic * primary_ipic; -static DEFINE_SPINLOCK(ipic_lock); +static DEFINE_RAW_SPINLOCK(ipic_lock); static struct ipic_info ipic_info[] = { [9] = { Index: linux-2.6.24-rc8-rt1/include/asm-powerpc/mpic.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-powerpc/mpic.h 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-powerpc/mpic.h 2008-01-16 22:28:46.000000000 -0500 @@ -275,7 +275,7 @@ struct mpic #ifdef CONFIG_MPIC_U3_HT_IRQS /* The fixup table */ struct mpic_irq_fixup *fixups; - spinlock_t fixup_lock; + raw_spinlock_t fixup_lock; #endif /* Register access method */ Index: linux-2.6.24-rc8-rt1/arch/sh/kernel/cpu/clock.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/sh/kernel/cpu/clock.c 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/sh/kernel/cpu/clock.c 2008-01-16 22:28:46.000000000 -0500 @@ -28,7 +28,7 @@ #include static LIST_HEAD(clock_list); -static DEFINE_SPINLOCK(clock_lock); +static DEFINE_RAW_SPINLOCK(clock_lock); static DEFINE_MUTEX(clock_list_sem); /* Index: linux-2.6.24-rc8-rt1/arch/sh/kernel/cpu/sh4/sq.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/sh/kernel/cpu/sh4/sq.c 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/sh/kernel/cpu/sh4/sq.c 2008-01-16 22:28:46.000000000 -0500 @@ -37,7 +37,7 @@ struct sq_mapping { }; static struct sq_mapping *sq_mapping_list; -static DEFINE_SPINLOCK(sq_mapping_lock); +static DEFINE_RAW_SPINLOCK(sq_mapping_lock); static struct kmem_cache *sq_cache; static unsigned long *sq_bitmap; Index: linux-2.6.24-rc8-rt1/arch/sh/kernel/entry-common.S =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/sh/kernel/entry-common.S 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/sh/kernel/entry-common.S 2008-01-16 22:28:46.000000000 -0500 @@ -157,7 +157,7 @@ ENTRY(resume_userspace) mov.l @(TI_FLAGS,r8), r0 ! current_thread_info->flags tst #_TIF_WORK_MASK, r0 bt/s __restore_all - tst #_TIF_NEED_RESCHED, r0 + tst #_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED, r0 .align 2 work_pending: @@ -209,10 +209,10 @@ work_resched: tst #_TIF_WORK_MASK, r0 bt __restore_all bra work_pending - tst #_TIF_NEED_RESCHED, r0 + tst #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED, r0 .align 2 -1: .long schedule +1: .long __schedule 2: .long do_notify_resume 3: .long restore_all #ifdef CONFIG_TRACE_IRQFLAGS @@ -226,7 +226,7 @@ syscall_exit_work: ! r8: current_thread_info tst #_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP, r0 bt/s work_pending - tst #_TIF_NEED_RESCHED, r0 + tst #_TIF_NEED_RESCHED| _TIF_NEED_RESCHED_DELAYED, r0 #ifdef CONFIG_TRACE_IRQFLAGS mov.l 5f, r0 jsr @r0 Index: linux-2.6.24-rc8-rt1/arch/sh/kernel/irq.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/sh/kernel/irq.c 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/sh/kernel/irq.c 2008-01-16 22:28:46.000000000 -0500 @@ -81,7 +81,7 @@ static union irq_ctx *hardirq_ctx[NR_CPU static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; #endif -asmlinkage int do_IRQ(unsigned int irq, struct pt_regs *regs) +asmlinkage notrace int do_IRQ(unsigned int irq, struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); #ifdef CONFIG_IRQSTACKS Index: linux-2.6.24-rc8-rt1/arch/sh/kernel/process.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/sh/kernel/process.c 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/sh/kernel/process.c 2008-01-16 22:28:46.000000000 -0500 @@ -65,7 +65,7 @@ void default_idle(void) clear_thread_flag(TIF_POLLING_NRFLAG); smp_mb__after_clear_bit(); set_bl_bit(); - while (!need_resched()) + while (!need_resched() && !need_resched_delayed()) cpu_sleep(); clear_bl_bit(); set_thread_flag(TIF_POLLING_NRFLAG); @@ -86,13 +86,15 @@ void cpu_idle(void) idle = default_idle; tick_nohz_stop_sched_tick(); - while (!need_resched()) + while (!need_resched() && !need_resched_delayed()) idle(); tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); check_pgt_cache(); } } Index: linux-2.6.24-rc8-rt1/arch/sh/kernel/semaphore.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/sh/kernel/semaphore.c 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/sh/kernel/semaphore.c 2008-01-16 22:28:46.000000000 -0500 @@ -46,7 +46,7 @@ DEFINE_SPINLOCK(semaphore_wake_lock); * critical part is the inline stuff in * where we want to avoid any extra jumps and calls. */ -void __up(struct semaphore *sem) +void __attribute_used__ __compat_up(struct compat_semaphore *sem) { wake_one_more(sem); wake_up(&sem->wait); @@ -104,7 +104,7 @@ void __up(struct semaphore *sem) tsk->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); -void __sched __down(struct semaphore * sem) +void __attribute_used__ __sched __compat_down(struct compat_semaphore * sem) { DOWN_VAR DOWN_HEAD(TASK_UNINTERRUPTIBLE) @@ -114,7 +114,7 @@ void __sched __down(struct semaphore * s DOWN_TAIL(TASK_UNINTERRUPTIBLE) } -int __sched __down_interruptible(struct semaphore * sem) +int __attribute_used__ __sched __compat_down_interruptible(struct compat_semaphore * sem) { int ret = 0; DOWN_VAR @@ -133,7 +133,13 @@ int __sched __down_interruptible(struct return ret; } -int __down_trylock(struct semaphore * sem) +int __attribute_used__ __compat_down_trylock(struct compat_semaphore * sem) { return waking_non_zero_trylock(sem); } + +fastcall int __sched compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} + Index: linux-2.6.24-rc8-rt1/arch/sh/kernel/sh_ksyms.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/sh/kernel/sh_ksyms.c 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/sh/kernel/sh_ksyms.c 2008-01-16 22:28:46.000000000 -0500 @@ -26,7 +26,6 @@ EXPORT_SYMBOL(sh_mv); /* platform dependent support */ EXPORT_SYMBOL(dump_fpu); EXPORT_SYMBOL(kernel_thread); -EXPORT_SYMBOL(irq_desc); EXPORT_SYMBOL(no_irq_type); EXPORT_SYMBOL(strlen); @@ -49,10 +48,10 @@ EXPORT_SYMBOL(get_vm_area); #endif /* semaphore exports */ -EXPORT_SYMBOL(__up); -EXPORT_SYMBOL(__down); -EXPORT_SYMBOL(__down_interruptible); -EXPORT_SYMBOL(__down_trylock); +EXPORT_SYMBOL(__compat_up); +EXPORT_SYMBOL(__compat_down); +EXPORT_SYMBOL(__compat_down_interruptible); +EXPORT_SYMBOL(__compat_down_trylock); EXPORT_SYMBOL(__udelay); EXPORT_SYMBOL(__ndelay); Index: linux-2.6.24-rc8-rt1/arch/sh/kernel/signal.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/sh/kernel/signal.c 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/sh/kernel/signal.c 2008-01-16 22:28:46.000000000 -0500 @@ -564,6 +564,13 @@ static void do_signal(struct pt_regs *re struct k_sigaction ka; sigset_t *oldset; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + raw_local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which * is why we may in certain cases get here from Index: linux-2.6.24-rc8-rt1/arch/sh/kernel/time.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/sh/kernel/time.c 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/sh/kernel/time.c 2008-01-16 22:28:46.000000000 -0500 @@ -24,7 +24,7 @@ struct sys_timer *sys_timer; /* Move this somewhere more sensible.. */ -DEFINE_SPINLOCK(rtc_lock); +DEFINE_RAW_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); /* Dummy RTC ops */ Index: linux-2.6.24-rc8-rt1/arch/sh/kernel/traps.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/sh/kernel/traps.c 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/sh/kernel/traps.c 2008-01-16 22:28:46.000000000 -0500 @@ -77,7 +77,7 @@ static void dump_mem(const char *str, un } } -static DEFINE_SPINLOCK(die_lock); +static DEFINE_RAW_SPINLOCK(die_lock); void die(const char * str, struct pt_regs * regs, long err) { Index: linux-2.6.24-rc8-rt1/arch/sh/mm/cache-sh4.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/sh/mm/cache-sh4.c 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/sh/mm/cache-sh4.c 2008-01-16 22:28:46.000000000 -0500 @@ -204,7 +204,7 @@ void flush_cache_sigtramp(unsigned long index = CACHE_IC_ADDRESS_ARRAY | (v & boot_cpu_data.icache.entry_mask); - local_irq_save(flags); + raw_local_irq_save(flags); jump_to_P2(); for (i = 0; i < boot_cpu_data.icache.ways; @@ -213,7 +213,7 @@ void flush_cache_sigtramp(unsigned long back_to_P1(); wmb(); - local_irq_restore(flags); + raw_local_irq_restore(flags); } static inline void flush_cache_4096(unsigned long start, @@ -229,10 +229,10 @@ static inline void flush_cache_4096(unsi (start < CACHE_OC_ADDRESS_ARRAY)) exec_offset = 0x20000000; - local_irq_save(flags); + raw_local_irq_save(flags); __flush_cache_4096(start | SH_CACHE_ASSOC, P1SEGADDR(phys), exec_offset); - local_irq_restore(flags); + raw_local_irq_restore(flags); } /* @@ -260,7 +260,7 @@ static inline void flush_icache_all(void { unsigned long flags, ccr; - local_irq_save(flags); + raw_local_irq_save(flags); jump_to_P2(); /* Flush I-cache */ @@ -274,7 +274,7 @@ static inline void flush_icache_all(void */ back_to_P1(); - local_irq_restore(flags); + raw_local_irq_restore(flags); } void flush_dcache_all(void) Index: linux-2.6.24-rc8-rt1/arch/sh/mm/init.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/sh/mm/init.c 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/sh/mm/init.c 2008-01-16 22:28:46.000000000 -0500 @@ -21,7 +21,7 @@ #include #include -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); pgd_t swapper_pg_dir[PTRS_PER_PGD]; void (*copy_page)(void *from, void *to); Index: linux-2.6.24-rc8-rt1/arch/sh/mm/pg-sh4.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/sh/mm/pg-sh4.c 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/sh/mm/pg-sh4.c 2008-01-16 22:28:46.000000000 -0500 @@ -28,9 +28,9 @@ static inline void *kmap_coherent(struct vaddr = __fix_to_virt(FIX_CMAP_END - idx); pte = mk_pte(page, PAGE_KERNEL); - local_irq_save(flags); + raw_local_irq_save(flags); flush_tlb_one(get_asid(), vaddr); - local_irq_restore(flags); + raw_local_irq_restore(flags); update_mmu_cache(NULL, vaddr, pte); Index: linux-2.6.24-rc8-rt1/arch/sh/mm/tlb-flush.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/sh/mm/tlb-flush.c 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/sh/mm/tlb-flush.c 2008-01-16 22:28:46.000000000 -0500 @@ -24,7 +24,7 @@ void local_flush_tlb_page(struct vm_area asid = cpu_asid(cpu, vma->vm_mm); page &= PAGE_MASK; - local_irq_save(flags); + raw_local_irq_save(flags); if (vma->vm_mm != current->mm) { saved_asid = get_asid(); set_asid(asid); @@ -32,7 +32,7 @@ void local_flush_tlb_page(struct vm_area local_flush_tlb_one(asid, page); if (saved_asid != MMU_NO_ASID) set_asid(saved_asid); - local_irq_restore(flags); + raw_local_irq_restore(flags); } } @@ -46,7 +46,7 @@ void local_flush_tlb_range(struct vm_are unsigned long flags; int size; - local_irq_save(flags); + raw_local_irq_save(flags); size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT; if (size > (MMU_NTLB_ENTRIES/4)) { /* Too many TLB to flush */ cpu_context(cpu, mm) = NO_CONTEXT; @@ -71,7 +71,7 @@ void local_flush_tlb_range(struct vm_are if (saved_asid != MMU_NO_ASID) set_asid(saved_asid); } - local_irq_restore(flags); + raw_local_irq_restore(flags); } } @@ -81,7 +81,7 @@ void local_flush_tlb_kernel_range(unsign unsigned long flags; int size; - local_irq_save(flags); + raw_local_irq_save(flags); size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT; if (size > (MMU_NTLB_ENTRIES/4)) { /* Too many TLB to flush */ local_flush_tlb_all(); @@ -100,7 +100,7 @@ void local_flush_tlb_kernel_range(unsign } set_asid(saved_asid); } - local_irq_restore(flags); + raw_local_irq_restore(flags); } void local_flush_tlb_mm(struct mm_struct *mm) @@ -112,11 +112,11 @@ void local_flush_tlb_mm(struct mm_struct if (cpu_context(cpu, mm) != NO_CONTEXT) { unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); cpu_context(cpu, mm) = NO_CONTEXT; if (mm == current->mm) activate_context(mm, cpu); - local_irq_restore(flags); + raw_local_irq_restore(flags); } } @@ -131,10 +131,10 @@ void local_flush_tlb_all(void) * TF-bit for SH-3, TI-bit for SH-4. * It's same position, bit #2. */ - local_irq_save(flags); + raw_local_irq_save(flags); status = ctrl_inl(MMUCR); status |= 0x04; ctrl_outl(status, MMUCR); ctrl_barrier(); - local_irq_restore(flags); + raw_local_irq_restore(flags); } Index: linux-2.6.24-rc8-rt1/arch/sh/mm/tlb-sh4.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/sh/mm/tlb-sh4.c 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/sh/mm/tlb-sh4.c 2008-01-16 22:28:46.000000000 -0500 @@ -43,7 +43,7 @@ void update_mmu_cache(struct vm_area_str } #endif - local_irq_save(flags); + raw_local_irq_save(flags); /* Set PTEH register */ vpn = (address & MMU_VPN_MASK) | get_asid(); @@ -76,7 +76,7 @@ void update_mmu_cache(struct vm_area_str /* Load the TLB */ asm volatile("ldtlb": /* no output */ : /* no input */ : "memory"); - local_irq_restore(flags); + raw_local_irq_restore(flags); } void local_flush_tlb_one(unsigned long asid, unsigned long page) Index: linux-2.6.24-rc8-rt1/include/asm-sh/atomic-irq.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-sh/atomic-irq.h 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-sh/atomic-irq.h 2008-01-16 22:28:46.000000000 -0500 @@ -10,29 +10,29 @@ static inline void atomic_add(int i, ato { unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); *(long *)v += i; - local_irq_restore(flags); + raw_local_irq_restore(flags); } static inline void atomic_sub(int i, atomic_t *v) { unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); *(long *)v -= i; - local_irq_restore(flags); + raw_local_irq_restore(flags); } static inline int atomic_add_return(int i, atomic_t *v) { unsigned long temp, flags; - local_irq_save(flags); + raw_local_irq_save(flags); temp = *(long *)v; temp += i; *(long *)v = temp; - local_irq_restore(flags); + raw_local_irq_restore(flags); return temp; } @@ -41,11 +41,11 @@ static inline int atomic_sub_return(int { unsigned long temp, flags; - local_irq_save(flags); + raw_local_irq_save(flags); temp = *(long *)v; temp -= i; *(long *)v = temp; - local_irq_restore(flags); + raw_local_irq_restore(flags); return temp; } @@ -54,18 +54,18 @@ static inline void atomic_clear_mask(uns { unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); *(long *)v &= ~mask; - local_irq_restore(flags); + raw_local_irq_restore(flags); } static inline void atomic_set_mask(unsigned int mask, atomic_t *v) { unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); *(long *)v |= mask; - local_irq_restore(flags); + raw_local_irq_restore(flags); } #endif /* __ASM_SH_ATOMIC_IRQ_H */ Index: linux-2.6.24-rc8-rt1/include/asm-sh/atomic.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-sh/atomic.h 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-sh/atomic.h 2008-01-16 22:28:46.000000000 -0500 @@ -49,11 +49,11 @@ static inline int atomic_cmpxchg(atomic_ int ret; unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); ret = v->counter; if (likely(ret == old)) v->counter = new; - local_irq_restore(flags); + raw_local_irq_restore(flags); return ret; } @@ -65,11 +65,11 @@ static inline int atomic_add_unless(atom int ret; unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); ret = v->counter; if (ret != u) v->counter += a; - local_irq_restore(flags); + raw_local_irq_restore(flags); return ret != u; } Index: linux-2.6.24-rc8-rt1/include/asm-sh/bitops.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-sh/bitops.h 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-sh/bitops.h 2008-01-16 22:28:46.000000000 -0500 @@ -19,9 +19,9 @@ static inline void set_bit(int nr, volat a += nr >> 5; mask = 1 << (nr & 0x1f); - local_irq_save(flags); + raw_local_irq_save(flags); *a |= mask; - local_irq_restore(flags); + raw_local_irq_restore(flags); } /* @@ -37,9 +37,9 @@ static inline void clear_bit(int nr, vol a += nr >> 5; mask = 1 << (nr & 0x1f); - local_irq_save(flags); + raw_local_irq_save(flags); *a &= ~mask; - local_irq_restore(flags); + raw_local_irq_restore(flags); } static inline void change_bit(int nr, volatile void * addr) @@ -50,9 +50,9 @@ static inline void change_bit(int nr, vo a += nr >> 5; mask = 1 << (nr & 0x1f); - local_irq_save(flags); + raw_local_irq_save(flags); *a ^= mask; - local_irq_restore(flags); + raw_local_irq_restore(flags); } static inline int test_and_set_bit(int nr, volatile void * addr) @@ -63,10 +63,10 @@ static inline int test_and_set_bit(int n a += nr >> 5; mask = 1 << (nr & 0x1f); - local_irq_save(flags); + raw_local_irq_save(flags); retval = (mask & *a) != 0; *a |= mask; - local_irq_restore(flags); + raw_local_irq_restore(flags); return retval; } @@ -79,10 +79,10 @@ static inline int test_and_clear_bit(int a += nr >> 5; mask = 1 << (nr & 0x1f); - local_irq_save(flags); + raw_local_irq_save(flags); retval = (mask & *a) != 0; *a &= ~mask; - local_irq_restore(flags); + raw_local_irq_restore(flags); return retval; } @@ -95,10 +95,10 @@ static inline int test_and_change_bit(in a += nr >> 5; mask = 1 << (nr & 0x1f); - local_irq_save(flags); + raw_local_irq_save(flags); retval = (mask & *a) != 0; *a ^= mask; - local_irq_restore(flags); + raw_local_irq_restore(flags); return retval; } Index: linux-2.6.24-rc8-rt1/include/asm-sh/pgalloc.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-sh/pgalloc.h 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-sh/pgalloc.h 2008-01-16 22:28:46.000000000 -0500 @@ -13,7 +13,7 @@ static inline void pmd_populate_kernel(s set_pmd(pmd, __pmd((unsigned long)pte)); } -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, +static inline void notrace pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) { set_pmd(pmd, __pmd((unsigned long)page_address(pte))); Index: linux-2.6.24-rc8-rt1/include/asm-sh/rwsem.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-sh/rwsem.h 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-sh/rwsem.h 2008-01-16 22:28:46.000000000 -0500 @@ -19,7 +19,7 @@ /* * the semaphore definition */ -struct rw_semaphore { +struct compat_rw_semaphore { long count; #define RWSEM_UNLOCKED_VALUE 0x00000000 #define RWSEM_ACTIVE_BIAS 0x00000001 @@ -27,7 +27,7 @@ struct rw_semaphore { #define RWSEM_WAITING_BIAS (-0x00010000) #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - spinlock_t wait_lock; + raw_spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; @@ -45,25 +45,25 @@ struct rw_semaphore { LIST_HEAD_INIT((name).wait_list) \ __RWSEM_DEP_MAP_INIT(name) } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define COMPAT_DECLARE_RWSEM(name) \ + struct compat_rw_semaphore name = __RWSEM_INITIALIZER(name) -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_down_read_failed(struct compat_rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_down_write_failed(struct compat_rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_wake(struct compat_rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_downgrade_wake(struct compat_rw_semaphore *sem); -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, +extern void __compat_init_rwsem(struct rw_semaphore *sem, const char *name, struct lock_class_key *key); -#define init_rwsem(sem) \ +#define compat_init_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ - __init_rwsem((sem), #sem, &__key); \ + __compat_init_rwsem((sem), #sem, &__key); \ } while (0) -static inline void init_rwsem(struct rw_semaphore *sem) +static inline void compat_init_rwsem(struct rw_semaphore *sem) { sem->count = RWSEM_UNLOCKED_VALUE; spin_lock_init(&sem->wait_lock); @@ -73,7 +73,7 @@ static inline void init_rwsem(struct rw_ /* * lock for reading */ -static inline void __down_read(struct rw_semaphore *sem) +static inline void __down_read(struct compat_rw_semaphore *sem) { if (atomic_inc_return((atomic_t *)(&sem->count)) > 0) smp_wmb(); @@ -81,7 +81,7 @@ static inline void __down_read(struct rw rwsem_down_read_failed(sem); } -static inline int __down_read_trylock(struct rw_semaphore *sem) +static inline int __down_read_trylock(struct compat_rw_semaphore *sem) { int tmp; @@ -98,7 +98,7 @@ static inline int __down_read_trylock(st /* * lock for writing */ -static inline void __down_write(struct rw_semaphore *sem) +static inline void __down_write(struct compat_rw_semaphore *sem) { int tmp; @@ -110,7 +110,7 @@ static inline void __down_write(struct r rwsem_down_write_failed(sem); } -static inline int __down_write_trylock(struct rw_semaphore *sem) +static inline int __down_write_trylock(struct compat_rw_semaphore *sem) { int tmp; @@ -123,7 +123,7 @@ static inline int __down_write_trylock(s /* * unlock after reading */ -static inline void __up_read(struct rw_semaphore *sem) +static inline void __up_read(struct compat_rw_semaphore *sem) { int tmp; @@ -136,7 +136,7 @@ static inline void __up_read(struct rw_s /* * unlock after writing */ -static inline void __up_write(struct rw_semaphore *sem) +static inline void __up_write(struct compat_rw_semaphore *sem) { smp_wmb(); if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS, @@ -147,7 +147,7 @@ static inline void __up_write(struct rw_ /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem) { atomic_add(delta, (atomic_t *)(&sem->count)); } @@ -155,7 +155,7 @@ static inline void rwsem_atomic_add(int /* * downgrade write lock to read lock */ -static inline void __downgrade_write(struct rw_semaphore *sem) +static inline void __downgrade_write(struct compat_rw_semaphore *sem) { int tmp; @@ -165,7 +165,7 @@ static inline void __downgrade_write(str rwsem_downgrade_wake(sem); } -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline void __down_write_nested(struct compat_rw_semaphore *sem, int subclass) { __down_write(sem); } @@ -173,13 +173,13 @@ static inline void __down_write_nested(s /* * implement exchange and add functionality */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem) { smp_mb(); return atomic_add_return(delta, (atomic_t *)(&sem->count)); } -static inline int rwsem_is_locked(struct rw_semaphore *sem) +static inline int rwsem_is_locked(struct compat_rw_semaphore *sem) { return (sem->count != 0); } Index: linux-2.6.24-rc8-rt1/include/asm-sh/semaphore-helper.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-sh/semaphore-helper.h 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-sh/semaphore-helper.h 2008-01-16 22:28:46.000000000 -0500 @@ -14,12 +14,12 @@ * This is trivially done with load_locked/store_cond, * which we have. Let the rest of the losers suck eggs. */ -static __inline__ void wake_one_more(struct semaphore * sem) +static __inline__ void wake_one_more(struct compat_semaphore * sem) { atomic_inc((atomic_t *)&sem->sleepers); } -static __inline__ int waking_non_zero(struct semaphore *sem) +static __inline__ int waking_non_zero(struct compat_semaphore *sem) { unsigned long flags; int ret = 0; @@ -43,7 +43,7 @@ static __inline__ int waking_non_zero(st * protected by the spinlock in order to make atomic this atomic_inc() with the * atomic_read() in wake_one_more(), otherwise we can race. -arca */ -static __inline__ int waking_non_zero_interruptible(struct semaphore *sem, +static __inline__ int waking_non_zero_interruptible(struct compat_semaphore *sem, struct task_struct *tsk) { unsigned long flags; @@ -70,7 +70,7 @@ static __inline__ int waking_non_zero_in * protected by the spinlock in order to make atomic this atomic_inc() with the * atomic_read() in wake_one_more(), otherwise we can race. -arca */ -static __inline__ int waking_non_zero_trylock(struct semaphore *sem) +static __inline__ int waking_non_zero_trylock(struct compat_semaphore *sem) { unsigned long flags; int ret = 1; Index: linux-2.6.24-rc8-rt1/include/asm-sh/semaphore.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-sh/semaphore.h 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-sh/semaphore.h 2008-01-16 22:28:46.000000000 -0500 @@ -20,28 +20,35 @@ #include #include -struct semaphore { +/* + * On !PREEMPT_RT all semaphores are compat: + */ +#ifndef CONFIG_PREEMPT_RT +# define compat_semaphore semaphore +#endif + +struct compat_semaphore { atomic_t count; int sleepers; wait_queue_head_t wait; }; -#define __SEMAPHORE_INITIALIZER(name, n) \ +#define __COMPAT_SEMAPHORE_INITIALIZER(name, n) \ { \ .count = ATOMIC_INIT(n), \ .sleepers = 0, \ .wait = __WAIT_QUEUE_HEAD_INITIALIZER((name).wait) \ } -#define __DECLARE_SEMAPHORE_GENERIC(name,count) \ - struct semaphore name = __SEMAPHORE_INITIALIZER(name,count) +#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \ + struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count) -#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1) +#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1) -static inline void sema_init (struct semaphore *sem, int val) +static inline void compat_sema_init (struct compat_semaphore *sem, int val) { /* - * *sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val); + * *sem = (struct compat_semaphore)__SEMAPHORE_INITIALIZER((*sem),val); * * i'd rather use the more flexible initialization above, but sadly * GCC 2.7.2.3 emits a bogus warning. EGCS doesn't. Oh well. @@ -51,14 +58,14 @@ static inline void sema_init (struct sem init_waitqueue_head(&sem->wait); } -static inline void init_MUTEX (struct semaphore *sem) +static inline void compat_init_MUTEX (struct compat_semaphore *sem) { - sema_init(sem, 1); + compat_sema_init(sem, 1); } -static inline void init_MUTEX_LOCKED (struct semaphore *sem) +static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem) { - sema_init(sem, 0); + compat_sema_init(sem, 0); } #if 0 @@ -68,36 +75,36 @@ asmlinkage int __down_failed_trylock(vo asmlinkage void __up_wakeup(void /* special register calling convention */); #endif -asmlinkage void __down(struct semaphore * sem); -asmlinkage int __down_interruptible(struct semaphore * sem); -asmlinkage int __down_trylock(struct semaphore * sem); -asmlinkage void __up(struct semaphore * sem); +asmlinkage void __compat_down(struct compat_semaphore * sem); +asmlinkage int __compat_down_interruptible(struct compat_semaphore * sem); +asmlinkage int __compat_down_trylock(struct compat_semaphore * sem); +asmlinkage void __compat_up(struct compat_semaphore * sem); extern spinlock_t semaphore_wake_lock; -static inline void down(struct semaphore * sem) +static inline void compat_down(struct compat_semaphore * sem) { might_sleep(); if (atomic_dec_return(&sem->count) < 0) - __down(sem); + __compat_down(sem); } -static inline int down_interruptible(struct semaphore * sem) +static inline int compat_down_interruptible(struct compat_semaphore * sem) { int ret = 0; might_sleep(); if (atomic_dec_return(&sem->count) < 0) - ret = __down_interruptible(sem); + ret = __compat_down_interruptible(sem); return ret; } -static inline int down_trylock(struct semaphore * sem) +static inline int compat_down_trylock(struct compat_semaphore * sem) { int ret = 0; if (atomic_dec_return(&sem->count) < 0) - ret = __down_trylock(sem); + ret = __compat_down_trylock(sem); return ret; } @@ -105,11 +112,17 @@ static inline int down_trylock(struct se * Note! This is subtle. We jump to wake people up only if * the semaphore was negative (== somebody was waiting on it). */ -static inline void up(struct semaphore * sem) +static inline void compat_up(struct compat_semaphore * sem) { if (atomic_inc_return(&sem->count) <= 0) - __up(sem); + __compat_up(sem); } +extern int compat_sem_is_locked(struct compat_semaphore *sem); + +#define compat_sema_count(sem) atomic_read(&(sem)->count) + +#include + #endif #endif /* __ASM_SH_SEMAPHORE_H */ Index: linux-2.6.24-rc8-rt1/include/asm-sh/system.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-sh/system.h 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-sh/system.h 2008-01-16 22:28:46.000000000 -0500 @@ -159,10 +159,10 @@ static inline unsigned long xchg_u32(vol { unsigned long flags, retval; - local_irq_save(flags); + raw_local_irq_save(flags); retval = *m; *m = val; - local_irq_restore(flags); + raw_local_irq_restore(flags); return retval; } @@ -170,10 +170,10 @@ static inline unsigned long xchg_u8(vola { unsigned long flags, retval; - local_irq_save(flags); + raw_local_irq_save(flags); retval = *m; *m = val & 0xff; - local_irq_restore(flags); + raw_local_irq_restore(flags); return retval; } @@ -208,11 +208,11 @@ static inline unsigned long __cmpxchg_u3 __u32 retval; unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); retval = *m; if (retval == old) *m = new; - local_irq_restore(flags); /* implies memory barrier */ + raw_local_irq_restore(flags); /* implies memory barrier */ return retval; } Index: linux-2.6.24-rc8-rt1/include/asm-sh/thread_info.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-sh/thread_info.h 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-sh/thread_info.h 2008-01-16 22:28:46.000000000 -0500 @@ -111,6 +111,7 @@ static inline struct thread_info *curren #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ #define TIF_RESTORE_SIGMASK 3 /* restore signal mask in do_signal() */ #define TIF_SINGLESTEP 4 /* singlestepping active */ +#define TIF_NEED_RESCHED_DELAYED 6 /* reschedule on return to userspace */ #define TIF_USEDFPU 16 /* FPU was used by this task this quantum (SMP) */ #define TIF_POLLING_NRFLAG 17 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_MEMDIE 18 @@ -121,6 +122,7 @@ static inline struct thread_info *curren #define _TIF_NEED_RESCHED (1< #include -DEFINE_SPINLOCK(i8253_lock); +DEFINE_RAW_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); /* Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/microcode.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/microcode.c 2008-01-16 22:27:31.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/microcode.c 2008-01-16 22:28:47.000000000 -0500 @@ -117,7 +117,7 @@ MODULE_LICENSE("GPL"); #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) /* serialize access to the physical write to MSR 0x79 */ -static DEFINE_SPINLOCK(microcode_update_lock); +static DEFINE_RAW_SPINLOCK(microcode_update_lock); /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ static DEFINE_MUTEX(microcode_mutex); Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/signal_32.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/signal_32.c 2008-01-16 22:27:31.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/signal_32.c 2008-01-16 22:28:47.000000000 -0500 @@ -536,6 +536,13 @@ handle_signal(unsigned long sig, siginfo } } +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif /* * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so * that register information in the sigcontext is correct. @@ -576,6 +583,13 @@ static void fastcall do_signal(struct pt struct k_sigaction ka; sigset_t *oldset; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which * is why we may in certain cases get here from Index: linux-2.6.24-rc8-rt1/arch/x86/kernel/vm86_32.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/kernel/vm86_32.c 2008-01-16 22:27:31.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/kernel/vm86_32.c 2008-01-16 22:28:47.000000000 -0500 @@ -135,6 +135,7 @@ struct pt_regs * fastcall save_v86_state local_irq_enable(); if (!current->thread.vm86_info) { + local_irq_disable(); printk("no vm86_info: BAD\n"); do_exit(SIGSEGV); } Index: linux-2.6.24-rc8-rt1/arch/x86/mm/pgtable_32.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/mm/pgtable_32.c 2008-01-16 22:27:31.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/mm/pgtable_32.c 2008-01-16 22:28:47.000000000 -0500 @@ -210,7 +210,7 @@ void pmd_ctor(struct kmem_cache *cache, * vmalloc faults work because attached pagetables are never freed. * -- wli */ -DEFINE_SPINLOCK(pgd_lock); +DEFINE_RAW_SPINLOCK(pgd_lock); struct page *pgd_list; static inline void pgd_list_add(pgd_t *pgd) Index: linux-2.6.24-rc8-rt1/arch/x86/pci/common.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/pci/common.c 2008-01-16 22:27:31.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/pci/common.c 2008-01-16 22:28:47.000000000 -0500 @@ -54,7 +54,7 @@ int pcibios_scanned; * This interrupt-safe spinlock protects all accesses to PCI * configuration space. */ -DEFINE_SPINLOCK(pci_config_lock); +DEFINE_RAW_SPINLOCK(pci_config_lock); /* * Several buggy motherboards address only 16 devices and mirror Index: linux-2.6.24-rc8-rt1/arch/x86/pci/direct.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/pci/direct.c 2008-01-16 22:27:31.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/pci/direct.c 2008-01-16 22:28:47.000000000 -0500 @@ -220,16 +220,23 @@ static int __init pci_check_type1(void) unsigned int tmp; int works = 0; - local_irq_save(flags); + spin_lock_irqsave(&pci_config_lock, flags); outb(0x01, 0xCFB); tmp = inl(0xCF8); outl(0x80000000, 0xCF8); - if (inl(0xCF8) == 0x80000000 && pci_sanity_check(&pci_direct_conf1)) { - works = 1; + + if (inl(0xCF8) == 0x80000000) { + spin_unlock_irqrestore(&pci_config_lock, flags); + + if (pci_sanity_check(&pci_direct_conf1)) + works = 1; + + spin_lock_irqsave(&pci_config_lock, flags); } outl(tmp, 0xCF8); - local_irq_restore(flags); + + spin_unlock_irqrestore(&pci_config_lock, flags); return works; } @@ -239,17 +246,19 @@ static int __init pci_check_type2(void) unsigned long flags; int works = 0; - local_irq_save(flags); + spin_lock_irqsave(&pci_config_lock, flags); outb(0x00, 0xCFB); outb(0x00, 0xCF8); outb(0x00, 0xCFA); - if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00 && - pci_sanity_check(&pci_direct_conf2)) { - works = 1; - } - local_irq_restore(flags); + if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00) { + spin_unlock_irqrestore(&pci_config_lock, flags); + + if (pci_sanity_check(&pci_direct_conf2)) + works = 1; + } else + spin_unlock_irqrestore(&pci_config_lock, flags); return works; } Index: linux-2.6.24-rc8-rt1/arch/x86/pci/pci.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/arch/x86/pci/pci.h 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/arch/x86/pci/pci.h 2008-01-16 22:28:47.000000000 -0500 @@ -80,7 +80,7 @@ struct irq_routing_table { extern unsigned int pcibios_irq_mask; extern int pcibios_scanned; -extern spinlock_t pci_config_lock; +extern raw_spinlock_t pci_config_lock; extern int (*pcibios_enable_irq)(struct pci_dev *dev); extern void (*pcibios_disable_irq)(struct pci_dev *dev); Index: linux-2.6.24-rc8-rt1/include/asm-x86/acpi_32.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/acpi_32.h 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/acpi_32.h 2008-01-16 22:28:47.000000000 -0500 @@ -52,8 +52,8 @@ #define ACPI_ASM_MACROS #define BREAKPOINT3 -#define ACPI_DISABLE_IRQS() local_irq_disable() -#define ACPI_ENABLE_IRQS() local_irq_enable() +#define ACPI_DISABLE_IRQS() local_irq_disable_nort() +#define ACPI_ENABLE_IRQS() local_irq_enable_nort() #define ACPI_FLUSH_CPU_CACHE() wbinvd() int __acpi_acquire_global_lock(unsigned int *lock); Index: linux-2.6.24-rc8-rt1/include/asm-x86/dma_32.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/dma_32.h 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/dma_32.h 2008-01-16 22:28:47.000000000 -0500 @@ -134,7 +134,7 @@ #define DMA_AUTOINIT 0x10 -extern spinlock_t dma_spin_lock; +extern spinlock_t dma_spin_lock; static __inline__ unsigned long claim_dma_lock(void) { Index: linux-2.6.24-rc8-rt1/include/asm-x86/highmem.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/highmem.h 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/highmem.h 2008-01-16 22:29:04.000000000 -0500 @@ -67,6 +67,16 @@ extern void * FASTCALL(kmap_high(struct extern void FASTCALL(kunmap_high(struct page *page)); void *kmap(struct page *page); +extern void kunmap_virt(void *ptr); +extern struct page *kmap_to_page(void *ptr); +void kunmap(struct page *page); + +void *__kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); +void *__kmap_atomic(struct page *page, enum km_type type); +void __kunmap_atomic(void *kvaddr, enum km_type type); +void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type); +struct page *__kmap_atomic_to_page(void *ptr); + void kunmap(struct page *page); void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); void *kmap_atomic(struct page *page, enum km_type type); @@ -80,6 +90,23 @@ struct page *kmap_atomic_to_page(void *p #define flush_cache_kmaps() do { } while (0) +/* + * on PREEMPT_RT kmap_atomic() is a wrapper that uses kmap(): + */ +#ifdef CONFIG_PREEMPT_RT +# define kmap_atomic_prot(page, type, prot) ({ pagefault_disable(); kmap(page); }) +# define kmap_atomic(page, type) ({ pagefault_disable(); kmap(page); }) +# define kmap_atomic_pfn(pfn, type) kmap(pfn_to_page(pfn)) +# define kunmap_atomic(kvaddr, type) do { pagefault_enable(); kunmap_virt(kvaddr); } while(0) +# define kmap_atomic_to_page(kvaddr) kmap_to_page(kvaddr) +#else +# define kmap_atomic_prot(page, type, prot) __kmap_atomic_prot(page, type, prot) +# define kmap_atomic(page, type) __kmap_atomic(page, type) +# define kmap_atomic_pfn(pfn, type) __kmap_atomic_pfn(pfn, type) +# define kunmap_atomic(kvaddr, type) __kunmap_atomic(kvaddr, type) +# define kmap_atomic_to_page(kvaddr) __kmap_atomic_to_page(kvaddr) +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_HIGHMEM_H */ Index: linux-2.6.24-rc8-rt1/include/asm-x86/i8253.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/i8253.h 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/i8253.h 2008-01-16 22:28:47.000000000 -0500 @@ -6,7 +6,7 @@ #define PIT_CH0 0x40 #define PIT_CH2 0x42 -extern spinlock_t i8253_lock; +extern raw_spinlock_t i8253_lock; extern struct clock_event_device *global_clock_event; Index: linux-2.6.24-rc8-rt1/include/asm-x86/i8259.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/i8259.h 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/i8259.h 2008-01-16 22:28:47.000000000 -0500 @@ -7,7 +7,7 @@ extern unsigned int cached_irq_mask; #define cached_master_mask (__byte(0, cached_irq_mask)) #define cached_slave_mask (__byte(1, cached_irq_mask)) -extern spinlock_t i8259A_lock; +extern raw_spinlock_t i8259A_lock; extern void init_8259A(int auto_eoi); extern void enable_8259A_irq(unsigned int irq); Index: linux-2.6.24-rc8-rt1/include/asm-x86/mach-default/irq_vectors.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/mach-default/irq_vectors.h 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/mach-default/irq_vectors.h 2008-01-16 22:28:47.000000000 -0500 @@ -63,7 +63,7 @@ * levels. (0x80 is the syscall vector) */ #define FIRST_DEVICE_VECTOR 0x31 -#define FIRST_SYSTEM_VECTOR 0xef +#define FIRST_SYSTEM_VECTOR 0xee #define TIMER_IRQ 0 Index: linux-2.6.24-rc8-rt1/include/asm-x86/mc146818rtc_32.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/mc146818rtc_32.h 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/mc146818rtc_32.h 2008-01-16 22:28:47.000000000 -0500 @@ -72,7 +72,7 @@ static inline unsigned char current_lock lock_cmos(reg) #define lock_cmos_suffix(reg) \ unlock_cmos(); \ - local_irq_restore(cmos_flags); \ + local_irq_restore(cmos_flags); \ } while (0) #else #define lock_cmos_prefix(reg) do {} while (0) Index: linux-2.6.24-rc8-rt1/include/asm-x86/pgtable_32.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/pgtable_32.h 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/pgtable_32.h 2008-01-16 22:28:47.000000000 -0500 @@ -33,7 +33,7 @@ struct vm_area_struct; extern unsigned long empty_zero_page[1024]; extern pgd_t swapper_pg_dir[1024]; extern struct kmem_cache *pmd_cache; -extern spinlock_t pgd_lock; +extern raw_spinlock_t pgd_lock; extern struct page *pgd_list; void check_pgt_cache(void); Index: linux-2.6.24-rc8-rt1/include/asm-x86/tlbflush_32.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/tlbflush_32.h 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/tlbflush_32.h 2008-01-16 22:28:47.000000000 -0500 @@ -4,6 +4,21 @@ #include #include +/* + * TLB-flush needs to be nonpreemptible on PREEMPT_RT due to the + * following complex race scenario: + * + * if the current task is lazy-TLB and does a TLB flush and + * gets preempted after the movl %%r3, %0 but before the + * movl %0, %%cr3 then its ->active_mm might change and it will + * install the wrong cr3 when it switches back. This is not a + * problem for the lazy-TLB task itself, but if the next task it + * switches to has an ->mm that is also the lazy-TLB task's + * new ->active_mm, then the scheduler will assume that cr3 is + * the new one, while we overwrote it with the old one. The result + * is the wrong cr3 in the new (non-lazy-TLB) task, which typically + * causes an infinite pagefault upon the next userspace access. + */ #ifdef CONFIG_PARAVIRT #include #else @@ -16,11 +31,13 @@ do { \ unsigned int tmpreg; \ \ + preempt_disable(); \ __asm__ __volatile__( \ "movl %%cr3, %0; \n" \ "movl %0, %%cr3; # flush TLB \n" \ : "=r" (tmpreg) \ :: "memory"); \ + preempt_enable(); \ } while (0) /* @@ -31,6 +48,7 @@ do { \ unsigned int tmpreg, cr4, cr4_orig; \ \ + preempt_disable(); \ __asm__ __volatile__( \ "movl %%cr4, %2; # turn off PGE \n" \ "movl %2, %1; \n" \ @@ -42,6 +60,7 @@ : "=&r" (tmpreg), "=&r" (cr4), "=&r" (cr4_orig) \ : "i" (~X86_CR4_PGE) \ : "memory"); \ + preempt_enable(); \ } while (0) #define __native_flush_tlb_single(addr) \ @@ -97,6 +116,13 @@ static inline void flush_tlb_mm(struct mm_struct *mm) { + /* + * This is safe on PREEMPT_RT because if we preempt + * right after the check but before the __flush_tlb(), + * and if ->active_mm changes, then we might miss a + * TLB flush, but that TLB flush happened already when + * ->active_mm was changed: + */ if (mm == current->active_mm) __flush_tlb(); } Index: linux-2.6.24-rc8-rt1/include/asm-x86/xor_32.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/asm-x86/xor_32.h 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/asm-x86/xor_32.h 2008-01-16 22:28:47.000000000 -0500 @@ -862,7 +862,21 @@ static struct xor_block_template xor_blo #include #undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ +/* + * MMX/SSE ops disable preemption for long periods of time, + * so on PREEMPT_RT use the register-based ops only: + */ +#ifdef CONFIG_PREEMPT_RT +# define XOR_TRY_TEMPLATES \ + do { \ + xor_speed(&xor_block_8regs); \ + xor_speed(&xor_block_8regs_p); \ + xor_speed(&xor_block_32regs); \ + xor_speed(&xor_block_32regs_p); \ + } while (0) +# define XOR_SELECT_TEMPLATE(FASTEST) (FASTEST) +#else +# define XOR_TRY_TEMPLATES \ do { \ xor_speed(&xor_block_8regs); \ xor_speed(&xor_block_8regs_p); \ @@ -875,9 +889,10 @@ static struct xor_block_template xor_blo xor_speed(&xor_block_p5_mmx); \ } \ } while (0) - /* We force the use of the SSE xor block because it can write around L2. We may also be able to load into the L1 only depending on how the cpu deals with a load to a line that is being prefetched. */ -#define XOR_SELECT_TEMPLATE(FASTEST) \ +# define XOR_SELECT_TEMPLATE(FASTEST) \ (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) +#endif + Index: linux-2.6.24-rc8-rt1/kernel/Kconfig.instrumentation =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/Kconfig.instrumentation 2008-01-16 22:27:32.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/Kconfig.instrumentation 2008-01-16 22:28:47.000000000 -0500 @@ -29,6 +29,11 @@ config OPROFILE If unsure, say N. +config PROFILE_NMI + bool + depends on OPROFILE + default y + config KPROBES bool "Kprobes" depends on KALLSYMS && MODULES && !UML Index: linux-2.6.24-rc8-rt1/include/linux/mm_types.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/mm_types.h 2008-01-16 22:27:31.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/mm_types.h 2008-01-16 22:28:48.000000000 -0500 @@ -199,6 +199,9 @@ struct mm_struct { /* Architecture-specific MM context */ mm_context_t context; + /* realtime bits */ + struct list_head delayed_drop; + /* Swap token stuff */ /* * Last value of global fault stamp as seen by this process. Index: linux-2.6.24-rc8-rt1/include/linux/completion.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/completion.h 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/completion.h 2008-01-16 22:28:49.000000000 -0500 @@ -48,6 +48,7 @@ extern unsigned long wait_for_completion unsigned long timeout); extern unsigned long wait_for_completion_interruptible_timeout( struct completion *x, unsigned long timeout); +extern unsigned int completion_done(struct completion *x); extern void complete(struct completion *); extern void complete_all(struct completion *); Index: linux-2.6.24-rc8-rt1/include/linux/radix-tree.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/radix-tree.h 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/radix-tree.h 2008-01-16 22:29:03.000000000 -0500 @@ -62,23 +62,65 @@ struct radix_tree_root { unsigned int height; gfp_t gfp_mask; struct radix_tree_node *rnode; + spinlock_t lock; }; #define RADIX_TREE_INIT(mask) { \ .height = 0, \ .gfp_mask = (mask), \ .rnode = NULL, \ + .lock = __SPIN_LOCK_UNLOCKED(radix_tree_root.lock), \ } #define RADIX_TREE(name, mask) \ struct radix_tree_root name = RADIX_TREE_INIT(mask) -#define INIT_RADIX_TREE(root, mask) \ -do { \ - (root)->height = 0; \ - (root)->gfp_mask = (mask); \ - (root)->rnode = NULL; \ -} while (0) +static inline void INIT_RADIX_TREE(struct radix_tree_root *root, gfp_t gfp_mask) +{ + root->height = 0; + root->gfp_mask = gfp_mask; + root->rnode = NULL; + spin_lock_init(&root->lock); +} + +struct radix_tree_context { + struct radix_tree_root *tree; + struct radix_tree_root *root; +#ifdef CONFIG_RADIX_TREE_CONCURRENT + spinlock_t *locked; +#endif +}; + +#ifdef CONFIG_RADIX_TREE_CONCURRENT +#define RADIX_CONTEXT_ROOT(context) \ + ((struct radix_tree_root *)(((unsigned long)context) + 1)) + +#define __RADIX_TREE_CONTEXT_INIT(context, _tree) \ + .tree = RADIX_CONTEXT_ROOT(&context), \ + .locked = NULL, +#else +#define __RADIX_TREE_CONTEXT_INIT(context, _tree) \ + .tree = (_tree), +#endif + +#define DEFINE_RADIX_TREE_CONTEXT(context, _tree) \ + struct radix_tree_context context = { \ + .root = (_tree), \ + __RADIX_TREE_CONTEXT_INIT(context, _tree) \ + } + +static inline void +init_radix_tree_context(struct radix_tree_context *ctx, + struct radix_tree_root *root) +{ + ctx->root = root; +#ifdef CONFIG_RADIX_TREE_CONCURRENT + ctx->tree = RADIX_CONTEXT_ROOT(ctx); + ctx->locked = NULL; +#else + ctx->tree = root; +#endif +} /** * Radix-tree synchronization @@ -99,12 +141,15 @@ do { \ * * The notable exceptions to this rule are the following functions: * radix_tree_lookup + * radix_tree_lookup_slot * radix_tree_tag_get * radix_tree_gang_lookup + * radix_tree_gang_lookup_slot * radix_tree_gang_lookup_tag + * radix_tree_gang_lookup_tag_slot * radix_tree_tagged * - * The first 4 functions are able to be called locklessly, using RCU. The + * The first 7 functions are able to be called locklessly, using RCU. The * caller must ensure calls to these functions are made within rcu_read_lock() * regions. Other readers (lock-free or otherwise) and modifications may be * running concurrently. @@ -152,6 +197,48 @@ static inline void radix_tree_replace_sl rcu_assign_pointer(*pslot, item); } +#if defined(CONFIG_RADIX_TREE_OPTIMISTIC) +static inline void radix_tree_lock(struct radix_tree_context *context) +{ + rcu_read_lock(); + BUG_ON(context->locked); +} +#elif defined(CONFIG_RADIX_TREE_CONCURRENT) +static inline void radix_tree_lock(struct radix_tree_context *context) +{ + struct radix_tree_root *root = context->root; + + rcu_read_lock(); + spin_lock(&root->lock); + BUG_ON(context->locked); + context->locked = &root->lock; +} +#else +static inline void radix_tree_lock(struct radix_tree_context *context) +{ + struct radix_tree_root *root = context->root; + + rcu_read_lock(); + spin_lock(&root->lock); +} +#endif + +#if defined(CONFIG_RADIX_TREE_CONCURRENT) +static inline void radix_tree_unlock(struct radix_tree_context *context) +{ + BUG_ON(!context->locked); + spin_unlock(context->locked); + context->locked = NULL; + rcu_read_unlock(); +} +#else +static inline void radix_tree_unlock(struct radix_tree_context *context) +{ + spin_unlock(&context->root->lock); + rcu_read_unlock(); +} +#endif + int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); void *radix_tree_lookup(struct radix_tree_root *, unsigned long); void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); @@ -159,9 +246,23 @@ void *radix_tree_delete(struct radix_tre unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items); +unsigned int +radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, + unsigned long first_index, unsigned int max_items); unsigned long radix_tree_next_hole(struct radix_tree_root *root, unsigned long index, unsigned long max_scan); +/* + * On a mutex based kernel we can freely schedule within the radix code: + */ +#ifdef CONFIG_PREEMPT_RT +static inline int radix_tree_preload(gfp_t gfp_mask) +{ + return 0; +} +#else int radix_tree_preload(gfp_t gfp_mask); +#endif + void radix_tree_init(void); void *radix_tree_tag_set(struct radix_tree_root *root, unsigned long index, unsigned int tag); @@ -173,11 +274,17 @@ unsigned int radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items, unsigned int tag); +unsigned int +radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, + unsigned long first_index, unsigned int max_items, + unsigned int tag); int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag); static inline void radix_tree_preload_end(void) { +#ifndef CONFIG_PREEMPT_RT preempt_enable(); +#endif } #endif /* _LINUX_RADIX_TREE_H */ Index: linux-2.6.24-rc8-rt1/include/linux/smp_lock.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/smp_lock.h 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/smp_lock.h 2008-01-16 22:28:49.000000000 -0500 @@ -17,6 +17,8 @@ extern void __lockfunc __release_kernel_ __release_kernel_lock(); \ } while (0) + + /* * Non-SMP kernels will never block on the kernel lock, * so we are better off returning a constant zero from @@ -44,7 +46,7 @@ extern void __lockfunc unlock_kernel(voi #define lock_kernel() do { } while(0) #define unlock_kernel() do { } while(0) #define release_kernel_lock(task) do { } while(0) -#define reacquire_kernel_lock(task) 0 +#define reacquire_kernel_lock(task) do { } while(0) #define kernel_locked() 1 #endif /* CONFIG_LOCK_KERNEL */ Index: linux-2.6.24-rc8-rt1/include/linux/workqueue.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/workqueue.h 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/workqueue.h 2008-01-16 22:29:23.000000000 -0500 @@ -9,6 +9,8 @@ #include #include #include +#include +#include #include struct workqueue_struct; @@ -27,7 +29,7 @@ struct work_struct { #define WORK_STRUCT_PENDING 0 /* T if work item pending execution */ #define WORK_STRUCT_FLAG_MASK (3UL) #define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK) - struct list_head entry; + struct plist_node entry; work_func_t func; #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; @@ -39,6 +41,7 @@ struct work_struct { struct delayed_work { struct work_struct work; struct timer_list timer; + int prio; }; struct execute_work { @@ -59,7 +62,7 @@ struct execute_work { #define __WORK_INITIALIZER(n, f) { \ .data = WORK_DATA_INIT(), \ - .entry = { &(n).entry, &(n).entry }, \ + .entry = PLIST_NODE_INIT(n.entry, MAX_PRIO), \ .func = (f), \ __WORK_INIT_LOCKDEP_MAP(#n, &(n)) \ } @@ -100,14 +103,14 @@ struct execute_work { \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ lockdep_init_map(&(_work)->lockdep_map, #_work, &__key, 0);\ - INIT_LIST_HEAD(&(_work)->entry); \ + plist_node_init(&(_work)->entry, -1); \ PREPARE_WORK((_work), (_func)); \ } while (0) #else #define INIT_WORK(_work, _func) \ do { \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ - INIT_LIST_HEAD(&(_work)->entry); \ + plist_node_init(&(_work)->entry, -1); \ PREPARE_WORK((_work), (_func)); \ } while (0) #endif @@ -168,6 +171,9 @@ __create_workqueue_key(const char *name, #define create_freezeable_workqueue(name) __create_workqueue((name), 1, 1) #define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0) +extern void set_workqueue_prio(struct workqueue_struct *wq, int policy, + int rt_priority, int nice); + extern void destroy_workqueue(struct workqueue_struct *wq); extern int FASTCALL(queue_work(struct workqueue_struct *wq, struct work_struct *work)); @@ -184,7 +190,8 @@ extern int FASTCALL(schedule_delayed_wor unsigned long delay)); extern int schedule_delayed_work_on(int cpu, struct delayed_work *work, unsigned long delay); -extern int schedule_on_each_cpu(work_func_t func); +extern int schedule_on_each_cpu_wq(struct workqueue_struct *wq, work_func_t func); +extern int schedule_on_each_cpu(void (*func)(void *info), void *info, int retry, int wait); extern int current_is_keventd(void); extern int keventd_up(void); Index: linux-2.6.24-rc8-rt1/kernel/exit.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/exit.c 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/exit.c 2008-01-16 22:28:49.000000000 -0500 @@ -63,7 +63,9 @@ static void __unhash_process(struct task detach_pid(p, PIDTYPE_SID); list_del_rcu(&p->tasks); + preempt_disable(); __get_cpu_var(process_counts)--; + preempt_enable(); } list_del_rcu(&p->thread_group); remove_parent(p); @@ -585,9 +587,11 @@ static void exit_mm(struct task_struct * task_lock(tsk); tsk->mm = NULL; up_read(&mm->mmap_sem); + preempt_disable(); // FIXME enter_lazy_tlb(mm, current); /* We don't want this task to be frozen prematurely */ clear_freeze_flag(tsk); + preempt_enable(); task_unlock(tsk); mmput(mm); } @@ -1042,15 +1046,18 @@ fastcall NORET_TYPE void do_exit(long co if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); - preempt_disable(); +again: + local_irq_disable(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; - schedule(); - BUG(); - /* Avoid "noreturn function does return". */ - for (;;) - cpu_relax(); /* For when BUG is null */ + __schedule(); + printk(KERN_ERR "BUG: dead task %s:%d back from the grave!\n", + current->comm, current->pid); + printk(KERN_ERR ".... flags: %08x, count: %d, state: %08lx\n", + current->flags, atomic_read(¤t->usage), current->state); + printk(KERN_ERR ".... trying again ...\n"); + goto again; } EXPORT_SYMBOL_GPL(do_exit); @@ -1553,6 +1560,7 @@ repeat: int ret; list_for_each_entry(p, &tsk->children, sibling) { + BUG_ON(!atomic_read(&p->usage)); ret = eligible_child(pid, options, p); if (!ret) continue; Index: linux-2.6.24-rc8-rt1/kernel/signal.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/signal.c 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/signal.c 2008-01-16 22:28:49.000000000 -0500 @@ -762,8 +762,10 @@ specific_send_sig_info(int sig, struct s { int ret = 0; - BUG_ON(!irqs_disabled()); + BUG_ON_NONRT(!irqs_disabled()); +#ifdef CONFIG_SMP assert_spin_locked(&t->sighand->siglock); +#endif /* Short-circuit ignored signals. */ if (sig_ignored(t, sig)) @@ -1624,6 +1626,7 @@ static void ptrace_stop(int exit_code, i if (may_ptrace_stop()) { do_notify_parent_cldstop(current, CLD_TRAPPED); read_unlock(&tasklist_lock); + current->flags &= ~PF_NOSCHED; schedule(); } else { /* @@ -1684,6 +1687,7 @@ finish_stop(int stop_count) } do { + current->flags &= ~PF_NOSCHED; schedule(); } while (try_to_freeze()); /* @@ -1795,6 +1799,9 @@ int get_signal_to_deliver(siginfo_t *inf try_to_freeze(); +#ifdef CONFIG_PREEMPT_RT + might_sleep(); +#endif relock: spin_lock_irq(¤t->sighand->siglock); for (;;) { Index: linux-2.6.24-rc8-rt1/kernel/user.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/user.c 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/user.c 2008-01-16 22:29:25.000000000 -0500 @@ -225,14 +225,14 @@ static void remove_user_sysfs_dir(struct */ uids_mutex_lock(); - local_irq_save(flags); + local_irq_save_nort(flags); if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { uid_hash_remove(up); remove_user = 1; spin_unlock_irqrestore(&uidhash_lock, flags); } else { - local_irq_restore(flags); + local_irq_restore_nort(flags); } if (!remove_user) @@ -312,11 +312,11 @@ void free_uid(struct user_struct *up) if (!up) return; - local_irq_save(flags); + local_irq_save_nort(flags); if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) free_user(up, flags); else - local_irq_restore(flags); + local_irq_restore_nort(flags); } struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) Index: linux-2.6.24-rc8-rt1/kernel/workqueue.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/workqueue.c 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/workqueue.c 2008-01-16 22:29:23.000000000 -0500 @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -33,6 +34,11 @@ #include #include #include +#include + +#include + +struct wq_barrier; /* * The per-CPU workqueue (if single thread, we always use the first @@ -42,7 +48,7 @@ struct cpu_workqueue_struct { spinlock_t lock; - struct list_head worklist; + struct plist_head worklist; wait_queue_head_t more_work; struct work_struct *current_work; @@ -50,6 +56,9 @@ struct cpu_workqueue_struct { struct task_struct *thread; int run_depth; /* Detect run_workqueue() recursion depth */ + + wait_queue_head_t work_done; + struct wq_barrier *barrier; } ____cacheline_aligned; /* @@ -126,7 +135,7 @@ struct cpu_workqueue_struct *get_wq_data } static void insert_work(struct cpu_workqueue_struct *cwq, - struct work_struct *work, int tail) + struct work_struct *work, int prio, int boost_prio) { set_wq_data(work, cwq); /* @@ -134,21 +143,22 @@ static void insert_work(struct cpu_workq * result of list_add() below, see try_to_grab_pending(). */ smp_wmb(); - if (tail) - list_add_tail(&work->entry, &cwq->worklist); - else - list_add(&work->entry, &cwq->worklist); + plist_node_init(&work->entry, prio); + plist_add(&work->entry, &cwq->worklist); + + if (boost_prio < cwq->thread->prio) + task_setprio(cwq->thread, boost_prio); wake_up(&cwq->more_work); } /* Preempt must be disabled. */ static void __queue_work(struct cpu_workqueue_struct *cwq, - struct work_struct *work) + struct work_struct *work, int prio) { unsigned long flags; spin_lock_irqsave(&cwq->lock, flags); - insert_work(cwq, work, 1); + insert_work(cwq, work, prio, prio); spin_unlock_irqrestore(&cwq->lock, flags); } @@ -161,15 +171,16 @@ static void __queue_work(struct cpu_work * * We queue the work to the CPU it was submitted, but there is no * guarantee that it will be processed by that CPU. + * + * Especially no such guarantee on PREEMPT_RT. */ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) { - int ret = 0; + int ret = 0, cpu = raw_smp_processor_id(); if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { - BUG_ON(!list_empty(&work->entry)); - __queue_work(wq_per_cpu(wq, get_cpu()), work); - put_cpu(); + BUG_ON(!plist_node_empty(&work->entry)); + __queue_work(wq_per_cpu(wq, cpu), work, current->normal_prio); ret = 1; } return ret; @@ -182,7 +193,8 @@ void delayed_work_timer_fn(unsigned long struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); struct workqueue_struct *wq = cwq->wq; - __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work); + __queue_work(wq_per_cpu(wq, raw_smp_processor_id()), + &dwork->work, dwork->prio); } /** @@ -222,9 +234,10 @@ int queue_delayed_work_on(int cpu, struc if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { BUG_ON(timer_pending(timer)); - BUG_ON(!list_empty(&work->entry)); + BUG_ON(!plist_node_empty(&work->entry)); /* This stores cwq for the moment, for the timer_fn */ + dwork->prio = current->normal_prio; set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); timer->expires = jiffies + delay; timer->data = (unsigned long)dwork; @@ -240,8 +253,34 @@ int queue_delayed_work_on(int cpu, struc } EXPORT_SYMBOL_GPL(queue_delayed_work_on); +static void leak_check(void *func) +{ + if (!in_atomic() && lockdep_depth(current) <= 0) + return; + printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), + current->pid); + printk(KERN_ERR " last function: "); + print_symbol("%s\n", (unsigned long)func); + debug_show_held_locks(current); + dump_stack(); +} + +struct wq_barrier { + struct work_struct work; + struct plist_head worklist; + struct wq_barrier *prev_barrier; + int prev_prio; + int waiter_prio; + struct cpu_workqueue_struct *cwq; + struct completion done; +}; + static void run_workqueue(struct cpu_workqueue_struct *cwq) { + struct plist_head *worklist = &cwq->worklist; + spin_lock_irq(&cwq->lock); cwq->run_depth++; if (cwq->run_depth > 3) { @@ -250,8 +289,11 @@ static void run_workqueue(struct cpu_wor __FUNCTION__, cwq->run_depth); dump_stack(); } - while (!list_empty(&cwq->worklist)) { - struct work_struct *work = list_entry(cwq->worklist.next, + +again: + while (!plist_head_empty(worklist)) { + int prio; + struct work_struct *work = plist_first_entry(worklist, struct work_struct, entry); work_func_t f = work->func; #ifdef CONFIG_LOCKDEP @@ -266,32 +308,56 @@ static void run_workqueue(struct cpu_wor struct lockdep_map lockdep_map = work->lockdep_map; #endif + prio = work->entry.prio; + if (unlikely(worklist != &cwq->worklist)) { + prio = min(prio, cwq->barrier->prev_prio); + prio = min(prio, cwq->barrier->waiter_prio); + prio = min(prio, plist_first(&cwq->worklist)->prio); + } + prio = max(prio, 0); + + if (likely(cwq->thread->prio != prio)) + task_setprio(cwq->thread, prio); + cwq->current_work = work; - list_del_init(cwq->worklist.next); + plist_del(&work->entry, worklist); + plist_node_init(&work->entry, MAX_PRIO); spin_unlock_irq(&cwq->lock); BUG_ON(get_wq_data(work) != cwq); work_clear_pending(work); + leak_check(NULL); lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_); lock_acquire(&lockdep_map, 0, 0, 0, 2, _THIS_IP_); f(work); lock_release(&lockdep_map, 1, _THIS_IP_); lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_); - - if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { - printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), - task_pid_nr(current)); - printk(KERN_ERR " last function: "); - print_symbol("%s\n", (unsigned long)f); - debug_show_held_locks(current); - dump_stack(); - } + leak_check(f); spin_lock_irq(&cwq->lock); cwq->current_work = NULL; + wake_up_all(&cwq->work_done); + if (unlikely(cwq->barrier)) + worklist = &cwq->barrier->worklist; } + + if (unlikely(worklist != &cwq->worklist)) { + struct wq_barrier *barrier = cwq->barrier; + + BUG_ON(!barrier); + cwq->barrier = barrier->prev_barrier; + complete(&barrier->done); + + if (unlikely(cwq->barrier)) + worklist = &cwq->barrier->worklist; + else + worklist = &cwq->worklist; + + if (!plist_head_empty(worklist)) + goto again; + } + + task_setprio(cwq->thread, current->normal_prio); cwq->run_depth--; spin_unlock_irq(&cwq->lock); } @@ -310,7 +376,7 @@ static int worker_thread(void *__cwq) prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); if (!freezing(current) && !kthread_should_stop() && - list_empty(&cwq->worklist)) + plist_head_empty(&cwq->worklist)) schedule(); finish_wait(&cwq->more_work, &wait); @@ -325,26 +391,37 @@ static int worker_thread(void *__cwq) return 0; } -struct wq_barrier { - struct work_struct work; - struct completion done; -}; - static void wq_barrier_func(struct work_struct *work) { - struct wq_barrier *barr = container_of(work, struct wq_barrier, work); - complete(&barr->done); + struct wq_barrier *barrier = + container_of(work, struct wq_barrier, work); + struct cpu_workqueue_struct *cwq = barrier->cwq; + int prio = MAX_PRIO; + + spin_lock_irq(&cwq->lock); + barrier->prev_barrier = cwq->barrier; + if (cwq->barrier) { + prio = min(prio, cwq->barrier->waiter_prio); + prio = min(prio, plist_first(&cwq->barrier->worklist)->prio); + } + barrier->prev_prio = prio; + cwq->barrier = barrier; + spin_unlock_irq(&cwq->lock); } static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, - struct wq_barrier *barr, int tail) + struct wq_barrier *barr) { INIT_WORK(&barr->work, wq_barrier_func); __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); + plist_head_init(&barr->worklist, NULL); + plist_head_splice(&cwq->worklist, &barr->worklist); + barr->cwq = cwq; init_completion(&barr->done); + barr->waiter_prio = current->prio; - insert_work(cwq, &barr->work, tail); + insert_work(cwq, &barr->work, 0, current->prio); } static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) @@ -363,8 +440,9 @@ static int flush_cpu_workqueue(struct cp active = 0; spin_lock_irq(&cwq->lock); - if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { - insert_wq_barrier(cwq, &barr, 1); + if (!plist_head_empty(&cwq->worklist) || + cwq->current_work != NULL) { + insert_wq_barrier(cwq, &barr); active = 1; } spin_unlock_irq(&cwq->lock); @@ -424,7 +502,7 @@ static int try_to_grab_pending(struct wo return ret; spin_lock_irq(&cwq->lock); - if (!list_empty(&work->entry)) { + if (!plist_node_empty(&work->entry)) { /* * This work is queued, but perhaps we locked the wrong cwq. * In that case we must see the new value after rmb(), see @@ -432,7 +510,8 @@ static int try_to_grab_pending(struct wo */ smp_rmb(); if (cwq == get_wq_data(work)) { - list_del_init(&work->entry); + plist_del(&work->entry, &cwq->worklist); + plist_node_init(&work->entry, MAX_PRIO); ret = 1; } } @@ -441,21 +520,24 @@ static int try_to_grab_pending(struct wo return ret; } -static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, - struct work_struct *work) +static inline +int is_current_work(struct cpu_workqueue_struct *cwq, struct work_struct *work) { - struct wq_barrier barr; - int running = 0; + int ret; spin_lock_irq(&cwq->lock); - if (unlikely(cwq->current_work == work)) { - insert_wq_barrier(cwq, &barr, 0); - running = 1; - } + ret = (cwq->current_work == work); spin_unlock_irq(&cwq->lock); - if (unlikely(running)) - wait_for_completion(&barr.done); + return ret; +} + +static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, + struct work_struct *work) +{ + DEFINE_WAIT(wait); + + wait_event(cwq->work_done, !is_current_work(cwq, work)); } static void wait_on_work(struct work_struct *work) @@ -585,9 +667,28 @@ int schedule_delayed_work_on(int cpu, } EXPORT_SYMBOL(schedule_delayed_work_on); +struct schedule_on_each_cpu_work { + struct work_struct work; + void (*func)(void *info); + void *info; +}; + +static void schedule_on_each_cpu_func(struct work_struct *work) +{ + struct schedule_on_each_cpu_work *w; + + w = container_of(work, typeof(*w), work); + w->func(w->info); + + kfree(w); +} + /** * schedule_on_each_cpu - call a function on each online CPU from keventd * @func: the function to call + * @info: data to pass to function + * @retry: ignored + * @wait: wait for completion * * Returns zero on success. * Returns -ve errno on failure. @@ -596,29 +697,94 @@ EXPORT_SYMBOL(schedule_delayed_work_on); * * schedule_on_each_cpu() is very slow. */ -int schedule_on_each_cpu(work_func_t func) +int schedule_on_each_cpu(void (*func)(void *info), void *info, int retry, int wait) +{ + int cpu; + struct schedule_on_each_cpu_work **works; + int err = 0; + + works = kzalloc(sizeof(void *)*nr_cpu_ids, GFP_KERNEL); + if (!works) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + works[cpu] = kmalloc_node(sizeof(struct schedule_on_each_cpu_work), + GFP_KERNEL, cpu_to_node(cpu)); + if (!works[cpu]) { + err = -ENOMEM; + goto out; + } + } + + lock_cpu_hotplug(); + for_each_online_cpu(cpu) { + struct schedule_on_each_cpu_work *work; + + work = works[cpu]; + works[cpu] = NULL; + + work->func = func; + work->info = info; + INIT_WORK(&work->work, schedule_on_each_cpu_func); + set_bit(WORK_STRUCT_PENDING, work_data_bits(&work->work)); + __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), + &work->work, current->normal_prio); + } + unlock_cpu_hotplug(); + +out: + for_each_possible_cpu(cpu) { + if (works[cpu]) + kfree(works[cpu]); + } + kfree(works); + + if (!err && wait) + flush_workqueue(keventd_wq); + + return err; +} +EXPORT_SYMBOL(schedule_on_each_cpu); + +/** + * schedule_on_each_cpu_wq - call a function on each online CPU on a per-CPU wq + * @func: the function to call + * + * Returns zero on success. + * Returns -ve errno on failure. + * + * Appears to be racy against CPU hotplug. + * + * schedule_on_each_cpu() is very slow. + */ +int schedule_on_each_cpu_wq(struct workqueue_struct *wq, work_func_t func) { int cpu; struct work_struct *works; + if (is_single_threaded(wq)) { + WARN_ON(1); + return -EINVAL; + } works = alloc_percpu(struct work_struct); if (!works) return -ENOMEM; - preempt_disable(); /* CPU hotplug */ for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); - __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work); + __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work, + current->normal_prio); } - preempt_enable(); - flush_workqueue(keventd_wq); + flush_workqueue(wq); free_percpu(works); + return 0; } + void flush_scheduled_work(void) { flush_workqueue(keventd_wq); @@ -679,8 +845,10 @@ init_cpu_workqueue(struct workqueue_stru cwq->wq = wq; spin_lock_init(&cwq->lock); - INIT_LIST_HEAD(&cwq->worklist); + plist_head_init(&cwq->worklist, NULL); init_waitqueue_head(&cwq->more_work); + cwq->barrier = NULL; + init_waitqueue_head(&cwq->work_done); return cwq; } @@ -797,6 +965,47 @@ static void cleanup_workqueue_thread(str cwq->thread = NULL; } +void set_workqueue_thread_prio(struct workqueue_struct *wq, int cpu, + int policy, int rt_priority, int nice) +{ + struct sched_param param = { .sched_priority = rt_priority }; + struct cpu_workqueue_struct *cwq; + mm_segment_t oldfs = get_fs(); + struct task_struct *p; + unsigned long flags; + int ret; + + cwq = per_cpu_ptr(wq->cpu_wq, cpu); + spin_lock_irqsave(&cwq->lock, flags); + p = cwq->thread; + spin_unlock_irqrestore(&cwq->lock, flags); + + set_user_nice(p, nice); + + set_fs(KERNEL_DS); + ret = sys_sched_setscheduler(p->pid, policy, ¶m); + set_fs(oldfs); + + WARN_ON(ret); +} + + void set_workqueue_prio(struct workqueue_struct *wq, int policy, + int rt_priority, int nice) +{ + int cpu; + + /* We don't need the distraction of CPUs appearing and vanishing. */ + mutex_lock(&workqueue_mutex); + if (is_single_threaded(wq)) + set_workqueue_thread_prio(wq, 0, policy, rt_priority, nice); + else { + for_each_online_cpu(cpu) + set_workqueue_thread_prio(wq, cpu, policy, + rt_priority, nice); + } + mutex_unlock(&workqueue_mutex); +} + /** * destroy_workqueue - safely terminate a workqueue * @wq: target workqueue @@ -879,4 +1088,5 @@ void __init init_workqueues(void) hotcpu_notifier(workqueue_cpu_callback, 0); keventd_wq = create_workqueue("events"); BUG_ON(!keventd_wq); + set_workqueue_prio(keventd_wq, SCHED_FIFO, 1, -20); } Index: linux-2.6.24-rc8-rt1/lib/radix-tree.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/lib/radix-tree.c 2008-01-16 22:27:31.000000000 -0500 +++ linux-2.6.24-rc8-rt1/lib/radix-tree.c 2008-01-16 22:29:03.000000000 -0500 @@ -32,6 +32,7 @@ #include #include #include +#include #ifdef __KERNEL__ @@ -52,11 +53,17 @@ struct radix_tree_node { struct rcu_head rcu_head; void *slots[RADIX_TREE_MAP_SIZE]; unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; +#ifdef CONFIG_RADIX_TREE_CONCURRENT + spinlock_t lock; +#endif }; struct radix_tree_path { struct radix_tree_node *node; int offset; +#ifdef CONFIG_RADIX_TREE_CONCURRENT + spinlock_t *locked; +#endif }; #define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) @@ -69,6 +76,129 @@ struct radix_tree_path { */ static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly; +#ifdef CONFIG_RADIX_TREE_CONCURRENT +static struct lock_class_key radix_node_class[RADIX_TREE_MAX_PATH]; +#endif +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static const char *radix_node_key_string[RADIX_TREE_MAX_PATH] = { + "radix-node-00", + "radix-node-01", + "radix-node-02", + "radix-node-03", + "radix-node-04", + "radix-node-05", + "radix-node-06", + "radix-node-07", + "radix-node-08", + "radix-node-09", + "radix-node-10", + "radix-node-11", + "radix-node-12", + "radix-node-13", + "radix-node-14", + "radix-node-15", +}; +#endif + +#ifdef CONFIG_RADIX_TREE_OPTIMISTIC +static DEFINE_PER_CPU(unsigned long[RADIX_TREE_MAX_PATH+1], optimistic_histogram); + +static void optimistic_hit(unsigned long height) +{ + if (height > RADIX_TREE_MAX_PATH) + height = RADIX_TREE_MAX_PATH; + + __get_cpu_var(optimistic_histogram)[height]++; +} + +#ifdef CONFIG_PROC_FS + +#include +#include + +static void *frag_start(struct seq_file *m, loff_t *pos) +{ + if (*pos < 0 || *pos > RADIX_TREE_MAX_PATH) + return NULL; + + m->private = (void *)(unsigned long)*pos; + return pos; +} + +static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) +{ + if (*pos < RADIX_TREE_MAX_PATH) { + (*pos)++; + (*((unsigned long *)&m->private))++; + return pos; + } + return NULL; +} + +static void frag_stop(struct seq_file *m, void *arg) +{ +} + +unsigned long get_optimistic_stat(unsigned long index) +{ + unsigned long total = 0; + int cpu; + + for_each_possible_cpu(cpu) { + total += per_cpu(optimistic_histogram, cpu)[index]; + } + return total; +} + +static int frag_show(struct seq_file *m, void *arg) +{ + unsigned long index = (unsigned long)m->private; + unsigned long hits = get_optimistic_stat(index); + + if (index == 0) + seq_printf(m, "levels skipped\thits\n"); + + if (index < RADIX_TREE_MAX_PATH) + seq_printf(m, "%9lu\t%9lu\n", index, hits); + else + seq_printf(m, "failed\t%9lu\n", hits); + + return 0; +} + +struct seq_operations optimistic_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = frag_show, +}; + +static void optimistic_reset(void) +{ + int cpu; + int height; + for_each_possible_cpu(cpu) { + for (height = 0; height <= RADIX_TREE_MAX_PATH; height++) + per_cpu(optimistic_histogram, cpu)[height] = 0; + } +} + +ssize_t optimistic_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + if (count) { + char c; + if (get_user(c, buf)) + return -EFAULT; + if (c == '0') + optimistic_reset(); + } + return count; +} + +#endif // CONFIG_PROC_FS +#endif // CONFIG_RADIX_TREE_OPTIMISTIC + /* * Radix tree node cache. */ @@ -93,7 +223,7 @@ static inline gfp_t root_gfp_mask(struct * that the caller has pinned this thread of control to the current CPU. */ static struct radix_tree_node * -radix_tree_node_alloc(struct radix_tree_root *root) +radix_tree_node_alloc(struct radix_tree_root *root, int height) { struct radix_tree_node *ret; gfp_t gfp_mask = root_gfp_mask(root); @@ -103,14 +233,22 @@ radix_tree_node_alloc(struct radix_tree_ if (ret == NULL && !(gfp_mask & __GFP_WAIT)) { struct radix_tree_preload *rtp; - rtp = &__get_cpu_var(radix_tree_preloads); + rtp = &get_cpu_var(radix_tree_preloads); if (rtp->nr) { ret = rtp->nodes[rtp->nr - 1]; rtp->nodes[rtp->nr - 1] = NULL; rtp->nr--; } + put_cpu_var(radix_tree_preloads); } BUG_ON(radix_tree_is_indirect_ptr(ret)); +#ifdef CONFIG_RADIX_TREE_CONCURRENT + spin_lock_init(&ret->lock); + lockdep_set_class_and_name(&ret->lock, + &radix_node_class[height], + radix_node_key_string[height]); +#endif + ret->height = height; return ret; } @@ -127,6 +265,8 @@ radix_tree_node_free(struct radix_tree_n call_rcu(&node->rcu_head, radix_tree_node_rcu_free); } +#ifndef CONFIG_PREEMPT_RT + /* * Load up this CPU's radix_tree_node buffer with sufficient objects to * ensure that the addition of a single element in the tree cannot fail. On @@ -160,6 +300,8 @@ out: } EXPORT_SYMBOL(radix_tree_preload); +#endif + static inline void tag_set(struct radix_tree_node *node, unsigned int tag, int offset) { @@ -213,6 +355,22 @@ static inline int any_tag_set(struct rad return 0; } +static inline int any_tag_set_but(struct radix_tree_node *node, + unsigned int tag, int offset) +{ + int idx; + int offset_idx = offset / BITS_PER_LONG; + unsigned long offset_mask = ~(1UL << (offset % BITS_PER_LONG)); + for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { + unsigned long mask = ~0UL; + if (idx == offset_idx) + mask = offset_mask; + if (node->tags[tag][idx] & mask) + return 1; + } + return 0; +} + /* * Return the maximum key which can be store into a * radix tree with height HEIGHT. @@ -242,8 +400,8 @@ static int radix_tree_extend(struct radi } do { - unsigned int newheight; - if (!(node = radix_tree_node_alloc(root))) + unsigned int newheight = root->height + 1; + if (!(node = radix_tree_node_alloc(root, newheight))) return -ENOMEM; /* Increase the height. */ @@ -255,8 +413,6 @@ static int radix_tree_extend(struct radi tag_set(node, tag, 0); } - newheight = root->height+1; - node->height = newheight; node->count = 1; node = radix_tree_ptr_to_indirect(node); rcu_assign_pointer(root->rnode, node); @@ -266,6 +422,193 @@ out: return 0; } +#ifdef CONFIG_RADIX_TREE_CONCURRENT +static inline struct radix_tree_context * +radix_tree_get_context(struct radix_tree_root **rootp) +{ + struct radix_tree_context *context = NULL; + unsigned long addr = (unsigned long)*rootp; + + if (addr & 1) { + context = (struct radix_tree_context *)(addr - 1); + *rootp = context->root; + } + + return context; +} + +#define RADIX_TREE_CONTEXT(context, root) \ + struct radix_tree_context *context = \ + radix_tree_get_context(&root) + +static inline spinlock_t *radix_node_lock(struct radix_tree_root *root, + struct radix_tree_node *node) +{ + spinlock_t *locked = &node->lock; + spin_lock(locked); + return locked; +} + +static inline void radix_ladder_lock(struct radix_tree_context *context, + struct radix_tree_node *node) +{ + if (context) { + struct radix_tree_root *root = context->root; + spinlock_t *locked = radix_node_lock(root, node); + if (locked) { + spin_unlock(context->locked); + context->locked = locked; + } + } +} + +static inline void radix_path_init(struct radix_tree_context *context, + struct radix_tree_path *pathp) +{ + pathp->locked = context ? context->locked : NULL; +} + +static inline void radix_path_lock(struct radix_tree_context *context, + struct radix_tree_path *pathp, struct radix_tree_node *node) +{ + if (context) { + struct radix_tree_root *root = context->root; + spinlock_t *locked = radix_node_lock(root, node); + if (locked) + context->locked = locked; + pathp->locked = locked; + } else + pathp->locked = NULL; +} + +static inline void radix_path_unlock(struct radix_tree_context *context, + struct radix_tree_path *punlock) +{ + if (context && punlock->locked && + context->locked != punlock->locked) + spin_unlock(punlock->locked); +} +#else +#define RADIX_TREE_CONTEXT(context, root) do { } while (0) +#define radix_ladder_lock(context, node) do { } while (0) +#define radix_path_init(context, pathp) do { } while (0) +#define radix_path_lock(context, pathp, node) do { } while (0) +#define radix_path_unlock(context, punlock) do { } while (0) +#endif + +#ifdef CONFIG_RADIX_TREE_OPTIMISTIC +typedef int (*radix_valid_fn)(struct radix_tree_node *, int, int); + +static struct radix_tree_node * +radix_optimistic_lookup(struct radix_tree_context *context, unsigned long index, + int tag, radix_valid_fn valid) +{ + unsigned int height, shift; + struct radix_tree_node *node, *ret = NULL, **slot; + struct radix_tree_root *root = context->root; + + node = rcu_dereference(root->rnode); + if (node == NULL) + return NULL; + + if (!radix_tree_is_indirect_ptr(node)) + return NULL; + + node = radix_tree_indirect_to_ptr(node); + + height = node->height; + if (index > radix_tree_maxindex(height)) + return NULL; + + shift = (height-1) * RADIX_TREE_MAP_SHIFT; + do { + int offset = (index >> shift) & RADIX_TREE_MAP_MASK; + if ((*valid)(node, offset, tag)) + ret = node; + slot = (struct radix_tree_node **)(node->slots + offset); + node = rcu_dereference(*slot); + if (!node) + break; + + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } while (height > 0); + + return ret; +} + +static struct radix_tree_node * +__radix_optimistic_lock(struct radix_tree_context *context, unsigned long index, + int tag, radix_valid_fn valid) +{ + struct radix_tree_node *node; + spinlock_t *locked; + unsigned int shift, offset; + + node = radix_optimistic_lookup(context, index, tag, valid); + if (!node) + goto out; + + locked = radix_node_lock(context->root, node); + if (!locked) + goto out; + +#if 0 + if (node != radix_optimistic_lookup(context, index, tag, valid)) + goto out_unlock; +#else + /* check if the node got freed */ + if (!node->count) + goto out_unlock; + + /* check if the node is still a valid termination point */ + shift = (node->height - 1) * RADIX_TREE_MAP_SHIFT; + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + if (!(*valid)(node, offset, tag)) + goto out_unlock; +#endif + + context->locked = locked; + return node; + +out_unlock: + spin_unlock(locked); +out: + return NULL; +} + +static struct radix_tree_node * +radix_optimistic_lock(struct radix_tree_context *context, unsigned long index, + int tag, radix_valid_fn valid) +{ + struct radix_tree_node *node = NULL; + + if (context) { + node = __radix_optimistic_lock(context, index, tag, valid); + if (!node) { + BUG_ON(context->locked); + spin_lock(&context->root->lock); + context->locked = &context->root->lock; + optimistic_hit(RADIX_TREE_MAX_PATH); + } else + optimistic_hit(context->root->height - node->height); + } + return node; +} + +static int radix_valid_always(struct radix_tree_node *node, int offset, int tag) +{ + return 1; +} + +static int radix_valid_tag(struct radix_tree_node *node, int offset, int tag) +{ + return tag_get(node, tag, offset); +} +#else +#define radix_optimistic_lock(context, index, tag, valid) NULL +#endif + /** * radix_tree_insert - insert into a radix tree * @root: radix tree root @@ -281,9 +624,18 @@ int radix_tree_insert(struct radix_tree_ unsigned int height, shift; int offset; int error; + int tag; + RADIX_TREE_CONTEXT(context, root); BUG_ON(radix_tree_is_indirect_ptr(item)); + node = radix_optimistic_lock(context, index, 0, radix_valid_always); + if (node) { + height = node->height; + shift = (height-1) * RADIX_TREE_MAP_SHIFT; + goto optimistic; + } + /* Make sure the tree is high enough. */ if (index > radix_tree_maxindex(root->height)) { error = radix_tree_extend(root, index); @@ -292,7 +644,6 @@ int radix_tree_insert(struct radix_tree_ } slot = radix_tree_indirect_to_ptr(root->rnode); - height = root->height; shift = (height-1) * RADIX_TREE_MAP_SHIFT; @@ -300,9 +651,8 @@ int radix_tree_insert(struct radix_tree_ while (height > 0) { if (slot == NULL) { /* Have to add a child node. */ - if (!(slot = radix_tree_node_alloc(root))) + if (!(slot = radix_tree_node_alloc(root, height))) return -ENOMEM; - slot->height = height; if (node) { rcu_assign_pointer(node->slots[offset], slot); node->count++; @@ -312,8 +662,11 @@ int radix_tree_insert(struct radix_tree_ } /* Go a level down */ - offset = (index >> shift) & RADIX_TREE_MAP_MASK; node = slot; + radix_ladder_lock(context, node); + +optimistic: + offset = (index >> shift) & RADIX_TREE_MAP_MASK; slot = node->slots[offset]; shift -= RADIX_TREE_MAP_SHIFT; height--; @@ -325,12 +678,12 @@ int radix_tree_insert(struct radix_tree_ if (node) { node->count++; rcu_assign_pointer(node->slots[offset], item); - BUG_ON(tag_get(node, 0, offset)); - BUG_ON(tag_get(node, 1, offset)); + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + BUG_ON(tag_get(node, tag, offset)); } else { rcu_assign_pointer(root->rnode, item); - BUG_ON(root_tag_get(root, 0)); - BUG_ON(root_tag_get(root, 1)); + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + BUG_ON(root_tag_get(root, tag)); } return 0; @@ -345,18 +698,22 @@ EXPORT_SYMBOL(radix_tree_insert); * Returns: the slot corresponding to the position @index in the * radix tree @root. This is useful for update-if-exists operations. * - * This function cannot be called under rcu_read_lock, it must be - * excluded from writers, as must the returned slot for subsequent - * use by radix_tree_deref_slot() and radix_tree_replace slot. - * Caller must hold tree write locked across slot lookup and - * replace. + * This function can be called under rcu_read_lock iff the slot is not + * modified by radix_tree_replace_slot, otherwise it must be called + * exclusive from other writers. Any dereference of the slot must be done + * using radix_tree_deref_slot. */ void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index) { unsigned int height, shift; struct radix_tree_node *node, **slot; + RADIX_TREE_CONTEXT(context, root); - node = root->rnode; + node = radix_optimistic_lock(context, index, 0, radix_valid_always); + if (node) + goto optimistic; + + node = rcu_dereference(root->rnode); if (node == NULL) return NULL; @@ -367,6 +724,7 @@ void **radix_tree_lookup_slot(struct rad } node = radix_tree_indirect_to_ptr(node); +optimistic: height = node->height; if (index > radix_tree_maxindex(height)) return NULL; @@ -376,10 +734,12 @@ void **radix_tree_lookup_slot(struct rad do { slot = (struct radix_tree_node **) (node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK)); - node = *slot; + node = rcu_dereference(*slot); if (node == NULL) return NULL; + radix_ladder_lock(context, node); + shift -= RADIX_TREE_MAP_SHIFT; height--; } while (height > 0); @@ -455,6 +815,14 @@ void *radix_tree_tag_set(struct radix_tr { unsigned int height, shift; struct radix_tree_node *slot; + RADIX_TREE_CONTEXT(context, root); + + slot = radix_optimistic_lock(context, index, tag, radix_valid_tag); + if (slot) { + height = slot->height; + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + goto optimistic; + } height = root->height; BUG_ON(index > radix_tree_maxindex(height)); @@ -462,9 +830,16 @@ void *radix_tree_tag_set(struct radix_tr slot = radix_tree_indirect_to_ptr(root->rnode); shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + /* set the root's tag bit */ + if (slot && !root_tag_get(root, tag)) + root_tag_set(root, tag); + while (height > 0) { int offset; + radix_ladder_lock(context, slot); + +optimistic: offset = (index >> shift) & RADIX_TREE_MAP_MASK; if (!tag_get(slot, tag, offset)) tag_set(slot, tag, offset); @@ -474,14 +849,24 @@ void *radix_tree_tag_set(struct radix_tr height--; } - /* set the root's tag bit */ - if (slot && !root_tag_get(root, tag)) - root_tag_set(root, tag); - return slot; } EXPORT_SYMBOL(radix_tree_tag_set); +/* + * the change can never propagate upwards from here. + */ +static +int radix_valid_tag_clear(struct radix_tree_node *node, int offset, int tag) +{ + int this, other; + + this = tag_get(node, tag, offset); + other = any_tag_set_but(node, tag, offset); + + return !this || other; +} + /** * radix_tree_tag_clear - clear a tag on a radix tree node * @root: radix tree root @@ -504,28 +889,51 @@ void *radix_tree_tag_clear(struct radix_ * since the "list" is null terminated. */ struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path; + struct radix_tree_path *punlock = path, *piter; struct radix_tree_node *slot = NULL; - unsigned int height, shift; + unsigned int height, shift, offset; + + RADIX_TREE_CONTEXT(context, root); + + slot = radix_optimistic_lock(context, index, tag, + radix_valid_tag_clear); + if (slot) { + height = slot->height; + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + pathp->offset = offset; + pathp->node = slot; + radix_path_init(context, pathp); + goto optimistic; + } + + pathp->node = NULL; + radix_path_init(context, pathp); height = root->height; if (index > radix_tree_maxindex(height)) goto out; shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - pathp->node = NULL; slot = radix_tree_indirect_to_ptr(root->rnode); while (height > 0) { - int offset; - if (slot == NULL) goto out; offset = (index >> shift) & RADIX_TREE_MAP_MASK; - pathp[1].offset = offset; - pathp[1].node = slot; - slot = slot->slots[offset]; pathp++; + pathp->offset = offset; + pathp->node = slot; + radix_path_lock(context, pathp, slot); + + if (radix_valid_tag_clear(slot, offset, tag)) { + for (; punlock < pathp; punlock++) + radix_path_unlock(context, punlock); + } + +optimistic: + slot = slot->slots[offset]; shift -= RADIX_TREE_MAP_SHIFT; height--; } @@ -533,20 +941,22 @@ void *radix_tree_tag_clear(struct radix_ if (slot == NULL) goto out; - while (pathp->node) { - if (!tag_get(pathp->node, tag, pathp->offset)) - goto out; - tag_clear(pathp->node, tag, pathp->offset); - if (any_tag_set(pathp->node, tag)) - goto out; - pathp--; + for (piter = pathp; piter >= punlock; piter--) { + if (piter->node) { + if (!tag_get(piter->node, tag, piter->offset)) + break; + tag_clear(piter->node, tag, piter->offset); + if (any_tag_set(piter->node, tag)) + break; + } else { + if (root_tag_get(root, tag)) + root_tag_clear(root, tag); + } } - /* clear the root's tag bit */ - if (root_tag_get(root, tag)) - root_tag_clear(root, tag); - out: + for (; punlock < pathp; punlock++) + radix_path_unlock(context, punlock); return slot; } EXPORT_SYMBOL(radix_tree_tag_clear); @@ -653,7 +1063,7 @@ unsigned long radix_tree_next_hole(struc EXPORT_SYMBOL(radix_tree_next_hole); static unsigned int -__lookup(struct radix_tree_node *slot, void **results, unsigned long index, +__lookup(struct radix_tree_node *slot, void ***results, unsigned long index, unsigned int max_items, unsigned long *next_index) { unsigned int nr_found = 0; @@ -687,11 +1097,9 @@ __lookup(struct radix_tree_node *slot, v /* Bottom level: grab some items */ for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) { - struct radix_tree_node *node; index++; - node = slot->slots[i]; - if (node) { - results[nr_found++] = rcu_dereference(node); + if (slot->slots[i]) { + results[nr_found++] = &(slot->slots[i]); if (nr_found == max_items) goto out; } @@ -745,13 +1153,22 @@ radix_tree_gang_lookup(struct radix_tree ret = 0; while (ret < max_items) { - unsigned int nr_found; + unsigned int nr_found, slots_found, i; unsigned long next_index; /* Index of next search */ if (cur_index > max_index) break; - nr_found = __lookup(node, results + ret, cur_index, + slots_found = __lookup(node, (void ***)results + ret, cur_index, max_items - ret, &next_index); + nr_found = 0; + for (i = 0; i < slots_found; i++) { + struct radix_tree_node *slot; + slot = *(((void ***)results)[ret + i]); + if (!slot) + continue; + results[ret + nr_found] = rcu_dereference(slot); + nr_found++; + } ret += nr_found; if (next_index == 0) break; @@ -762,12 +1179,71 @@ radix_tree_gang_lookup(struct radix_tree } EXPORT_SYMBOL(radix_tree_gang_lookup); +/** + * radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree + * @root: radix tree root + * @results: where the results of the lookup are placed + * @first_index: start the lookup from this key + * @max_items: place up to this many items at *results + * + * Performs an index-ascending scan of the tree for present items. Places + * their slots at *@results and returns the number of items which were + * placed at *@results. + * + * The implementation is naive. + * + * Like radix_tree_gang_lookup as far as RCU and locking goes. Slots must + * be dereferenced with radix_tree_deref_slot, and if using only RCU + * protection, radix_tree_deref_slot may fail requiring a retry. + */ +unsigned int +radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, + unsigned long first_index, unsigned int max_items) +{ + unsigned long max_index; + struct radix_tree_node *node; + unsigned long cur_index = first_index; + unsigned int ret; + + node = rcu_dereference(root->rnode); + if (!node) + return 0; + + if (!radix_tree_is_indirect_ptr(node)) { + if (first_index > 0) + return 0; + results[0] = (void **)&root->rnode; + return 1; + } + node = radix_tree_indirect_to_ptr(node); + + max_index = radix_tree_maxindex(node->height); + + ret = 0; + while (ret < max_items) { + unsigned int slots_found; + unsigned long next_index; /* Index of next search */ + + if (cur_index > max_index) + break; + slots_found = __lookup(node, results + ret, cur_index, + max_items - ret, &next_index); + ret += slots_found; + if (next_index == 0) + break; + cur_index = next_index; + } + + return ret; +} +EXPORT_SYMBOL(radix_tree_gang_lookup_slot); + /* * FIXME: the two tag_get()s here should use find_next_bit() instead of * open-coding the search. */ static unsigned int -__lookup_tag(struct radix_tree_node *slot, void **results, unsigned long index, +__lookup_tag(struct radix_tree_node *slot, void ***results, unsigned long index, unsigned int max_items, unsigned long *next_index, unsigned int tag) { unsigned int nr_found = 0; @@ -812,9 +1288,8 @@ __lookup_tag(struct radix_tree_node *slo * lookup ->slots[x] without a lock (ie. can't * rely on its value remaining the same). */ - if (node) { - node = rcu_dereference(node); - results[nr_found++] = node; + if (slot->slots[j]) { + results[nr_found++] = &slot->slots[j]; if (nr_found == max_items) goto out; } @@ -873,13 +1348,22 @@ radix_tree_gang_lookup_tag(struct radix_ ret = 0; while (ret < max_items) { - unsigned int nr_found; + unsigned int slots_found, nr_found, i; unsigned long next_index; /* Index of next search */ if (cur_index > max_index) break; - nr_found = __lookup_tag(node, results + ret, cur_index, - max_items - ret, &next_index, tag); + slots_found = __lookup_tag(node, (void ***)results + ret, + cur_index, max_items - ret, &next_index, tag); + nr_found = 0; + for (i = 0; i < slots_found; i++) { + struct radix_tree_node *slot; + slot = *((void ***)results)[ret + i]; + if (!slot) + continue; + results[ret + nr_found] = rcu_dereference(slot); + nr_found++; + } ret += nr_found; if (next_index == 0) break; @@ -891,6 +1375,67 @@ radix_tree_gang_lookup_tag(struct radix_ EXPORT_SYMBOL(radix_tree_gang_lookup_tag); /** + * radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a + * radix tree based on a tag + * @root: radix tree root + * @results: where the results of the lookup are placed + * @first_index: start the lookup from this key + * @max_items: place up to this many items at *results + * @tag: the tag index (< RADIX_TREE_MAX_TAGS) + * + * Performs an index-ascending scan of the tree for present items which + * have the tag indexed by @tag set. Places the slots at *@results and + * returns the number of slots which were placed at *@results. + */ +unsigned int +radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, + unsigned long first_index, unsigned int max_items, + unsigned int tag) +{ + struct radix_tree_node *node; + unsigned long max_index; + unsigned long cur_index = first_index; + unsigned int ret; + + /* check the root's tag bit */ + if (!root_tag_get(root, tag)) + return 0; + + node = rcu_dereference(root->rnode); + if (!node) + return 0; + + if (!radix_tree_is_indirect_ptr(node)) { + if (first_index > 0) + return 0; + results[0] = (void **)&root->rnode; + return 1; + } + node = radix_tree_indirect_to_ptr(node); + + max_index = radix_tree_maxindex(node->height); + + ret = 0; + while (ret < max_items) { + unsigned int slots_found; + unsigned long next_index; /* Index of next search */ + + if (cur_index > max_index) + break; + slots_found = __lookup_tag(node, results + ret, + cur_index, max_items - ret, &next_index, tag); + ret += slots_found; + if (next_index == 0) + break; + cur_index = next_index; + } + + return ret; +} +EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); + + +/** * radix_tree_shrink - shrink height of a radix tree to minimal * @root radix tree root */ @@ -900,6 +1445,7 @@ static inline void radix_tree_shrink(str while (root->height > 0) { struct radix_tree_node *to_free = root->rnode; void *newptr; + int tag; BUG_ON(!radix_tree_is_indirect_ptr(to_free)); to_free = radix_tree_indirect_to_ptr(to_free); @@ -926,14 +1472,35 @@ static inline void radix_tree_shrink(str root->rnode = newptr; root->height--; /* must only free zeroed nodes into the slab */ - tag_clear(to_free, 0, 0); - tag_clear(to_free, 1, 0); + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + tag_clear(to_free, tag, 0); to_free->slots[0] = NULL; to_free->count = 0; - radix_tree_node_free(to_free); } } +static +int radix_valid_delete(struct radix_tree_node *node, int offset, int tag) +{ + /* + * we need to check for > 2, because nodes with a single child + * can still be deleted, see radix_tree_shrink(). + */ + int unlock = (node->count > 2); + + if (!unlock) + return unlock; + + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { + if (!radix_valid_tag_clear(node, offset, tag)) { + unlock = 0; + break; + } + } + + return unlock; +} + /** * radix_tree_delete - delete an item from a radix tree * @root: radix tree root @@ -950,11 +1517,26 @@ void *radix_tree_delete(struct radix_tre * since the "list" is null terminated. */ struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path; + struct radix_tree_path *punlock = path, *piter; struct radix_tree_node *slot = NULL; - struct radix_tree_node *to_free; unsigned int height, shift; int tag; int offset; + RADIX_TREE_CONTEXT(context, root); + + slot = radix_optimistic_lock(context, index, 0, radix_valid_delete); + if (slot) { + height = slot->height; + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + pathp->offset = offset; + pathp->node = slot; + radix_path_init(context, pathp); + goto optimistic; + } + + pathp->node = NULL; + radix_path_init(context, pathp); height = root->height; if (index > radix_tree_maxindex(height)) @@ -969,7 +1551,6 @@ void *radix_tree_delete(struct radix_tre slot = radix_tree_indirect_to_ptr(slot); shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - pathp->node = NULL; do { if (slot == NULL) @@ -979,6 +1560,14 @@ void *radix_tree_delete(struct radix_tre offset = (index >> shift) & RADIX_TREE_MAP_MASK; pathp->offset = offset; pathp->node = slot; + radix_path_lock(context, pathp, slot); + + if (radix_valid_delete(slot, offset, 0)) { + for (; punlock < pathp; punlock++) + radix_path_unlock(context, punlock); + } + +optimistic: slot = slot->slots[offset]; shift -= RADIX_TREE_MAP_SHIFT; height--; @@ -991,41 +1580,45 @@ void *radix_tree_delete(struct radix_tre * Clear all tags associated with the just-deleted item */ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { - if (tag_get(pathp->node, tag, pathp->offset)) - radix_tree_tag_clear(root, index, tag); + for (piter = pathp; piter >= punlock; piter--) { + if (piter->node) { + if (!tag_get(piter->node, tag, piter->offset)) + break; + tag_clear(piter->node, tag, piter->offset); + if (any_tag_set(piter->node, tag)) + break; + } else { + if (root_tag_get(root, tag)) + root_tag_clear(root, tag); + } + } } - to_free = NULL; - /* Now free the nodes we do not need anymore */ - while (pathp->node) { - pathp->node->slots[pathp->offset] = NULL; - pathp->node->count--; - /* - * Queue the node for deferred freeing after the - * last reference to it disappears (set NULL, above). - */ - if (to_free) - radix_tree_node_free(to_free); + /* Now unhook the nodes we do not need anymore */ + for (piter = pathp; piter >= punlock && piter->node; piter--) { + piter->node->slots[piter->offset] = NULL; + piter->node->count--; - if (pathp->node->count) { - if (pathp->node == + if (piter->node->count) { + if (piter->node == radix_tree_indirect_to_ptr(root->rnode)) radix_tree_shrink(root); goto out; } + } - /* Node with zero slots in use so free it */ - to_free = pathp->node; - pathp--; + BUG_ON(piter->node); - } root_tag_clear_all(root); root->height = 0; root->rnode = NULL; - if (to_free) - radix_tree_node_free(to_free); out: + for (; punlock <= pathp; punlock++) { + radix_path_unlock(context, punlock); + if (punlock->node && punlock->node->count == 0) + radix_tree_node_free(punlock->node); + } return slot; } EXPORT_SYMBOL(radix_tree_delete); Index: linux-2.6.24-rc8-rt1/kernel/notifier.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/kernel/notifier.c 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/kernel/notifier.c 2008-01-16 22:28:49.000000000 -0500 @@ -55,7 +55,7 @@ static int notifier_chain_unregister(str * @returns: notifier_call_chain returns the value returned by the * last notifier function called. */ -static int __kprobes notifier_call_chain(struct notifier_block **nl, +static int __kprobes notrace notifier_call_chain(struct notifier_block **nl, unsigned long val, void *v, int nr_to_call, int *nr_calls) { @@ -193,7 +193,7 @@ int blocking_notifier_chain_register(str * not yet working and interrupts must remain disabled. At * such times we must not call down_write(). */ - if (unlikely(system_state == SYSTEM_BOOTING)) + if (unlikely(system_state < SYSTEM_RUNNING)) return notifier_chain_register(&nh->head, n); down_write(&nh->rwsem); Index: linux-2.6.24-rc8-rt1/block/ll_rw_blk.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/block/ll_rw_blk.c 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/block/ll_rw_blk.c 2008-01-16 22:28:50.000000000 -0500 @@ -1548,7 +1548,7 @@ static int ll_merge_requests_fn(struct r */ void blk_plug_device(struct request_queue *q) { - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); /* * don't plug a stopped queue, it must be paired with blk_start_queue() @@ -1571,7 +1571,7 @@ EXPORT_SYMBOL(blk_plug_device); */ int blk_remove_plug(struct request_queue *q) { - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) return 0; @@ -1670,7 +1670,7 @@ EXPORT_SYMBOL(blk_unplug); **/ void blk_start_queue(struct request_queue *q) { - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags); Index: linux-2.6.24-rc8-rt1/fs/aio.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/fs/aio.c 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/fs/aio.c 2008-01-16 22:28:50.000000000 -0500 @@ -582,13 +582,15 @@ static void use_mm(struct mm_struct *mm) tsk->flags |= PF_BORROWED_MM; active_mm = tsk->active_mm; atomic_inc(&mm->mm_count); - tsk->mm = mm; - tsk->active_mm = mm; + local_irq_disable(); // FIXME /* * Note that on UML this *requires* PF_BORROWED_MM to be set, otherwise * it won't work. Update it accordingly if you change it here */ switch_mm(active_mm, mm, tsk); + tsk->mm = mm; + tsk->active_mm = mm; + local_irq_enable(); task_unlock(tsk); mmdrop(active_mm); Index: linux-2.6.24-rc8-rt1/fs/block_dev.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/fs/block_dev.c 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/fs/block_dev.c 2008-01-16 22:29:01.000000000 -0500 @@ -59,7 +59,7 @@ static sector_t max_block(struct block_d /* Kill _all_ buffers and pagecache , dirty or not.. */ static void kill_bdev(struct block_device *bdev) { - if (bdev->bd_inode->i_mapping->nrpages == 0) + if (mapping_nrpages(bdev->bd_inode->i_mapping) == 0) return; invalidate_bh_lrus(); truncate_inode_pages(bdev->bd_inode->i_mapping, 0); @@ -604,7 +604,7 @@ long nr_blockdev_pages(void) long ret = 0; spin_lock(&bdev_lock); list_for_each_entry(bdev, &all_bdevs, bd_list) { - ret += bdev->bd_inode->i_mapping->nrpages; + ret += mapping_nrpages(bdev->bd_inode->i_mapping); } spin_unlock(&bdev_lock); return ret; @@ -1225,14 +1225,32 @@ static int __blkdev_get(struct block_dev * For now, block device ->open() routine must _not_ * examine anything in 'inode' argument except ->i_rdev. */ - struct file fake_file = {}; - struct dentry fake_dentry = {}; - fake_file.f_mode = mode; - fake_file.f_flags = flags; - fake_file.f_path.dentry = &fake_dentry; - fake_dentry.d_inode = bdev->bd_inode; + struct file *fake_file; + struct dentry *fake_dentry; + int err = -ENOMEM; - return do_open(bdev, &fake_file, for_part); + fake_file = kmalloc(sizeof(*fake_file), GFP_KERNEL); + if (!fake_file) + goto out; + memset(fake_file, 0, sizeof(*fake_file)); + + fake_dentry = kmalloc(sizeof(*fake_dentry), GFP_KERNEL); + if (!fake_dentry) + goto out_free_file; + memset(fake_dentry, 0, sizeof(*fake_dentry)); + + fake_file->f_mode = mode; + fake_file->f_flags = flags; + fake_file->f_path.dentry = fake_dentry; + fake_dentry->d_inode = bdev->bd_inode; + + err = do_open(bdev, fake_file, for_part); + + kfree(fake_dentry); +out_free_file: + kfree(fake_file); +out: + return err; } int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags) Index: linux-2.6.24-rc8-rt1/fs/dcache.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/fs/dcache.c 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/fs/dcache.c 2008-01-16 22:28:50.000000000 -0500 @@ -704,8 +704,9 @@ void shrink_dcache_for_umount(struct sup { struct dentry *dentry; - if (down_read_trylock(&sb->s_umount)) - BUG(); +// -rt: this might succeed there ... +// if (down_read_trylock(&sb->s_umount)) +// BUG(); dentry = sb->s_root; sb->s_root = NULL; Index: linux-2.6.24-rc8-rt1/fs/dnotify.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/fs/dnotify.c 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/fs/dnotify.c 2008-01-16 22:28:50.000000000 -0500 @@ -162,7 +162,7 @@ void dnotify_parent(struct dentry *dentr spin_lock(&dentry->d_lock); parent = dentry->d_parent; - if (parent->d_inode->i_dnotify_mask & event) { + if (unlikely(parent->d_inode->i_dnotify_mask & event)) { dget(parent); spin_unlock(&dentry->d_lock); __inode_dir_notify(parent->d_inode, event); Index: linux-2.6.24-rc8-rt1/fs/exec.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/fs/exec.c 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/fs/exec.c 2008-01-16 22:28:50.000000000 -0500 @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -721,11 +722,16 @@ static int exec_mmap(struct mm_struct *m } } task_lock(tsk); + + local_irq_disable(); active_mm = tsk->active_mm; + activate_mm(active_mm, mm); tsk->mm = mm; tsk->active_mm = mm; - activate_mm(active_mm, mm); + local_irq_enable(); + task_unlock(tsk); + arch_pick_mmap_layout(mm); if (old_mm) { up_read(&old_mm->mmap_sem); Index: linux-2.6.24-rc8-rt1/fs/file.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/fs/file.c 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/fs/file.c 2008-01-16 22:28:50.000000000 -0500 @@ -96,14 +96,15 @@ void free_fdtable_rcu(struct rcu_head *r kfree(fdt->open_fds); kfree(fdt); } else { - fddef = &get_cpu_var(fdtable_defer_list); + + fddef = &per_cpu(fdtable_defer_list, raw_smp_processor_id()); + spin_lock(&fddef->lock); fdt->next = fddef->next; fddef->next = fdt; /* vmallocs are handled from the workqueue context */ schedule_work(&fddef->wq); spin_unlock(&fddef->lock); - put_cpu_var(fdtable_defer_list); } } Index: linux-2.6.24-rc8-rt1/fs/lockd/svc.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/fs/lockd/svc.c 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/fs/lockd/svc.c 2008-01-16 22:28:50.000000000 -0500 @@ -349,16 +349,12 @@ lockd_down(void) * Wait for the lockd process to exit, but since we're holding * the lockd semaphore, we can't wait around forever ... */ - clear_thread_flag(TIF_SIGPENDING); - interruptible_sleep_on_timeout(&lockd_exit, HZ); - if (nlmsvc_pid) { + if (wait_event_interruptible_timeout(lockd_exit, + nlmsvc_pid == 0, HZ) <= 0) { printk(KERN_WARNING "lockd_down: lockd failed to exit, clearing pid\n"); nlmsvc_pid = 0; } - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); out: mutex_unlock(&nlmsvc_mutex); } Index: linux-2.6.24-rc8-rt1/fs/pipe.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/fs/pipe.c 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/fs/pipe.c 2008-01-16 22:28:59.000000000 -0500 @@ -385,8 +385,14 @@ redo: wake_up_interruptible_sync(&pipe->wait); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } + /* + * Hack: we turn off atime updates for -RT kernels. + * Who uses them on pipes anyway? + */ +#ifndef CONFIG_PREEMPT_RT if (ret > 0) file_accessed(filp); +#endif return ret; } @@ -558,8 +564,14 @@ out: wake_up_interruptible_sync(&pipe->wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } + /* + * Hack: we turn off atime updates for -RT kernels. + * Who uses them on pipes anyway? + */ +#ifndef CONFIG_PREEMPT_RT if (ret > 0) file_update_time(filp); +#endif return ret; } @@ -999,12 +1011,17 @@ struct file *create_write_pipe(void) return ERR_PTR(err); } -void free_write_pipe(struct file *f) +void free_write_pipe(struct file *file) { - free_pipe_info(f->f_dentry->d_inode); - dput(f->f_path.dentry); - mntput(f->f_path.mnt); - put_filp(f); + struct dentry *dentry = file->f_path.dentry; + struct vfsmount *mnt = file->f_path.mnt; + + free_pipe_info(file->f_dentry->d_inode); + file->f_path.dentry = NULL; + file->f_path.mnt = NULL; + put_filp(file); + dput(dentry); + mntput(mnt); } struct file *create_read_pipe(struct file *wrf) Index: linux-2.6.24-rc8-rt1/fs/proc/task_mmu.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/fs/proc/task_mmu.c 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/fs/proc/task_mmu.c 2008-01-16 22:28:50.000000000 -0500 @@ -416,8 +416,10 @@ static void *m_start(struct seq_file *m, vma = NULL; if ((unsigned long)l < mm->map_count) { vma = mm->mmap; - while (l-- && vma) + while (l-- && vma) { vma = vma->vm_next; + cond_resched(); + } goto out; } Index: linux-2.6.24-rc8-rt1/fs/xfs/linux-2.6/mrlock.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/fs/xfs/linux-2.6/mrlock.h 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/fs/xfs/linux-2.6/mrlock.h 2008-01-16 22:28:50.000000000 -0500 @@ -23,8 +23,8 @@ enum { MR_NONE, MR_ACCESS, MR_UPDATE }; typedef struct { - struct rw_semaphore mr_lock; - int mr_writer; + struct compat_rw_semaphore mr_lock; + int mr_writer; } mrlock_t; #define mrinit(mrp, name) \ Index: linux-2.6.24-rc8-rt1/fs/xfs/xfs_mount.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/fs/xfs/xfs_mount.h 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/fs/xfs/xfs_mount.h 2008-01-16 22:28:50.000000000 -0500 @@ -383,7 +383,7 @@ typedef struct xfs_mount { uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ uint m_in_maxlevels; /* XFS_IN_MAXLEVELS */ struct xfs_perag *m_perag; /* per-ag accounting info */ - struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ + struct compat_rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ struct mutex m_growlock; /* growfs mutex */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_dmevmask; /* DMI events for this FS */ Index: linux-2.6.24-rc8-rt1/include/linux/genhd.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/genhd.h 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/genhd.h 2008-01-16 22:28:50.000000000 -0500 @@ -157,15 +157,22 @@ struct disk_attribute { * variants disable/enable preemption. */ #ifdef CONFIG_SMP -#define __disk_stat_add(gendiskp, field, addnd) \ - (per_cpu_ptr(gendiskp->dkstats, smp_processor_id())->field += addnd) +#define __disk_stat_add(gendiskp, field, addnd) \ +do { \ + preempt_disable(); \ + (per_cpu_ptr(gendiskp->dkstats, \ + smp_processor_id())->field += addnd); \ + preempt_enable(); \ +} while (0) #define disk_stat_read(gendiskp, field) \ ({ \ typeof(gendiskp->dkstats->field) res = 0; \ int i; \ + preempt_disable(); \ for_each_possible_cpu(i) \ res += per_cpu_ptr(gendiskp->dkstats, i)->field; \ + preempt_enable(); \ res; \ }) Index: linux-2.6.24-rc8-rt1/drivers/acpi/ec.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/acpi/ec.c 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/acpi/ec.c 2008-01-16 22:28:50.000000000 -0500 @@ -521,7 +521,19 @@ static u32 acpi_ec_gpe_handler(void *dat pr_debug(PREFIX "~~~> interrupt\n"); clear_bit(EC_FLAGS_WAIT_GPE, &ec->flags); if (test_bit(EC_FLAGS_GPE_MODE, &ec->flags)) +#if 0 wake_up(&ec->wait); +#else + // hack ... + if (waitqueue_active(&ec->wait)) { + struct task_struct *task; + + task = list_entry(ec->wait.task_list.next, + wait_queue_t, task_list)->private; + if (task) + wake_up_process(task); + } +#endif if (acpi_ec_read_status(ec) & ACPI_EC_FLAG_SCI) { if (!test_and_set_bit(EC_FLAGS_QUERY_PENDING, &ec->flags)) Index: linux-2.6.24-rc8-rt1/drivers/acpi/hardware/hwregs.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/acpi/hardware/hwregs.c 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/acpi/hardware/hwregs.c 2008-01-16 22:28:50.000000000 -0500 @@ -73,7 +73,7 @@ acpi_status acpi_hw_clear_acpi_status(vo ACPI_BITMASK_ALL_FIXED_STATUS, (u16) acpi_gbl_FADT.xpm1a_event_block.address)); - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); + spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags); status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS, ACPI_BITMASK_ALL_FIXED_STATUS); @@ -97,7 +97,7 @@ acpi_status acpi_hw_clear_acpi_status(vo status = acpi_ev_walk_gpe_list(acpi_hw_clear_gpe_block); unlock_and_exit: - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags); + spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags); return_ACPI_STATUS(status); } @@ -300,9 +300,9 @@ acpi_status acpi_get_register(u32 regist { acpi_status status; acpi_cpu_flags flags; - flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); + spin_lock_irqsave(acpi_gbl_hardware_lock, flags); status = acpi_get_register_unlocked(register_id, return_value); - acpi_os_release_lock(acpi_gbl_hardware_lock, flags); + spin_unlock_irqrestore(acpi_gbl_hardware_lock, flags); return status; } @@ -339,7 +339,7 @@ acpi_status acpi_set_register(u32 regist return_ACPI_STATUS(AE_BAD_PARAMETER); } - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); + spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags); /* Always do a register read first so we can insert the new bits */ @@ -443,7 +443,7 @@ acpi_status acpi_set_register(u32 regist unlock_and_exit: - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags); + spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags); /* Normalize the value that was read */ Index: linux-2.6.24-rc8-rt1/drivers/acpi/utilities/utmutex.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/acpi/utilities/utmutex.c 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/acpi/utilities/utmutex.c 2008-01-16 22:28:50.000000000 -0500 @@ -116,7 +116,7 @@ void acpi_ut_mutex_terminate(void) /* Delete the spinlocks */ acpi_os_delete_lock(acpi_gbl_gpe_lock); - acpi_os_delete_lock(acpi_gbl_hardware_lock); +// acpi_os_delete_lock(acpi_gbl_hardware_lock); return_VOID; } Index: linux-2.6.24-rc8-rt1/include/acpi/acglobal.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/acpi/acglobal.h 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/acpi/acglobal.h 2008-01-16 22:28:50.000000000 -0500 @@ -184,7 +184,12 @@ ACPI_EXTERN acpi_semaphore acpi_gbl_glob * interrupt level */ ACPI_EXTERN spinlock_t _acpi_gbl_gpe_lock; /* For GPE data structs and registers */ -ACPI_EXTERN spinlock_t _acpi_gbl_hardware_lock; /* For ACPI H/W except GPE registers */ + +/* + * Need to be raw because it might be used in acpi_processor_idle(): + */ +ACPI_EXTERN raw_spinlock_t _acpi_gbl_hardware_lock; /* For ACPI H/W except GPE registers */ + #define acpi_gbl_gpe_lock &_acpi_gbl_gpe_lock #define acpi_gbl_hardware_lock &_acpi_gbl_hardware_lock Index: linux-2.6.24-rc8-rt1/include/acpi/acpiosxf.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/acpi/acpiosxf.h 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/acpi/acpiosxf.h 2008-01-16 22:28:50.000000000 -0500 @@ -61,7 +61,7 @@ typedef enum { OSL_EC_BURST_HANDLER } acpi_execute_type; -#define ACPI_NO_UNIT_LIMIT ((u32) -1) +#define ACPI_NO_UNIT_LIMIT (INT_MAX/2) #define ACPI_MUTEX_SEM 1 /* Functions for acpi_os_signal */ Index: linux-2.6.24-rc8-rt1/ipc/mqueue.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/ipc/mqueue.c 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/ipc/mqueue.c 2008-01-16 22:28:50.000000000 -0500 @@ -779,12 +779,17 @@ static inline void pipelined_send(struct struct msg_msg *message, struct ext_wait_queue *receiver) { + /* + * Keep them in one critical section for PREEMPT_RT: + */ + preempt_disable(); receiver->msg = message; list_del(&receiver->list); receiver->state = STATE_PENDING; wake_up_process(receiver->task); smp_wmb(); receiver->state = STATE_READY; + preempt_enable(); } /* pipelined_receive() - if there is task waiting in sys_mq_timedsend() Index: linux-2.6.24-rc8-rt1/ipc/msg.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/ipc/msg.c 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/ipc/msg.c 2008-01-16 22:28:50.000000000 -0500 @@ -261,12 +261,19 @@ static void expunge_all(struct msg_queue while (tmp != &msq->q_receivers) { struct msg_receiver *msr; + /* + * Make sure that the wakeup doesnt preempt + * this CPU prematurely. (on PREEMPT_RT) + */ + preempt_disable(); + msr = list_entry(tmp, struct msg_receiver, r_list); tmp = tmp->next; msr->r_msg = NULL; - wake_up_process(msr->r_tsk); - smp_mb(); + wake_up_process(msr->r_tsk); /* serializes */ msr->r_msg = ERR_PTR(res); + + preempt_enable(); } } @@ -637,22 +644,28 @@ static inline int pipelined_send(struct !security_msg_queue_msgrcv(msq, msg, msr->r_tsk, msr->r_msgtype, msr->r_mode)) { + /* + * Make sure that the wakeup doesnt preempt + * this CPU prematurely. (on PREEMPT_RT) + */ + preempt_disable(); + list_del(&msr->r_list); if (msr->r_maxsize < msg->m_ts) { msr->r_msg = NULL; - wake_up_process(msr->r_tsk); - smp_mb(); + wake_up_process(msr->r_tsk); /* serializes */ msr->r_msg = ERR_PTR(-E2BIG); } else { msr->r_msg = NULL; msq->q_lrpid = task_pid_vnr(msr->r_tsk); msq->q_rtime = get_seconds(); - wake_up_process(msr->r_tsk); - smp_mb(); + wake_up_process(msr->r_tsk); /* serializes */ msr->r_msg = msg; + preempt_enable(); return 1; } + preempt_enable(); } } return 0; Index: linux-2.6.24-rc8-rt1/ipc/sem.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/ipc/sem.c 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/ipc/sem.c 2008-01-16 22:28:50.000000000 -0500 @@ -467,6 +467,11 @@ static void update_queue (struct sem_arr if (error <= 0) { struct sem_queue *n; remove_from_queue(sma,q); + /* + * make sure that the wakeup doesnt preempt + * _this_ cpu prematurely. (on preempt_rt) + */ + preempt_disable(); q->status = IN_WAKEUP; /* * Continue scanning. The next operation @@ -489,6 +494,7 @@ static void update_queue (struct sem_arr */ smp_wmb(); q->status = error; + preempt_enable(); q = n; } else { q = q->next; Index: linux-2.6.24-rc8-rt1/sound/core/pcm_lib.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/sound/core/pcm_lib.c 2008-01-16 22:27:30.000000000 -0500 +++ linux-2.6.24-rc8-rt1/sound/core/pcm_lib.c 2008-01-16 22:28:50.000000000 -0500 @@ -130,6 +130,7 @@ static void xrun(struct snd_pcm_substrea snd_pcm_stop(substream, SNDRV_PCM_STATE_XRUN); #ifdef CONFIG_SND_PCM_XRUN_DEBUG if (substream->pstr->xrun_debug) { + user_trace_stop(); snd_printd(KERN_DEBUG "XRUN: pcmC%dD%d%c\n", substream->pcm->card->number, substream->pcm->device, Index: linux-2.6.24-rc8-rt1/include/linux/pagevec.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/pagevec.h 2008-01-16 22:27:27.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/pagevec.h 2008-01-16 22:28:51.000000000 -0500 @@ -9,7 +9,7 @@ #define _LINUX_PAGEVEC_H /* 14 pointers + two long's align the pagevec structure to a power of two */ -#define PAGEVEC_SIZE 14 +#define PAGEVEC_SIZE 8 struct page; struct address_space; Index: linux-2.6.24-rc8-rt1/include/linux/vmstat.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/vmstat.h 2008-01-16 22:27:27.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/vmstat.h 2008-01-16 22:28:51.000000000 -0500 @@ -59,7 +59,12 @@ DECLARE_PER_CPU(struct vm_event_state, v static inline void __count_vm_event(enum vm_event_item item) { +#ifdef CONFIG_PREEMPT_RT + get_cpu_var(vm_event_states).event[item]++; + put_cpu(); +#else __get_cpu_var(vm_event_states).event[item]++; +#endif } static inline void count_vm_event(enum vm_event_item item) @@ -70,7 +75,12 @@ static inline void count_vm_event(enum v static inline void __count_vm_events(enum vm_event_item item, long delta) { +#ifdef CONFIG_PREEMPT_RT + get_cpu_var(vm_event_states).event[item] += delta; + put_cpu(); +#else __get_cpu_var(vm_event_states).event[item] += delta; +#endif } static inline void count_vm_events(enum vm_event_item item, long delta) Index: linux-2.6.24-rc8-rt1/mm/bounce.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/mm/bounce.c 2008-01-16 22:27:27.000000000 -0500 +++ linux-2.6.24-rc8-rt1/mm/bounce.c 2008-01-16 22:28:51.000000000 -0500 @@ -48,11 +48,11 @@ static void bounce_copy_vec(struct bio_v unsigned long flags; unsigned char *vto; - local_irq_save(flags); + local_irq_save_nort(flags); vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); memcpy(vto + to->bv_offset, vfrom, to->bv_len); kunmap_atomic(vto, KM_BOUNCE_READ); - local_irq_restore(flags); + local_irq_restore_nort(flags); } #else /* CONFIG_HIGHMEM */ Index: linux-2.6.24-rc8-rt1/mm/mmap.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/mm/mmap.c 2008-01-16 22:27:27.000000000 -0500 +++ linux-2.6.24-rc8-rt1/mm/mmap.c 2008-01-16 22:28:51.000000000 -0500 @@ -1910,10 +1910,16 @@ asmlinkage long sys_munmap(unsigned long static inline void verify_mm_writelocked(struct mm_struct *mm) { #ifdef CONFIG_DEBUG_VM - if (unlikely(down_read_trylock(&mm->mmap_sem))) { +# ifdef CONFIG_PREEMPT_RT + if (unlikely(!rt_rwsem_is_locked(&mm->mmap_sem))) { WARN_ON(1); - up_read(&mm->mmap_sem); } +# else + if (unlikely(down_read_trylock(&mm->mmap_sem))) { + WARN_ON(1); + up_read(&mm->mmap_sem); + } +# endif #endif } Index: linux-2.6.24-rc8-rt1/mm/vmscan.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/mm/vmscan.c 2008-01-16 22:27:27.000000000 -0500 +++ linux-2.6.24-rc8-rt1/mm/vmscan.c 2008-01-16 22:29:02.000000000 -0500 @@ -23,6 +23,7 @@ #include #include #include +#include #include /* for try_to_release_page(), buffer_heads_over_limit */ #include @@ -384,7 +385,7 @@ int remove_mapping(struct address_space BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - write_lock_irq(&mapping->tree_lock); + lock_page_ref_irq(page); /* * The non racy check for a busy page. * @@ -419,19 +420,19 @@ int remove_mapping(struct address_space if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; __delete_from_swap_cache(page); - write_unlock_irq(&mapping->tree_lock); swap_free(swap); - __put_page(page); /* The pagecache ref */ - return 1; + goto free_it; } __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); - __put_page(page); + +free_it: + unlock_page_ref_irq(page); + __put_page(page); /* The pagecache ref */ return 1; cannot_free: - write_unlock_irq(&mapping->tree_lock); + unlock_page_ref_irq(page); return 0; } @@ -840,7 +841,7 @@ static unsigned long shrink_inactive_lis } nr_reclaimed += nr_freed; - local_irq_disable(); + local_irq_disable_nort(); if (current_is_kswapd()) { __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); __count_vm_events(KSWAPD_STEAL, nr_freed); @@ -871,9 +872,14 @@ static unsigned long shrink_inactive_lis } } } while (nr_scanned < max_scan); + /* + * Non-PREEMPT_RT relies on IRQs-off protecting the page_states + * per-CPU data. PREEMPT_RT has that data protected even in + * __mod_page_state(), so no need to keep IRQs disabled. + */ spin_unlock(&zone->lru_lock); done: - local_irq_enable(); + local_irq_enable_nort(); pagevec_release(&pvec); return nr_reclaimed; } Index: linux-2.6.24-rc8-rt1/mm/vmstat.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/mm/vmstat.c 2008-01-16 22:27:27.000000000 -0500 +++ linux-2.6.24-rc8-rt1/mm/vmstat.c 2008-01-16 22:28:51.000000000 -0500 @@ -157,10 +157,14 @@ static void refresh_zone_stat_thresholds void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, int delta) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); - s8 *p = pcp->vm_stat_diff + item; + struct per_cpu_pageset *pcp; + int cpu; long x; + s8 *p; + cpu = get_cpu(); + pcp = zone_pcp(zone, cpu); + p = pcp->vm_stat_diff + item; x = delta + *p; if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { @@ -168,6 +172,7 @@ void __mod_zone_page_state(struct zone * x = 0; } *p = x; + put_cpu(); } EXPORT_SYMBOL(__mod_zone_page_state); @@ -210,9 +215,13 @@ EXPORT_SYMBOL(mod_zone_page_state); */ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); - s8 *p = pcp->vm_stat_diff + item; + struct per_cpu_pageset *pcp; + int cpu; + s8 *p; + cpu = get_cpu(); + pcp = zone_pcp(zone, cpu); + p = pcp->vm_stat_diff + item; (*p)++; if (unlikely(*p > pcp->stat_threshold)) { @@ -221,18 +230,34 @@ void __inc_zone_state(struct zone *zone, zone_page_state_add(*p + overstep, zone, item); *p = -overstep; } + put_cpu(); } void __inc_zone_page_state(struct page *page, enum zone_stat_item item) { +#ifdef CONFIG_PREEMPT_RT + unsigned long flags; + struct zone *zone; + + zone = page_zone(page); + local_irq_save(flags); + __inc_zone_state(zone, item); + local_irq_restore(flags); +#else __inc_zone_state(page_zone(page), item); +#endif } EXPORT_SYMBOL(__inc_zone_page_state); void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); - s8 *p = pcp->vm_stat_diff + item; + struct per_cpu_pageset *pcp; + int cpu; + s8 *p; + + cpu = get_cpu(); + pcp = zone_pcp(zone, cpu); + p = pcp->vm_stat_diff + item; (*p)--; @@ -242,6 +267,7 @@ void __dec_zone_state(struct zone *zone, zone_page_state_add(*p - overstep, zone, item); *p = overstep; } + put_cpu(); } void __dec_zone_page_state(struct page *page, enum zone_stat_item item) Index: linux-2.6.24-rc8-rt1/drivers/block/paride/pseudo.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/block/paride/pseudo.h 2008-01-16 22:27:27.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/block/paride/pseudo.h 2008-01-16 22:28:51.000000000 -0500 @@ -43,7 +43,7 @@ static unsigned long ps_timeout; static int ps_tq_active = 0; static int ps_nice = 0; -static DEFINE_SPINLOCK(ps_spinlock __attribute__((unused))); +static __attribute__((unused)) DEFINE_SPINLOCK(ps_spinlock); static DECLARE_DELAYED_WORK(ps_tq, ps_tq_int); Index: linux-2.6.24-rc8-rt1/drivers/video/console/fbcon.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/video/console/fbcon.c 2008-01-16 22:27:27.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/video/console/fbcon.c 2008-01-16 22:28:51.000000000 -0500 @@ -1306,7 +1306,6 @@ static void fbcon_clear(struct vc_data * { struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; struct fbcon_ops *ops = info->fbcon_par; - struct display *p = &fb_display[vc->vc_num]; u_int y_break; @@ -1335,10 +1334,11 @@ static void fbcon_putcs(struct vc_data * struct display *p = &fb_display[vc->vc_num]; struct fbcon_ops *ops = info->fbcon_par; - if (!fbcon_is_inactive(vc, info)) + if (!fbcon_is_inactive(vc, info)) { ops->putcs(vc, info, s, count, real_y(p, ypos), xpos, get_color(vc, info, scr_readw(s), 1), get_color(vc, info, scr_readw(s), 0)); + } } static void fbcon_putc(struct vc_data *vc, int c, int ypos, int xpos) @@ -3322,6 +3322,7 @@ static const struct consw fb_con = { .con_screen_pos = fbcon_screen_pos, .con_getxy = fbcon_getxy, .con_resize = fbcon_resize, + .con_preemptible = 1, }; static struct notifier_block fbcon_event_notifier = { Index: linux-2.6.24-rc8-rt1/include/linux/console.h =================================================================== --- linux-2.6.24-rc8-rt1.orig/include/linux/console.h 2008-01-16 22:27:27.000000000 -0500 +++ linux-2.6.24-rc8-rt1/include/linux/console.h 2008-01-16 22:28:51.000000000 -0500 @@ -55,6 +55,7 @@ struct consw { void (*con_invert_region)(struct vc_data *, u16 *, int); u16 *(*con_screen_pos)(struct vc_data *, int); unsigned long (*con_getxy)(struct vc_data *, unsigned long, int *, int *); + int con_preemptible; // can it reschedule from within printk? }; extern const struct consw *conswitchp; Index: linux-2.6.24-rc8-rt1/drivers/char/sysrq.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/char/sysrq.c 2008-01-16 22:27:27.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/char/sysrq.c 2008-01-16 22:28:57.000000000 -0500 @@ -209,6 +209,22 @@ static struct sysrq_key_op sysrq_showreg .enable_mask = SYSRQ_ENABLE_DUMP, }; +#if defined(__i386__) || defined(__x86_64__) + +static void sysrq_handle_showallregs(int key, struct tty_struct *tty) +{ + nmi_show_all_regs(); +} + +static struct sysrq_key_op sysrq_showallregs_op = { + .handler = sysrq_handle_showallregs, + .help_msg = "showalLcpupc", + .action_msg = "Show Regs On All CPUs", +}; +#else +#define sysrq_showallregs_op (*(struct sysrq_key_op *)0) +#endif + static void sysrq_handle_showstate(int key, struct tty_struct *tty) { show_state(); @@ -341,7 +357,7 @@ static struct sysrq_key_op *sysrq_key_ta &sysrq_kill_op, /* i */ NULL, /* j */ &sysrq_SAK_op, /* k */ - NULL, /* l */ + &sysrq_showallregs_op, /* l */ &sysrq_showmem_op, /* m */ &sysrq_unrt_op, /* n */ /* o: This will often be registered as 'Off' at init time */ Index: linux-2.6.24-rc8-rt1/drivers/ide/ide-floppy.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/ide/ide-floppy.c 2008-01-16 22:27:27.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/ide/ide-floppy.c 2008-01-16 22:28:52.000000000 -0500 @@ -1668,9 +1668,9 @@ static int idefloppy_get_format_progress atapi_status_t status; unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); status.all = HWIF(drive)->INB(IDE_STATUS_REG); - local_irq_restore(flags); + local_irq_restore_nort(flags); progress_indication = !status.b.dsc ? 0 : 0x10000; } Index: linux-2.6.24-rc8-rt1/drivers/ide/ide-io.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/ide/ide-io.c 2008-01-16 22:27:27.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/ide/ide-io.c 2008-01-16 22:28:52.000000000 -0500 @@ -1194,7 +1194,7 @@ static void ide_do_request (ide_hwgroup_ ide_get_lock(ide_intr, hwgroup); /* caller must own ide_lock */ - BUG_ON(!irqs_disabled()); + BUG_ON_NONRT(!irqs_disabled()); while (!hwgroup->busy) { hwgroup->busy = 1; @@ -1462,7 +1462,7 @@ void ide_timer_expiry (unsigned long dat #endif /* DISABLE_IRQ_NOSYNC */ /* local CPU only, * as if we were handling an interrupt */ - local_irq_disable(); + local_irq_disable_nort(); if (hwgroup->polling) { startstop = handler(drive); } else if (drive_is_ready(drive)) { Index: linux-2.6.24-rc8-rt1/drivers/ide/ide-iops.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/ide/ide-iops.c 2008-01-16 22:27:27.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/ide/ide-iops.c 2008-01-16 22:28:52.000000000 -0500 @@ -220,10 +220,10 @@ static void ata_input_data(ide_drive_t * if (io_32bit) { if (io_32bit & 2) { unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); ata_vlb_sync(drive, IDE_NSECTOR_REG); hwif->INSL(IDE_DATA_REG, buffer, wcount); - local_irq_restore(flags); + local_irq_restore_nort(flags); } else hwif->INSL(IDE_DATA_REG, buffer, wcount); } else { @@ -242,10 +242,10 @@ static void ata_output_data(ide_drive_t if (io_32bit) { if (io_32bit & 2) { unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); ata_vlb_sync(drive, IDE_NSECTOR_REG); hwif->OUTSL(IDE_DATA_REG, buffer, wcount); - local_irq_restore(flags); + local_irq_restore_nort(flags); } else hwif->OUTSL(IDE_DATA_REG, buffer, wcount); } else { @@ -506,12 +506,12 @@ static int __ide_wait_stat(ide_drive_t * if (!(stat & BUSY_STAT)) break; - local_irq_restore(flags); + local_irq_restore_nort(flags); *rstat = stat; return -EBUSY; } } - local_irq_restore(flags); + local_irq_restore_nort(flags); } /* * Allow status to settle, then read it again. @@ -730,17 +730,15 @@ int ide_driveid_update(ide_drive_t *driv printk("%s: CHECK for good STATUS\n", drive->name); return 0; } - local_irq_save(flags); - SELECT_MASK(drive, 0); id = kmalloc(SECTOR_WORDS*4, GFP_ATOMIC); - if (!id) { - local_irq_restore(flags); + if (!id) return 0; - } + local_irq_save_nort(flags); + SELECT_MASK(drive, 0); ata_input_data(drive, id, SECTOR_WORDS); (void) hwif->INB(IDE_STATUS_REG); /* clear drive IRQ */ - local_irq_enable(); - local_irq_restore(flags); + local_irq_enable_nort(); + local_irq_restore_nort(flags); ide_fix_driveid(id); if (id) { drive->id->dma_ultra = id->dma_ultra; Index: linux-2.6.24-rc8-rt1/drivers/ide/ide-lib.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/ide/ide-lib.c 2008-01-16 22:27:27.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/ide/ide-lib.c 2008-01-16 22:28:52.000000000 -0500 @@ -447,15 +447,16 @@ int ide_set_xfer_rate(ide_drive_t *drive static void ide_dump_opcode(ide_drive_t *drive) { + unsigned long flags; struct request *rq; u8 opcode = 0; int found = 0; - spin_lock(&ide_lock); + spin_lock_irqsave(&ide_lock, flags); rq = NULL; if (HWGROUP(drive)) rq = HWGROUP(drive)->rq; - spin_unlock(&ide_lock); + spin_unlock_irqrestore(&ide_lock, flags); if (!rq) return; if (rq->cmd_type == REQ_TYPE_ATA_CMD || @@ -484,10 +485,8 @@ static void ide_dump_opcode(ide_drive_t static u8 ide_dump_ata_status(ide_drive_t *drive, const char *msg, u8 stat) { ide_hwif_t *hwif = HWIF(drive); - unsigned long flags; u8 err = 0; - local_irq_save(flags); printk("%s: %s: status=0x%02x { ", drive->name, msg, stat); if (stat & BUSY_STAT) printk("Busy "); @@ -548,7 +547,7 @@ static u8 ide_dump_ata_status(ide_drive_ printk("\n"); } ide_dump_opcode(drive); - local_irq_restore(flags); + return err; } @@ -563,14 +562,11 @@ static u8 ide_dump_ata_status(ide_drive_ static u8 ide_dump_atapi_status(ide_drive_t *drive, const char *msg, u8 stat) { - unsigned long flags; - atapi_status_t status; atapi_error_t error; status.all = stat; error.all = 0; - local_irq_save(flags); printk("%s: %s: status=0x%02x { ", drive->name, msg, stat); if (status.b.bsy) printk("Busy "); @@ -596,7 +592,7 @@ static u8 ide_dump_atapi_status(ide_driv printk("}\n"); } ide_dump_opcode(drive); - local_irq_restore(flags); + return error.all; } Index: linux-2.6.24-rc8-rt1/drivers/ide/ide-probe.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/ide/ide-probe.c 2008-01-16 22:27:27.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/ide/ide-probe.c 2008-01-16 22:28:52.000000000 -0500 @@ -128,7 +128,7 @@ static inline void do_identify (ide_driv hwif->ata_input_data(drive, id, SECTOR_WORDS); drive->id_read = 1; - local_irq_enable(); + local_irq_enable_nort(); ide_fix_driveid(id); #if defined (CONFIG_SCSI_EATA_PIO) || defined (CONFIG_SCSI_EATA) @@ -311,14 +311,14 @@ static int actual_try_to_identify (ide_d unsigned long flags; /* local CPU only; some systems need this */ - local_irq_save(flags); + local_irq_save_nort(flags); /* drive returned ID */ do_identify(drive, cmd); /* drive responded with ID */ rc = 0; /* clear drive IRQ */ (void) hwif->INB(IDE_STATUS_REG); - local_irq_restore(flags); + local_irq_restore_nort(flags); } else { /* drive refused ID */ rc = 2; @@ -801,7 +801,7 @@ static void probe_hwif(ide_hwif_t *hwif) } while ((stat & BUSY_STAT) && time_after(timeout, jiffies)); } - local_irq_restore(flags); + local_irq_restore_nort(flags); /* * Use cached IRQ number. It might be (and is...) changed by probe * code above Index: linux-2.6.24-rc8-rt1/drivers/ide/ide-taskfile.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/ide/ide-taskfile.c 2008-01-16 22:27:27.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/ide/ide-taskfile.c 2008-01-16 22:28:52.000000000 -0500 @@ -269,7 +269,7 @@ static void ide_pio_sector(ide_drive_t * offset %= PAGE_SIZE; #ifdef CONFIG_HIGHMEM - local_irq_save(flags); + local_irq_save_nort(flags); #endif buf = kmap_atomic(page, KM_BIO_SRC_IRQ) + offset; @@ -289,7 +289,7 @@ static void ide_pio_sector(ide_drive_t * kunmap_atomic(buf, KM_BIO_SRC_IRQ); #ifdef CONFIG_HIGHMEM - local_irq_restore(flags); + local_irq_restore_nort(flags); #endif } @@ -457,7 +457,7 @@ ide_startstop_t pre_task_out_intr (ide_d } if (!drive->unmask) - local_irq_disable(); + local_irq_disable_nort(); ide_set_handler(drive, &task_out_intr, WAIT_WORSTCASE, NULL); ide_pio_datablock(drive, rq, 1); Index: linux-2.6.24-rc8-rt1/drivers/ide/pci/alim15x3.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/ide/pci/alim15x3.c 2008-01-16 22:27:27.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/ide/pci/alim15x3.c 2008-01-16 22:28:52.000000000 -0500 @@ -322,7 +322,7 @@ static void ali_set_pio_mode(ide_drive_t if (r_clc >= 16) r_clc = 0; } - local_irq_save(flags); + local_irq_save_nort(flags); /* * PIO mode => ATA FIFO on, ATAPI FIFO off @@ -344,7 +344,7 @@ static void ali_set_pio_mode(ide_drive_t pci_write_config_byte(dev, port, s_clc); pci_write_config_byte(dev, port+drive->select.b.unit+2, (a_clc << 4) | r_clc); - local_irq_restore(flags); + local_irq_restore_nort(flags); /* * setup active rec @@ -479,7 +479,7 @@ static unsigned int __devinit init_chips } #endif /* defined(DISPLAY_ALI_TIMINGS) && defined(CONFIG_IDE_PROC_FS) */ - local_irq_save(flags); + local_irq_save_nort(flags); if (m5229_revision < 0xC2) { /* @@ -570,7 +570,7 @@ out: } pci_dev_put(north); pci_dev_put(isa_dev); - local_irq_restore(flags); + local_irq_restore_nort(flags); return 0; } @@ -632,7 +632,7 @@ static u8 __devinit ata66_ali15x3(ide_hw unsigned long flags; u8 cbl = ATA_CBL_PATA40, tmpbyte; - local_irq_save(flags); + local_irq_save_nort(flags); if (m5229_revision >= 0xC2) { /* @@ -653,7 +653,7 @@ static u8 __devinit ata66_ali15x3(ide_hw } } - local_irq_restore(flags); + local_irq_restore_nort(flags); return cbl; } Index: linux-2.6.24-rc8-rt1/drivers/ide/pci/hpt366.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/ide/pci/hpt366.c 2008-01-16 22:27:27.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/ide/pci/hpt366.c 2008-01-16 22:28:52.000000000 -0500 @@ -1430,7 +1430,7 @@ static void __devinit init_dma_hpt366(id dma_old = inb(dmabase + 2); - local_irq_save(flags); + local_irq_save_nort(flags); dma_new = dma_old; pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma); @@ -1441,7 +1441,7 @@ static void __devinit init_dma_hpt366(id if (dma_new != dma_old) outb(dma_new, dmabase + 2); - local_irq_restore(flags); + local_irq_restore_nort(flags); ide_setup_dma(hwif, dmabase, 8); } Index: linux-2.6.24-rc8-rt1/drivers/input/gameport/gameport.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/input/gameport/gameport.c 2008-01-16 22:27:27.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/input/gameport/gameport.c 2008-01-16 22:28:52.000000000 -0500 @@ -21,6 +21,7 @@ #include #include #include +#include #include /* HZ */ #include #include @@ -100,12 +101,12 @@ static int gameport_measure_speed(struct tx = 1 << 30; for(i = 0; i < 50; i++) { - local_irq_save(flags); + local_irq_save_nort(flags); GET_TIME(t1); for (t = 0; t < 50; t++) gameport_read(gameport); GET_TIME(t2); GET_TIME(t3); - local_irq_restore(flags); + local_irq_restore_nort(flags); udelay(i * 10); if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t; } @@ -124,11 +125,11 @@ static int gameport_measure_speed(struct tx = 1 << 30; for(i = 0; i < 50; i++) { - local_irq_save(flags); + local_irq_save_nort(flags); rdtscl(t1); for (t = 0; t < 50; t++) gameport_read(gameport); rdtscl(t2); - local_irq_restore(flags); + local_irq_restore_nort(flags); udelay(i * 10); if (t2 - t1 < tx) tx = t2 - t1; } Index: linux-2.6.24-rc8-rt1/drivers/net/tulip/tulip_core.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/net/tulip/tulip_core.c 2008-01-16 22:27:27.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/net/tulip/tulip_core.c 2008-01-16 22:28:53.000000000 -0500 @@ -1797,6 +1797,7 @@ static void __devexit tulip_remove_one ( pci_iounmap(pdev, tp->base_addr); free_netdev (dev); pci_release_regions (pdev); + pci_disable_device (pdev); pci_set_drvdata (pdev, NULL); /* pci_power_off (pdev, -1); */ Index: linux-2.6.24-rc8-rt1/drivers/oprofile/oprofilefs.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/oprofile/oprofilefs.c 2008-01-16 22:27:27.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/oprofile/oprofilefs.c 2008-01-16 22:28:53.000000000 -0500 @@ -21,7 +21,7 @@ #define OPROFILEFS_MAGIC 0x6f70726f -DEFINE_SPINLOCK(oprofilefs_lock); +DEFINE_RAW_SPINLOCK(oprofilefs_lock); static struct inode * oprofilefs_get_inode(struct super_block * sb, int mode) { Index: linux-2.6.24-rc8-rt1/drivers/pci/access.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/pci/access.c 2008-01-16 22:27:27.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/pci/access.c 2008-01-16 22:28:53.000000000 -0500 @@ -11,7 +11,7 @@ * configuration space. */ -static DEFINE_SPINLOCK(pci_lock); +static DEFINE_RAW_SPINLOCK(pci_lock); /* * Wrappers for all PCI configuration access functions. They just check Index: linux-2.6.24-rc8-rt1/drivers/video/console/vgacon.c =================================================================== --- linux-2.6.24-rc8-rt1.orig/drivers/video/console/vgacon.c 2008-01-16 22:27:27.000000000 -0500 +++ linux-2.6.24-rc8-rt1/drivers/video/console/vgacon.c 2008-01-16 22:28:53.000000000 -0500 @@ -51,7 +51,7 @@ #include