diff --git a/Documentation/hwlat_detector.txt b/Documentation/hwlat_detector.txt new file mode 100644 index 0000000..cb61516 --- /dev/null +++ b/Documentation/hwlat_detector.txt @@ -0,0 +1,64 @@ +Introduction: +------------- + +The module hwlat_detector is a special purpose kernel module that is used to +detect large system latencies induced by the behavior of certain underlying +hardware or firmware, independent of Linux itself. The code was developed +originally to detect SMIs (System Management Interrupts) on x86 systems, +however there is nothing x86 specific about this patchset. It was +originally written for use by the "RT" patch since the Real Time +kernel is highly latency sensitive. + +SMIs are usually not serviced by the Linux kernel, which typically does not +even know that they are occuring. SMIs are instead are set up by BIOS code +and are serviced by BIOS code, usually for "critical" events such as +management of thermal sensors and fans. Sometimes though, SMIs are used for +other tasks and those tasks can spend an inordinate amount of time in the +handler (sometimes measured in milliseconds). Obviously this is a problem if +you are trying to keep event service latencies down in the microsecond range. + +The hardware latency detector works by hogging all of the cpus for configurable +amounts of time (by calling stop_machine()), polling the CPU Time Stamp Counter +for some period, then looking for gaps in the TSC data. Any gap indicates a +time when the polling was interrupted and since the machine is stopped and +interrupts turned off the only thing that could do that would be an SMI. + +Note that the SMI detector should *NEVER* be used in a production environment. +It is intended to be run manually to determine if the hardware platform has a +problem with long system firmware service routines. + +Usage: +------ + +Loading the module hwlat_detector passing the parameter "enabled=1" (or by +setting the "enable" entry in "hwlat_detector" debugfs toggled on) is the only +step required to start the hwlat_detector. It is possible to redefine the +threshold in microseconds (us) above which latency spikes will be taken +into account (parameter "threshold="). + +Example: + + # modprobe hwlat_detector enabled=1 threshold=100 + +After the module is loaded, it creates a directory named "hwlat_detector" under +the debugfs mountpoint, "/debug/hwlat_detector" for this text. It is necessary +to have debugfs mounted, which might be on /sys/debug on your system. + +The /debug/hwlat_detector interface contains the following files: + +count - number of latency spikes observed since last reset +enable - a global enable/disable toggle (0/1), resets count +max - maximum hardware latency actually observed (usecs) +sample - a pipe from which to read current raw sample data + in the format <timestamp> <latency observed usecs> + (can be opened O_NONBLOCK for a single sample) +threshold - minimum latency value to be considered (usecs) +width - time period to sample with CPUs held (usecs) + must be less than the total window size (enforced) +window - total period of sampling, width being inside (usecs) + +By default we will set width to 500,000 and window to 1,000,000, meaning that +we will sample every 1,000,000 usecs (1s) for 500,000 usecs (0.5s). If we +observe any latencies that exceed the threshold (initially 100 usecs), +then we write to a global sample ring buffer of 8K samples, which is +consumed by reading from the "sample" (pipe) debugfs file interface. diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 736d456..e7848a0 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -199,6 +199,10 @@ and is between 256 and 4096 characters. It is defined in the file acpi_display_output=video See above. + acpi_early_pdc_eval [HW,ACPI] Evaluate processor _PDC methods + early. Needed on some platforms to properly + initialize the EC. + acpi_irq_balance [HW,ACPI] ACPI will balance active IRQs default in APIC mode @@ -311,6 +315,11 @@ and is between 256 and 4096 characters. It is defined in the file aic79xx= [HW,SCSI] See Documentation/scsi/aic79xx.txt. + alignment= [KNL,ARM] + Allow the default userspace alignment fault handler + behaviour to be specified. Bit 0 enables warnings, + bit 1 enables fixups, and bit 2 sends a segfault. + amd_iommu= [HW,X86-84] Pass parameters to the AMD IOMMU driver in the system. Possible values are: diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 006b39d..e87f3cd 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1074,10 +1074,10 @@ regen_max_retry - INTEGER Default: 5 max_addresses - INTEGER - Number of maximum addresses per interface. 0 disables limitation. - It is recommended not set too large value (or 0) because it would - be too easy way to crash kernel to allow to create too much of - autoconfigured addresses. + Maximum number of autoconfigured addresses per interface. Setting + to zero disables the limitation. It is not recommended to set this + value too large (or to zero) because it would be an easy way to + crash the kernel by allowing too many addresses to be created. Default: 16 disable_ipv6 - BOOLEAN diff --git a/Documentation/trace/histograms.txt b/Documentation/trace/histograms.txt new file mode 100644 index 0000000..6645057 --- /dev/null +++ b/Documentation/trace/histograms.txt @@ -0,0 +1,156 @@ + Using the Linux Kernel Latency Histograms + + +This document gives a short explanation how to enable, configure and use +latency histograms. Latency histograms are primarily relevant in the +context of real-time enabled kernels (CONFIG_PREEMPT/CONFIG_PREEMPT_RT) +and are used in the quality management of the Linux real-time +capabilities. + + +* Purpose of latency histograms + +A latency histogram continuously accumulates the frequencies of latency +data. There are two types of histograms +- potential sources of latencies +- effective latencies + + +* Potential sources of latencies + +Potential sources of latencies are code segments where interrupts, +preemption or both are disabled (aka critical sections). To create +histograms of potential sources of latency, the kernel stores the time +stamp at the start of a critical section, determines the time elapsed +when the end of the section is reached, and increments the frequency +counter of that latency value - irrespective of whether any concurrently +running process is affected by latency or not. +- Configuration items (in the Kernel hacking/Tracers submenu) + CONFIG_INTERRUPT_OFF_LATENCY + CONFIG_PREEMPT_OFF_LATENCY + + +* Effective latencies + +Effective latencies are actually occuring during wakeup of a process. To +determine effective latencies, the kernel stores the time stamp when a +process is scheduled to be woken up, and determines the duration of the +wakeup time shortly before control is passed over to this process. Note +that the apparent latency in user space may be considerably longer, +since +i) interrupts may be disabled preventing the scheduler from initiating +the wakeup mechanism, and +ii) the process may be interrupted after control is passed over to it +but before user space execution takes place. +- Configuration item (in the Kernel hacking/Tracers submenu) + CONFIG_WAKEUP_LATENCY + + +* Usage + +The interface to the administration of the latency histograms is located +in the debugfs file system. To mount it, either enter + +mount -t sysfs nodev /sys +mount -t debugfs nodev /sys/kernel/debug + +from shell command line level, or add + +nodev /sys sysfs defaults 0 0 +nodev /sys/kernel/debug debugfs defaults 0 0 + +to the file /etc/fstab. All latency histogram related files are +available in the directory /sys/kernel/debug/tracing/latency_hist. A +particular histogram type is enabled by writing non-zero to the related +variable in the /sys/kernel/debug/tracing/latency_hist/enable directory. +Select "preemptirqsoff" for the histograms of potential sources of +latencies and "wakeup" for histograms of effective latencies. The +histogram data - one per CPU - are available in the files + +/sys/kernel/debug/tracing/latency_hist/preemptoff/CPUx +/sys/kernel/debug/tracing/latency_hist/irqsoff/CPUx +/sys/kernel/debug/tracing/latency_hist/preemptirqsoff/CPUx +/sys/kernel/debug/tracing/latency_hist/wakeup/CPUx. + +The histograms are reset by writing non-zero to the file "reset" in a +particular latency directory. To reset all latency data, use + +#!/bin/sh + +HISTDIR=/sys/kernel/debug/tracing/latency_hist + +if test -d $HISTDIR +then + cd $HISTDIR + for i in */reset + do + echo 1 >$i + done +fi + + +* Data format + +Latency data are stored with a resolution of one microsecond. The +maximum latency is 10,240 microseconds. The data are only valid, if the +overflow register is empty. Every output line contains the latency in +microseconds in the first row and the number of samples in the second +row. To display only lines with a positive latency count, use, for +example, + +grep -v " 0$" /sys/kernel/debug/tracing/latency_hist/preemptoff/CPU0 + +#Minimum latency: 0 microseconds. +#Average latency: 0 microseconds. +#Maximum latency: 25 microseconds. +#Total samples: 3104770694 +#There are 0 samples greater or equal than 10240 microseconds +#usecs samples + 0 2984486876 + 1 49843506 + 2 58219047 + 3 5348126 + 4 2187960 + 5 3388262 + 6 959289 + 7 208294 + 8 40420 + 9 4485 + 10 14918 + 11 18340 + 12 25052 + 13 19455 + 14 5602 + 15 969 + 16 47 + 17 18 + 18 14 + 19 1 + 20 3 + 21 2 + 22 5 + 23 2 + 25 1 + + +* Wakeup latency of a selected process + +To only collect wakeup latency data of a particular process, write the +PID of the requested process to + +/sys/kernel/debug/tracing/latency_hist/wakeup/pid. + +PIDs are not considered, if this variable is set to 0. + + +* Details of the process with the highest wakeup latency so far + +Selected data of the process that suffered from the highest wakeup +latency that occurred in a particular CPU are available in the file + +/sys/kernel/debug/tracing/latency_hist/wakeup/max_latency-CPUx. + +The format of the data is +<PID> <Priority> <Latency> <Command> + +These data are also reset when the wakeup histogram ist reset. diff --git a/MAINTAINERS b/MAINTAINERS index 412eff6..be1b7ca 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -616,10 +616,10 @@ M: Richard Purdie <rpurdie@rpsys.net> S: Maintained ARM/CORTINA SYSTEMS GEMINI ARM ARCHITECTURE -M: Paulius Zaleckas <paulius.zaleckas@teltonika.lt> +M: Paulius Zaleckas <paulius.zaleckas@gmail.com> L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) T: git git://gitorious.org/linux-gemini/mainline.git -S: Maintained +S: Odd Fixes F: arch/arm/mach-gemini/ ARM/EBSA110 MACHINE SUPPORT @@ -641,9 +641,9 @@ T: topgit git://git.openezx.org/openezx.git F: arch/arm/mach-pxa/ezx.c ARM/FARADAY FA526 PORT -M: Paulius Zaleckas <paulius.zaleckas@teltonika.lt> +M: Paulius Zaleckas <paulius.zaleckas@gmail.com> L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -S: Maintained +S: Odd Fixes F: arch/arm/mm/*-fa* ARM/FOOTBRIDGE ARCHITECTURE @@ -1733,10 +1733,9 @@ F: include/linux/tfrc.h F: net/dccp/ DECnet NETWORK LAYER -M: Christine Caulfield <christine.caulfield@googlemail.com> W: http://linux-decnet.sourceforge.net L: linux-decnet-user@lists.sourceforge.net -S: Maintained +S: Orphan F: Documentation/networking/decnet.txt F: net/decnet/ @@ -2444,6 +2443,15 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-2.6.git S: Maintained F: drivers/media/video/gspca/ +HARDWARE LATENCY DETECTOR +P: Jon Masters +M: jcm@jonmasters.org +W: http://www.kernel.org/pub/linux/kernel/people/jcm/hwlat_detector/ +S: Supported +L: linux-kernel@vger.kernel.org +F: Documentation/hwlat_detector.txt +F: drivers/misc/hwlat_detector.c + HARDWARE MONITORING L: lm-sensors@lm-sensors.org W: http://www.lm-sensors.org/ @@ -3490,9 +3498,9 @@ S: Maintained F: drivers/net/wireless/libertas/ MARVELL MV643XX ETHERNET DRIVER -M: Lennert Buytenhek <buytenh@marvell.com> +M: Lennert Buytenhek <buytenh@wantstofly.org> L: netdev@vger.kernel.org -S: Supported +S: Maintained F: drivers/net/mv643xx_eth.* F: include/linux/mv643xx.h diff --git a/Makefile b/Makefile index 12b1aa1..95f232e 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 33 -EXTRAVERSION = -rc8 +EXTRAVERSION = -rc8-rt2 NAME = Man-Eating Seals of Antiquity # *DOCUMENTATION* diff --git a/arch/Kconfig b/arch/Kconfig index 9d055b4..4c3c06a 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -46,6 +46,11 @@ config OPROFILE_EVENT_MULTIPLEX config HAVE_OPROFILE bool +config PROFILE_NMI + bool + depends on OPROFILE + default y + config KPROBES bool "Kprobes" depends on KALLSYMS && MODULES diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h index 1570c0b..55f4f13 100644 --- a/arch/alpha/include/asm/rwsem.h +++ b/arch/alpha/include/asm/rwsem.h @@ -18,15 +18,18 @@ struct rwsem_waiter; -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_down_read_failed(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_down_write_failed(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore *rwsem_wake(struct rw_anon_semaphore *); +extern struct rw_anon_semaphore * +rwsem_downgrade_wake(struct rw_anon_semaphore *sem); /* * the semaphore definition */ -struct rw_semaphore { +struct rw_anon_semaphore { long count; #define RWSEM_UNLOCKED_VALUE 0x0000000000000000L #define RWSEM_ACTIVE_BIAS 0x0000000000000001L @@ -38,6 +41,31 @@ struct rw_semaphore { struct list_head wait_list; }; +#define __RWSEM_ANON_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \ + LIST_HEAD_INIT((name).wait_list) } + +#define DECLARE_ANON_RWSEM(name) \ + struct rw_anon_semaphore name = __RWSEM_ANON_INITIALIZER(name) + +static inline void init_anon_rwsem(struct rw_anon_semaphore *sem) +{ + sem->count = RWSEM_UNLOCKED_VALUE; + spin_lock_init(&sem->wait_lock); + INIT_LIST_HEAD(&sem->wait_list); +} + +static inline int anon_rwsem_is_locked(struct rw_anon_semaphore *sem) +{ + return (sem->count != 0); +} + +struct rw_semaphore { + long count; + spinlock_t wait_lock; + struct list_head wait_list; +}; + #define __RWSEM_INITIALIZER(name) \ { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \ LIST_HEAD_INIT((name).wait_list) } @@ -47,12 +75,15 @@ struct rw_semaphore { static inline void init_rwsem(struct rw_semaphore *sem) { - sem->count = RWSEM_UNLOCKED_VALUE; - spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); + init_anon_rwsem((struct rw_anon_semaphore *)sem); } -static inline void __down_read(struct rw_semaphore *sem) +static inline int rwsem_is_locked(struct rw_semaphore *sem) +{ + return (sem->count != 0); +} + +static inline void __down_read(struct rw_anon_semaphore *sem) { long oldcount; #ifndef CONFIG_SMP @@ -79,7 +110,7 @@ static inline void __down_read(struct rw_semaphore *sem) /* * trylock for reading -- returns 1 if successful, 0 if contention */ -static inline int __down_read_trylock(struct rw_semaphore *sem) +static inline int __down_read_trylock(struct rw_anon_semaphore *sem) { long old, new, res; @@ -94,7 +125,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) return res >= 0 ? 1 : 0; } -static inline void __down_write(struct rw_semaphore *sem) +static inline void __down_write(struct rw_anon_semaphore *sem) { long oldcount; #ifndef CONFIG_SMP @@ -121,7 +152,7 @@ static inline void __down_write(struct rw_semaphore *sem) /* * trylock for writing -- returns 1 if successful, 0 if contention */ -static inline int __down_write_trylock(struct rw_semaphore *sem) +static inline int __down_write_trylock(struct rw_anon_semaphore *sem) { long ret = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, RWSEM_ACTIVE_WRITE_BIAS); @@ -130,7 +161,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) return 0; } -static inline void __up_read(struct rw_semaphore *sem) +static inline void __up_read(struct rw_anon_semaphore *sem) { long oldcount; #ifndef CONFIG_SMP @@ -155,7 +186,7 @@ static inline void __up_read(struct rw_semaphore *sem) rwsem_wake(sem); } -static inline void __up_write(struct rw_semaphore *sem) +static inline void __up_write(struct rw_anon_semaphore *sem) { long count; #ifndef CONFIG_SMP @@ -184,7 +215,7 @@ static inline void __up_write(struct rw_semaphore *sem) /* * downgrade write lock to read lock */ -static inline void __downgrade_write(struct rw_semaphore *sem) +static inline void __downgrade_write(struct rw_anon_semaphore *sem) { long oldcount; #ifndef CONFIG_SMP @@ -208,7 +239,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) rwsem_downgrade_wake(sem); } -static inline void rwsem_atomic_add(long val, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(long val, struct rw_anon_semaphore *sem) { #ifndef CONFIG_SMP sem->count += val; @@ -227,7 +258,7 @@ static inline void rwsem_atomic_add(long val, struct rw_semaphore *sem) #endif } -static inline long rwsem_atomic_update(long val, struct rw_semaphore *sem) +static inline long rwsem_atomic_update(long val, struct rw_anon_semaphore *sem) { #ifndef CONFIG_SMP sem->count += val; @@ -250,10 +281,5 @@ static inline long rwsem_atomic_update(long val, struct rw_semaphore *sem) #endif } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* __KERNEL__ */ #endif /* _ALPHA_RWSEM_H */ diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c index 5d08266..760dd1b 100644 --- a/arch/alpha/kernel/time.c +++ b/arch/alpha/kernel/time.c @@ -106,7 +106,7 @@ irqreturn_t timer_interrupt(int irq, void *dev) profile_tick(CPU_PROFILING); #endif - write_seqlock(&xtime_lock); + write_raw_seqlock(&xtime_lock); /* * Calculate how many ticks have passed since the last update, @@ -136,7 +136,7 @@ irqreturn_t timer_interrupt(int irq, void *dev) state.last_rtc_update = xtime.tv_sec - (tmp ? 600 : 0); } - write_sequnlock(&xtime_lock); + write_raw_sequnlock(&xtime_lock); #ifndef CONFIG_SMP while (nticks--) diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index c77d2fa..8113bb5 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -42,7 +42,8 @@ #endif #if defined(CONFIG_CPU_ARM920T) || defined(CONFIG_CPU_ARM922T) || \ - defined(CONFIG_CPU_ARM925T) || defined(CONFIG_CPU_ARM1020) + defined(CONFIG_CPU_ARM925T) || defined(CONFIG_CPU_ARM1020) || \ + defined(CONFIG_CPU_ARM1026) # define MULTI_CACHE 1 #endif diff --git a/arch/arm/include/asm/dma.h b/arch/arm/include/asm/dma.h index ca51143..8ff6602 100644 --- a/arch/arm/include/asm/dma.h +++ b/arch/arm/include/asm/dma.h @@ -31,18 +31,18 @@ #define DMA_MODE_CASCADE 0xc0 #define DMA_AUTOINIT 0x10 -extern spinlock_t dma_spin_lock; +extern raw_spinlock_t dma_spin_lock; static inline unsigned long claim_dma_lock(void) { unsigned long flags; - spin_lock_irqsave(&dma_spin_lock, flags); + raw_spin_lock_irqsave(&dma_spin_lock, flags); return flags; } static inline void release_dma_lock(unsigned long flags) { - spin_unlock_irqrestore(&dma_spin_lock, flags); + raw_spin_unlock_irqrestore(&dma_spin_lock, flags); } /* Clear the 'DMA Pointer Flip Flop'. diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h index 058e7e9..d8fd711 100644 --- a/arch/arm/include/asm/system.h +++ b/arch/arm/include/asm/system.h @@ -60,6 +60,8 @@ #include <linux/linkage.h> #include <linux/irqflags.h> +#include <asm/memory.h> + #define __exception __attribute__((section(".exception.text"))) struct thread_info; diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index f41a6f5..dd667f2 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -40,17 +40,12 @@ struct mmu_gather { unsigned long range_end; }; -DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); - -static inline struct mmu_gather * -tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) +static inline void +tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, + unsigned int full_mm_flush) { - struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); - tlb->mm = mm; tlb->fullmm = full_mm_flush; - - return tlb; } static inline void @@ -61,8 +56,6 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) /* keep the page table cache within bounds */ check_pgt_cache(); - - put_cpu_var(mmu_gathers); } /* diff --git a/arch/arm/kernel/dma.c b/arch/arm/kernel/dma.c index 7d5b9fb..0b5ad04 100644 --- a/arch/arm/kernel/dma.c +++ b/arch/arm/kernel/dma.c @@ -21,7 +21,7 @@ #include <asm/mach/dma.h> -DEFINE_SPINLOCK(dma_spin_lock); +DEFINE_RAW_SPINLOCK(dma_spin_lock); EXPORT_SYMBOL(dma_spin_lock); static dma_t *dma_chan[MAX_DMA_CHANNELS]; diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 2c1db77..1d2a386 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -52,7 +52,8 @@ work_pending: b ret_slow_syscall @ Check work again work_resched: - bl schedule + bl __schedule + /* * "slow" syscall return path. "why" tells us if this was a real syscall. */ diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index ba2adef..2bec4a9 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -174,9 +174,11 @@ void cpu_idle(void) } leds_event(led_idle_end); tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index c6c57b6..621acad 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -102,6 +102,7 @@ struct cpu_cache_fns cpu_cache; #endif #ifdef CONFIG_OUTER_CACHE struct outer_cache_fns outer_cache; +EXPORT_SYMBOL(outer_cache); #endif struct stack { diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index e7714f3..469f223 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -607,6 +607,14 @@ static void do_signal(struct pt_regs *regs, int syscall) siginfo_t info; int signr; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif + /* * We want the common case to go fast, which * is why we may in certain cases get here from diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 57162af..9c67108 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -452,17 +452,17 @@ void __cpuinit percpu_timer_setup(void) local_timer_setup(evt); } -static DEFINE_SPINLOCK(stop_lock); +static DEFINE_RAW_SPINLOCK(stop_lock); /* * ipi_cpu_stop - handle IPI from smp_send_stop() */ static void ipi_cpu_stop(unsigned int cpu) { - spin_lock(&stop_lock); + raw_spin_lock(&stop_lock); printk(KERN_CRIT "CPU%u: stopping\n", cpu); dump_stack(); - spin_unlock(&stop_lock); + raw_spin_unlock(&stop_lock); set_cpu_online(cpu, false); diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c index d38cdf2..3654ecf 100644 --- a/arch/arm/kernel/time.c +++ b/arch/arm/kernel/time.c @@ -245,11 +245,11 @@ void do_gettimeofday(struct timeval *tv) unsigned long usec, sec; do { - seq = read_seqbegin_irqsave(&xtime_lock, flags); + seq = read_raw_seqbegin_irqsave(&xtime_lock, flags); usec = system_timer->offset(); sec = xtime.tv_sec; usec += xtime.tv_nsec / 1000; - } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); + } while (read_raw_seqretry_irqrestore(&xtime_lock, seq, flags)); /* usec may have gone up a lot: be safe */ while (usec >= 1000000) { @@ -271,7 +271,7 @@ int do_settimeofday(struct timespec *tv) if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irq(&xtime_lock); + write_raw_seqlock_irq(&xtime_lock); /* * This is revolting. We need to set "xtime" correctly. However, the * value in this location is the value at the most recent update of @@ -287,7 +287,7 @@ int do_settimeofday(struct timespec *tv) set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); ntp_clear(); - write_sequnlock_irq(&xtime_lock); + write_raw_sequnlock_irq(&xtime_lock); clock_was_set(); return 0; } @@ -337,9 +337,9 @@ void timer_tick(void) profile_tick(CPU_PROFILING); do_leds(); do_set_rtc(); - write_seqlock(&xtime_lock); + write_raw_seqlock(&xtime_lock); do_timer(1); - write_sequnlock(&xtime_lock); + write_raw_sequnlock(&xtime_lock); #ifndef CONFIG_SMP update_process_times(user_mode(get_irq_regs())); #endif diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 3f361a7..00ebadc 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -245,7 +245,7 @@ static void __die(const char *str, int err, struct thread_info *thread, struct p } } -DEFINE_SPINLOCK(die_lock); +DEFINE_RAW_SPINLOCK(die_lock); /* * This function is protected against re-entrancy. @@ -256,13 +256,13 @@ NORET_TYPE void die(const char *str, struct pt_regs *regs, int err) oops_enter(); - spin_lock_irq(&die_lock); + raw_spin_lock_irq(&die_lock); console_verbose(); bust_spinlocks(1); __die(str, err, thread, regs); bust_spinlocks(0); add_taint(TAINT_DIE); - spin_unlock_irq(&die_lock); + raw_spin_unlock_irq(&die_lock); oops_exit(); if (in_interrupt()) @@ -288,24 +288,24 @@ void arm_notify_die(const char *str, struct pt_regs *regs, } static LIST_HEAD(undef_hook); -static DEFINE_SPINLOCK(undef_lock); +static DEFINE_RAW_SPINLOCK(undef_lock); void register_undef_hook(struct undef_hook *hook) { unsigned long flags; - spin_lock_irqsave(&undef_lock, flags); + raw_spin_lock_irqsave(&undef_lock, flags); list_add(&hook->node, &undef_hook); - spin_unlock_irqrestore(&undef_lock, flags); + raw_spin_unlock_irqrestore(&undef_lock, flags); } void unregister_undef_hook(struct undef_hook *hook) { unsigned long flags; - spin_lock_irqsave(&undef_lock, flags); + raw_spin_lock_irqsave(&undef_lock, flags); list_del(&hook->node); - spin_unlock_irqrestore(&undef_lock, flags); + raw_spin_unlock_irqrestore(&undef_lock, flags); } static int call_undef_hook(struct pt_regs *regs, unsigned int instr) @@ -314,12 +314,12 @@ static int call_undef_hook(struct pt_regs *regs, unsigned int instr) unsigned long flags; int (*fn)(struct pt_regs *regs, unsigned int instr) = NULL; - spin_lock_irqsave(&undef_lock, flags); + raw_spin_lock_irqsave(&undef_lock, flags); list_for_each_entry(hook, &undef_hook, node) if ((instr & hook->instr_mask) == hook->instr_val && (regs->ARM_cpsr & hook->cpsr_mask) == hook->cpsr_val) fn = hook->fn; - spin_unlock_irqrestore(&undef_lock, flags); + raw_spin_unlock_irqrestore(&undef_lock, flags); return fn ? fn(regs, instr) : 1; } diff --git a/arch/arm/mach-at91/gpio.c b/arch/arm/mach-at91/gpio.c index ae4772e..d959979 100644 --- a/arch/arm/mach-at91/gpio.c +++ b/arch/arm/mach-at91/gpio.c @@ -373,12 +373,18 @@ static int gpio_irq_type(unsigned pin, unsigned type) } } +static void gpio_irq_ack_noop(unsigned int irq) +{ + /* Dummy function. */ +} + static struct irq_chip gpio_irqchip = { .name = "GPIO", .mask = gpio_irq_mask, .unmask = gpio_irq_unmask, .set_type = gpio_irq_type, .set_wake = gpio_irq_set_wake, + .ack = gpio_irq_ack_noop, }; static void gpio_irq_handler(unsigned irq, struct irq_desc *desc) @@ -525,7 +531,7 @@ void __init at91_gpio_irq_setup(void) * shorter, and the AIC handles interrupts sanely. */ set_irq_chip(pin, &gpio_irqchip); - set_irq_handler(pin, handle_simple_irq); + set_irq_handler(pin, handle_edge_irq); set_irq_flags(pin, IRQF_VALID); } diff --git a/arch/arm/mach-bcmring/dma.c b/arch/arm/mach-bcmring/dma.c index 7b20fcc..744edb0 100644 --- a/arch/arm/mach-bcmring/dma.c +++ b/arch/arm/mach-bcmring/dma.c @@ -690,7 +690,7 @@ int dma_init(void) memset(&gDMA, 0, sizeof(gDMA)); - init_MUTEX_LOCKED(&gDMA.lock); + sema_init(&gDMA.lock, 0); init_waitqueue_head(&gDMA.freeChannelQ); /* Initialize the Hardware */ @@ -1573,7 +1573,7 @@ int dma_init_mem_map(DMA_MemMap_t *memMap) { memset(memMap, 0, sizeof(*memMap)); - init_MUTEX(&memMap->lock); + sema_init(&memMap->lock, 1); return 0; } @@ -2225,6 +2225,8 @@ int dma_unmap(DMA_MemMap_t *memMap, /* Stores state information about the map */ DMA_Region_t *region; DMA_Segment_t *segment; + down(&memMap->lock); + for (regionIdx = 0; regionIdx < memMap->numRegionsUsed; regionIdx++) { region = &memMap->region[regionIdx]; diff --git a/arch/arm/mach-footbridge/include/mach/hardware.h b/arch/arm/mach-footbridge/include/mach/hardware.h index 51dd902..eee37b6 100644 --- a/arch/arm/mach-footbridge/include/mach/hardware.h +++ b/arch/arm/mach-footbridge/include/mach/hardware.h @@ -86,7 +86,7 @@ #define CPLD_FLASH_WR_ENABLE 1 #ifndef __ASSEMBLY__ -extern spinlock_t nw_gpio_lock; +extern raw_spinlock_t nw_gpio_lock; extern void nw_gpio_modify_op(unsigned int mask, unsigned int set); extern void nw_gpio_modify_io(unsigned int mask, unsigned int in); extern unsigned int nw_gpio_read(void); diff --git a/arch/arm/mach-footbridge/netwinder-hw.c b/arch/arm/mach-footbridge/netwinder-hw.c index ac7ffa6..c903dad 100644 --- a/arch/arm/mach-footbridge/netwinder-hw.c +++ b/arch/arm/mach-footbridge/netwinder-hw.c @@ -68,7 +68,7 @@ static inline void wb977_ww(int reg, int val) /* * This is a lock for accessing ports GP1_IO_BASE and GP2_IO_BASE */ -DEFINE_SPINLOCK(nw_gpio_lock); +DEFINE_RAW_SPINLOCK(nw_gpio_lock); EXPORT_SYMBOL(nw_gpio_lock); static unsigned int current_gpio_op; @@ -327,9 +327,9 @@ static inline void wb977_init_gpio(void) /* * Set Group1/Group2 outputs */ - spin_lock_irqsave(&nw_gpio_lock, flags); + raw_spin_lock_irqsave(&nw_gpio_lock, flags); nw_gpio_modify_op(-1, GPIO_RED_LED | GPIO_FAN); - spin_unlock_irqrestore(&nw_gpio_lock, flags); + raw_spin_unlock_irqrestore(&nw_gpio_lock, flags); } /* @@ -390,9 +390,9 @@ static void __init cpld_init(void) { unsigned long flags; - spin_lock_irqsave(&nw_gpio_lock, flags); + raw_spin_lock_irqsave(&nw_gpio_lock, flags); nw_cpld_modify(-1, CPLD_UNMUTE | CPLD_7111_DISABLE); - spin_unlock_irqrestore(&nw_gpio_lock, flags); + raw_spin_unlock_irqrestore(&nw_gpio_lock, flags); } static unsigned char rwa_unlock[] __initdata = @@ -616,9 +616,9 @@ static int __init nw_hw_init(void) cpld_init(); rwa010_init(); - spin_lock_irqsave(&nw_gpio_lock, flags); + raw_spin_lock_irqsave(&nw_gpio_lock, flags); nw_gpio_modify_op(GPIO_RED_LED|GPIO_GREEN_LED, DEFAULT_LEDS); - spin_unlock_irqrestore(&nw_gpio_lock, flags); + raw_spin_unlock_irqrestore(&nw_gpio_lock, flags); } return 0; } diff --git a/arch/arm/mach-footbridge/netwinder-leds.c b/arch/arm/mach-footbridge/netwinder-leds.c index 00269fe..e57102e 100644 --- a/arch/arm/mach-footbridge/netwinder-leds.c +++ b/arch/arm/mach-footbridge/netwinder-leds.c @@ -31,13 +31,13 @@ static char led_state; static char hw_led_state; -static DEFINE_SPINLOCK(leds_lock); +static DEFINE_RAW_SPINLOCK(leds_lock); static void netwinder_leds_event(led_event_t evt) { unsigned long flags; - spin_lock_irqsave(&leds_lock, flags); + raw_spin_lock_irqsave(&leds_lock, flags); switch (evt) { case led_start: @@ -117,12 +117,12 @@ static void netwinder_leds_event(led_event_t evt) break; } - spin_unlock_irqrestore(&leds_lock, flags); + raw_spin_unlock_irqrestore(&leds_lock, flags); if (led_state & LED_STATE_ENABLED) { - spin_lock_irqsave(&nw_gpio_lock, flags); + raw_spin_lock_irqsave(&nw_gpio_lock, flags); nw_gpio_modify_op(GPIO_RED_LED | GPIO_GREEN_LED, hw_led_state); - spin_unlock_irqrestore(&nw_gpio_lock, flags); + raw_spin_unlock_irqrestore(&nw_gpio_lock, flags); } } diff --git a/arch/arm/mach-gemini/gpio.c b/arch/arm/mach-gemini/gpio.c index e726385..fe3bd5a 100644 --- a/arch/arm/mach-gemini/gpio.c +++ b/arch/arm/mach-gemini/gpio.c @@ -86,7 +86,7 @@ static int gpio_set_irq_type(unsigned int irq, unsigned int type) unsigned int reg_both, reg_level, reg_type; reg_type = __raw_readl(base + GPIO_INT_TYPE); - reg_level = __raw_readl(base + GPIO_INT_BOTH_EDGE); + reg_level = __raw_readl(base + GPIO_INT_LEVEL); reg_both = __raw_readl(base + GPIO_INT_BOTH_EDGE); switch (type) { @@ -117,7 +117,7 @@ static int gpio_set_irq_type(unsigned int irq, unsigned int type) } __raw_writel(reg_type, base + GPIO_INT_TYPE); - __raw_writel(reg_level, base + GPIO_INT_BOTH_EDGE); + __raw_writel(reg_level, base + GPIO_INT_LEVEL); __raw_writel(reg_both, base + GPIO_INT_BOTH_EDGE); gpio_ack_irq(irq); diff --git a/arch/arm/mach-integrator/core.c b/arch/arm/mach-integrator/core.c index a0f60e5..15c94cd 100644 --- a/arch/arm/mach-integrator/core.c +++ b/arch/arm/mach-integrator/core.c @@ -199,7 +199,7 @@ static struct amba_pl010_data integrator_uart_data = { #define CM_CTRL IO_ADDRESS(INTEGRATOR_HDR_BASE) + INTEGRATOR_HDR_CTRL_OFFSET -static DEFINE_SPINLOCK(cm_lock); +static DEFINE_RAW_SPINLOCK(cm_lock); /** * cm_control - update the CM_CTRL register. @@ -211,10 +211,10 @@ void cm_control(u32 mask, u32 set) unsigned long flags; u32 val; - spin_lock_irqsave(&cm_lock, flags); + raw_spin_lock_irqsave(&cm_lock, flags); val = readl(CM_CTRL) & ~mask; writel(val | set, CM_CTRL); - spin_unlock_irqrestore(&cm_lock, flags); + raw_spin_unlock_irqrestore(&cm_lock, flags); } EXPORT_SYMBOL(cm_control); diff --git a/arch/arm/mach-integrator/pci_v3.c b/arch/arm/mach-integrator/pci_v3.c index 148d25f..f7fe214 100644 --- a/arch/arm/mach-integrator/pci_v3.c +++ b/arch/arm/mach-integrator/pci_v3.c @@ -163,7 +163,7 @@ * 7:2 register number * */ -static DEFINE_SPINLOCK(v3_lock); +static DEFINE_RAW_SPINLOCK(v3_lock); #define PCI_BUS_NONMEM_START 0x00000000 #define PCI_BUS_NONMEM_SIZE SZ_256M @@ -284,7 +284,7 @@ static int v3_read_config(struct pci_bus *bus, unsigned int devfn, int where, unsigned long flags; u32 v; - spin_lock_irqsave(&v3_lock, flags); + raw_spin_lock_irqsave(&v3_lock, flags); addr = v3_open_config_window(bus, devfn, where); switch (size) { @@ -302,7 +302,7 @@ static int v3_read_config(struct pci_bus *bus, unsigned int devfn, int where, } v3_close_config_window(); - spin_unlock_irqrestore(&v3_lock, flags); + raw_spin_unlock_irqrestore(&v3_lock, flags); *val = v; return PCIBIOS_SUCCESSFUL; @@ -314,7 +314,7 @@ static int v3_write_config(struct pci_bus *bus, unsigned int devfn, int where, unsigned long addr; unsigned long flags; - spin_lock_irqsave(&v3_lock, flags); + raw_spin_lock_irqsave(&v3_lock, flags); addr = v3_open_config_window(bus, devfn, where); switch (size) { @@ -335,7 +335,7 @@ static int v3_write_config(struct pci_bus *bus, unsigned int devfn, int where, } v3_close_config_window(); - spin_unlock_irqrestore(&v3_lock, flags); + raw_spin_unlock_irqrestore(&v3_lock, flags); return PCIBIOS_SUCCESSFUL; } @@ -510,7 +510,7 @@ void __init pci_v3_preinit(void) hook_fault_code(8, v3_pci_fault, SIGBUS, "external abort on non-linefetch"); hook_fault_code(10, v3_pci_fault, SIGBUS, "external abort on non-linefetch"); - spin_lock_irqsave(&v3_lock, flags); + raw_spin_lock_irqsave(&v3_lock, flags); /* * Unlock V3 registers, but only if they were previously locked. @@ -583,7 +583,7 @@ void __init pci_v3_preinit(void) printk(KERN_ERR "PCI: unable to grab PCI error " "interrupt: %d\n", ret); - spin_unlock_irqrestore(&v3_lock, flags); + raw_spin_unlock_irqrestore(&v3_lock, flags); } void __init pci_v3_postinit(void) diff --git a/arch/arm/mach-ixp2000/core.c b/arch/arm/mach-ixp2000/core.c index babb225..e24e3d0 100644 --- a/arch/arm/mach-ixp2000/core.c +++ b/arch/arm/mach-ixp2000/core.c @@ -197,7 +197,7 @@ unsigned long ixp2000_gettimeoffset (void) return offset / ticks_per_usec; } -static int ixp2000_timer_interrupt(int irq, void *dev_id) +static irqreturn_t ixp2000_timer_interrupt(int irq, void *dev_id) { /* clear timer 1 */ ixp2000_reg_wrb(IXP2000_T1_CLR, 1); diff --git a/arch/arm/mach-ixp4xx/common-pci.c b/arch/arm/mach-ixp4xx/common-pci.c index c4a0159..fefbf10 100644 --- a/arch/arm/mach-ixp4xx/common-pci.c +++ b/arch/arm/mach-ixp4xx/common-pci.c @@ -54,7 +54,7 @@ unsigned long ixp4xx_pci_reg_base = 0; * these transactions are atomic or we will end up * with corrupt data on the bus or in a driver. */ -static DEFINE_SPINLOCK(ixp4xx_pci_lock); +static DEFINE_RAW_SPINLOCK(ixp4xx_pci_lock); /* * Read from PCI config space @@ -62,10 +62,10 @@ static DEFINE_SPINLOCK(ixp4xx_pci_lock); static void crp_read(u32 ad_cbe, u32 *data) { unsigned long flags; - spin_lock_irqsave(&ixp4xx_pci_lock, flags); + raw_spin_lock_irqsave(&ixp4xx_pci_lock, flags); *PCI_CRP_AD_CBE = ad_cbe; *data = *PCI_CRP_RDATA; - spin_unlock_irqrestore(&ixp4xx_pci_lock, flags); + raw_spin_unlock_irqrestore(&ixp4xx_pci_lock, flags); } /* @@ -74,10 +74,10 @@ static void crp_read(u32 ad_cbe, u32 *data) static void crp_write(u32 ad_cbe, u32 data) { unsigned long flags; - spin_lock_irqsave(&ixp4xx_pci_lock, flags); + raw_spin_lock_irqsave(&ixp4xx_pci_lock, flags); *PCI_CRP_AD_CBE = CRP_AD_CBE_WRITE | ad_cbe; *PCI_CRP_WDATA = data; - spin_unlock_irqrestore(&ixp4xx_pci_lock, flags); + raw_spin_unlock_irqrestore(&ixp4xx_pci_lock, flags); } static inline int check_master_abort(void) @@ -101,7 +101,7 @@ int ixp4xx_pci_read_errata(u32 addr, u32 cmd, u32* data) int retval = 0; int i; - spin_lock_irqsave(&ixp4xx_pci_lock, flags); + raw_spin_lock_irqsave(&ixp4xx_pci_lock, flags); *PCI_NP_AD = addr; @@ -118,7 +118,7 @@ int ixp4xx_pci_read_errata(u32 addr, u32 cmd, u32* data) if(check_master_abort()) retval = 1; - spin_unlock_irqrestore(&ixp4xx_pci_lock, flags); + raw_spin_unlock_irqrestore(&ixp4xx_pci_lock, flags); return retval; } @@ -127,7 +127,7 @@ int ixp4xx_pci_read_no_errata(u32 addr, u32 cmd, u32* data) unsigned long flags; int retval = 0; - spin_lock_irqsave(&ixp4xx_pci_lock, flags); + raw_spin_lock_irqsave(&ixp4xx_pci_lock, flags); *PCI_NP_AD = addr; @@ -140,7 +140,7 @@ int ixp4xx_pci_read_no_errata(u32 addr, u32 cmd, u32* data) if(check_master_abort()) retval = 1; - spin_unlock_irqrestore(&ixp4xx_pci_lock, flags); + raw_spin_unlock_irqrestore(&ixp4xx_pci_lock, flags); return retval; } @@ -149,7 +149,7 @@ int ixp4xx_pci_write(u32 addr, u32 cmd, u32 data) unsigned long flags; int retval = 0; - spin_lock_irqsave(&ixp4xx_pci_lock, flags); + raw_spin_lock_irqsave(&ixp4xx_pci_lock, flags); *PCI_NP_AD = addr; @@ -162,7 +162,7 @@ int ixp4xx_pci_write(u32 addr, u32 cmd, u32 data) if(check_master_abort()) retval = 1; - spin_unlock_irqrestore(&ixp4xx_pci_lock, flags); + raw_spin_unlock_irqrestore(&ixp4xx_pci_lock, flags); return retval; } diff --git a/arch/arm/mach-msm/proc_comm.c b/arch/arm/mach-msm/proc_comm.c index 915ee70..e825c36 100644 --- a/arch/arm/mach-msm/proc_comm.c +++ b/arch/arm/mach-msm/proc_comm.c @@ -14,6 +14,7 @@ * */ +#include <linux/cache.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/io.h> diff --git a/arch/arm/mach-omap2/mux.c b/arch/arm/mach-omap2/mux.c index 5fedc50..5fef73f 100644 --- a/arch/arm/mach-omap2/mux.c +++ b/arch/arm/mach-omap2/mux.c @@ -961,16 +961,14 @@ static void __init omap_mux_init_list(struct omap_mux *superset) while (superset->reg_offset != OMAP_MUX_TERMINATOR) { struct omap_mux *entry; -#ifndef CONFIG_OMAP_MUX - /* Skip pins that are not muxed as GPIO by bootloader */ - if (!OMAP_MODE_GPIO(omap_mux_read(superset->reg_offset))) { +#ifdef CONFIG_OMAP_MUX + if (!superset->muxnames || !superset->muxnames[0]) { superset++; continue; } -#endif - -#if defined(CONFIG_OMAP_MUX) && defined(CONFIG_DEBUG_FS) - if (!superset->muxnames || !superset->muxnames[0]) { +#else + /* Skip pins that are not muxed as GPIO by bootloader */ + if (!OMAP_MODE_GPIO(omap_mux_read(superset->reg_offset))) { superset++; continue; } diff --git a/arch/arm/mach-sa1100/badge4.c b/arch/arm/mach-sa1100/badge4.c index 051ec0f..fd8ceef 100644 --- a/arch/arm/mach-sa1100/badge4.c +++ b/arch/arm/mach-sa1100/badge4.c @@ -240,15 +240,22 @@ void badge4_set_5V(unsigned subsystem, int on) /* detect on->off and off->on transitions */ if ((!old_5V_bitmap) && (badge4_5V_bitmap)) { /* was off, now on */ - printk(KERN_INFO "%s: enabling 5V supply rail\n", __func__); GPSR = BADGE4_GPIO_PCMEN5V; } else if ((old_5V_bitmap) && (!badge4_5V_bitmap)) { /* was on, now off */ - printk(KERN_INFO "%s: disabling 5V supply rail\n", __func__); GPCR = BADGE4_GPIO_PCMEN5V; } local_irq_restore(flags); + + /* detect on->off and off->on transitions */ + if ((!old_5V_bitmap) && (badge4_5V_bitmap)) { + /* was off, now on */ + printk(KERN_INFO "%s: enabling 5V supply rail\n", __FUNCTION__); + } else if ((old_5V_bitmap) && (!badge4_5V_bitmap)) { + /* was on, now off */ + printk(KERN_INFO "%s: disabling 5V supply rail\n", __FUNCTION__); + } } EXPORT_SYMBOL(badge4_set_5V); diff --git a/arch/arm/mach-shark/leds.c b/arch/arm/mach-shark/leds.c index c9e32de..ccd4918 100644 --- a/arch/arm/mach-shark/leds.c +++ b/arch/arm/mach-shark/leds.c @@ -36,7 +36,7 @@ static char led_state; static short hw_led_state; static short saved_state; -static DEFINE_SPINLOCK(leds_lock); +static DEFINE_RAW_SPINLOCK(leds_lock); short sequoia_read(int addr) { outw(addr,0x24); @@ -52,7 +52,7 @@ static void sequoia_leds_event(led_event_t evt) { unsigned long flags; - spin_lock_irqsave(&leds_lock, flags); + raw_spin_lock_irqsave(&leds_lock, flags); hw_led_state = sequoia_read(0x09); @@ -144,7 +144,7 @@ static void sequoia_leds_event(led_event_t evt) if (led_state & LED_STATE_ENABLED) sequoia_write(hw_led_state,0x09); - spin_unlock_irqrestore(&leds_lock, flags); + raw_spin_unlock_irqrestore(&leds_lock, flags); } static int __init leds_init(void) diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c index b270d62..62820ed 100644 --- a/arch/arm/mm/alignment.c +++ b/arch/arm/mm/alignment.c @@ -11,6 +11,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ +#include <linux/moduleparam.h> #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/errno.h> @@ -77,6 +78,8 @@ static unsigned long ai_dword; static unsigned long ai_multi; static int ai_usermode; +core_param(alignment, ai_usermode, int, 0600); + #define UM_WARN (1 << 0) #define UM_FIXUP (1 << 1) #define UM_SIGNAL (1 << 2) diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c index cb8fc65..e912ff2 100644 --- a/arch/arm/mm/cache-l2x0.c +++ b/arch/arm/mm/cache-l2x0.c @@ -26,7 +26,7 @@ #define CACHE_LINE_SIZE 32 static void __iomem *l2x0_base; -static DEFINE_SPINLOCK(l2x0_lock); +static DEFINE_RAW_SPINLOCK(l2x0_lock); static inline void cache_wait(void __iomem *reg, unsigned long mask) { @@ -47,11 +47,11 @@ static inline void l2x0_inv_all(void) unsigned long flags; /* invalidate all ways */ - spin_lock_irqsave(&l2x0_lock, flags); + raw_spin_lock_irqsave(&l2x0_lock, flags); writel(0xff, l2x0_base + L2X0_INV_WAY); cache_wait(l2x0_base + L2X0_INV_WAY, 0xff); cache_sync(); - spin_unlock_irqrestore(&l2x0_lock, flags); + raw_spin_unlock_irqrestore(&l2x0_lock, flags); } static void l2x0_inv_range(unsigned long start, unsigned long end) @@ -59,7 +59,7 @@ static void l2x0_inv_range(unsigned long start, unsigned long end) void __iomem *base = l2x0_base; unsigned long flags; - spin_lock_irqsave(&l2x0_lock, flags); + raw_spin_lock_irqsave(&l2x0_lock, flags); if (start & (CACHE_LINE_SIZE - 1)) { start &= ~(CACHE_LINE_SIZE - 1); cache_wait(base + L2X0_CLEAN_INV_LINE_PA, 1); @@ -83,13 +83,13 @@ static void l2x0_inv_range(unsigned long start, unsigned long end) } if (blk_end < end) { - spin_unlock_irqrestore(&l2x0_lock, flags); - spin_lock_irqsave(&l2x0_lock, flags); + raw_spin_unlock_irqrestore(&l2x0_lock, flags); + raw_spin_lock_irqsave(&l2x0_lock, flags); } } cache_wait(base + L2X0_INV_LINE_PA, 1); cache_sync(); - spin_unlock_irqrestore(&l2x0_lock, flags); + raw_spin_unlock_irqrestore(&l2x0_lock, flags); } static void l2x0_clean_range(unsigned long start, unsigned long end) @@ -97,7 +97,7 @@ static void l2x0_clean_range(unsigned long start, unsigned long end) void __iomem *base = l2x0_base; unsigned long flags; - spin_lock_irqsave(&l2x0_lock, flags); + raw_spin_lock_irqsave(&l2x0_lock, flags); start &= ~(CACHE_LINE_SIZE - 1); while (start < end) { unsigned long blk_end = start + min(end - start, 4096UL); @@ -109,13 +109,13 @@ static void l2x0_clean_range(unsigned long start, unsigned long end) } if (blk_end < end) { - spin_unlock_irqrestore(&l2x0_lock, flags); - spin_lock_irqsave(&l2x0_lock, flags); + raw_spin_unlock_irqrestore(&l2x0_lock, flags); + raw_spin_lock_irqsave(&l2x0_lock, flags); } } cache_wait(base + L2X0_CLEAN_LINE_PA, 1); cache_sync(); - spin_unlock_irqrestore(&l2x0_lock, flags); + raw_spin_unlock_irqrestore(&l2x0_lock, flags); } static void l2x0_flush_range(unsigned long start, unsigned long end) @@ -123,7 +123,7 @@ static void l2x0_flush_range(unsigned long start, unsigned long end) void __iomem *base = l2x0_base; unsigned long flags; - spin_lock_irqsave(&l2x0_lock, flags); + raw_spin_lock_irqsave(&l2x0_lock, flags); start &= ~(CACHE_LINE_SIZE - 1); while (start < end) { unsigned long blk_end = start + min(end - start, 4096UL); @@ -135,13 +135,13 @@ static void l2x0_flush_range(unsigned long start, unsigned long end) } if (blk_end < end) { - spin_unlock_irqrestore(&l2x0_lock, flags); - spin_lock_irqsave(&l2x0_lock, flags); + raw_spin_unlock_irqrestore(&l2x0_lock, flags); + raw_spin_lock_irqsave(&l2x0_lock, flags); } } cache_wait(base + L2X0_CLEAN_INV_LINE_PA, 1); cache_sync(); - spin_unlock_irqrestore(&l2x0_lock, flags); + raw_spin_unlock_irqrestore(&l2x0_lock, flags); } void __init l2x0_init(void __iomem *base, __u32 aux_val, __u32 aux_mask) diff --git a/arch/arm/mm/context.c b/arch/arm/mm/context.c index a9e22e3..ec7b776 100644 --- a/arch/arm/mm/context.c +++ b/arch/arm/mm/context.c @@ -14,7 +14,7 @@ #include <asm/mmu_context.h> #include <asm/tlbflush.h> -static DEFINE_SPINLOCK(cpu_asid_lock); +static DEFINE_RAW_SPINLOCK(cpu_asid_lock); unsigned int cpu_last_asid = ASID_FIRST_VERSION; /* @@ -32,7 +32,7 @@ void __new_context(struct mm_struct *mm) { unsigned int asid; - spin_lock(&cpu_asid_lock); + raw_spin_lock(&cpu_asid_lock); asid = ++cpu_last_asid; if (asid == 0) asid = cpu_last_asid = ASID_FIRST_VERSION; @@ -54,7 +54,7 @@ void __new_context(struct mm_struct *mm) dsb(); } } - spin_unlock(&cpu_asid_lock); + raw_spin_unlock(&cpu_asid_lock); cpumask_copy(mm_cpumask(mm), cpumask_of(smp_processor_id())); mm->context.id = asid; diff --git a/arch/arm/mm/copypage-v4mc.c b/arch/arm/mm/copypage-v4mc.c index 7370a71..9691c02 100644 --- a/arch/arm/mm/copypage-v4mc.c +++ b/arch/arm/mm/copypage-v4mc.c @@ -30,7 +30,7 @@ #define minicache_pgprot __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | \ L_PTE_MT_MINICACHE) -static DEFINE_SPINLOCK(minicache_lock); +static DEFINE_RAW_SPINLOCK(minicache_lock); /* * ARMv4 mini-dcache optimised copy_user_highpage @@ -76,14 +76,14 @@ void v4_mc_copy_user_highpage(struct page *to, struct page *from, if (test_and_clear_bit(PG_dcache_dirty, &from->flags)) __flush_dcache_page(page_mapping(from), from); - spin_lock(&minicache_lock); + raw_spin_lock(&minicache_lock); set_pte_ext(TOP_PTE(0xffff8000), pfn_pte(page_to_pfn(from), minicache_pgprot), 0); flush_tlb_kernel_page(0xffff8000); mc_copy_user_page((void *)0xffff8000, kto); - spin_unlock(&minicache_lock); + raw_spin_unlock(&minicache_lock); kunmap_atomic(kto, KM_USER1); } diff --git a/arch/arm/mm/copypage-v6.c b/arch/arm/mm/copypage-v6.c index 0fa1319..8000e55 100644 --- a/arch/arm/mm/copypage-v6.c +++ b/arch/arm/mm/copypage-v6.c @@ -27,7 +27,7 @@ #define from_address (0xffff8000) #define to_address (0xffffc000) -static DEFINE_SPINLOCK(v6_lock); +static DEFINE_RAW_SPINLOCK(v6_lock); /* * Copy the user page. No aliasing to deal with so we can just @@ -96,7 +96,7 @@ static void v6_copy_user_highpage_aliasing(struct page *to, * Now copy the page using the same cache colour as the * pages ultimate destination. */ - spin_lock(&v6_lock); + raw_spin_lock(&v6_lock); set_pte_ext(TOP_PTE(from_address) + offset, pfn_pte(page_to_pfn(from), PAGE_KERNEL), 0); set_pte_ext(TOP_PTE(to_address) + offset, pfn_pte(page_to_pfn(to), PAGE_KERNEL), 0); @@ -109,7 +109,7 @@ static void v6_copy_user_highpage_aliasing(struct page *to, copy_page((void *)kto, (void *)kfrom); - spin_unlock(&v6_lock); + raw_spin_unlock(&v6_lock); } /* @@ -129,13 +129,13 @@ static void v6_clear_user_highpage_aliasing(struct page *page, unsigned long vad * Now clear the page using the same cache colour as * the pages ultimate destination. */ - spin_lock(&v6_lock); + raw_spin_lock(&v6_lock); set_pte_ext(TOP_PTE(to_address) + offset, pfn_pte(page_to_pfn(page), PAGE_KERNEL), 0); flush_tlb_kernel_page(to); clear_page((void *)to); - spin_unlock(&v6_lock); + raw_spin_unlock(&v6_lock); } struct cpu_user_fns v6_user_fns __initdata = { diff --git a/arch/arm/mm/copypage-xscale.c b/arch/arm/mm/copypage-xscale.c index 76824d3..eddffc3 100644 --- a/arch/arm/mm/copypage-xscale.c +++ b/arch/arm/mm/copypage-xscale.c @@ -32,7 +32,7 @@ #define minicache_pgprot __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | \ L_PTE_MT_MINICACHE) -static DEFINE_SPINLOCK(minicache_lock); +static DEFINE_RAW_SPINLOCK(minicache_lock); /* * XScale mini-dcache optimised copy_user_highpage @@ -98,14 +98,14 @@ void xscale_mc_copy_user_highpage(struct page *to, struct page *from, if (test_and_clear_bit(PG_dcache_dirty, &from->flags)) __flush_dcache_page(page_mapping(from), from); - spin_lock(&minicache_lock); + raw_spin_lock(&minicache_lock); set_pte_ext(TOP_PTE(COPYPAGE_MINICACHE), pfn_pte(page_to_pfn(from), minicache_pgprot), 0); flush_tlb_kernel_page(COPYPAGE_MINICACHE); mc_copy_user_page((void *)COPYPAGE_MINICACHE, kto); - spin_unlock(&minicache_lock); + raw_spin_unlock(&minicache_lock); kunmap_atomic(kto, KM_USER1); } diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 10e0680..7aa9b88 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -273,7 +273,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_atomic() || !mm) + if (in_atomic() || !mm || current->pagefault_disabled) goto no_context; /* diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 761ffed..627426d 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -30,8 +30,6 @@ #include "mm.h" -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - /* * empty_zero_page is a special page that is used for * zero-initialized data and COW. diff --git a/arch/arm/oprofile/common.c b/arch/arm/oprofile/common.c index 3fcd752..54a182b 100644 --- a/arch/arm/oprofile/common.c +++ b/arch/arm/oprofile/common.c @@ -48,9 +48,9 @@ static int op_arm_setup(void) { int ret; - spin_lock(&oprofilefs_lock); + raw_spin_lock(&oprofilefs_lock); ret = op_arm_model->setup_ctrs(); - spin_unlock(&oprofilefs_lock); + raw_spin_unlock(&oprofilefs_lock); return ret; } diff --git a/arch/arm/oprofile/op_model_xscale.c b/arch/arm/oprofile/op_model_xscale.c index 724ab9c..cbe91ee 100644 --- a/arch/arm/oprofile/op_model_xscale.c +++ b/arch/arm/oprofile/op_model_xscale.c @@ -381,8 +381,9 @@ static int xscale_pmu_start(void) { int ret; u32 pmnc = read_pmnc(); + unsigned long irq_flags = IRQF_DISABLED | IRQF_NODELAY; - ret = request_irq(XSCALE_PMU_IRQ, xscale_pmu_interrupt, IRQF_DISABLED, + ret = request_irq(XSCALE_PMU_IRQ, xscale_pmu_interrupt, irq_flags, "XScale PMU", (void *)results); if (ret < 0) { diff --git a/arch/arm/plat-omap/clock.c b/arch/arm/plat-omap/clock.c index 4becbdd..50f299b 100644 --- a/arch/arm/plat-omap/clock.c +++ b/arch/arm/plat-omap/clock.c @@ -78,15 +78,12 @@ EXPORT_SYMBOL(clk_disable); unsigned long clk_get_rate(struct clk *clk) { - unsigned long flags; unsigned long ret = 0; if (clk == NULL || IS_ERR(clk)) return 0; - spin_lock_irqsave(&clockfw_lock, flags); ret = clk->rate; - spin_unlock_irqrestore(&clockfw_lock, flags); return ret; } diff --git a/arch/arm/tools/mach-types b/arch/arm/tools/mach-types index 5a79fc6..31c2f4c 100644 --- a/arch/arm/tools/mach-types +++ b/arch/arm/tools/mach-types @@ -12,7 +12,7 @@ # # http://www.arm.linux.org.uk/developer/machines/?action=new # -# Last update: Thu Jan 28 22:15:54 2010 +# Last update: Sat Feb 20 14:16:15 2010 # # machine_is_xxx CONFIG_xxxx MACH_TYPE_xxx number # @@ -2257,7 +2257,7 @@ oratisalog MACH_ORATISALOG ORATISALOG 2268 oratismadi MACH_ORATISMADI ORATISMADI 2269 oratisot16 MACH_ORATISOT16 ORATISOT16 2270 oratisdesk MACH_ORATISDESK ORATISDESK 2271 -v2_ca9 MACH_V2P_CA9 V2P_CA9 2272 +vexpress MACH_VEXPRESS VEXPRESS 2272 sintexo MACH_SINTEXO SINTEXO 2273 cm3389 MACH_CM3389 CM3389 2274 omap3_cio MACH_OMAP3_CIO OMAP3_CIO 2275 @@ -2636,3 +2636,45 @@ hw90240 MACH_HW90240 HW90240 2648 dm365_leopard MACH_DM365_LEOPARD DM365_LEOPARD 2649 mityomapl138 MACH_MITYOMAPL138 MITYOMAPL138 2650 scat110 MACH_SCAT110 SCAT110 2651 +acer_a1 MACH_ACER_A1 ACER_A1 2652 +cmcontrol MACH_CMCONTROL CMCONTROL 2653 +pelco_lamar MACH_PELCO_LAMAR PELCO_LAMAR 2654 +rfp43 MACH_RFP43 RFP43 2655 +sk86r0301 MACH_SK86R0301 SK86R0301 2656 +ctpxa MACH_CTPXA CTPXA 2657 +epb_arm9_a MACH_EPB_ARM9_A EPB_ARM9_A 2658 +guruplug MACH_GURUPLUG GURUPLUG 2659 +spear310 MACH_SPEAR310 SPEAR310 2660 +spear320 MACH_SPEAR320 SPEAR320 2661 +robotx MACH_ROBOTX ROBOTX 2662 +lsxhl MACH_LSXHL LSXHL 2663 +smartlite MACH_SMARTLITE SMARTLITE 2664 +cws2 MACH_CWS2 CWS2 2665 +m619 MACH_M619 M619 2666 +smartview MACH_SMARTVIEW SMARTVIEW 2667 +lsa_salsa MACH_LSA_SALSA LSA_SALSA 2668 +kizbox MACH_KIZBOX KIZBOX 2669 +htccharmer MACH_HTCCHARMER HTCCHARMER 2670 +guf_neso_lt MACH_GUF_NESO_LT GUF_NESO_LT 2671 +pm9g45 MACH_PM9G45 PM9G45 2672 +htcpanther MACH_HTCPANTHER HTCPANTHER 2673 +htcpanther_cdma MACH_HTCPANTHER_CDMA HTCPANTHER_CDMA 2674 +reb01 MACH_REB01 REB01 2675 +aquila MACH_AQUILA AQUILA 2676 +spark_sls_hw2 MACH_SPARK_SLS_HW2 SPARK_SLS_HW2 2677 +sheeva_esata MACH_ESATA_SHEEVAPLUG ESATA_SHEEVAPLUG 2678 +surf7x30 MACH_SURF7X30 SURF7X30 2679 +micro2440 MACH_MICRO2440 MICRO2440 2680 +am2440 MACH_AM2440 AM2440 2681 +tq2440 MACH_TQ2440 TQ2440 2682 +lpc2478oem MACH_LPC2478OEM LPC2478OEM 2683 +ak880x MACH_AK880X AK880X 2684 +cobra3530 MACH_COBRA3530 COBRA3530 2685 +pmppb MACH_PMPPB PMPPB 2686 +u6715 MACH_U6715 U6715 2687 +axar1500_sender MACH_AXAR1500_SENDER AXAR1500_SENDER 2688 +g30_dvb MACH_G30_DVB G30_DVB 2689 +vc088x MACH_VC088X VC088X 2690 +mioa702 MACH_MIOA702 MIOA702 2691 +hpmin MACH_HPMIN HPMIN 2692 +ak880xak MACH_AK880XAK AK880XAK 2693 diff --git a/arch/blackfin/kernel/time.c b/arch/blackfin/kernel/time.c index 13c1ee3..8ded01f 100644 --- a/arch/blackfin/kernel/time.c +++ b/arch/blackfin/kernel/time.c @@ -129,7 +129,7 @@ irqreturn_t timer_interrupt(int irq, void *dummy) /* last time the cmos clock got updated */ static long last_rtc_update; - write_seqlock(&xtime_lock); + write_raw_seqlock(&xtime_lock); do_timer(1); /* @@ -149,7 +149,7 @@ irqreturn_t timer_interrupt(int irq, void *dummy) /* Do it again in 60s. */ last_rtc_update = xtime.tv_sec - 600; } - write_sequnlock(&xtime_lock); + write_raw_sequnlock(&xtime_lock); #ifdef CONFIG_IPIPE update_root_process_times(get_irq_regs()); diff --git a/arch/cris/kernel/time.c b/arch/cris/kernel/time.c index 074fe7d..58d2a1a 100644 --- a/arch/cris/kernel/time.c +++ b/arch/cris/kernel/time.c @@ -87,7 +87,7 @@ int do_settimeofday(struct timespec *tv) if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irq(&xtime_lock); + write_raw_seqlock_irq(&xtime_lock); /* * This is revolting. We need to set "xtime" correctly. However, the * value in this location is the value at the most recent update of @@ -103,7 +103,7 @@ int do_settimeofday(struct timespec *tv) set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); ntp_clear(); - write_sequnlock_irq(&xtime_lock); + write_raw_sequnlock_irq(&xtime_lock); clock_was_set(); return 0; } diff --git a/arch/frv/include/asm/highmem.h b/arch/frv/include/asm/highmem.h index 68e4677..5cb8dff 100644 --- a/arch/frv/include/asm/highmem.h +++ b/arch/frv/include/asm/highmem.h @@ -116,6 +116,7 @@ static inline void *kmap_atomic(struct page *page, enum km_type type) { unsigned long paddr; + preempt_disable(); pagefault_disable(); debug_kmap_atomic(type); paddr = page_to_phys(page); @@ -173,6 +174,7 @@ static inline void kunmap_atomic(void *kvaddr, enum km_type type) BUG(); } pagefault_enable(); + preempt_enable(); } #endif /* !__ASSEMBLY__ */ diff --git a/arch/frv/kernel/time.c b/arch/frv/kernel/time.c index fb0ce75..82943ba 100644 --- a/arch/frv/kernel/time.c +++ b/arch/frv/kernel/time.c @@ -70,7 +70,7 @@ static irqreturn_t timer_interrupt(int irq, void *dummy) * the irq version of write_lock because as just said we have irq * locally disabled. -arca */ - write_seqlock(&xtime_lock); + write_raw_seqlock(&xtime_lock); do_timer(1); @@ -96,7 +96,7 @@ static irqreturn_t timer_interrupt(int irq, void *dummy) __set_LEDS(n); #endif /* CONFIG_HEARTBEAT */ - write_sequnlock(&xtime_lock); + write_raw_sequnlock(&xtime_lock); update_process_times(user_mode(get_irq_regs())); diff --git a/arch/h8300/kernel/time.c b/arch/h8300/kernel/time.c index 7f2d6cf..d08012c 100644 --- a/arch/h8300/kernel/time.c +++ b/arch/h8300/kernel/time.c @@ -35,9 +35,9 @@ void h8300_timer_tick(void) { if (current->pid) profile_tick(CPU_PROFILING); - write_seqlock(&xtime_lock); + write_raw_seqlock(&xtime_lock); do_timer(1); - write_sequnlock(&xtime_lock); + write_raw_sequnlock(&xtime_lock); update_process_times(user_mode(get_irq_regs())); } diff --git a/arch/ia64/include/asm/acpi.h b/arch/ia64/include/asm/acpi.h index 7ae5889..e97b255 100644 --- a/arch/ia64/include/asm/acpi.h +++ b/arch/ia64/include/asm/acpi.h @@ -94,6 +94,7 @@ ia64_acpi_release_global_lock (unsigned int *lock) #define acpi_noirq 0 /* ACPI always enabled on IA64 */ #define acpi_pci_disabled 0 /* ACPI PCI always enabled on IA64 */ #define acpi_strict 1 /* no ACPI spec workarounds on IA64 */ +#define acpi_ht 0 /* no HT-only mode on IA64 */ #endif #define acpi_processor_cstate_check(x) (x) /* no idle limits on IA64 :) */ static inline void disable_acpi(void) { } diff --git a/arch/ia64/include/asm/elf.h b/arch/ia64/include/asm/elf.h index e14108b..4c41656 100644 --- a/arch/ia64/include/asm/elf.h +++ b/arch/ia64/include/asm/elf.h @@ -201,7 +201,9 @@ extern void ia64_elf_core_copy_regs (struct pt_regs *src, elf_gregset_t dst); relevant until we have real hardware to play with... */ #define ELF_PLATFORM NULL -#define SET_PERSONALITY(ex) set_personality(PER_LINUX) +#define SET_PERSONALITY(ex) \ + set_personality((current->personality & ~PER_MASK) | PER_LINUX) + #define elf_read_implies_exec(ex, executable_stack) \ ((executable_stack!=EXSTACK_DISABLE_X) && ((ex).e_flags & EF_IA_64_LINUX_EXECUTABLE_STACK) != 0) diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h index e876268..a48df05 100644 --- a/arch/ia64/include/asm/rwsem.h +++ b/arch/ia64/include/asm/rwsem.h @@ -33,7 +33,7 @@ /* * the semaphore definition */ -struct rw_semaphore { +struct rw_anon_semaphore { signed long count; spinlock_t wait_lock; struct list_head wait_list; @@ -51,26 +51,47 @@ struct rw_semaphore { LIST_HEAD_INIT((name).wait_list) } #define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) - -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); - -static inline void -init_rwsem (struct rw_semaphore *sem) + struct rw_anon_semaphore name = __RWSEM_INITIALIZER(name) + +extern struct rw_anon_semaphore * +rwsem_down_read_failed(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_down_write_failed(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_wake(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_downgrade_wake(struct rw_anon_semaphore *sem); + +static inline void init_anon_rwsem (struct rw_anon_semaphore *sem) { sem->count = RWSEM_UNLOCKED_VALUE; spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); } +struct rw_anon_semaphore { + signed long count; + spinlock_t wait_lock; + struct list_head wait_list; +}; + +#define __RWSEM_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \ + LIST_HEAD_INIT((name).wait_list) } + +#define DECLARE_RWSEM(name) \ + struct rw_semaphore name = __RWSEM_INITIALIZER(name) + +static inline void init_rwsem(struct rw_semaphore *sem) +{ + init_anon_rwsem((struct rw_anon_semaphore *)sem); +} + /* * lock for reading */ static inline void -__down_read (struct rw_semaphore *sem) +__down_read (struct rw_anon_semaphore *sem) { long result = ia64_fetchadd8_acq((unsigned long *)&sem->count, 1); @@ -82,7 +103,7 @@ __down_read (struct rw_semaphore *sem) * lock for writing */ static inline void -__down_write (struct rw_semaphore *sem) +__down_write (struct rw_anon_semaphore *sem) { long old, new; @@ -99,7 +120,7 @@ __down_write (struct rw_semaphore *sem) * unlock after reading */ static inline void -__up_read (struct rw_semaphore *sem) +__up_read (struct rw_anon_semaphore *sem) { long result = ia64_fetchadd8_rel((unsigned long *)&sem->count, -1); @@ -111,7 +132,7 @@ __up_read (struct rw_semaphore *sem) * unlock after writing */ static inline void -__up_write (struct rw_semaphore *sem) +__up_write (struct rw_anon_semaphore *sem) { long old, new; @@ -128,7 +149,7 @@ __up_write (struct rw_semaphore *sem) * trylock for reading -- returns 1 if successful, 0 if contention */ static inline int -__down_read_trylock (struct rw_semaphore *sem) +__down_read_trylock (struct rw_anon_semaphore *sem) { long tmp; while ((tmp = sem->count) >= 0) { @@ -143,7 +164,7 @@ __down_read_trylock (struct rw_semaphore *sem) * trylock for writing -- returns 1 if successful, 0 if contention */ static inline int -__down_write_trylock (struct rw_semaphore *sem) +__down_write_trylock (struct rw_anon_semaphore *sem) { long tmp = cmpxchg_acq(&sem->count, RWSEM_UNLOCKED_VALUE, RWSEM_ACTIVE_WRITE_BIAS); @@ -154,7 +175,7 @@ __down_write_trylock (struct rw_semaphore *sem) * downgrade write lock to read lock */ static inline void -__downgrade_write (struct rw_semaphore *sem) +__downgrade_write (struct rw_anon_semaphore *sem) { long old, new; @@ -174,6 +195,11 @@ __downgrade_write (struct rw_semaphore *sem) #define rwsem_atomic_add(delta, sem) atomic64_add(delta, (atomic64_t *)(&(sem)->count)) #define rwsem_atomic_update(delta, sem) atomic64_add_return(delta, (atomic64_t *)(&(sem)->count)) +static inline int anon_rwsem_is_locked(struct rw_anon_semaphore *sem) +{ + return (sem->count != 0); +} + static inline int rwsem_is_locked(struct rw_semaphore *sem) { return (sem->count != 0); diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c index e6676fc..c6273cc 100644 --- a/arch/ia64/kernel/salinfo.c +++ b/arch/ia64/kernel/salinfo.c @@ -643,7 +643,7 @@ salinfo_init(void) for (i = 0; i < ARRAY_SIZE(salinfo_log_name); i++) { data = salinfo_data + i; data->type = i; - init_MUTEX(&data->mutex); + sema_init(&data->mutex, 1); dir = proc_mkdir(salinfo_log_name[i], salinfo_dir); if (!dir) continue; diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index a35c661..3b8e4f1 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -197,10 +197,10 @@ timer_interrupt (int irq, void *dev_id) * another CPU. We need to avoid to SMP race by acquiring the * xtime_lock. */ - write_seqlock(&xtime_lock); + write_raw_seqlock(&xtime_lock); do_timer(1); local_cpu_data->itm_next = new_itm; - write_sequnlock(&xtime_lock); + write_raw_sequnlock(&xtime_lock); } else local_cpu_data->itm_next = new_itm; @@ -477,7 +477,7 @@ void update_vsyscall(struct timespec *wall, struct clocksource *c, u32 mult) { unsigned long flags; - write_seqlock_irqsave(&fsyscall_gtod_data.lock, flags); + write_raw_seqlock_irqsave(&fsyscall_gtod_data.lock, flags); /* copy fsyscall clock data */ fsyscall_gtod_data.clk_mask = c->mask; @@ -500,6 +500,6 @@ void update_vsyscall(struct timespec *wall, struct clocksource *c, u32 mult) fsyscall_gtod_data.monotonic_time.tv_sec++; } - write_sequnlock_irqrestore(&fsyscall_gtod_data.lock, flags); + write_raw_sequnlock_irqrestore(&fsyscall_gtod_data.lock, flags); } diff --git a/arch/ia64/xen/time.c b/arch/ia64/xen/time.c index c1c5445..f681845 100644 --- a/arch/ia64/xen/time.c +++ b/arch/ia64/xen/time.c @@ -140,10 +140,10 @@ consider_steal_time(unsigned long new_itm) delta_itm += local_cpu_data->itm_delta * (stolen + blocked); if (cpu == time_keeper_id) { - write_seqlock(&xtime_lock); + write_raw_seqlock(&xtime_lock); do_timer(stolen + blocked); local_cpu_data->itm_next = delta_itm + new_itm; - write_sequnlock(&xtime_lock); + write_raw_sequnlock(&xtime_lock); } else { local_cpu_data->itm_next = delta_itm + new_itm; } diff --git a/arch/m32r/kernel/time.c b/arch/m32r/kernel/time.c index 9cedcef..47632ca 100644 --- a/arch/m32r/kernel/time.c +++ b/arch/m32r/kernel/time.c @@ -143,7 +143,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be * called as close as possible to 500 ms before the new second starts. */ - write_seqlock(&xtime_lock); + write_raw_seqlock(&xtime_lock); if (ntp_synced() && xtime.tv_sec > last_rtc_update + 660 && (xtime.tv_nsec / 1000) >= 500000 - ((unsigned)TICK_SIZE) / 2 @@ -154,7 +154,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) else /* do it again in 60 s */ last_rtc_update = xtime.tv_sec - 600; } - write_sequnlock(&xtime_lock); + write_raw_sequnlock(&xtime_lock); /* As we return to user mode fire off the other CPU schedulers.. this is basically because we don't yet share IRQ's around. This message is rigged to be safe on the 386 - basically it's diff --git a/arch/m68knommu/kernel/time.c b/arch/m68knommu/kernel/time.c index a90acf5..f8eb60f 100644 --- a/arch/m68knommu/kernel/time.c +++ b/arch/m68knommu/kernel/time.c @@ -44,11 +44,11 @@ irqreturn_t arch_timer_interrupt(int irq, void *dummy) if (current->pid) profile_tick(CPU_PROFILING); - write_seqlock(&xtime_lock); + write_raw_seqlock(&xtime_lock); do_timer(1); - write_sequnlock(&xtime_lock); + write_raw_sequnlock(&xtime_lock); #ifndef CONFIG_SMP update_process_times(user_mode(get_irq_regs())); diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h index ef3ec1d..72be219 100644 --- a/arch/microblaze/include/asm/prom.h +++ b/arch/microblaze/include/asm/prom.h @@ -38,7 +38,7 @@ extern struct device_node *of_chosen; #define HAVE_ARCH_DEVTREE_FIXUPS extern struct device_node *allnodes; /* temporary while merging */ -extern rwlock_t devtree_lock; /* temporary while merging */ +extern raw_spinlock_t devtree_lock; /* temporary while merging */ /* For updating the device tree at runtime */ extern void of_attach_node(struct device_node *); diff --git a/arch/microblaze/kernel/prom.c b/arch/microblaze/kernel/prom.c index b817df1..7305a32 100644 --- a/arch/microblaze/kernel/prom.c +++ b/arch/microblaze/kernel/prom.c @@ -849,12 +849,12 @@ struct device_node *of_find_node_by_phandle(phandle handle) { struct device_node *np; - read_lock(&devtree_lock); + raw_spin_lock(&devtree_lock); for (np = allnodes; np != NULL; np = np->allnext) if (np->linux_phandle == handle) break; of_node_get(np); - read_unlock(&devtree_lock); + raw_spin_unlock(&devtree_lock); return np; } EXPORT_SYMBOL(of_find_node_by_phandle); @@ -940,12 +940,12 @@ void of_attach_node(struct device_node *np) { unsigned long flags; - write_lock_irqsave(&devtree_lock, flags); + raw_spin_lock_irqsave(&devtree_lock, flags); np->sibling = np->parent->child; np->allnext = allnodes; np->parent->child = np; allnodes = np; - write_unlock_irqrestore(&devtree_lock, flags); + raw_spin_unlock_irqrestore(&devtree_lock, flags); } /* @@ -958,7 +958,7 @@ void of_detach_node(struct device_node *np) struct device_node *parent; unsigned long flags; - write_lock_irqsave(&devtree_lock, flags); + raw_spin_lock_irqsave(&devtree_lock, flags); parent = np->parent; if (!parent) @@ -989,7 +989,7 @@ void of_detach_node(struct device_node *np) of_node_set_flag(np, OF_DETACHED); out_unlock: - write_unlock_irqrestore(&devtree_lock, flags); + raw_spin_unlock_irqrestore(&devtree_lock, flags); } /* @@ -1001,18 +1001,18 @@ int prom_add_property(struct device_node *np, struct property *prop) unsigned long flags; prop->next = NULL; - write_lock_irqsave(&devtree_lock, flags); + raw_spin_lock_irqsave(&devtree_lock, flags); next = &np->properties; while (*next) { if (strcmp(prop->name, (*next)->name) == 0) { /* duplicate ! don't insert it */ - write_unlock_irqrestore(&devtree_lock, flags); + raw_spin_unlock_irqrestore(&devtree_lock, flags); return -1; } next = &(*next)->next; } *next = prop; - write_unlock_irqrestore(&devtree_lock, flags); + raw_spin_unlock_irqrestore(&devtree_lock, flags); #ifdef CONFIG_PROC_DEVICETREE /* try to add to proc as well if it was initialized */ @@ -1035,7 +1035,7 @@ int prom_remove_property(struct device_node *np, struct property *prop) unsigned long flags; int found = 0; - write_lock_irqsave(&devtree_lock, flags); + raw_spin_lock_irqsave(&devtree_lock, flags); next = &np->properties; while (*next) { if (*next == prop) { @@ -1048,7 +1048,7 @@ int prom_remove_property(struct device_node *np, struct property *prop) } next = &(*next)->next; } - write_unlock_irqrestore(&devtree_lock, flags); + raw_unlock_irqrestore(&devtree_lock, flags); if (!found) return -ENODEV; @@ -1077,7 +1077,7 @@ int prom_update_property(struct device_node *np, unsigned long flags; int found = 0; - write_lock_irqsave(&devtree_lock, flags); + raw_spin_lock_irqsave(&devtree_lock, flags); next = &np->properties; while (*next) { if (*next == oldprop) { @@ -1091,7 +1091,7 @@ int prom_update_property(struct device_node *np, } next = &(*next)->next; } - write_unlock_irqrestore(&devtree_lock, flags); + raw_spin_unlock_irqrestore(&devtree_lock, flags); if (!found) return -ENODEV; diff --git a/arch/mips/bcm47xx/prom.c b/arch/mips/bcm47xx/prom.c index c51405e..29d3cbf 100644 --- a/arch/mips/bcm47xx/prom.c +++ b/arch/mips/bcm47xx/prom.c @@ -141,6 +141,14 @@ static __init void prom_init_mem(void) break; } + /* Ignoring the last page when ddr size is 128M. Cached + * accesses to last page is causing the processor to prefetch + * using address above 128M stepping out of the ddr address + * space. + */ + if (mem == 0x8000000) + mem -= 0x1000; + add_memory_region(0, mem, BOOT_MEM_RAM); } diff --git a/arch/mips/include/asm/i8253.h b/arch/mips/include/asm/i8253.h index 032ca73..48bb823 100644 --- a/arch/mips/include/asm/i8253.h +++ b/arch/mips/include/asm/i8253.h @@ -12,7 +12,7 @@ #define PIT_CH0 0x40 #define PIT_CH2 0x42 -extern spinlock_t i8253_lock; +extern raw_spinlock_t i8253_lock; extern void setup_pit_timer(void); diff --git a/arch/mips/kernel/i8253.c b/arch/mips/kernel/i8253.c index ed5c441..9479406 100644 --- a/arch/mips/kernel/i8253.c +++ b/arch/mips/kernel/i8253.c @@ -15,7 +15,7 @@ #include <asm/io.h> #include <asm/time.h> -DEFINE_SPINLOCK(i8253_lock); +DEFINE_RAW_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); /* @@ -26,7 +26,7 @@ EXPORT_SYMBOL(i8253_lock); static void init_pit_timer(enum clock_event_mode mode, struct clock_event_device *evt) { - spin_lock(&i8253_lock); + raw_spin_lock(&i8253_lock); switch(mode) { case CLOCK_EVT_MODE_PERIODIC: @@ -55,7 +55,7 @@ static void init_pit_timer(enum clock_event_mode mode, /* Nothing to do here */ break; } - spin_unlock(&i8253_lock); + raw_spin_unlock(&i8253_lock); } /* @@ -65,10 +65,10 @@ static void init_pit_timer(enum clock_event_mode mode, */ static int pit_next_event(unsigned long delta, struct clock_event_device *evt) { - spin_lock(&i8253_lock); + raw_spin_lock(&i8253_lock); outb_p(delta & 0xff , PIT_CH0); /* LSB */ outb(delta >> 8 , PIT_CH0); /* MSB */ - spin_unlock(&i8253_lock); + raw_spin_unlock(&i8253_lock); return 0; } @@ -137,7 +137,7 @@ static cycle_t pit_read(struct clocksource *cs) static int old_count; static u32 old_jifs; - spin_lock_irqsave(&i8253_lock, flags); + raw_spin_lock_irqsave(&i8253_lock, flags); /* * Although our caller may have the read side of xtime_lock, * this is now a seqlock, and we are cheating in this routine @@ -183,7 +183,7 @@ static cycle_t pit_read(struct clocksource *cs) old_count = count; old_jifs = jifs; - spin_unlock_irqrestore(&i8253_lock, flags); + raw_spin_unlock_irqrestore(&i8253_lock, flags); count = (LATCH - 1) - count; diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index e97a7a2..589f1b9 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -75,7 +75,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write, * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_atomic() || !mm) + if (in_atomic() || !mm || current->pagefault_disabled) goto bad_area_nosemaphore; down_read(&mm->mmap_sem); diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index e274fda..384159b 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -1,5 +1,6 @@ #include <linux/module.h> #include <linux/highmem.h> +#include <linux/sched.h> #include <linux/smp.h> #include <asm/fixmap.h> #include <asm/tlbflush.h> @@ -45,7 +46,7 @@ void *__kmap_atomic(struct page *page, enum km_type type) enum fixed_addresses idx; unsigned long vaddr; - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); @@ -71,6 +72,7 @@ void __kunmap_atomic(void *kvaddr, enum km_type type) if (vaddr < FIXADDR_START) { // FIXME pagefault_enable(); + preempt_enable(); return; } @@ -85,6 +87,7 @@ void __kunmap_atomic(void *kvaddr, enum km_type type) #endif pagefault_enable(); + preempt_enable(); } EXPORT_SYMBOL(__kunmap_atomic); @@ -97,6 +100,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) enum fixed_addresses idx; unsigned long vaddr; + preempt_disable(); pagefault_disable(); debug_kmap_atomic(type); diff --git a/arch/mn10300/kernel/time.c b/arch/mn10300/kernel/time.c index 395caf0..82e6bb8 100644 --- a/arch/mn10300/kernel/time.c +++ b/arch/mn10300/kernel/time.c @@ -99,7 +99,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) { unsigned tsc, elapse; - write_seqlock(&xtime_lock); + write_raw_seqlock(&xtime_lock); while (tsc = get_cycles(), elapse = mn10300_last_tsc - tsc, /* time elapsed since last @@ -114,7 +114,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) check_rtc_time(); } - write_sequnlock(&xtime_lock); + write_raw_sequnlock(&xtime_lock); update_process_times(user_mode(get_irq_regs())); diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 524d935..f388dc6 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -18,7 +18,6 @@ config PARISC select BUG select HAVE_PERF_EVENTS select GENERIC_ATOMIC64 if !64BIT - select HAVE_ARCH_TRACEHOOK help The PA-RISC microprocessor is designed by Hewlett-Packard and used in many of their workstations & servers (HP9000 700 and 800 series, diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c index a79c6f9..52c8cf7 100644 --- a/arch/parisc/kernel/time.c +++ b/arch/parisc/kernel/time.c @@ -163,9 +163,9 @@ irqreturn_t __irq_entry timer_interrupt(int irq, void *dev_id) } if (cpu == 0) { - write_seqlock(&xtime_lock); + write_raw_seqlock(&xtime_lock); do_timer(ticks_elapsed); - write_sequnlock(&xtime_lock); + write_raw_sequnlock(&xtime_lock); } return IRQ_HANDLED; @@ -268,12 +268,12 @@ void __init time_init(void) if (pdc_tod_read(&tod_data) == 0) { unsigned long flags; - write_seqlock_irqsave(&xtime_lock, flags); + write_raw_seqlock_irqsave(&xtime_lock, flags); xtime.tv_sec = tod_data.tod_sec; xtime.tv_nsec = tod_data.tod_usec * 1000; set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); - write_sequnlock_irqrestore(&xtime_lock, flags); + write_raw_sequnlock_irqrestore(&xtime_lock, flags); } else { printk(KERN_ERR "Error reading tod clock\n"); xtime.tv_sec = 0; diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index ba3948c..209b1b0 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -81,13 +81,6 @@ config LOCKDEP_SUPPORT bool default y -config RWSEM_GENERIC_SPINLOCK - bool - -config RWSEM_XCHGADD_ALGORITHM - bool - default y - config GENERIC_LOCKBREAK bool default y @@ -256,6 +249,14 @@ config HIGHMEM source kernel/time/Kconfig source kernel/Kconfig.hz source kernel/Kconfig.preempt + +config RWSEM_GENERIC_SPINLOCK + bool + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + source "fs/Kconfig.binfmt" config HUGETLB_PAGE_SIZE_VARIABLE diff --git a/arch/powerpc/include/asm/mpic.h b/arch/powerpc/include/asm/mpic.h index a002682..61913d9 100644 --- a/arch/powerpc/include/asm/mpic.h +++ b/arch/powerpc/include/asm/mpic.h @@ -289,7 +289,7 @@ struct mpic #ifdef CONFIG_MPIC_U3_HT_IRQS /* The fixup table */ struct mpic_irq_fixup *fixups; - spinlock_t fixup_lock; + raw_spinlock_t fixup_lock; #endif /* Register access method */ diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index abe8532..df1b4cb 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -32,13 +32,13 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) #ifdef CONFIG_SMP extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift); -extern void pte_free_finish(void); +extern void pte_free_finish(struct mmu_gather *tlb); #else /* CONFIG_SMP */ static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift) { pgtable_free(table, shift); } -static inline void pte_free_finish(void) { } +static inline void pte_free_finish(struct mmu_gather *tlb) { } #endif /* !CONFIG_SMP */ static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage, diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index 4986504..3b29c95 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -230,8 +230,15 @@ static inline unsigned long pte_update(struct mm_struct *mm, assert_pte_locked(mm, addr); #ifdef CONFIG_PPC_STD_MMU_64 - if (old & _PAGE_HASHPTE) + if (old & _PAGE_HASHPTE) { +#ifdef CONFIG_PREEMPT_RT + preempt_disable(); +#endif hpte_need_flush(mm, addr, ptep, old, huge); +#ifdef CONFIG_PREEMPT_RT + preempt_enable(); +#endif + } #endif return old; diff --git a/arch/powerpc/include/asm/pmac_feature.h b/arch/powerpc/include/asm/pmac_feature.h index 877c35a..00eedc5 100644 --- a/arch/powerpc/include/asm/pmac_feature.h +++ b/arch/powerpc/include/asm/pmac_feature.h @@ -378,7 +378,7 @@ extern struct macio_chip* macio_find(struct device_node* child, int type); * Those are exported by pmac feature for internal use by arch code * only like the platform function callbacks, do not use directly in drivers */ -extern spinlock_t feature_lock; +extern raw_spinlock_t feature_lock; extern struct device_node *uninorth_node; extern u32 __iomem *uninorth_base; diff --git a/arch/powerpc/include/asm/rwsem.h b/arch/powerpc/include/asm/rwsem.h index 24cd928..accf580 100644 --- a/arch/powerpc/include/asm/rwsem.h +++ b/arch/powerpc/include/asm/rwsem.h @@ -21,7 +21,7 @@ /* * the semaphore definition */ -struct rw_semaphore { +struct rw_anon_semaphore { /* XXX this should be able to be an atomic_t -- paulus */ signed int count; #define RWSEM_UNLOCKED_VALUE 0x00000000 @@ -30,7 +30,7 @@ struct rw_semaphore { #define RWSEM_WAITING_BIAS (-0x00010000) #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - spinlock_t wait_lock; + raw_spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; @@ -38,43 +38,47 @@ struct rw_semaphore { }; #ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +# define __RWSEM_ANON_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } #else -# define __RWSEM_DEP_MAP_INIT(lockname) +# define __RWSEM_ANON_DEP_MAP_INIT(lockname) #endif -#define __RWSEM_INITIALIZER(name) \ - { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ - LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } +#define __RWSEM_ANON_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, __RAW_SPIN_LOCK_UNLOCKED((name).wait_lock), \ + LIST_HEAD_INIT((name).wait_list) __RWSEM_ANON_DEP_MAP_INIT(name) } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define DECLARE_ANON_RWSEM(name) \ + struct rw_anon_semaphore name = __RWSEM_ANON_INITIALIZER(name) -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_down_read_failed(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_down_write_failed(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_wake(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_downgrade_wake(struct rw_anon_semaphore *sem); -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, +extern void __init_anon_rwsem(struct rw_anon_semaphore *sem, const char *name, struct lock_class_key *key); -#define init_rwsem(sem) \ +#define init_anon_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ - __init_rwsem((sem), #sem, &__key); \ + __init_anon_rwsem((sem), #sem, &__key); \ } while (0) /* * lock for reading */ -static inline void __down_read(struct rw_semaphore *sem) +static inline void __down_read(struct rw_anon_semaphore *sem) { if (unlikely(atomic_inc_return((atomic_t *)(&sem->count)) <= 0)) rwsem_down_read_failed(sem); } -static inline int __down_read_trylock(struct rw_semaphore *sem) +static inline int __down_read_trylock(struct rw_anon_semaphore *sem) { int tmp; @@ -90,7 +94,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline void __down_write_nested(struct rw_anon_semaphore *sem, int subclass) { int tmp; @@ -100,12 +104,12 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) rwsem_down_write_failed(sem); } -static inline void __down_write(struct rw_semaphore *sem) +static inline void __down_write(struct rw_anon_semaphore *sem) { __down_write_nested(sem, 0); } -static inline int __down_write_trylock(struct rw_semaphore *sem) +static inline int __down_write_trylock(struct rw_anon_semaphore *sem) { int tmp; @@ -117,7 +121,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) /* * unlock after reading */ -static inline void __up_read(struct rw_semaphore *sem) +static inline void __up_read(struct rw_anon_semaphore *sem) { int tmp; @@ -129,7 +133,7 @@ static inline void __up_read(struct rw_semaphore *sem) /* * unlock after writing */ -static inline void __up_write(struct rw_semaphore *sem) +static inline void __up_write(struct rw_anon_semaphore *sem) { if (unlikely(atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS, (atomic_t *)(&sem->count)) < 0)) @@ -139,7 +143,7 @@ static inline void __up_write(struct rw_semaphore *sem) /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(int delta, struct rw_anon_semaphore *sem) { atomic_add(delta, (atomic_t *)(&sem->count)); } @@ -147,7 +151,7 @@ static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) /* * downgrade write lock to read lock */ -static inline void __downgrade_write(struct rw_semaphore *sem) +static inline void __downgrade_write(struct rw_anon_semaphore *sem) { int tmp; @@ -159,15 +163,59 @@ static inline void __downgrade_write(struct rw_semaphore *sem) /* * implement exchange and add functionality */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline int rwsem_atomic_update(int delta, struct rw_anon_semaphore *sem) { return atomic_add_return(delta, (atomic_t *)(&sem->count)); } +static inline int anon_rwsem_is_locked(struct rw_anon_semaphore *sem) +{ + return (sem->count != 0); +} + +#ifndef CONFIG_PREEMPT_RT + +struct rw_semaphore { + /* XXX this should be able to be an atomic_t -- paulus */ + signed int count; + raw_spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +#else +# define __RWSEM_DEP_MAP_INIT(lockname) +#endif + +#define __RWSEM_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, __RAW_SPIN_LOCK_UNLOCKED((name).wait_lock), \ + LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } + +#define DECLARE_RWSEM(name) \ + struct rw_semaphore name = __RWSEM_INITIALIZER(name) + +static inline void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ + __init_anon_rwsem((struct rw_anon_semaphore *)sem, name, key); +} + +#define init_rwsem(sem) \ + do { \ + static struct lock_class_key __key; \ + \ + __init_rwsem((sem), #sem, &__key); \ + } while (0) + static inline int rwsem_is_locked(struct rw_semaphore *sem) { return (sem->count != 0); } +#endif #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_RWSEM_H */ diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index e2b428b..8f0ed7a 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -28,6 +28,16 @@ #define tlb_start_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0) +#define HAVE_ARCH_MMU_GATHER 1 + +struct pte_freelist_batch; + +struct arch_mmu_gather { + struct pte_freelist_batch *batch; +}; + +#define ARCH_MMU_GATHER_INIT (struct arch_mmu_gather){ .batch = NULL, } + extern void tlb_flush(struct mmu_gather *tlb); /* Get the generic bits... */ diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h index d50a380..b594942 100644 --- a/arch/powerpc/include/asm/tlbflush.h +++ b/arch/powerpc/include/asm/tlbflush.h @@ -108,18 +108,25 @@ extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr, static inline void arch_enter_lazy_mmu_mode(void) { - struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); + struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch); batch->active = 1; + + put_cpu_var(ppc64_tlb_batch); } static inline void arch_leave_lazy_mmu_mode(void) { - struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); + struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch); + + if (batch->active) { + if (batch->index) { + __flush_tlb_pending(batch); + } + batch->active = 0; + } - if (batch->index) - __flush_tlb_pending(batch); - batch->active = 0; + put_cpu_var(ppc64_tlb_batch); } #define arch_flush_lazy_mmu_mode() do {} while (0) diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 049dda6..1925982 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -96,9 +96,11 @@ void cpu_idle(void) tick_nohz_restart_sched_tick(); if (cpu_should_die()) cpu_die(); - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 9040330..01a1216 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -474,7 +474,7 @@ void do_softirq(void) */ static LIST_HEAD(irq_hosts); -static DEFINE_SPINLOCK(irq_big_lock); +static DEFINE_RAW_SPINLOCK(irq_big_lock); static unsigned int revmap_trees_allocated; static DEFINE_MUTEX(revmap_trees_mutex); struct irq_map_entry irq_map[NR_IRQS]; @@ -520,14 +520,14 @@ struct irq_host *irq_alloc_host(struct device_node *of_node, if (host->ops->match == NULL) host->ops->match = default_irq_host_match; - spin_lock_irqsave(&irq_big_lock, flags); + raw_spin_lock_irqsave(&irq_big_lock, flags); /* If it's a legacy controller, check for duplicates and * mark it as allocated (we use irq 0 host pointer for that */ if (revmap_type == IRQ_HOST_MAP_LEGACY) { if (irq_map[0].host != NULL) { - spin_unlock_irqrestore(&irq_big_lock, flags); + raw_spin_unlock_irqrestore(&irq_big_lock, flags); /* If we are early boot, we can't free the structure, * too bad... * this will be fixed once slab is made available early @@ -541,7 +541,7 @@ struct irq_host *irq_alloc_host(struct device_node *of_node, } list_add(&host->link, &irq_hosts); - spin_unlock_irqrestore(&irq_big_lock, flags); + raw_spin_unlock_irqrestore(&irq_big_lock, flags); /* Additional setups per revmap type */ switch(revmap_type) { @@ -592,13 +592,13 @@ struct irq_host *irq_find_host(struct device_node *node) * the absence of a device node. This isn't a problem so far * yet though... */ - spin_lock_irqsave(&irq_big_lock, flags); + raw_spin_lock_irqsave(&irq_big_lock, flags); list_for_each_entry(h, &irq_hosts, link) if (h->ops->match(h, node)) { found = h; break; } - spin_unlock_irqrestore(&irq_big_lock, flags); + raw_spin_unlock_irqrestore(&irq_big_lock, flags); return found; } EXPORT_SYMBOL_GPL(irq_find_host); @@ -967,7 +967,7 @@ unsigned int irq_alloc_virt(struct irq_host *host, if (count == 0 || count > (irq_virq_count - NUM_ISA_INTERRUPTS)) return NO_IRQ; - spin_lock_irqsave(&irq_big_lock, flags); + raw_spin_lock_irqsave(&irq_big_lock, flags); /* Use hint for 1 interrupt if any */ if (count == 1 && hint >= NUM_ISA_INTERRUPTS && @@ -991,7 +991,7 @@ unsigned int irq_alloc_virt(struct irq_host *host, } } if (found == NO_IRQ) { - spin_unlock_irqrestore(&irq_big_lock, flags); + raw_spin_unlock_irqrestore(&irq_big_lock, flags); return NO_IRQ; } hint_found: @@ -1000,7 +1000,7 @@ unsigned int irq_alloc_virt(struct irq_host *host, smp_wmb(); irq_map[i].host = host; } - spin_unlock_irqrestore(&irq_big_lock, flags); + raw_spin_unlock_irqrestore(&irq_big_lock, flags); return found; } @@ -1012,7 +1012,7 @@ void irq_free_virt(unsigned int virq, unsigned int count) WARN_ON (virq < NUM_ISA_INTERRUPTS); WARN_ON (count == 0 || (virq + count) > irq_virq_count); - spin_lock_irqsave(&irq_big_lock, flags); + raw_spin_lock_irqsave(&irq_big_lock, flags); for (i = virq; i < (virq + count); i++) { struct irq_host *host; @@ -1025,7 +1025,7 @@ void irq_free_virt(unsigned int virq, unsigned int count) smp_wmb(); irq_map[i].host = NULL; } - spin_unlock_irqrestore(&irq_big_lock, flags); + raw_spin_unlock_irqrestore(&irq_big_lock, flags); } int arch_early_irq_init(void) diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index c932978..dcbf960 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -263,7 +263,7 @@ ss_probe: kcb->kprobe_status = KPROBE_HIT_SSDONE; reset_current_kprobe(); - preempt_enable_no_resched(); + preempt_enable(); return 1; } else if (ret < 0) { /* @@ -282,7 +282,7 @@ ss_probe: return 1; no_kprobe: - preempt_enable_no_resched(); + preempt_enable(); return ret; } @@ -412,7 +412,7 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs) } reset_current_kprobe(); out: - preempt_enable_no_resched(); + preempt_enable(); /* * if somebody else is singlestepping across a probe point, msr diff --git a/arch/powerpc/kernel/pmc.c b/arch/powerpc/kernel/pmc.c index 0516e2d..461499b 100644 --- a/arch/powerpc/kernel/pmc.c +++ b/arch/powerpc/kernel/pmc.c @@ -37,7 +37,7 @@ static void dummy_perf(struct pt_regs *regs) } -static DEFINE_SPINLOCK(pmc_owner_lock); +static DEFINE_RAW_SPINLOCK(pmc_owner_lock); static void *pmc_owner_caller; /* mostly for debugging */ perf_irq_t perf_irq = dummy_perf; @@ -45,7 +45,7 @@ int reserve_pmc_hardware(perf_irq_t new_perf_irq) { int err = 0; - spin_lock(&pmc_owner_lock); + raw_spin_lock(&pmc_owner_lock); if (pmc_owner_caller) { printk(KERN_WARNING "reserve_pmc_hardware: " @@ -59,21 +59,21 @@ int reserve_pmc_hardware(perf_irq_t new_perf_irq) perf_irq = new_perf_irq ? new_perf_irq : dummy_perf; out: - spin_unlock(&pmc_owner_lock); + raw_spin_unlock(&pmc_owner_lock); return err; } EXPORT_SYMBOL_GPL(reserve_pmc_hardware); void release_pmc_hardware(void) { - spin_lock(&pmc_owner_lock); + raw_spin_lock(&pmc_owner_lock); WARN_ON(! pmc_owner_caller); pmc_owner_caller = NULL; perf_irq = dummy_perf; - spin_unlock(&pmc_owner_lock); + raw_spin_unlock(&pmc_owner_lock); } EXPORT_SYMBOL_GPL(release_pmc_hardware); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 7b816da..3ec4ca4 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -304,6 +304,10 @@ struct task_struct *__switch_to(struct task_struct *prev, struct thread_struct *new_thread, *old_thread; unsigned long flags; struct task_struct *last; +#if defined(CONFIG_PPC64) && defined (CONFIG_PREEMPT_RT) + struct ppc64_tlb_batch *batch; + int hadbatch; +#endif #ifdef CONFIG_SMP /* avoid complexity of lazy save/restore of fpu @@ -396,6 +400,17 @@ struct task_struct *__switch_to(struct task_struct *prev, old_thread->accum_tb += (current_tb - start_tb); new_thread->start_tb = current_tb; } + +#ifdef CONFIG_PREEMPT_RT + batch = &__get_cpu_var(ppc64_tlb_batch); + if (batch->active) { + hadbatch = 1; + if (batch->index) { + __flush_tlb_pending(batch); + } + batch->active = 0; + } +#endif /* #ifdef CONFIG_PREEMPT_RT */ #endif local_irq_save(flags); @@ -414,6 +429,13 @@ struct task_struct *__switch_to(struct task_struct *prev, local_irq_restore(flags); +#if defined(CONFIG_PPC64) && defined(CONFIG_PREEMPT_RT) + if (hadbatch) { + batch = &__get_cpu_var(ppc64_tlb_batch); + batch->active = 1; + } +#endif + return last; } diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 4ec3008..a8b952e 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -81,7 +81,7 @@ struct boot_param_header *initial_boot_params; extern struct device_node *allnodes; /* temporary while merging */ -extern rwlock_t devtree_lock; /* temporary while merging */ +extern raw_spinlock_t devtree_lock; /* temporary while merging */ /* export that to outside world */ struct device_node *of_chosen; @@ -1275,12 +1275,12 @@ struct device_node *of_find_node_by_phandle(phandle handle) { struct device_node *np; - read_lock(&devtree_lock); + raw_spin_lock(&devtree_lock); for (np = allnodes; np != 0; np = np->allnext) if (np->linux_phandle == handle) break; of_node_get(np); - read_unlock(&devtree_lock); + raw_spin_unlock(&devtree_lock); return np; } EXPORT_SYMBOL(of_find_node_by_phandle); @@ -1396,12 +1396,12 @@ void of_attach_node(struct device_node *np) { unsigned long flags; - write_lock_irqsave(&devtree_lock, flags); + raw_spin_lock_irqsave(&devtree_lock, flags); np->sibling = np->parent->child; np->allnext = allnodes; np->parent->child = np; allnodes = np; - write_unlock_irqrestore(&devtree_lock, flags); + raw_spin_unlock_irqrestore(&devtree_lock, flags); } /* @@ -1414,7 +1414,7 @@ void of_detach_node(struct device_node *np) struct device_node *parent; unsigned long flags; - write_lock_irqsave(&devtree_lock, flags); + raw_spin_lock_irqsave(&devtree_lock, flags); parent = np->parent; if (!parent) @@ -1445,7 +1445,7 @@ void of_detach_node(struct device_node *np) of_node_set_flag(np, OF_DETACHED); out_unlock: - write_unlock_irqrestore(&devtree_lock, flags); + raw_spin_unlock_irqrestore(&devtree_lock, flags); } #ifdef CONFIG_PPC_PSERIES @@ -1529,18 +1529,18 @@ int prom_add_property(struct device_node* np, struct property* prop) unsigned long flags; prop->next = NULL; - write_lock_irqsave(&devtree_lock, flags); + raw_spin_lock_irqsave(&devtree_lock, flags); next = &np->properties; while (*next) { if (strcmp(prop->name, (*next)->name) == 0) { /* duplicate ! don't insert it */ - write_unlock_irqrestore(&devtree_lock, flags); + raw_spin_unlock_irqrestore(&devtree_lock, flags); return -1; } next = &(*next)->next; } *next = prop; - write_unlock_irqrestore(&devtree_lock, flags); + raw_spin_unlock_irqrestore(&devtree_lock, flags); #ifdef CONFIG_PROC_DEVICETREE /* try to add to proc as well if it was initialized */ @@ -1563,7 +1563,7 @@ int prom_remove_property(struct device_node *np, struct property *prop) unsigned long flags; int found = 0; - write_lock_irqsave(&devtree_lock, flags); + raw_spin_lock_irqsave(&devtree_lock, flags); next = &np->properties; while (*next) { if (*next == prop) { @@ -1576,7 +1576,7 @@ int prom_remove_property(struct device_node *np, struct property *prop) } next = &(*next)->next; } - write_unlock_irqrestore(&devtree_lock, flags); + raw_spin_unlock_irqrestore(&devtree_lock, flags); if (!found) return -ENODEV; @@ -1605,7 +1605,7 @@ int prom_update_property(struct device_node *np, unsigned long flags; int found = 0; - write_lock_irqsave(&devtree_lock, flags); + raw_spin_lock_irqsave(&devtree_lock, flags); next = &np->properties; while (*next) { if (*next == oldprop) { @@ -1619,7 +1619,7 @@ int prom_update_property(struct device_node *np, } next = &(*next)->next; } - write_unlock_irqrestore(&devtree_lock, flags); + raw_spin_unlock_irqrestore(&devtree_lock, flags); if (!found) return -ENODEV; diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 2e4832a..0059a8f 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -209,7 +209,7 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal) break; case ERR_TYPE_KERNEL_PANIC: default: - WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */ + WARN_ON_ONCE_NONRT(!irqs_disabled()); /* @@@ DEBUG @@@ */ spin_unlock_irqrestore(&rtasd_log_lock, s); return; } @@ -231,7 +231,7 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal) /* Check to see if we need to or have stopped logging */ if (fatal || !logging_enabled) { logging_enabled = 0; - WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */ + WARN_ON_ONCE_NONRT(!irqs_disabled()); /* @@@ DEBUG @@@ */ spin_unlock_irqrestore(&rtasd_log_lock, s); return; } @@ -254,13 +254,13 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal) else rtas_log_start += 1; - WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */ + WARN_ON_ONCE_NONRT(!irqs_disabled()); /* @@@ DEBUG @@@ */ spin_unlock_irqrestore(&rtasd_log_lock, s); wake_up_interruptible(&rtas_log_wait); break; case ERR_TYPE_KERNEL_PANIC: default: - WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */ + WARN_ON_ONCE_NONRT(!irqs_disabled()); /* @@@ DEBUG @@@ */ spin_unlock_irqrestore(&rtasd_log_lock, s); return; } diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 6c6093d..d5f6048 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -1049,7 +1049,7 @@ void __init time_init(void) /* Save the current timebase to pretty up CONFIG_PRINTK_TIME */ boot_tb = get_tb_or_rtc(); - write_seqlock_irqsave(&xtime_lock, flags); + write_raw_seqlock_irqsave(&xtime_lock, flags); /* If platform provided a timezone (pmac), we correct the time */ if (timezone_offset) { @@ -1063,7 +1063,7 @@ void __init time_init(void) vdso_data->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC; vdso_data->tb_to_xs = tb_to_xs; - write_sequnlock_irqrestore(&xtime_lock, flags); + write_raw_sequnlock_irqrestore(&xtime_lock, flags); /* Start the decrementer on CPUs that have manual control * such as BookE diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index d069ff8..4f99a3a 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -102,11 +102,11 @@ static inline void pmac_backlight_unblank(void) { } int die(const char *str, struct pt_regs *regs, long err) { static struct { - spinlock_t lock; + raw_spinlock_t lock; u32 lock_owner; int lock_owner_depth; } die = { - .lock = __SPIN_LOCK_UNLOCKED(die.lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(die.lock), .lock_owner = -1, .lock_owner_depth = 0 }; @@ -120,7 +120,7 @@ int die(const char *str, struct pt_regs *regs, long err) if (die.lock_owner != raw_smp_processor_id()) { console_verbose(); - spin_lock_irqsave(&die.lock, flags); + raw_spin_lock_irqsave(&die.lock, flags); die.lock_owner = smp_processor_id(); die.lock_owner_depth = 0; bust_spinlocks(1); @@ -155,7 +155,7 @@ int die(const char *str, struct pt_regs *regs, long err) bust_spinlocks(0); die.lock_owner = -1; add_taint(TAINT_DIE); - spin_unlock_irqrestore(&die.lock, flags); + raw_spin_unlock_irqrestore(&die.lock, flags); if (kexec_should_crash(current) || kexec_sr_activated(smp_processor_id())) @@ -202,6 +202,11 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr) addr, regs->nip, regs->link, code); } +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif + memset(&info, 0, sizeof(info)); info.si_signo = signr; info.si_code = code; diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c index 58e14fb..55c7908 100644 --- a/arch/powerpc/lib/locks.c +++ b/arch/powerpc/lib/locks.c @@ -86,8 +86,10 @@ void arch_spin_unlock_wait(arch_spinlock_t *lock) { while (lock->slock) { HMT_low(); + preempt_disable(); if (SHARED_PROCESSOR) __spin_yield(lock); + preempt_enable(); } HMT_medium(); } diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 26fb6b9..d89a78a 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -159,7 +159,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, } #endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/ - if (in_atomic() || mm == NULL) { + if (in_atomic() || mm == NULL || current->pagefault_disabled) { if (!user_mode(regs)) return SIGSEGV; /* in_atomic() in user mode is really bad, diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 056d23a..2cecbf5 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -37,7 +37,7 @@ #define HPTE_LOCK_BIT 3 -static DEFINE_SPINLOCK(native_tlbie_lock); +static DEFINE_RAW_SPINLOCK(native_tlbie_lock); static inline void __tlbie(unsigned long va, int psize, int ssize) { @@ -104,7 +104,7 @@ static inline void tlbie(unsigned long va, int psize, int ssize, int local) if (use_local) use_local = mmu_psize_defs[psize].tlbiel; if (lock_tlbie && !use_local) - spin_lock(&native_tlbie_lock); + raw_spin_lock(&native_tlbie_lock); asm volatile("ptesync": : :"memory"); if (use_local) { __tlbiel(va, psize, ssize); @@ -114,7 +114,7 @@ static inline void tlbie(unsigned long va, int psize, int ssize, int local) asm volatile("eieio; tlbsync; ptesync": : :"memory"); } if (lock_tlbie && !use_local) - spin_unlock(&native_tlbie_lock); + raw_spin_unlock(&native_tlbie_lock); } static inline void native_lock_hpte(struct hash_pte *hptep) @@ -434,7 +434,7 @@ static void native_hpte_clear(void) /* we take the tlbie lock and hold it. Some hardware will * deadlock if we try to tlbie from two processors at once. */ - spin_lock(&native_tlbie_lock); + raw_spin_lock(&native_tlbie_lock); slots = pteg_count * HPTES_PER_GROUP; @@ -458,7 +458,7 @@ static void native_hpte_clear(void) } asm volatile("eieio; tlbsync; ptesync":::"memory"); - spin_unlock(&native_tlbie_lock); + raw_spin_unlock(&native_tlbie_lock); local_irq_restore(flags); } @@ -521,7 +521,7 @@ static void native_flush_hash_range(unsigned long number, int local) int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); if (lock_tlbie) - spin_lock(&native_tlbie_lock); + raw_spin_lock(&native_tlbie_lock); asm volatile("ptesync":::"memory"); for (i = 0; i < number; i++) { @@ -536,7 +536,7 @@ static void native_flush_hash_range(unsigned long number, int local) asm volatile("eieio; tlbsync; ptesync":::"memory"); if (lock_tlbie) - spin_unlock(&native_tlbie_lock); + raw_spin_unlock(&native_tlbie_lock); } local_irq_restore(flags); diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c index c2186c7..81310e2 100644 --- a/arch/powerpc/mm/highmem.c +++ b/arch/powerpc/mm/highmem.c @@ -35,6 +35,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) unsigned long vaddr; /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); @@ -73,5 +74,6 @@ void kunmap_atomic(void *kvaddr, enum km_type type) local_flush_tlb_page(NULL, vaddr); #endif pagefault_enable(); + preempt_enable(); } EXPORT_SYMBOL(kunmap_atomic); diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index 1044a63..dbc6921 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -56,7 +56,7 @@ static unsigned int next_context, nr_free_contexts; static unsigned long *context_map; static unsigned long *stale_map[NR_CPUS]; static struct mm_struct **context_mm; -static DEFINE_SPINLOCK(context_lock); +static DEFINE_RAW_SPINLOCK(context_lock); #define CTX_MAP_SIZE \ (sizeof(unsigned long) * (last_context / BITS_PER_LONG + 1)) @@ -121,9 +121,9 @@ static unsigned int steal_context_smp(unsigned int id) /* This will happen if you have more CPUs than available contexts, * all we can do here is wait a bit and try again */ - spin_unlock(&context_lock); + raw_spin_unlock(&context_lock); cpu_relax(); - spin_lock(&context_lock); + raw_spin_lock(&context_lock); /* This will cause the caller to try again */ return MMU_NO_CONTEXT; @@ -194,7 +194,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) unsigned long *map; /* No lockless fast path .. yet */ - spin_lock(&context_lock); + raw_spin_lock(&context_lock); pr_hard("[%d] activating context for mm @%p, active=%d, id=%d", cpu, next, next->context.active, next->context.id); @@ -278,7 +278,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) /* Flick the MMU and release lock */ pr_hardcont(" -> %d\n", id); set_context(id, next->pgd); - spin_unlock(&context_lock); + raw_spin_unlock(&context_lock); } /* @@ -307,7 +307,7 @@ void destroy_context(struct mm_struct *mm) WARN_ON(mm->context.active != 0); - spin_lock_irqsave(&context_lock, flags); + raw_spin_lock_irqsave(&context_lock, flags); id = mm->context.id; if (id != MMU_NO_CONTEXT) { __clear_bit(id, context_map); @@ -318,7 +318,7 @@ void destroy_context(struct mm_struct *mm) context_mm[id] = NULL; nr_free_contexts++; } - spin_unlock_irqrestore(&context_lock, flags); + raw_spin_unlock_irqrestore(&context_lock, flags); } #ifdef CONFIG_SMP diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 99df697..4243a84 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -32,8 +32,6 @@ #include "mmu_decl.h" -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - #ifdef CONFIG_SMP /* @@ -42,7 +40,6 @@ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); * freeing a page table page that is being walked without locks */ -static DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); static unsigned long pte_freelist_forced_free; struct pte_freelist_batch @@ -97,12 +94,12 @@ static void pte_free_submit(struct pte_freelist_batch *batch) void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift) { - /* This is safe since tlb_gather_mmu has disabled preemption */ - struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); + struct pte_freelist_batch **batchp = &tlb->arch.batch; unsigned long pgf; - if (atomic_read(&tlb->mm->mm_users) < 2 || - cpumask_equal(mm_cpumask(tlb->mm), cpumask_of(smp_processor_id()))){ + /* CHECKME */ + + if (atomic_read(&tlb->mm->mm_users) < 2) { pgtable_free(table, shift); return; } @@ -124,15 +121,14 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift) } } -void pte_free_finish(void) +void pte_free_finish(struct mmu_gather *tlb) { - /* This is safe since tlb_gather_mmu has disabled preemption */ - struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); + struct pte_freelist_batch **batchp = &tlb->arch.batch; - if (*batchp == NULL) - return; - pte_free_submit(*batchp); - *batchp = NULL; + if (*batchp) { + pte_free_submit(*batchp); + *batchp = NULL; + } } #endif /* CONFIG_SMP */ diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c index 8aaa8b7..3b0b3d8 100644 --- a/arch/powerpc/mm/tlb_hash32.c +++ b/arch/powerpc/mm/tlb_hash32.c @@ -73,7 +73,7 @@ void tlb_flush(struct mmu_gather *tlb) } /* Push out batch of freed page tables */ - pte_free_finish(); + pte_free_finish(tlb); } /* diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index 1ec0657..f290f6c 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -30,6 +30,7 @@ #include <asm/tlbflush.h> #include <asm/tlb.h> #include <asm/bug.h> +#include <asm/machdep.h> DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); @@ -44,7 +45,7 @@ DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); void hpte_need_flush(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long pte, int huge) { - struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); + struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch); unsigned long vsid, vaddr; unsigned int psize; int ssize; @@ -99,6 +100,7 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, */ if (!batch->active) { flush_hash_page(vaddr, rpte, psize, ssize, 0); + put_cpu_var(ppc64_tlb_batch); return; } @@ -125,8 +127,22 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, batch->pte[i] = rpte; batch->vaddr[i] = vaddr; batch->index = ++i; + +#ifdef CONFIG_PREEMPT_RT + /* + * Since flushing tlb needs expensive hypervisor call(s) on celleb, + * always flush it on RT to reduce scheduling latency. + */ + if (machine_is(celleb)) { + __flush_tlb_pending(batch); + put_cpu_var(ppc64_tlb_batch); + return; + } +#endif /* CONFIG_PREEMPT_RT */ + if (i >= PPC64_TLB_BATCH_NR) __flush_tlb_pending(batch); + put_cpu_var(ppc64_tlb_batch); } /* @@ -155,7 +171,7 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch) void tlb_flush(struct mmu_gather *tlb) { - struct ppc64_tlb_batch *tlbbatch = &__get_cpu_var(ppc64_tlb_batch); + struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch); /* If there's a TLB batch pending, then we must flush it because the * pages are going to be freed and we really don't want to have a CPU @@ -165,7 +181,8 @@ void tlb_flush(struct mmu_gather *tlb) __flush_tlb_pending(tlbbatch); /* Push out batch of freed page tables */ - pte_free_finish(); + put_cpu_var(ppc64_tlb_batch); + pte_free_finish(tlb); } /** diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index 2fbc680..c242127 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -150,7 +150,7 @@ EXPORT_SYMBOL(local_flush_tlb_page); */ #ifdef CONFIG_SMP -static DEFINE_SPINLOCK(tlbivax_lock); +static DEFINE_RAW_SPINLOCK(tlbivax_lock); static int mm_is_core_local(struct mm_struct *mm) { @@ -232,10 +232,10 @@ void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) { int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL); if (lock) - spin_lock(&tlbivax_lock); + raw_spin_lock(&tlbivax_lock); _tlbivax_bcast(vmaddr, pid, tsize, ind); if (lock) - spin_unlock(&tlbivax_lock); + raw_spin_unlock(&tlbivax_lock); goto bail; } else { struct tlb_flush_param p = { @@ -274,7 +274,9 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) _tlbil_pid(0); preempt_enable(); #else + preempt_disable(); _tlbil_pid(0); + preempt_enable(); #endif } EXPORT_SYMBOL(flush_tlb_kernel_range); @@ -298,7 +300,7 @@ void tlb_flush(struct mmu_gather *tlb) flush_tlb_mm(tlb->mm); /* Push out batch of freed page tables */ - pte_free_finish(); + pte_free_finish(tlb); } /* diff --git a/arch/powerpc/platforms/85xx/mpc85xx_mds.c b/arch/powerpc/platforms/85xx/mpc85xx_mds.c index 21f61b8..cc29c0f 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_mds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_mds.c @@ -338,7 +338,8 @@ static void __init mpc85xx_mds_pic_init(void) } mpic = mpic_alloc(np, r.start, - MPIC_PRIMARY | MPIC_WANTS_RESET | MPIC_BIG_ENDIAN, + MPIC_PRIMARY | MPIC_WANTS_RESET | MPIC_BIG_ENDIAN | + MPIC_BROKEN_FRR_NIRQS, 0, 256, " OpenPIC "); BUG_ON(mpic == NULL); of_node_put(np); diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c index 04160a4..a15f582 100644 --- a/arch/powerpc/platforms/85xx/smp.c +++ b/arch/powerpc/platforms/85xx/smp.c @@ -46,6 +46,7 @@ smp_85xx_kick_cpu(int nr) __iomem u32 *bptr_vaddr; struct device_node *np; int n = 0; + int ioremappable; WARN_ON (nr < 0 || nr >= NR_CPUS); @@ -59,21 +60,37 @@ smp_85xx_kick_cpu(int nr) return; } + /* + * A secondary core could be in a spinloop in the bootpage + * (0xfffff000), somewhere in highmem, or somewhere in lowmem. + * The bootpage and highmem can be accessed via ioremap(), but + * we need to directly access the spinloop if its in lowmem. + */ + ioremappable = *cpu_rel_addr > virt_to_phys(high_memory); + /* Map the spin table */ - bptr_vaddr = ioremap(*cpu_rel_addr, SIZE_BOOT_ENTRY); + if (ioremappable) + bptr_vaddr = ioremap(*cpu_rel_addr, SIZE_BOOT_ENTRY); + else + bptr_vaddr = phys_to_virt(*cpu_rel_addr); local_irq_save(flags); out_be32(bptr_vaddr + BOOT_ENTRY_PIR, nr); out_be32(bptr_vaddr + BOOT_ENTRY_ADDR_LOWER, __pa(__early_start)); + if (!ioremappable) + flush_dcache_range((ulong)bptr_vaddr, + (ulong)(bptr_vaddr + SIZE_BOOT_ENTRY)); + /* Wait a bit for the CPU to ack. */ while ((__secondary_hold_acknowledge != nr) && (++n < 1000)) mdelay(1); local_irq_restore(flags); - iounmap(bptr_vaddr); + if (ioremappable) + iounmap(bptr_vaddr); pr_debug("waited %d msecs for CPU #%d.\n", n, nr); } diff --git a/arch/powerpc/platforms/cell/beat_htab.c b/arch/powerpc/platforms/cell/beat_htab.c index 35b1ec4..2516c1c 100644 --- a/arch/powerpc/platforms/cell/beat_htab.c +++ b/arch/powerpc/platforms/cell/beat_htab.c @@ -40,7 +40,7 @@ #define DBG_LOW(fmt...) do { } while (0) #endif -static DEFINE_SPINLOCK(beat_htab_lock); +static DEFINE_RAW_SPINLOCK(beat_htab_lock); static inline unsigned int beat_read_mask(unsigned hpte_group) { @@ -114,18 +114,18 @@ static long beat_lpar_hpte_insert(unsigned long hpte_group, if (rflags & _PAGE_NO_CACHE) hpte_r &= ~_PAGE_COHERENT; - spin_lock(&beat_htab_lock); + raw_spin_lock(&beat_htab_lock); lpar_rc = beat_read_mask(hpte_group); if (lpar_rc == 0) { if (!(vflags & HPTE_V_BOLTED)) DBG_LOW(" full\n"); - spin_unlock(&beat_htab_lock); + raw_spin_unlock(&beat_htab_lock); return -1; } lpar_rc = beat_insert_htab_entry(0, hpte_group, lpar_rc << 48, hpte_v, hpte_r, &slot); - spin_unlock(&beat_htab_lock); + raw_spin_unlock(&beat_htab_lock); /* * Since we try and ioremap PHBs we don't own, the pte insert @@ -198,17 +198,17 @@ static long beat_lpar_hpte_updatepp(unsigned long slot, "avpnv=%016lx, slot=%016lx, psize: %d, newpp %016lx ... ", want_v & HPTE_V_AVPN, slot, psize, newpp); - spin_lock(&beat_htab_lock); + raw_spin_lock(&beat_htab_lock); dummy0 = beat_lpar_hpte_getword0(slot); if ((dummy0 & ~0x7FUL) != (want_v & ~0x7FUL)) { DBG_LOW("not found !\n"); - spin_unlock(&beat_htab_lock); + raw_spin_unlock(&beat_htab_lock); return -1; } lpar_rc = beat_write_htab_entry(0, slot, 0, newpp, 0, 7, &dummy0, &dummy1); - spin_unlock(&beat_htab_lock); + raw_spin_unlock(&beat_htab_lock); if (lpar_rc != 0 || dummy0 == 0) { DBG_LOW("not found !\n"); return -1; @@ -262,13 +262,13 @@ static void beat_lpar_hpte_updateboltedpp(unsigned long newpp, vsid = get_kernel_vsid(ea, MMU_SEGSIZE_256M); va = (vsid << 28) | (ea & 0x0fffffff); - spin_lock(&beat_htab_lock); + raw_spin_lock(&beat_htab_lock); slot = beat_lpar_hpte_find(va, psize); BUG_ON(slot == -1); lpar_rc = beat_write_htab_entry(0, slot, 0, newpp, 0, 7, &dummy0, &dummy1); - spin_unlock(&beat_htab_lock); + raw_spin_unlock(&beat_htab_lock); BUG_ON(lpar_rc != 0); } @@ -285,18 +285,18 @@ static void beat_lpar_hpte_invalidate(unsigned long slot, unsigned long va, slot, va, psize, local); want_v = hpte_encode_v(va, psize, MMU_SEGSIZE_256M); - spin_lock_irqsave(&beat_htab_lock, flags); + raw_spin_lock_irqsave(&beat_htab_lock, flags); dummy1 = beat_lpar_hpte_getword0(slot); if ((dummy1 & ~0x7FUL) != (want_v & ~0x7FUL)) { DBG_LOW("not found !\n"); - spin_unlock_irqrestore(&beat_htab_lock, flags); + raw_spin_unlock_irqrestore(&beat_htab_lock, flags); return; } lpar_rc = beat_write_htab_entry(0, slot, 0, 0, HPTE_V_VALID, 0, &dummy1, &dummy2); - spin_unlock_irqrestore(&beat_htab_lock, flags); + raw_spin_unlock_irqrestore(&beat_htab_lock, flags); BUG_ON(lpar_rc != 0); } diff --git a/arch/powerpc/platforms/cell/beat_interrupt.c b/arch/powerpc/platforms/cell/beat_interrupt.c index 36052a9..cfe3b96 100644 --- a/arch/powerpc/platforms/cell/beat_interrupt.c +++ b/arch/powerpc/platforms/cell/beat_interrupt.c @@ -30,7 +30,7 @@ #include "beat_wrapper.h" #define MAX_IRQS NR_IRQS -static DEFINE_SPINLOCK(beatic_irq_mask_lock); +static DEFINE_RAW_SPINLOCK(beatic_irq_mask_lock); static uint64_t beatic_irq_mask_enable[(MAX_IRQS+255)/64]; static uint64_t beatic_irq_mask_ack[(MAX_IRQS+255)/64]; @@ -65,30 +65,30 @@ static void beatic_mask_irq(unsigned int irq_plug) { unsigned long flags; - spin_lock_irqsave(&beatic_irq_mask_lock, flags); + raw_spin_lock_irqsave(&beatic_irq_mask_lock, flags); beatic_irq_mask_enable[irq_plug/64] &= ~(1UL << (63 - (irq_plug%64))); beatic_update_irq_mask(irq_plug); - spin_unlock_irqrestore(&beatic_irq_mask_lock, flags); + raw_spin_unlock_irqrestore(&beatic_irq_mask_lock, flags); } static void beatic_unmask_irq(unsigned int irq_plug) { unsigned long flags; - spin_lock_irqsave(&beatic_irq_mask_lock, flags); + raw_spin_lock_irqsave(&beatic_irq_mask_lock, flags); beatic_irq_mask_enable[irq_plug/64] |= 1UL << (63 - (irq_plug%64)); beatic_update_irq_mask(irq_plug); - spin_unlock_irqrestore(&beatic_irq_mask_lock, flags); + raw_spin_unlock_irqrestore(&beatic_irq_mask_lock, flags); } static void beatic_ack_irq(unsigned int irq_plug) { unsigned long flags; - spin_lock_irqsave(&beatic_irq_mask_lock, flags); + raw_spin_lock_irqsave(&beatic_irq_mask_lock, flags); beatic_irq_mask_ack[irq_plug/64] &= ~(1UL << (63 - (irq_plug%64))); beatic_update_irq_mask(irq_plug); - spin_unlock_irqrestore(&beatic_irq_mask_lock, flags); + raw_spin_unlock_irqrestore(&beatic_irq_mask_lock, flags); } static void beatic_end_irq(unsigned int irq_plug) @@ -103,10 +103,10 @@ static void beatic_end_irq(unsigned int irq_plug) printk(KERN_ERR "IRQ over-downcounted, plug %d\n", irq_plug); } - spin_lock_irqsave(&beatic_irq_mask_lock, flags); + raw_spin_lock_irqsave(&beatic_irq_mask_lock, flags); beatic_irq_mask_ack[irq_plug/64] |= 1UL << (63 - (irq_plug%64)); beatic_update_irq_mask(irq_plug); - spin_unlock_irqrestore(&beatic_irq_mask_lock, flags); + raw_spin_unlock_irqrestore(&beatic_irq_mask_lock, flags); } static struct irq_chip beatic_pic = { diff --git a/arch/powerpc/platforms/chrp/time.c b/arch/powerpc/platforms/chrp/time.c index 054dfe5..8f1d8cd 100644 --- a/arch/powerpc/platforms/chrp/time.c +++ b/arch/powerpc/platforms/chrp/time.c @@ -83,7 +83,12 @@ int chrp_set_rtc_time(struct rtc_time *tmarg) unsigned char save_control, save_freq_select; struct rtc_time tm = *tmarg; +#if CONFIG_PREEMPT_RT + if (!spin_trylock(&rtc_lock)) + return -1; +#else spin_lock(&rtc_lock); +#endif save_control = chrp_cmos_clock_read(RTC_CONTROL); /* tell the clock it's being set */ diff --git a/arch/powerpc/platforms/powermac/feature.c b/arch/powerpc/platforms/powermac/feature.c index fbc9bbd..424b633 100644 --- a/arch/powerpc/platforms/powermac/feature.c +++ b/arch/powerpc/platforms/powermac/feature.c @@ -59,10 +59,10 @@ extern struct device_node *k2_skiplist[2]; * We use a single global lock to protect accesses. Each driver has * to take care of its own locking */ -DEFINE_SPINLOCK(feature_lock); +DEFINE_RAW_SPINLOCK(feature_lock); -#define LOCK(flags) spin_lock_irqsave(&feature_lock, flags); -#define UNLOCK(flags) spin_unlock_irqrestore(&feature_lock, flags); +#define LOCK(flags) raw_spin_lock_irqsave(&feature_lock, flags); +#define UNLOCK(flags) raw_spin_unlock_irqrestore(&feature_lock, flags); /* diff --git a/arch/powerpc/platforms/powermac/nvram.c b/arch/powerpc/platforms/powermac/nvram.c index c6f0f9e..80a5258 100644 --- a/arch/powerpc/platforms/powermac/nvram.c +++ b/arch/powerpc/platforms/powermac/nvram.c @@ -80,7 +80,7 @@ static int is_core_99; static int core99_bank = 0; static int nvram_partitions[3]; // XXX Turn that into a sem -static DEFINE_SPINLOCK(nv_lock); +static DEFINE_RAW_SPINLOCK(nv_lock); static int (*core99_write_bank)(int bank, u8* datas); static int (*core99_erase_bank)(int bank); @@ -165,10 +165,10 @@ static unsigned char indirect_nvram_read_byte(int addr) unsigned char val; unsigned long flags; - spin_lock_irqsave(&nv_lock, flags); + raw_spin_lock_irqsave(&nv_lock, flags); out_8(nvram_addr, addr >> 5); val = in_8(&nvram_data[(addr & 0x1f) << 4]); - spin_unlock_irqrestore(&nv_lock, flags); + raw_spin_unlock_irqrestore(&nv_lock, flags); return val; } @@ -177,10 +177,10 @@ static void indirect_nvram_write_byte(int addr, unsigned char val) { unsigned long flags; - spin_lock_irqsave(&nv_lock, flags); + raw_spin_lock_irqsave(&nv_lock, flags); out_8(nvram_addr, addr >> 5); out_8(&nvram_data[(addr & 0x1f) << 4], val); - spin_unlock_irqrestore(&nv_lock, flags); + raw_spin_unlock_irqrestore(&nv_lock, flags); } @@ -481,7 +481,7 @@ static void core99_nvram_sync(void) if (!is_core_99 || !nvram_data || !nvram_image) return; - spin_lock_irqsave(&nv_lock, flags); + raw_spin_lock_irqsave(&nv_lock, flags); if (!memcmp(nvram_image, (u8*)nvram_data + core99_bank*NVRAM_SIZE, NVRAM_SIZE)) goto bail; @@ -503,7 +503,7 @@ static void core99_nvram_sync(void) if (core99_write_bank(core99_bank, nvram_image)) printk("nvram: Error writing bank %d\n", core99_bank); bail: - spin_unlock_irqrestore(&nv_lock, flags); + raw_spin_unlock_irqrestore(&nv_lock, flags); #ifdef DEBUG mdelay(2000); diff --git a/arch/powerpc/platforms/powermac/pfunc_base.c b/arch/powerpc/platforms/powermac/pfunc_base.c index db20de5..f5e3cda 100644 --- a/arch/powerpc/platforms/powermac/pfunc_base.c +++ b/arch/powerpc/platforms/powermac/pfunc_base.c @@ -50,13 +50,13 @@ static int macio_do_gpio_write(PMF_STD_ARGS, u8 value, u8 mask) value = ~value; /* Toggle the GPIO */ - spin_lock_irqsave(&feature_lock, flags); + raw_spin_lock_irqsave(&feature_lock, flags); tmp = readb(addr); tmp = (tmp & ~mask) | (value & mask); DBG("Do write 0x%02x to GPIO %s (%p)\n", tmp, func->node->full_name, addr); writeb(tmp, addr); - spin_unlock_irqrestore(&feature_lock, flags); + raw_spin_unlock_irqrestore(&feature_lock, flags); return 0; } @@ -145,9 +145,9 @@ static int macio_do_write_reg32(PMF_STD_ARGS, u32 offset, u32 value, u32 mask) struct macio_chip *macio = func->driver_data; unsigned long flags; - spin_lock_irqsave(&feature_lock, flags); + raw_spin_lock_irqsave(&feature_lock, flags); MACIO_OUT32(offset, (MACIO_IN32(offset) & ~mask) | (value & mask)); - spin_unlock_irqrestore(&feature_lock, flags); + raw_spin_unlock_irqrestore(&feature_lock, flags); return 0; } @@ -168,9 +168,9 @@ static int macio_do_write_reg8(PMF_STD_ARGS, u32 offset, u8 value, u8 mask) struct macio_chip *macio = func->driver_data; unsigned long flags; - spin_lock_irqsave(&feature_lock, flags); + raw_spin_lock_irqsave(&feature_lock, flags); MACIO_OUT8(offset, (MACIO_IN8(offset) & ~mask) | (value & mask)); - spin_unlock_irqrestore(&feature_lock, flags); + raw_spin_unlock_irqrestore(&feature_lock, flags); return 0; } @@ -223,12 +223,12 @@ static int macio_do_write_reg32_slm(PMF_STD_ARGS, u32 offset, u32 shift, if (args == NULL || args->count == 0) return -EINVAL; - spin_lock_irqsave(&feature_lock, flags); + raw_spin_lock_irqsave(&feature_lock, flags); tmp = MACIO_IN32(offset); val = args->u[0].v << shift; tmp = (tmp & ~mask) | (val & mask); MACIO_OUT32(offset, tmp); - spin_unlock_irqrestore(&feature_lock, flags); + raw_spin_unlock_irqrestore(&feature_lock, flags); return 0; } @@ -243,12 +243,12 @@ static int macio_do_write_reg8_slm(PMF_STD_ARGS, u32 offset, u32 shift, if (args == NULL || args->count == 0) return -EINVAL; - spin_lock_irqsave(&feature_lock, flags); + raw_spin_lock_irqsave(&feature_lock, flags); tmp = MACIO_IN8(offset); val = args->u[0].v << shift; tmp = (tmp & ~mask) | (val & mask); MACIO_OUT8(offset, tmp); - spin_unlock_irqrestore(&feature_lock, flags); + raw_spin_unlock_irqrestore(&feature_lock, flags); return 0; } @@ -278,12 +278,12 @@ static int unin_do_write_reg32(PMF_STD_ARGS, u32 offset, u32 value, u32 mask) { unsigned long flags; - spin_lock_irqsave(&feature_lock, flags); + raw_spin_lock_irqsave(&feature_lock, flags); /* This is fairly bogus in darwin, but it should work for our needs * implemeted that way: */ UN_OUT(offset, (UN_IN(offset) & ~mask) | (value & mask)); - spin_unlock_irqrestore(&feature_lock, flags); + raw_spin_unlock_irqrestore(&feature_lock, flags); return 0; } diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c index 09e8272..e2364cf 100644 --- a/arch/powerpc/platforms/powermac/pic.c +++ b/arch/powerpc/platforms/powermac/pic.c @@ -57,7 +57,7 @@ static int max_irqs; static int max_real_irqs; static u32 level_mask[4]; -static DEFINE_SPINLOCK(pmac_pic_lock); +static DEFINE_RAW_SPINLOCK(pmac_pic_lock); #define NR_MASK_WORDS ((NR_IRQS + 31) / 32) static unsigned long ppc_lost_interrupts[NR_MASK_WORDS]; @@ -85,7 +85,7 @@ static void pmac_mask_and_ack_irq(unsigned int virq) int i = src >> 5; unsigned long flags; - spin_lock_irqsave(&pmac_pic_lock, flags); + raw_spin_lock_irqsave(&pmac_pic_lock, flags); __clear_bit(src, ppc_cached_irq_mask); if (__test_and_clear_bit(src, ppc_lost_interrupts)) atomic_dec(&ppc_n_lost_interrupts); @@ -97,7 +97,7 @@ static void pmac_mask_and_ack_irq(unsigned int virq) mb(); } while((in_le32(&pmac_irq_hw[i]->enable) & bit) != (ppc_cached_irq_mask[i] & bit)); - spin_unlock_irqrestore(&pmac_pic_lock, flags); + raw_spin_unlock_irqrestore(&pmac_pic_lock, flags); } static void pmac_ack_irq(unsigned int virq) @@ -107,12 +107,12 @@ static void pmac_ack_irq(unsigned int virq) int i = src >> 5; unsigned long flags; - spin_lock_irqsave(&pmac_pic_lock, flags); + raw_spin_lock_irqsave(&pmac_pic_lock, flags); if (__test_and_clear_bit(src, ppc_lost_interrupts)) atomic_dec(&ppc_n_lost_interrupts); out_le32(&pmac_irq_hw[i]->ack, bit); (void)in_le32(&pmac_irq_hw[i]->ack); - spin_unlock_irqrestore(&pmac_pic_lock, flags); + raw_spin_unlock_irqrestore(&pmac_pic_lock, flags); } static void __pmac_set_irq_mask(unsigned int irq_nr, int nokicklost) @@ -152,12 +152,12 @@ static unsigned int pmac_startup_irq(unsigned int virq) unsigned long bit = 1UL << (src & 0x1f); int i = src >> 5; - spin_lock_irqsave(&pmac_pic_lock, flags); + raw_spin_lock_irqsave(&pmac_pic_lock, flags); if ((irq_to_desc(virq)->status & IRQ_LEVEL) == 0) out_le32(&pmac_irq_hw[i]->ack, bit); __set_bit(src, ppc_cached_irq_mask); __pmac_set_irq_mask(src, 0); - spin_unlock_irqrestore(&pmac_pic_lock, flags); + raw_spin_unlock_irqrestore(&pmac_pic_lock, flags); return 0; } @@ -167,10 +167,10 @@ static void pmac_mask_irq(unsigned int virq) unsigned long flags; unsigned int src = irq_map[virq].hwirq; - spin_lock_irqsave(&pmac_pic_lock, flags); + raw_spin_lock_irqsave(&pmac_pic_lock, flags); __clear_bit(src, ppc_cached_irq_mask); __pmac_set_irq_mask(src, 1); - spin_unlock_irqrestore(&pmac_pic_lock, flags); + raw_spin_unlock_irqrestore(&pmac_pic_lock, flags); } static void pmac_unmask_irq(unsigned int virq) @@ -178,19 +178,19 @@ static void pmac_unmask_irq(unsigned int virq) unsigned long flags; unsigned int src = irq_map[virq].hwirq; - spin_lock_irqsave(&pmac_pic_lock, flags); + raw_spin_lock_irqsave(&pmac_pic_lock, flags); __set_bit(src, ppc_cached_irq_mask); __pmac_set_irq_mask(src, 0); - spin_unlock_irqrestore(&pmac_pic_lock, flags); + raw_spin_unlock_irqrestore(&pmac_pic_lock, flags); } static int pmac_retrigger(unsigned int virq) { unsigned long flags; - spin_lock_irqsave(&pmac_pic_lock, flags); + raw_spin_lock_irqsave(&pmac_pic_lock, flags); __pmac_retrigger(irq_map[virq].hwirq); - spin_unlock_irqrestore(&pmac_pic_lock, flags); + raw_spin_unlock_irqrestore(&pmac_pic_lock, flags); return 1; } @@ -210,7 +210,7 @@ static irqreturn_t gatwick_action(int cpl, void *dev_id) int irq, bits; int rc = IRQ_NONE; - spin_lock_irqsave(&pmac_pic_lock, flags); + raw_spin_lock_irqsave(&pmac_pic_lock, flags); for (irq = max_irqs; (irq -= 32) >= max_real_irqs; ) { int i = irq >> 5; bits = in_le32(&pmac_irq_hw[i]->event) | ppc_lost_interrupts[i]; @@ -220,12 +220,12 @@ static irqreturn_t gatwick_action(int cpl, void *dev_id) if (bits == 0) continue; irq += __ilog2(bits); - spin_unlock_irqrestore(&pmac_pic_lock, flags); + raw_spin_unlock_irqrestore(&pmac_pic_lock, flags); generic_handle_irq(irq); - spin_lock_irqsave(&pmac_pic_lock, flags); + raw_spin_lock_irqsave(&pmac_pic_lock, flags); rc = IRQ_HANDLED; } - spin_unlock_irqrestore(&pmac_pic_lock, flags); + raw_spin_unlock_irqrestore(&pmac_pic_lock, flags); return rc; } @@ -244,7 +244,7 @@ static unsigned int pmac_pic_get_irq(void) return NO_IRQ_IGNORE; /* ignore, already handled */ } #endif /* CONFIG_SMP */ - spin_lock_irqsave(&pmac_pic_lock, flags); + raw_spin_lock_irqsave(&pmac_pic_lock, flags); for (irq = max_real_irqs; (irq -= 32) >= 0; ) { int i = irq >> 5; bits = in_le32(&pmac_irq_hw[i]->event) | ppc_lost_interrupts[i]; @@ -256,7 +256,7 @@ static unsigned int pmac_pic_get_irq(void) irq += __ilog2(bits); break; } - spin_unlock_irqrestore(&pmac_pic_lock, flags); + raw_spin_unlock_irqrestore(&pmac_pic_lock, flags); if (unlikely(irq < 0)) return NO_IRQ; return irq_linear_revmap(pmac_pic_host, irq); diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c index ccd8dd0..c35b484 100644 --- a/arch/powerpc/platforms/pseries/eeh.c +++ b/arch/powerpc/platforms/pseries/eeh.c @@ -100,7 +100,7 @@ int eeh_subsystem_enabled; EXPORT_SYMBOL(eeh_subsystem_enabled); /* Lock to avoid races due to multiple reports of an error */ -static DEFINE_SPINLOCK(confirm_error_lock); +static DEFINE_RAW_SPINLOCK(confirm_error_lock); /* Buffer for reporting slot-error-detail rtas calls. Its here * in BSS, and not dynamically alloced, so that it ends up in @@ -436,7 +436,7 @@ static void __eeh_clear_slot(struct device_node *parent, int mode_flag) void eeh_clear_slot (struct device_node *dn, int mode_flag) { unsigned long flags; - spin_lock_irqsave(&confirm_error_lock, flags); + raw_spin_lock_irqsave(&confirm_error_lock, flags); dn = find_device_pe (dn); @@ -447,7 +447,7 @@ void eeh_clear_slot (struct device_node *dn, int mode_flag) PCI_DN(dn)->eeh_mode &= ~mode_flag; PCI_DN(dn)->eeh_check_count = 0; __eeh_clear_slot(dn, mode_flag); - spin_unlock_irqrestore(&confirm_error_lock, flags); + raw_spin_unlock_irqrestore(&confirm_error_lock, flags); } /** @@ -506,7 +506,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) * in one slot might report errors simultaneously, and we * only want one error recovery routine running. */ - spin_lock_irqsave(&confirm_error_lock, flags); + raw_spin_lock_irqsave(&confirm_error_lock, flags); rc = 1; if (pdn->eeh_mode & EEH_MODE_ISOLATED) { pdn->eeh_check_count ++; @@ -575,7 +575,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) * with other functions on this device, and functions under * bridges. */ eeh_mark_slot (dn, EEH_MODE_ISOLATED); - spin_unlock_irqrestore(&confirm_error_lock, flags); + raw_spin_unlock_irqrestore(&confirm_error_lock, flags); eeh_send_failure_event (dn, dev); @@ -586,7 +586,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) return 1; dn_unlock: - spin_unlock_irqrestore(&confirm_error_lock, flags); + raw_spin_unlock_irqrestore(&confirm_error_lock, flags); return rc; } @@ -1064,7 +1064,7 @@ void __init eeh_init(void) struct device_node *phb, *np; struct eeh_early_enable_info info; - spin_lock_init(&confirm_error_lock); + raw_spin_lock_init(&confirm_error_lock); spin_lock_init(&slot_errbuf_lock); np = of_find_node_by_path("/rtas"); diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 1a0000a..902987d 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -140,7 +140,7 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, return ret; } -static DEFINE_PER_CPU(u64 *, tce_page) = NULL; +static DEFINE_PER_CPU_LOCKED(u64 *, tce_page) = NULL; static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, @@ -154,13 +154,14 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long l, limit; long tcenum_start = tcenum, npages_start = npages; int ret = 0; + int cpu; if (npages == 1) { return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, direction, attrs); } - tcep = __get_cpu_var(tce_page); + tcep = get_cpu_var_locked(tce_page, &cpu); /* This is safe to do since interrupts are off when we're called * from iommu_alloc{,_sg}() @@ -169,10 +170,11 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, tcep = (u64 *)__get_free_page(GFP_ATOMIC); /* If allocation fails, fall back to the loop implementation */ if (!tcep) { + put_cpu_var_locked(tce_page, cpu); return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, direction, attrs); } - __get_cpu_var(tce_page) = tcep; + per_cpu_var_locked(tce_page, cpu) = tcep; } rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT; @@ -216,6 +218,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, printk("\ttce[0] val = 0x%llx\n", tcep[0]); show_stack(current, (unsigned long *)__get_SP()); } + put_cpu_var_locked(tce_page, cpu); return ret; } diff --git a/arch/powerpc/sysdev/i8259.c b/arch/powerpc/sysdev/i8259.c index 0a55db8..69ddff0 100644 --- a/arch/powerpc/sysdev/i8259.c +++ b/arch/powerpc/sysdev/i8259.c @@ -23,7 +23,7 @@ static unsigned char cached_8259[2] = { 0xff, 0xff }; #define cached_A1 (cached_8259[0]) #define cached_21 (cached_8259[1]) -static DEFINE_SPINLOCK(i8259_lock); +static DEFINE_RAW_SPINLOCK(i8259_lock); static struct irq_host *i8259_host; @@ -42,7 +42,7 @@ unsigned int i8259_irq(void) if (pci_intack) irq = readb(pci_intack); else { - spin_lock(&i8259_lock); + raw_spin_lock(&i8259_lock); lock = 1; /* Perform an interrupt acknowledge cycle on controller 1. */ @@ -74,7 +74,7 @@ unsigned int i8259_irq(void) irq = NO_IRQ; if (lock) - spin_unlock(&i8259_lock); + raw_spin_unlock(&i8259_lock); return irq; } @@ -82,7 +82,7 @@ static void i8259_mask_and_ack_irq(unsigned int irq_nr) { unsigned long flags; - spin_lock_irqsave(&i8259_lock, flags); + raw_spin_lock_irqsave(&i8259_lock, flags); if (irq_nr > 7) { cached_A1 |= 1 << (irq_nr-8); inb(0xA1); /* DUMMY */ @@ -95,7 +95,7 @@ static void i8259_mask_and_ack_irq(unsigned int irq_nr) outb(cached_21, 0x21); outb(0x20, 0x20); /* Non-specific EOI */ } - spin_unlock_irqrestore(&i8259_lock, flags); + raw_spin_unlock_irqrestore(&i8259_lock, flags); } static void i8259_set_irq_mask(int irq_nr) @@ -110,13 +110,13 @@ static void i8259_mask_irq(unsigned int irq_nr) pr_debug("i8259_mask_irq(%d)\n", irq_nr); - spin_lock_irqsave(&i8259_lock, flags); + raw_spin_lock_irqsave(&i8259_lock, flags); if (irq_nr < 8) cached_21 |= 1 << irq_nr; else cached_A1 |= 1 << (irq_nr-8); i8259_set_irq_mask(irq_nr); - spin_unlock_irqrestore(&i8259_lock, flags); + raw_spin_unlock_irqrestore(&i8259_lock, flags); } static void i8259_unmask_irq(unsigned int irq_nr) @@ -125,13 +125,13 @@ static void i8259_unmask_irq(unsigned int irq_nr) pr_debug("i8259_unmask_irq(%d)\n", irq_nr); - spin_lock_irqsave(&i8259_lock, flags); + raw_spin_lock_irqsave(&i8259_lock, flags); if (irq_nr < 8) cached_21 &= ~(1 << irq_nr); else cached_A1 &= ~(1 << (irq_nr-8)); i8259_set_irq_mask(irq_nr); - spin_unlock_irqrestore(&i8259_lock, flags); + raw_spin_unlock_irqrestore(&i8259_lock, flags); } static struct irq_chip i8259_pic = { @@ -241,7 +241,7 @@ void i8259_init(struct device_node *node, unsigned long intack_addr) unsigned long flags; /* initialize the controller */ - spin_lock_irqsave(&i8259_lock, flags); + raw_spin_lock_irqsave(&i8259_lock, flags); /* Mask all first */ outb(0xff, 0xA1); @@ -273,7 +273,7 @@ void i8259_init(struct device_node *node, unsigned long intack_addr) outb(cached_A1, 0xA1); outb(cached_21, 0x21); - spin_unlock_irqrestore(&i8259_lock, flags); + raw_spin_unlock_irqrestore(&i8259_lock, flags); /* create a legacy host */ i8259_host = irq_alloc_host(node, IRQ_HOST_MAP_LEGACY, diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c index 28cdddd..970f3ba 100644 --- a/arch/powerpc/sysdev/ipic.c +++ b/arch/powerpc/sysdev/ipic.c @@ -32,7 +32,7 @@ static struct ipic * primary_ipic; static struct irq_chip ipic_level_irq_chip, ipic_edge_irq_chip; -static DEFINE_SPINLOCK(ipic_lock); +static DEFINE_RAW_SPINLOCK(ipic_lock); static struct ipic_info ipic_info[] = { [1] = { @@ -530,13 +530,13 @@ static void ipic_unmask_irq(unsigned int virq) unsigned long flags; u32 temp; - spin_lock_irqsave(&ipic_lock, flags); + raw_spin_lock_irqsave(&ipic_lock, flags); temp = ipic_read(ipic->regs, ipic_info[src].mask); temp |= (1 << (31 - ipic_info[src].bit)); ipic_write(ipic->regs, ipic_info[src].mask, temp); - spin_unlock_irqrestore(&ipic_lock, flags); + raw_spin_unlock_irqrestore(&ipic_lock, flags); } static void ipic_mask_irq(unsigned int virq) @@ -546,7 +546,7 @@ static void ipic_mask_irq(unsigned int virq) unsigned long flags; u32 temp; - spin_lock_irqsave(&ipic_lock, flags); + raw_spin_lock_irqsave(&ipic_lock, flags); temp = ipic_read(ipic->regs, ipic_info[src].mask); temp &= ~(1 << (31 - ipic_info[src].bit)); @@ -556,7 +556,7 @@ static void ipic_mask_irq(unsigned int virq) * for nearly all cases. */ mb(); - spin_unlock_irqrestore(&ipic_lock, flags); + raw_spin_unlock_irqrestore(&ipic_lock, flags); } static void ipic_ack_irq(unsigned int virq) @@ -566,7 +566,7 @@ static void ipic_ack_irq(unsigned int virq) unsigned long flags; u32 temp; - spin_lock_irqsave(&ipic_lock, flags); + raw_spin_lock_irqsave(&ipic_lock, flags); temp = 1 << (31 - ipic_info[src].bit); ipic_write(ipic->regs, ipic_info[src].ack, temp); @@ -575,7 +575,7 @@ static void ipic_ack_irq(unsigned int virq) * for nearly all cases. */ mb(); - spin_unlock_irqrestore(&ipic_lock, flags); + raw_spin_unlock_irqrestore(&ipic_lock, flags); } static void ipic_mask_irq_and_ack(unsigned int virq) @@ -585,7 +585,7 @@ static void ipic_mask_irq_and_ack(unsigned int virq) unsigned long flags; u32 temp; - spin_lock_irqsave(&ipic_lock, flags); + raw_spin_lock_irqsave(&ipic_lock, flags); temp = ipic_read(ipic->regs, ipic_info[src].mask); temp &= ~(1 << (31 - ipic_info[src].bit)); @@ -598,7 +598,7 @@ static void ipic_mask_irq_and_ack(unsigned int virq) * for nearly all cases. */ mb(); - spin_unlock_irqrestore(&ipic_lock, flags); + raw_spin_unlock_irqrestore(&ipic_lock, flags); } static int ipic_set_irq_type(unsigned int virq, unsigned int flow_type) diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index 470dc6c..cb5fe3f 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -46,7 +46,7 @@ static struct mpic *mpics; static struct mpic *mpic_primary; -static DEFINE_SPINLOCK(mpic_lock); +static DEFINE_RAW_SPINLOCK(mpic_lock); #ifdef CONFIG_PPC32 /* XXX for now */ #ifdef CONFIG_IRQ_ALL_CPUS @@ -347,10 +347,10 @@ static inline void mpic_ht_end_irq(struct mpic *mpic, unsigned int source) unsigned int mask = 1U << (fixup->index & 0x1f); writel(mask, fixup->applebase + soff); } else { - spin_lock(&mpic->fixup_lock); + raw_spin_lock(&mpic->fixup_lock); writeb(0x11 + 2 * fixup->index, fixup->base + 2); writel(fixup->data, fixup->base + 4); - spin_unlock(&mpic->fixup_lock); + raw_spin_unlock(&mpic->fixup_lock); } } @@ -366,7 +366,7 @@ static void mpic_startup_ht_interrupt(struct mpic *mpic, unsigned int source, DBG("startup_ht_interrupt(0x%x, 0x%x) index: %d\n", source, irqflags, fixup->index); - spin_lock_irqsave(&mpic->fixup_lock, flags); + raw_spin_lock_irqsave(&mpic->fixup_lock, flags); /* Enable and configure */ writeb(0x10 + 2 * fixup->index, fixup->base + 2); tmp = readl(fixup->base + 4); @@ -374,7 +374,7 @@ static void mpic_startup_ht_interrupt(struct mpic *mpic, unsigned int source, if (irqflags & IRQ_LEVEL) tmp |= 0x22; writel(tmp, fixup->base + 4); - spin_unlock_irqrestore(&mpic->fixup_lock, flags); + raw_spin_unlock_irqrestore(&mpic->fixup_lock, flags); #ifdef CONFIG_PM /* use the lowest bit inverted to the actual HW, @@ -396,12 +396,12 @@ static void mpic_shutdown_ht_interrupt(struct mpic *mpic, unsigned int source, DBG("shutdown_ht_interrupt(0x%x, 0x%x)\n", source, irqflags); /* Disable */ - spin_lock_irqsave(&mpic->fixup_lock, flags); + raw_spin_lock_irqsave(&mpic->fixup_lock, flags); writeb(0x10 + 2 * fixup->index, fixup->base + 2); tmp = readl(fixup->base + 4); tmp |= 1; writel(tmp, fixup->base + 4); - spin_unlock_irqrestore(&mpic->fixup_lock, flags); + raw_spin_unlock_irqrestore(&mpic->fixup_lock, flags); #ifdef CONFIG_PM /* use the lowest bit inverted to the actual HW, @@ -515,7 +515,7 @@ static void __init mpic_scan_ht_pics(struct mpic *mpic) BUG_ON(mpic->fixups == NULL); /* Init spinlock */ - spin_lock_init(&mpic->fixup_lock); + raw_spin_lock_init(&mpic->fixup_lock); /* Map U3 config space. We assume all IO-APICs are on the primary bus * so we only need to map 64kB. @@ -573,12 +573,12 @@ static int irq_choose_cpu(const cpumask_t *mask) if (cpumask_equal(mask, cpu_all_mask)) { static int irq_rover; - static DEFINE_SPINLOCK(irq_rover_lock); + static DEFINE_RAW_SPINLOCK(irq_rover_lock); unsigned long flags; /* Round-robin distribution... */ do_round_robin: - spin_lock_irqsave(&irq_rover_lock, flags); + raw_spin_lock_irqsave(&irq_rover_lock, flags); while (!cpu_online(irq_rover)) { if (++irq_rover >= NR_CPUS) @@ -590,7 +590,7 @@ static int irq_choose_cpu(const cpumask_t *mask) irq_rover = 0; } while (!cpu_online(irq_rover)); - spin_unlock_irqrestore(&irq_rover_lock, flags); + raw_spin_unlock_irqrestore(&irq_rover_lock, flags); } else { cpuid = cpumask_first_and(mask, cpu_online_mask); if (cpuid >= nr_cpu_ids) @@ -1368,14 +1368,14 @@ void __init mpic_set_serial_int(struct mpic *mpic, int enable) unsigned long flags; u32 v; - spin_lock_irqsave(&mpic_lock, flags); + raw_spin_lock_irqsave(&mpic_lock, flags); v = mpic_read(mpic->gregs, MPIC_GREG_GLOBAL_CONF_1); if (enable) v |= MPIC_GREG_GLOBAL_CONF_1_SIE; else v &= ~MPIC_GREG_GLOBAL_CONF_1_SIE; mpic_write(mpic->gregs, MPIC_GREG_GLOBAL_CONF_1, v); - spin_unlock_irqrestore(&mpic_lock, flags); + raw_spin_unlock_irqrestore(&mpic_lock, flags); } void mpic_irq_set_priority(unsigned int irq, unsigned int pri) @@ -1388,7 +1388,7 @@ void mpic_irq_set_priority(unsigned int irq, unsigned int pri) if (!mpic) return; - spin_lock_irqsave(&mpic_lock, flags); + raw_spin_lock_irqsave(&mpic_lock, flags); if (mpic_is_ipi(mpic, irq)) { reg = mpic_ipi_read(src - mpic->ipi_vecs[0]) & ~MPIC_VECPRI_PRIORITY_MASK; @@ -1400,7 +1400,7 @@ void mpic_irq_set_priority(unsigned int irq, unsigned int pri) mpic_irq_write(src, MPIC_INFO(IRQ_VECTOR_PRI), reg | (pri << MPIC_VECPRI_PRIORITY_SHIFT)); } - spin_unlock_irqrestore(&mpic_lock, flags); + raw_spin_unlock_irqrestore(&mpic_lock, flags); } void mpic_setup_this_cpu(void) @@ -1415,7 +1415,7 @@ void mpic_setup_this_cpu(void) DBG("%s: setup_this_cpu(%d)\n", mpic->name, hard_smp_processor_id()); - spin_lock_irqsave(&mpic_lock, flags); + raw_spin_lock_irqsave(&mpic_lock, flags); /* let the mpic know we want intrs. default affinity is 0xffffffff * until changed via /proc. That's how it's done on x86. If we want @@ -1431,7 +1431,7 @@ void mpic_setup_this_cpu(void) /* Set current processor priority to 0 */ mpic_cpu_write(MPIC_INFO(CPU_CURRENT_TASK_PRI), 0); - spin_unlock_irqrestore(&mpic_lock, flags); + raw_spin_unlock_irqrestore(&mpic_lock, flags); #endif /* CONFIG_SMP */ } @@ -1460,7 +1460,7 @@ void mpic_teardown_this_cpu(int secondary) BUG_ON(mpic == NULL); DBG("%s: teardown_this_cpu(%d)\n", mpic->name, hard_smp_processor_id()); - spin_lock_irqsave(&mpic_lock, flags); + raw_spin_lock_irqsave(&mpic_lock, flags); /* let the mpic know we don't want intrs. */ for (i = 0; i < mpic->num_sources ; i++) @@ -1474,7 +1474,7 @@ void mpic_teardown_this_cpu(int secondary) */ mpic_eoi(mpic); - spin_unlock_irqrestore(&mpic_lock, flags); + raw_spin_unlock_irqrestore(&mpic_lock, flags); } diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 4e6152c..dcf8888 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -358,6 +358,7 @@ static int xmon_core(struct pt_regs *regs, int fromipi) unsigned long timeout; #endif + preempt_disable(); local_irq_save(flags); bp = in_breakpoint_table(regs->nip, &offset); @@ -543,6 +544,7 @@ static int xmon_core(struct pt_regs *regs, int fromipi) insert_cpu_bpts(); local_irq_restore(flags); + preempt_enable(); return cmd != 'X' && cmd != EOF; } diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h index 9d2a179..e70d6dd 100644 --- a/arch/s390/include/asm/rwsem.h +++ b/arch/s390/include/asm/rwsem.h @@ -48,16 +48,21 @@ struct rwsem_waiter; -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *); -extern struct rw_semaphore *rwsem_downgrade_write(struct rw_semaphore *); +extern struct rw_anon_semaphore * +rwsem_down_read_failed(struct rw_anon_semaphore *); +extern struct rw_anon_semaphore * +rwsem_down_write_failed(struct rw_anon_semaphore *); +extern struct rw_anon_semaphore * +rwsem_wake(struct rw_anon_semaphore *); +extern struct rw_anon_semaphore * +rwsem_downgrade_wake(struct rw_anon_semaphore *); +extern struct rw_anon_semaphore * +rwsem_downgrade_write(struct rw_anon_semaphore *); /* * the semaphore definition */ -struct rw_semaphore { +struct rw_anon_semaphore { signed long count; spinlock_t wait_lock; struct list_head wait_list; @@ -85,40 +90,40 @@ struct rw_semaphore { */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +# define __RWSEM_ANON_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } #else -# define __RWSEM_DEP_MAP_INIT(lockname) +# define __RWSEM_ANON_DEP_MAP_INIT(lockname) #endif -#define __RWSEM_INITIALIZER(name) \ +#define __RWSEM_ANON_INITIALIZER(name) \ { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait.lock), \ - LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } + LIST_HEAD_INIT((name).wait_list) __RWSEM_ANON_DEP_MAP_INIT(name) } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define DECLARE_ANON_RWSEM(name) \ + struct rw_anon_semaphore name = __RWSEM_ANON_INITIALIZER(name) -static inline void init_rwsem(struct rw_semaphore *sem) +static inline void init_anon_rwsem(struct rw_anon_semaphore *sem) { sem->count = RWSEM_UNLOCKED_VALUE; spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); } -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); +extern void __init_anon_rwsem(struct rw_anon_semaphore *sem, const char *name, + struct lock_class_key *key); -#define init_rwsem(sem) \ +#define init_anon_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ - __init_rwsem((sem), #sem, &__key); \ + __init_anon_rwsem((sem), #sem, &__key); \ } while (0) /* * lock for reading */ -static inline void __down_read(struct rw_semaphore *sem) +static inline void __down_read(struct rw_anon_semaphore *sem) { signed long old, new; @@ -146,7 +151,7 @@ static inline void __down_read(struct rw_semaphore *sem) /* * trylock for reading -- returns 1 if successful, 0 if contention */ -static inline int __down_read_trylock(struct rw_semaphore *sem) +static inline int __down_read_trylock(struct rw_anon_semaphore *sem) { signed long old, new; @@ -177,7 +182,8 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline void +__down_write_nested(struct rw_anon_semaphore *sem, int subclass) { signed long old, new, tmp; @@ -203,7 +209,7 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) rwsem_down_write_failed(sem); } -static inline void __down_write(struct rw_semaphore *sem) +static inline void __down_write(struct rw_anon_semaphore *sem) { __down_write_nested(sem, 0); } @@ -211,7 +217,7 @@ static inline void __down_write(struct rw_semaphore *sem) /* * trylock for writing -- returns 1 if successful, 0 if contention */ -static inline int __down_write_trylock(struct rw_semaphore *sem) +static inline int __down_write_trylock(struct rw_anon_semaphore *sem) { signed long old; @@ -239,7 +245,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) /* * unlock after reading */ -static inline void __up_read(struct rw_semaphore *sem) +static inline void __up_read(struct rw_anon_semaphore *sem) { signed long old, new; @@ -269,7 +275,7 @@ static inline void __up_read(struct rw_semaphore *sem) /* * unlock after writing */ -static inline void __up_write(struct rw_semaphore *sem) +static inline void __up_write(struct rw_anon_semaphore *sem) { signed long old, new, tmp; @@ -299,7 +305,7 @@ static inline void __up_write(struct rw_semaphore *sem) /* * downgrade write lock to read lock */ -static inline void __downgrade_write(struct rw_semaphore *sem) +static inline void __downgrade_write(struct rw_anon_semaphore *sem) { signed long old, new, tmp; @@ -328,7 +334,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(long delta, struct rw_anon_semaphore *sem) { signed long old, new; @@ -354,7 +360,8 @@ static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem) /* * implement exchange and add functionality */ -static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) +static inline long +rwsem_atomic_update(long delta, struct rw_anon_semaphore *sem) { signed long old, new; @@ -378,10 +385,52 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) return new; } -static inline int rwsem_is_locked(struct rw_semaphore *sem) +static inline int rwsem_is_locked(struct rw_anon_semaphore *sem) { return (sem->count != 0); } +struct rw_semaphore { + signed long count; + spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +#else +# define __RWSEM_DEP_MAP_INIT(lockname) +#endif + +#define __RWSEM_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait.lock), \ + LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } + +#define DECLARE_RWSEM(name) \ + struct rw_anon_semaphore name = __RWSEM_INITIALIZER(name) + +static inline void init_rwsem(struct rw_anon_semaphore *sem) +{ + sem->count = RWSEM_UNLOCKED_VALUE; + spin_lock_init(&sem->wait_lock); + INIT_LIST_HEAD(&sem->wait_list); +} + +static inline void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ + __init_anon_rwsem((struct rw_anon_semaphore *)sem, name, key); +} + +#define init_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __init_rwsem((sem), #sem, &__key); \ +} while (0) + #endif /* __KERNEL__ */ #endif /* _S390_RWSEM_H */ diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h index 06e2251..d721bd8 100644 --- a/arch/sh/include/asm/rwsem.h +++ b/arch/sh/include/asm/rwsem.h @@ -19,7 +19,7 @@ /* * the semaphore definition */ -struct rw_semaphore { +struct rw_anon_semaphore { long count; #define RWSEM_UNLOCKED_VALUE 0x00000000 #define RWSEM_ACTIVE_BIAS 0x00000001 @@ -35,35 +35,38 @@ struct rw_semaphore { }; #ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +# define __RWSEM_ANON_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } #else -# define __RWSEM_DEP_MAP_INIT(lockname) +# define __RWSEM_ANON_DEP_MAP_INIT(lockname) #endif -#define __RWSEM_INITIALIZER(name) \ - { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ +#define __RWSEM_ANON_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \ LIST_HEAD_INIT((name).wait_list) \ - __RWSEM_DEP_MAP_INIT(name) } + __RWSEM_ANON_DEP_MAP_INIT(name) } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define DECLARE_ANON_RWSEM(name) \ + struct rw_anon_semaphore name = __RWSEM_ANON_INITIALIZER(name) -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_down_read_failed(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_down_write_failed(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore *rwsem_wake(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_downgrade_wake(struct rw_anon_semaphore *sem); -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); +extern void __init_anon_rwsem(struct rw_anon_semaphore *sem, const char *name, + struct lock_class_key *key); -#define init_rwsem(sem) \ +#define init_anon_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ - __init_rwsem((sem), #sem, &__key); \ + __init_anon_rwsem((sem), #sem, &__key); \ } while (0) -static inline void init_rwsem(struct rw_semaphore *sem) +static inline void init_anon_rwsem(struct rw_anon_semaphore *sem) { sem->count = RWSEM_UNLOCKED_VALUE; spin_lock_init(&sem->wait_lock); @@ -73,7 +76,7 @@ static inline void init_rwsem(struct rw_semaphore *sem) /* * lock for reading */ -static inline void __down_read(struct rw_semaphore *sem) +static inline void __down_read(struct rw_anon_semaphore *sem) { if (atomic_inc_return((atomic_t *)(&sem->count)) > 0) smp_wmb(); @@ -81,7 +84,7 @@ static inline void __down_read(struct rw_semaphore *sem) rwsem_down_read_failed(sem); } -static inline int __down_read_trylock(struct rw_semaphore *sem) +static inline int __down_read_trylock(struct rw_anon_semaphore *sem) { int tmp; @@ -98,7 +101,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write(struct rw_semaphore *sem) +static inline void __down_write(struct rw_anon_semaphore *sem) { int tmp; @@ -110,7 +113,7 @@ static inline void __down_write(struct rw_semaphore *sem) rwsem_down_write_failed(sem); } -static inline int __down_write_trylock(struct rw_semaphore *sem) +static inline int __down_write_trylock(struct rw_anon_semaphore *sem) { int tmp; @@ -123,7 +126,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) /* * unlock after reading */ -static inline void __up_read(struct rw_semaphore *sem) +static inline void __up_read(struct rw_anon_semaphore *sem) { int tmp; @@ -136,7 +139,7 @@ static inline void __up_read(struct rw_semaphore *sem) /* * unlock after writing */ -static inline void __up_write(struct rw_semaphore *sem) +static inline void __up_write(struct rw_anon_semaphore *sem) { smp_wmb(); if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS, @@ -147,7 +150,7 @@ static inline void __up_write(struct rw_semaphore *sem) /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(int delta, struct rw_anon_semaphore *sem) { atomic_add(delta, (atomic_t *)(&sem->count)); } @@ -155,7 +158,7 @@ static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) /* * downgrade write lock to read lock */ -static inline void __downgrade_write(struct rw_semaphore *sem) +static inline void __downgrade_write(struct rw_anon_semaphore *sem) { int tmp; @@ -165,7 +168,8 @@ static inline void __downgrade_write(struct rw_semaphore *sem) rwsem_downgrade_wake(sem); } -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline void + __down_write_nested(struct rw_anon_semaphore *sem, int subclass) { __down_write(sem); } @@ -173,12 +177,60 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) /* * implement exchange and add functionality */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline int rwsem_atomic_update(int delta, struct rw_anon_semaphore *sem) { smp_mb(); return atomic_add_return(delta, (atomic_t *)(&sem->count)); } +static inline int anon_rwsem_is_locked(struct rw_anon_semaphore *sem) +{ + return (sem->count != 0); +} + +struct rw_semaphore { + long count; + spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +#else +# define __RWSEM_DEP_MAP_INIT(lockname) +#endif + +#define __RWSEM_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ + LIST_HEAD_INIT((name).wait_list) \ + __RWSEM_DEP_MAP_INIT(name) } + +#define DECLARE_RWSEM(name) \ + struct rw_semaphore name = __RWSEM_INITIALIZER(name) + +static inline void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ + __init_anon_rwsem((struct rw_anon_semaphore *)sem, name, key); +} + +#define init_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __init_rwsem((sem), #sem, &__key); \ +} while (0) + +static inline void init_rwsem(struct rw_semaphore *sem) +{ + sem->count = RWSEM_UNLOCKED_VALUE; + spin_lock_init(&sem->wait_lock); + INIT_LIST_HEAD(&sem->wait_list); +} + static inline int rwsem_is_locked(struct rw_semaphore *sem) { return (sem->count != 0); diff --git a/arch/sh/kernel/ptrace_64.c b/arch/sh/kernel/ptrace_64.c index 873ebdc..b063eb8 100644 --- a/arch/sh/kernel/ptrace_64.c +++ b/arch/sh/kernel/ptrace_64.c @@ -133,6 +133,8 @@ void user_enable_single_step(struct task_struct *child) struct pt_regs *regs = child->thread.uregs; regs->sr |= SR_SSTEP; /* auto-resetting upon exception */ + + set_tsk_thread_flag(child, TIF_SINGLESTEP); } void user_disable_single_step(struct task_struct *child) @@ -140,6 +142,8 @@ void user_disable_single_step(struct task_struct *child) struct pt_regs *regs = child->thread.uregs; regs->sr &= ~SR_SSTEP; + + clear_tsk_thread_flag(child, TIF_SINGLESTEP); } static int genregs_get(struct task_struct *target, @@ -454,6 +458,8 @@ asmlinkage long long do_syscall_trace_enter(struct pt_regs *regs) asmlinkage void do_syscall_trace_leave(struct pt_regs *regs) { + int step; + if (unlikely(current->audit_context)) audit_syscall_exit(AUDITSC_RESULT(regs->regs[9]), regs->regs[9]); @@ -461,8 +467,9 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->regs[9]); - if (test_thread_flag(TIF_SYSCALL_TRACE)) - tracehook_report_syscall_exit(regs, 0); + step = test_thread_flag(TIF_SINGLESTEP); + if (step || test_thread_flag(TIF_SYSCALL_TRACE)) + tracehook_report_syscall_exit(regs, step); } /* Called with interrupts disabled */ diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c index ce76dbd..580e97d 100644 --- a/arch/sh/kernel/signal_64.c +++ b/arch/sh/kernel/signal_64.c @@ -118,7 +118,9 @@ static int do_signal(struct pt_regs *regs, sigset_t *oldset) * clear the TS_RESTORE_SIGMASK flag. */ current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - tracehook_signal_handler(signr, &info, &ka, regs, 0); + + tracehook_signal_handler(signr, &info, &ka, regs, + test_thread_flag(TIF_SINGLESTEP)); return 1; } } diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h index 6e56210..b573e97 100644 --- a/arch/sparc/include/asm/rwsem.h +++ b/arch/sparc/include/asm/rwsem.h @@ -19,7 +19,7 @@ struct rwsem_waiter; -struct rw_semaphore { +struct rw_anon_semaphore { signed int count; spinlock_t wait_lock; struct list_head wait_list; @@ -29,51 +29,92 @@ struct rw_semaphore { }; #ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +# define __RWSEM_ANON_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } #else -# define __RWSEM_DEP_MAP_INIT(lockname) +# define __RWSEM_ANON_DEP_MAP_INIT(lockname) #endif -#define __RWSEM_INITIALIZER(name) \ -{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ - LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } +#define __RWSEMANON__INITIALIZER(name) \ +{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) \ + __RWSEM_ANON_DEP_MAP_INIT(name) } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define DECLARE_ANON_RWSEM(name) \ + struct rw_anon_semaphore name = __RWSEM_ANON_INITIALIZER(name) -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); +extern void __init_anon_rwsem(struct rw_anon_semaphore *sem, const char *name, + struct lock_class_key *key); -#define init_rwsem(sem) \ +#define init_anon_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ - __init_rwsem((sem), #sem, &__key); \ + __init_anon_rwsem((sem), #sem, &__key); \ } while (0) -extern void __down_read(struct rw_semaphore *sem); -extern int __down_read_trylock(struct rw_semaphore *sem); -extern void __down_write(struct rw_semaphore *sem); -extern int __down_write_trylock(struct rw_semaphore *sem); -extern void __up_read(struct rw_semaphore *sem); -extern void __up_write(struct rw_semaphore *sem); -extern void __downgrade_write(struct rw_semaphore *sem); +extern void __down_read(struct rw_anon_semaphore *sem); +extern int __down_read_trylock(struct rw_anon_semaphore *sem); +extern void __down_write(struct rw_anon_semaphore *sem); +extern int __down_write_trylock(struct rw_anon_semaphore *sem); +extern void __up_read(struct rw_anon_semaphore *sem); +extern void __up_write(struct rw_anon_semaphore *sem); +extern void __downgrade_write(struct rw_anon_semaphore *sem); -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline void +__down_write_nested(struct rw_anon_semaphore *sem, int subclass) { __down_write(sem); } -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline int rwsem_atomic_update(int delta, struct rw_anon_semaphore *sem) { return atomic_add_return(delta, (atomic_t *)(&sem->count)); } -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(int delta, struct rw_anon_semaphore *sem) { atomic_add(delta, (atomic_t *)(&sem->count)); } +static inline int anon_rwsem_is_locked(struct rw_semaphore *sem) +{ + return (sem->count != 0); +} + +struct rw_semaphore { + signed int count; + spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +#else +# define __RWSEM_DEP_MAP_INIT(lockname) +#endif + +#define __RWSEM_INITIALIZER(name) \ +{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ + LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } + +#define DECLARE_RWSEM(name) \ + struct rw_semaphore name = __RWSEM_INITIALIZER(name) + +static inline void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ + __init_anon_rwsem((struct rw_anon_semaphore *)sem, name, key); +} + +#define init_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __init_rwsem((sem), #sem, &__key); \ +} while (0) + static inline int rwsem_is_locked(struct rw_semaphore *sem) { return (sem->count != 0); diff --git a/arch/sparc/include/asm/stat.h b/arch/sparc/include/asm/stat.h index 55db5ec..39327d6 100644 --- a/arch/sparc/include/asm/stat.h +++ b/arch/sparc/include/asm/stat.h @@ -53,8 +53,8 @@ struct stat { ino_t st_ino; mode_t st_mode; short st_nlink; - uid_t st_uid; - gid_t st_gid; + uid16_t st_uid; + gid16_t st_gid; unsigned short st_rdev; off_t st_size; time_t st_atime; diff --git a/arch/sparc/kernel/kstack.h b/arch/sparc/kernel/kstack.h index 4248d96..5247283 100644 --- a/arch/sparc/kernel/kstack.h +++ b/arch/sparc/kernel/kstack.h @@ -11,6 +11,10 @@ static inline bool kstack_valid(struct thread_info *tp, unsigned long sp) { unsigned long base = (unsigned long) tp; + /* Stack pointer must be 16-byte aligned. */ + if (sp & (16UL - 1)) + return false; + if (sp >= (base + sizeof(struct thread_info)) && sp <= (base + THREAD_SIZE - sizeof(struct sparc_stackf))) return true; diff --git a/arch/sparc/kernel/of_device_32.c b/arch/sparc/kernel/of_device_32.c index 4c26eb5..53a58b3 100644 --- a/arch/sparc/kernel/of_device_32.c +++ b/arch/sparc/kernel/of_device_32.c @@ -105,7 +105,7 @@ static unsigned long of_bus_sbus_get_flags(const u32 *addr, unsigned long flags) static int of_bus_ambapp_match(struct device_node *np) { - return !strcmp(np->name, "ambapp"); + return !strcmp(np->type, "ambapp"); } static void of_bus_ambapp_count_cells(struct device_node *child, diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c index 539e83f..592b03d 100644 --- a/arch/sparc/kernel/pci.c +++ b/arch/sparc/kernel/pci.c @@ -247,6 +247,7 @@ static struct pci_dev *of_create_pci_dev(struct pci_pbm_info *pbm, struct pci_bus *bus, int devfn) { struct dev_archdata *sd; + struct pci_slot *slot; struct of_device *op; struct pci_dev *dev; const char *type; @@ -286,6 +287,11 @@ static struct pci_dev *of_create_pci_dev(struct pci_pbm_info *pbm, dev->dev.bus = &pci_bus_type; dev->devfn = devfn; dev->multifunction = 0; /* maybe a lie? */ + set_pcie_port_type(dev); + + list_for_each_entry(slot, &dev->bus->slots, list) + if (PCI_SLOT(dev->devfn) == slot->number) + dev->slot = slot; dev->vendor = of_getintprop_default(node, "vendor-id", 0xffff); dev->device = of_getintprop_default(node, "device-id", 0xffff); @@ -322,6 +328,7 @@ static struct pci_dev *of_create_pci_dev(struct pci_pbm_info *pbm, dev->current_state = 4; /* unknown power state */ dev->error_state = pci_channel_io_normal; + dev->dma_mask = 0xffffffff; if (!strcmp(node->name, "pci")) { /* a PCI-PCI bridge */ diff --git a/arch/sparc/kernel/pcic.c b/arch/sparc/kernel/pcic.c index 4e2724e..2e321d7 100644 --- a/arch/sparc/kernel/pcic.c +++ b/arch/sparc/kernel/pcic.c @@ -702,10 +702,10 @@ static void pcic_clear_clock_irq(void) static irqreturn_t pcic_timer_handler (int irq, void *h) { - write_seqlock(&xtime_lock); /* Dummy, to show that we remember */ + write_raw_seqlock(&xtime_lock); /* Dummy, to show that we remember */ pcic_clear_clock_irq(); do_timer(1); - write_sequnlock(&xtime_lock); + write_raw_sequnlock(&xtime_lock); #ifndef CONFIG_SMP update_process_times(user_mode(get_irq_regs())); #endif diff --git a/arch/sparc/kernel/prom.h b/arch/sparc/kernel/prom.h index 453397f..0f14b12 100644 --- a/arch/sparc/kernel/prom.h +++ b/arch/sparc/kernel/prom.h @@ -5,7 +5,7 @@ #include <asm/prom.h> extern struct device_node *allnodes; /* temporary while merging */ -extern rwlock_t devtree_lock; /* temporary while merging */ +extern raw_spinlock_t devtree_lock; /* temporary while merging */ extern void * prom_early_alloc(unsigned long size); extern void irq_trans_init(struct device_node *dp); diff --git a/arch/sparc/kernel/prom_common.c b/arch/sparc/kernel/prom_common.c index d80a65d..a5b1b81 100644 --- a/arch/sparc/kernel/prom_common.c +++ b/arch/sparc/kernel/prom_common.c @@ -80,7 +80,7 @@ int of_set_property(struct device_node *dp, const char *name, void *val, int len err = -ENODEV; mutex_lock(&of_set_property_mutex); - write_lock(&devtree_lock); + raw_spin_lock(&devtree_lock); prevp = &dp->properties; while (*prevp) { struct property *prop = *prevp; @@ -107,7 +107,7 @@ int of_set_property(struct device_node *dp, const char *name, void *val, int len } prevp = &(*prevp)->next; } - write_unlock(&devtree_lock); + raw_spin_unlock(&devtree_lock); mutex_unlock(&of_set_property_mutex); /* XXX Upate procfs if necessary... */ diff --git a/arch/sparc/kernel/time_32.c b/arch/sparc/kernel/time_32.c index 0d4c09b..e2ca52f 100644 --- a/arch/sparc/kernel/time_32.c +++ b/arch/sparc/kernel/time_32.c @@ -95,7 +95,7 @@ static irqreturn_t timer_interrupt(int dummy, void *dev_id) #endif /* Protect counter clear so that do_gettimeoffset works */ - write_seqlock(&xtime_lock); + write_raw_seqlock(&xtime_lock); clear_clock_irq(); @@ -111,7 +111,7 @@ static irqreturn_t timer_interrupt(int dummy, void *dev_id) else last_rtc_update = xtime.tv_sec - 600; /* do it again in 60 s */ } - write_sequnlock(&xtime_lock); + write_raw_sequnlock(&xtime_lock); #ifndef CONFIG_SMP update_process_times(user_mode(get_irq_regs())); diff --git a/arch/sparc/kernel/tsb.S b/arch/sparc/kernel/tsb.S index 8c91d9b..db15d12 100644 --- a/arch/sparc/kernel/tsb.S +++ b/arch/sparc/kernel/tsb.S @@ -191,10 +191,12 @@ tsb_dtlb_load: tsb_itlb_load: /* Executable bit must be set. */ -661: andcc %g5, _PAGE_EXEC_4U, %g0 - .section .sun4v_1insn_patch, "ax" +661: sethi %hi(_PAGE_EXEC_4U), %g4 + andcc %g5, %g4, %g0 + .section .sun4v_2insn_patch, "ax" .word 661b andcc %g5, _PAGE_EXEC_4V, %g0 + nop .previous be,pn %xcc, tsb_do_fault diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c index 7916feb..a9f414c 100644 --- a/arch/sparc/mm/highmem.c +++ b/arch/sparc/mm/highmem.c @@ -34,7 +34,7 @@ void *kmap_atomic(struct page *page, enum km_type type) unsigned long idx; unsigned long vaddr; - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); @@ -73,6 +73,7 @@ void kunmap_atomic(void *kvaddr, enum km_type type) if (vaddr < FIXADDR_START) { // FIXME pagefault_enable(); + preempt_enable(); return; } @@ -99,6 +100,7 @@ void kunmap_atomic(void *kvaddr, enum km_type type) #endif pagefault_enable(); + preempt_enable(); } EXPORT_SYMBOL(kunmap_atomic); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index eb40925..f50f8b0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -124,10 +124,18 @@ config ARCH_MAY_HAVE_PC_FDC def_bool y config RWSEM_GENERIC_SPINLOCK - def_bool !X86_XADD + bool + depends on !X86_XADD || PREEMPT_RT + default y + +config ASM_SEMAPHORES + bool + default y config RWSEM_XCHGADD_ALGORITHM - def_bool X86_XADD + bool + depends on X86_XADD && !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT + default y config ARCH_HAS_CPU_IDLE_WAIT def_bool y @@ -281,6 +289,7 @@ config X86_X2APIC config SPARSE_IRQ bool "Support sparse irq numbering" depends on PCI_MSI || HT_IRQ + depends on !PREEMPT_RT ---help--- This enables support for sparse irqs. This is useful for distro kernels that want to define a high CONFIG_NR_CPUS value but still @@ -712,7 +721,7 @@ config IOMMU_API config MAXSMP bool "Configure Maximum number of SMP Processors and NUMA Nodes" - depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL + depends on 0 && X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL select CPUMASK_OFFSTACK default n ---help--- @@ -1898,7 +1907,7 @@ config PCI_MMCONFIG config DMAR bool "Support for DMA Remapping Devices (EXPERIMENTAL)" - depends on PCI_MSI && ACPI && EXPERIMENTAL + depends on PCI_MSI && ACPI && EXPERIMENTAL && !PREEMPT_RT help DMA remapping (DMAR) devices support enables independent address translations for Direct Memory Access (DMA) from devices. @@ -1941,6 +1950,7 @@ config DMAR_FLOPPY_WA config INTR_REMAP bool "Support for Interrupt Remapping (EXPERIMENTAL)" depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL + depends on !PREEMPT_RT ---help--- Supports Interrupt remapping for IO-APIC and MSI devices. To use x2apic mode in the CPU's which support x2APIC enhancements or diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index bc01e3e..93dc8cc 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -76,6 +76,7 @@ config DEBUG_PER_CPU_MAPS bool "Debug access to per_cpu maps" depends on DEBUG_KERNEL depends on SMP + depends on !PREEMPT_RT default n ---help--- Say Y to verify that the per_cpu map being accessed has @@ -126,6 +127,7 @@ config DEBUG_NX_TEST config 4KSTACKS bool "Use 4Kb for kernel stacks instead of 8Kb" depends on X86_32 + default y ---help--- If you say Y here the kernel will use a 4Kb stacksize for the kernel stack attached to each process/thread. This facilitates diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 56f462c..ead7e0b 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -50,8 +50,8 @@ #define ACPI_ASM_MACROS #define BREAKPOINT3 -#define ACPI_DISABLE_IRQS() local_irq_disable() -#define ACPI_ENABLE_IRQS() local_irq_enable() +#define ACPI_DISABLE_IRQS() local_irq_disable_nort() +#define ACPI_ENABLE_IRQS() local_irq_enable_nort() #define ACPI_FLUSH_CPU_CACHE() wbinvd() int __acpi_acquire_global_lock(unsigned int *lock); diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index dc5a667..166704f 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -186,10 +186,10 @@ static inline int atomic_add_return(int i, atomic_t *v) #ifdef CONFIG_M386 no_xadd: /* Legacy 386 processor */ - local_irq_save(flags); + raw_local_irq_save(flags); __i = atomic_read(v); atomic_set(v, i + __i); - local_irq_restore(flags); + raw_local_irq_restore(flags); return i + __i; #endif } diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 1994d3f..f2ad216 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -170,10 +170,7 @@ static inline void elf_common_init(struct thread_struct *t, } #define ELF_PLAT_INIT(_r, load_addr) \ -do { \ - elf_common_init(¤t->thread, _r, 0); \ - clear_thread_flag(TIF_IA32); \ -} while (0) + elf_common_init(¤t->thread, _r, 0) #define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ elf_common_init(¤t->thread, regs, __USER_DS) diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 014c2b8..433ae1f 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -58,16 +58,22 @@ extern void *kmap_high(struct page *page); extern void kunmap_high(struct page *page); void *kmap(struct page *page); +extern void kunmap_virt(void *ptr); +extern struct page *kmap_to_page(void *ptr); void kunmap(struct page *page); -void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); -void *kmap_atomic(struct page *page, enum km_type type); -void kunmap_atomic(void *kvaddr, enum km_type type); -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); + +void *__kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); +void *__kmap_atomic(struct page *page, enum km_type type); +void *__kmap_atomic_direct(struct page *page, enum km_type type); +void __kunmap_atomic(void *kvaddr, enum km_type type); +void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type); +struct page *__kmap_atomic_to_page(void *ptr); + void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); -struct page *kmap_atomic_to_page(void *ptr); #ifndef CONFIG_PARAVIRT -#define kmap_atomic_pte(page, type) kmap_atomic(page, type) +#define kmap_atomic_pte(page, type) kmap_atomic(page, type) +#define kmap_atomic_pte_direct(page, type) kmap_atomic_direct(page, type) #endif #define flush_cache_kmaps() do { } while (0) @@ -75,6 +81,27 @@ struct page *kmap_atomic_to_page(void *ptr); extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, unsigned long end_pfn); +/* + * on PREEMPT_RT kmap_atomic() is a wrapper that uses kmap(): + */ +#ifdef CONFIG_PREEMPT_RT +# define kmap_atomic_prot(page, type, prot) ({ pagefault_disable(); kmap(page); }) +# define kmap_atomic(page, type) ({ pagefault_disable(); kmap(page); }) +# define kmap_atomic_pfn(pfn, type) kmap(pfn_to_page(pfn)) +# define kunmap_atomic(kvaddr, type) do { pagefault_enable(); kunmap_virt(kvaddr); } while(0) +# define kmap_atomic_to_page(kvaddr) kmap_to_page(kvaddr) +# define kmap_atomic_direct(page, type) __kmap_atomic_direct(page, type) +# define kunmap_atomic_direct(kvaddr, type) __kunmap_atomic(kvaddr, type) +#else +# define kmap_atomic_prot(page, type, prot) __kmap_atomic_prot(page, type, prot) +# define kmap_atomic(page, type) __kmap_atomic(page, type) +# define kmap_atomic_pfn(pfn, type) __kmap_atomic_pfn(pfn, type) +# define kunmap_atomic(kvaddr, type) __kunmap_atomic(kvaddr, type) +# define kmap_atomic_to_page(kvaddr) __kmap_atomic_to_page(kvaddr) +# define kmap_atomic_direct(page, type) __kmap_atomic(page, type) +# define kunmap_atomic_direct(kvaddr, type) __kunmap_atomic(kvaddr, type) +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_X86_HIGHMEM_H */ diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h index 1edbf89..fc1f579 100644 --- a/arch/x86/include/asm/i8253.h +++ b/arch/x86/include/asm/i8253.h @@ -6,7 +6,7 @@ #define PIT_CH0 0x40 #define PIT_CH2 0x42 -extern spinlock_t i8253_lock; +extern raw_spinlock_t i8253_lock; extern struct clock_event_device *global_clock_event; diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 58d7091..7ec65b1 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -24,7 +24,7 @@ extern unsigned int cached_irq_mask; #define SLAVE_ICW4_DEFAULT 0x01 #define PIC_ICW4_AEOI 2 -extern spinlock_t i8259A_lock; +extern raw_spinlock_t i8259A_lock; extern void init_8259A(int auto_eoi); extern void enable_8259A_irq(unsigned int irq); diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 7639dbf..0ec050a 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -14,12 +14,21 @@ #define IRQ_STACK_ORDER 2 #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER) -#define STACKFAULT_STACK 1 -#define DOUBLEFAULT_STACK 2 -#define NMI_STACK 3 -#define DEBUG_STACK 4 -#define MCE_STACK 5 -#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ +#ifdef CONFIG_PREEMPT_RT +# define STACKFAULT_STACK 0 +# define DOUBLEFAULT_STACK 1 +# define NMI_STACK 2 +# define DEBUG_STACK 0 +# define MCE_STACK 3 +# define N_EXCEPTION_STACKS 3 /* hw limit: 7 */ +#else +# define STACKFAULT_STACK 1 +# define DOUBLEFAULT_STACK 2 +# define NMI_STACK 3 +# define DEBUG_STACK 4 +# define MCE_STACK 5 +# define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ +#endif #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index dd59a85..fffef51 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -442,6 +442,14 @@ static inline void *kmap_atomic_pte(struct page *page, enum km_type type) ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type); return (void *)ret; } + +static inline void *kmap_atomic_pte_direct(struct page *page, enum km_type type) +{ + unsigned long ret; + ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte_direct, + page, type); + return (void *)ret; +} #endif static inline void pte_update(struct mm_struct *mm, unsigned long addr, diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index b1e70d5..ca14f6c 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -306,6 +306,7 @@ struct pv_mmu_ops { #ifdef CONFIG_HIGHPTE void *(*kmap_atomic_pte)(struct page *page, enum km_type type); + void *(*kmap_atomic_pte_direct)(struct page *page, enum km_type type); #endif struct pv_lazy_ops lazy_mode; diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index b4bf9a9..567d009 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -83,7 +83,7 @@ struct irq_routing_table { extern unsigned int pcibios_irq_mask; extern int pcibios_scanned; -extern spinlock_t pci_config_lock; +extern raw_spinlock_t pci_config_lock; extern int (*pcibios_enable_irq)(struct pci_dev *dev); extern void (*pcibios_disable_irq)(struct pci_dev *dev); diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 177b016..0e989a1 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -71,6 +71,7 @@ static inline void pud_clear(pud_t *pudp) { unsigned long pgd; + preempt_disable(); set_pud(pudp, __pud(0)); /* @@ -86,6 +87,7 @@ static inline void pud_clear(pud_t *pudp) if (__pa(pudp) >= pgd && __pa(pudp) < (pgd + sizeof(pgd_t)*PTRS_PER_PGD)) write_cr3(pgd); + preempt_enable(); } #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 01fd946..b323c3d 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -59,14 +59,20 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t); #define pte_offset_map_nested(dir, address) \ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \ pte_index((address))) +#define pte_offset_map_direct(dir, address) \ + ((pte_t *)kmap_atomic_pte_direct(pmd_page(*(dir)), __KM_PTE) + \ + pte_index((address))) #define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE) #define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) +#define pte_unmap_direct(pte) kunmap_atomic_direct((pte), __KM_PTE) #else #define pte_offset_map(dir, address) \ ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address))) #define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address)) +#define pte_offset_map_direct(dir, address) pte_offset_map((dir), (address)) #define pte_unmap(pte) do { } while (0) #define pte_unmap_nested(pte) do { } while (0) +#define pte_unmap_direct(pte) do { } while (0) #endif /* Clear a kernel PTE and flush it from the TLB */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index c57a301..efc01ae 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -126,8 +126,10 @@ static inline int pgd_large(pgd_t pgd) { return 0; } /* x86-64 always has all page tables mapped. */ #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) #define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address)) -#define pte_unmap(pte) /* NOP */ -#define pte_unmap_nested(pte) /* NOP */ +#define pte_offset_map_direct(dir, address) pte_offset_kernel((dir), (address)) +#define pte_unmap(pte) do { } while (0) +#define pte_unmap_nested(pte) do { } while (0) +#define pte_unmap_direct(pte) do { } while (0) #define update_mmu_cache(vma, address, pte) do { } while (0) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index fc801ba..b753ea5 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -450,6 +450,8 @@ struct thread_struct { struct perf_event *ptrace_bps[HBP_NUM]; /* Debug status used for traps, single steps, etc... */ unsigned long debugreg6; + /* Keep track of the exact dr7 value set by the user */ + unsigned long ptrace_dr7; /* Fault info: */ unsigned long cr2; unsigned long trap_no; diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index ca7517d..6051918 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -44,14 +44,14 @@ struct rwsem_waiter; -extern asmregparm struct rw_semaphore * - rwsem_down_read_failed(struct rw_semaphore *sem); -extern asmregparm struct rw_semaphore * - rwsem_down_write_failed(struct rw_semaphore *sem); -extern asmregparm struct rw_semaphore * - rwsem_wake(struct rw_semaphore *); -extern asmregparm struct rw_semaphore * - rwsem_downgrade_wake(struct rw_semaphore *sem); +extern asmregparm struct rw_anon_semaphore * + rwsem_down_read_failed(struct rw_anon_semaphore *sem); +extern asmregparm struct rw_anon_semaphore * + rwsem_down_write_failed(struct rw_anon_semaphore *sem); +extern asmregparm struct rw_anon_semaphore * + rwsem_wake(struct rw_anon_semaphore *); +extern asmregparm struct rw_anon_semaphore * + rwsem_downgrade_wake(struct rw_anon_semaphore *sem); /* * the semaphore definition @@ -64,9 +64,9 @@ extern asmregparm struct rw_semaphore * #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -struct rw_semaphore { +struct rw_anon_semaphore { signed long count; - spinlock_t wait_lock; + raw_spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; @@ -74,35 +74,34 @@ struct rw_semaphore { }; #ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +# define __RWSEM_ANON_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } #else -# define __RWSEM_DEP_MAP_INIT(lockname) +# define __RWSEM_ANON_DEP_MAP_INIT(lockname) #endif - -#define __RWSEM_INITIALIZER(name) \ +#define __RWSEM_ANON_INITIALIZER(name) \ { \ - RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ + RWSEM_UNLOCKED_VALUE, __RAW_SPIN_LOCK_UNLOCKED((name).wait_lock), \ LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) \ } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define DECLARE_ANON_RWSEM(name) \ + struct rw_anon_semaphore name = __RWSEM_ANON_INITIALIZER(name) -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); +extern void __init_anon_rwsem(struct rw_anon_semaphore *sem, const char *name, + struct lock_class_key *key); -#define init_rwsem(sem) \ +#define init_anon_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ - __init_rwsem((sem), #sem, &__key); \ + __init_anon_rwsem((sem), #sem, &__key); \ } while (0) /* * lock for reading */ -static inline void __down_read(struct rw_semaphore *sem) +static inline void __down_read(struct rw_anon_semaphore *sem) { asm volatile("# beginning down_read\n\t" LOCK_PREFIX " incl (%%eax)\n\t" @@ -119,7 +118,7 @@ static inline void __down_read(struct rw_semaphore *sem) /* * trylock for reading -- returns 1 if successful, 0 if contention */ -static inline int __down_read_trylock(struct rw_semaphore *sem) +static inline int __down_read_trylock(struct rw_anon_semaphore *sem) { __s32 result, tmp; asm volatile("# beginning __down_read_trylock\n\t" @@ -141,7 +140,8 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline void +__down_write_nested(struct rw_anon_semaphore *sem, int subclass) { int tmp; @@ -160,7 +160,7 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) : "memory", "cc"); } -static inline void __down_write(struct rw_semaphore *sem) +static inline void __down_write(struct rw_anon_semaphore *sem) { __down_write_nested(sem, 0); } @@ -168,7 +168,7 @@ static inline void __down_write(struct rw_semaphore *sem) /* * trylock for writing -- returns 1 if successful, 0 if contention */ -static inline int __down_write_trylock(struct rw_semaphore *sem) +static inline int __down_write_trylock(struct rw_anon_semaphore *sem) { signed long ret = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, @@ -181,7 +181,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) /* * unlock after reading */ -static inline void __up_read(struct rw_semaphore *sem) +static inline void __up_read(struct rw_anon_semaphore *sem) { __s32 tmp = -RWSEM_ACTIVE_READ_BIAS; asm volatile("# beginning __up_read\n\t" @@ -199,7 +199,7 @@ static inline void __up_read(struct rw_semaphore *sem) /* * unlock after writing */ -static inline void __up_write(struct rw_semaphore *sem) +static inline void __up_write(struct rw_anon_semaphore *sem) { asm volatile("# beginning __up_write\n\t" " movl %2,%%edx\n\t" @@ -218,7 +218,7 @@ static inline void __up_write(struct rw_semaphore *sem) /* * downgrade write lock to read lock */ -static inline void __downgrade_write(struct rw_semaphore *sem) +static inline void __downgrade_write(struct rw_anon_semaphore *sem) { asm volatile("# beginning __downgrade_write\n\t" LOCK_PREFIX " addl %2,(%%eax)\n\t" @@ -235,7 +235,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(int delta, struct rw_anon_semaphore *sem) { asm volatile(LOCK_PREFIX "addl %1,%0" : "+m" (sem->count) @@ -245,7 +245,7 @@ static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) /* * implement exchange and add functionality */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline int rwsem_atomic_update(int delta, struct rw_anon_semaphore *sem) { int tmp = delta; @@ -256,10 +256,54 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) return tmp + delta; } +static inline int anon_rwsem_is_locked(struct rw_anon_semaphore *sem) +{ + return (sem->count != 0); +} + +#ifndef CONFIG_PREEMPT_RT + +struct rw_semaphore { + signed long count; + raw_spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +#else +# define __RWSEM_DEP_MAP_INIT(lockname) +#endif + +#define __RWSEM_INITIALIZER(name) \ +{ 0, __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), LIST_HEAD_INIT((name).wait_list) \ + __RWSEM_DEP_MAP_INIT(name) } + +#define DECLARE_RWSEM(name) \ + struct rw_semaphore name = __RWSEM_INITIALIZER(name) + +static inline void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ + __init_anon_rwsem((struct rw_anon_semaphore *)sem, name, key); +} + +#define init_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __init_rwsem((sem), #sem, &__key); \ +} while (0) + + static inline int rwsem_is_locked(struct rw_semaphore *sem) { return (sem->count != 0); } +#endif #endif /* __KERNEL__ */ #endif /* _ASM_X86_RWSEM_H */ diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 5469630..3d20117 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -58,9 +58,9 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) unsigned long long ns; unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); ns = __cycles_2_ns(cyc); - local_irq_restore(flags); + raw_local_irq_restore(flags); return ns; } diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 7f3eba0..1c77e81 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -7,6 +7,21 @@ #include <asm/processor.h> #include <asm/system.h> +/* + * TLB-flush needs to be nonpreemptible on PREEMPT_RT due to the + * following complex race scenario: + * + * if the current task is lazy-TLB and does a TLB flush and + * gets preempted after the movl %%r3, %0 but before the + * movl %0, %%cr3 then its ->active_mm might change and it will + * install the wrong cr3 when it switches back. This is not a + * problem for the lazy-TLB task itself, but if the next task it + * switches to has an ->mm that is also the lazy-TLB task's + * new ->active_mm, then the scheduler will assume that cr3 is + * the new one, while we overwrote it with the old one. The result + * is the wrong cr3 in the new (non-lazy-TLB) task, which typically + * causes an infinite pagefault upon the next userspace access. + */ #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else @@ -17,7 +32,9 @@ static inline void __native_flush_tlb(void) { + preempt_disable(); native_write_cr3(native_read_cr3()); + preempt_enable(); } static inline void __native_flush_tlb_global(void) @@ -95,6 +112,13 @@ static inline void __flush_tlb_one(unsigned long addr) static inline void flush_tlb_mm(struct mm_struct *mm) { + /* + * This is safe on PREEMPT_RT because if we preempt + * right after the check but before the __flush_tlb(), + * and if ->active_mm changes, then we might miss a + * TLB flush, but that TLB flush happened already when + * ->active_mm was changed: + */ if (mm == current->active_mm) __flush_tlb(); } diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 3d61e20..81ccfb3 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -5,7 +5,7 @@ #include <linux/clocksource.h> struct vsyscall_gtod_data { - seqlock_t lock; + raw_seqlock_t lock; /* open coded 'struct timespec' */ time_t wall_time_sec; diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index 133b40a..7a6aa68 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h @@ -865,7 +865,21 @@ static struct xor_block_template xor_block_pIII_sse = { #include <asm-generic/xor.h> #undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ +/* + * MMX/SSE ops disable preemption for long periods of time, + * so on PREEMPT_RT use the register-based ops only: + */ +#ifdef CONFIG_PREEMPT_RT +# define XOR_TRY_TEMPLATES \ + do { \ + xor_speed(&xor_block_8regs); \ + xor_speed(&xor_block_8regs_p); \ + xor_speed(&xor_block_32regs); \ + xor_speed(&xor_block_32regs_p); \ + } while (0) +# define XOR_SELECT_TEMPLATE(FASTEST) (FASTEST) +#else +# define XOR_TRY_TEMPLATES \ do { \ xor_speed(&xor_block_8regs); \ xor_speed(&xor_block_8regs_p); \ @@ -882,7 +896,8 @@ do { \ /* We force the use of the SSE xor block because it can write around L2. We may also be able to load into the L1 only depending on how the cpu deals with a load to a line that is being prefetched. */ -#define XOR_SELECT_TEMPLATE(FASTEST) \ +# define XOR_SELECT_TEMPLATE(FASTEST) \ (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) +#endif /* CONFIG_PREEMPT_RT */ #endif /* _ASM_X86_XOR_32_H */ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 0acbcdf..af1c583 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1344,14 +1344,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { }, { .callback = force_acpi_ht, - .ident = "ASUS P2B-DS", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), - DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"), - }, - }, - { - .callback = force_acpi_ht, .ident = "ASUS CUR-DLS", .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 53243ca..dced1f6 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -73,8 +73,8 @@ */ int sis_apic_bug = -1; -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); +static DEFINE_RAW_SPINLOCK(ioapic_lock); +static DEFINE_RAW_SPINLOCK(vector_lock); /* * # of IRQ routing registers @@ -406,7 +406,7 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) struct irq_pin_list *entry; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; int pin; @@ -415,11 +415,11 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) reg = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ if (reg & IO_APIC_REDIR_REMOTE_IRR) { - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return true; } } - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return false; } @@ -433,10 +433,10 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) { union entry_union eu; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return eu.entry; } @@ -459,9 +459,9 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); __ioapic_write_entry(apic, pin, e); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -474,10 +474,10 @@ static void ioapic_mask_entry(int apic, int pin) unsigned long flags; union entry_union eu = { .entry.mask = 1 }; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic, 0x10 + 2*pin, eu.w1); io_apic_write(apic, 0x11 + 2*pin, eu.w2); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -604,9 +604,9 @@ static void mask_IO_APIC_irq_desc(struct irq_desc *desc) BUG_ON(!cfg); - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); __mask_IO_APIC_irq(cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) @@ -614,9 +614,9 @@ static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) struct irq_cfg *cfg = desc->chip_data; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); __unmask_IO_APIC_irq(cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void mask_IO_APIC_irq(unsigned int irq) @@ -1140,12 +1140,12 @@ void lock_vector_lock(void) /* Used to the online set of cpus does not change * during assign_irq_vector. */ - spin_lock(&vector_lock); + raw_spin_lock(&vector_lock); } void unlock_vector_lock(void) { - spin_unlock(&vector_lock); + raw_spin_unlock(&vector_lock); } static int @@ -1232,9 +1232,9 @@ int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) int err; unsigned long flags; - spin_lock_irqsave(&vector_lock, flags); + raw_spin_lock_irqsave(&vector_lock, flags); err = __assign_irq_vector(irq, cfg, mask); - spin_unlock_irqrestore(&vector_lock, flags); + raw_spin_unlock_irqrestore(&vector_lock, flags); return err; } @@ -1601,14 +1601,14 @@ __apicdebuginit(void) print_IO_APIC(void) for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic, 0); reg_01.raw = io_apic_read(apic, 1); if (reg_01.bits.version >= 0x10) reg_02.raw = io_apic_read(apic, 2); if (reg_01.bits.version >= 0x20) reg_03.raw = io_apic_read(apic, 3); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); printk("\n"); printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid); @@ -1830,7 +1830,7 @@ __apicdebuginit(void) print_PIC(void) printk(KERN_DEBUG "\nprinting PIC contents\n"); - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); v = inb(0xa1) << 8 | inb(0x21); printk(KERN_DEBUG "... PIC IMR: %04x\n", v); @@ -1844,7 +1844,7 @@ __apicdebuginit(void) print_PIC(void) outb(0x0a,0xa0); outb(0x0a,0x20); - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); printk(KERN_DEBUG "... PIC ISR: %04x\n", v); @@ -1903,9 +1903,9 @@ void __init enable_IO_APIC(void) * The number of IO-APIC IRQ registers (== #pins): */ for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_01.raw = io_apic_read(apic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); nr_ioapic_registers[apic] = reg_01.bits.entries+1; } @@ -2045,9 +2045,9 @@ void __init setup_ioapic_ids_from_mpc(void) for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { /* Read the register 0 value */ - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic_id, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); old_id = mp_ioapics[apic_id].apicid; @@ -2106,16 +2106,16 @@ void __init setup_ioapic_ids_from_mpc(void) mp_ioapics[apic_id].apicid); reg_00.bits.ID = mp_ioapics[apic_id].apicid; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic_id, 0, reg_00.raw); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); /* * Sanity check */ - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic_id, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); if (reg_00.bits.ID != mp_ioapics[apic_id].apicid) printk("could not set ID!\n"); else @@ -2198,7 +2198,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq) unsigned long flags; struct irq_cfg *cfg; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); if (irq < nr_legacy_irqs) { disable_8259A_irq(irq); if (i8259A_irq_pending(irq)) @@ -2206,7 +2206,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq) } cfg = irq_cfg(irq); __unmask_IO_APIC_irq(cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return was_pending; } @@ -2217,9 +2217,9 @@ static int ioapic_retrigger_irq(unsigned int irq) struct irq_cfg *cfg = irq_cfg(irq); unsigned long flags; - spin_lock_irqsave(&vector_lock, flags); + raw_spin_lock_irqsave(&vector_lock, flags); apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); - spin_unlock_irqrestore(&vector_lock, flags); + raw_spin_unlock_irqrestore(&vector_lock, flags); return 1; } @@ -2312,14 +2312,14 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) irq = desc->irq; cfg = desc->chip_data; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); ret = set_desc_affinity(desc, mask, &dest); if (!ret) { /* Only the high 8 bits are valid. */ dest = SET_APIC_LOGICAL_ID(dest); __target_IO_APIC_irq(irq, dest, cfg); } - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return ret; } @@ -2554,9 +2554,9 @@ static void eoi_ioapic_irq(struct irq_desc *desc) irq = desc->irq; cfg = desc->chip_data; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); __eoi_ioapic_irq(irq, cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void ack_apic_level(unsigned int irq) @@ -2570,7 +2570,8 @@ static void ack_apic_level(unsigned int irq) irq_complete_move(&desc); #ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ - if (unlikely(desc->status & IRQ_MOVE_PENDING)) { + if (unlikely(desc->status & IRQ_MOVE_PENDING) && + !(desc->status & IRQ_INPROGRESS)) { do_unmask_irq = 1; mask_IO_APIC_irq_desc(desc); } @@ -2664,6 +2665,16 @@ static void ack_apic_level(unsigned int irq) move_masked_irq(irq); unmask_IO_APIC_irq_desc(desc); } + +#if (defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)) && \ + defined(CONFIG_PREEMPT_HARDIRQS) + /* + * With threaded interrupts, we always have IRQ_INPROGRESS + * when acking. CHECKME !!!!! + */ + else if (unlikely(desc->status & IRQ_MOVE_PENDING)) + move_masked_irq(irq); +#endif } #ifdef CONFIG_INTR_REMAP @@ -3138,13 +3149,13 @@ static int ioapic_resume(struct sys_device *dev) data = container_of(dev, struct sysfs_ioapic_data, dev); entry = data->entry; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(dev->id, 0); if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) { reg_00.bits.ID = mp_ioapics[dev->id].apicid; io_apic_write(dev->id, 0, reg_00.raw); } - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); for (i = 0; i < nr_ioapic_registers[dev->id]; i++) ioapic_write_entry(dev->id, i, entry[i]); @@ -3207,7 +3218,6 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) if (irq_want < nr_irqs_gsi) irq_want = nr_irqs_gsi; - spin_lock_irqsave(&vector_lock, flags); for (new = irq_want; new < nr_irqs; new++) { desc_new = irq_to_desc_alloc_node(new, node); if (!desc_new) { @@ -3219,14 +3229,15 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) if (cfg_new->vector != 0) continue; + raw_spin_lock_irqsave(&vector_lock, flags); desc_new = move_irq_desc(desc_new, node); cfg_new = desc_new->chip_data; if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) irq = new; + raw_spin_unlock_irqrestore(&vector_lock, flags); break; } - spin_unlock_irqrestore(&vector_lock, flags); if (irq > 0) { dynamic_irq_init(irq); @@ -3266,9 +3277,9 @@ void destroy_irq(unsigned int irq) desc->chip_data = cfg; free_irte(irq); - spin_lock_irqsave(&vector_lock, flags); + raw_spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq, cfg); - spin_unlock_irqrestore(&vector_lock, flags); + raw_spin_unlock_irqrestore(&vector_lock, flags); } /* @@ -3805,9 +3816,9 @@ int __init io_apic_get_redir_entries (int ioapic) union IO_APIC_reg_01 reg_01; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return reg_01.bits.entries; } @@ -3969,9 +3980,9 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) if (physids_empty(apic_id_map)) apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map); - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); if (apic_id >= get_physical_broadcast()) { printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " @@ -4005,10 +4016,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) if (reg_00.bits.ID != apic_id) { reg_00.bits.ID = apic_id; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(ioapic, 0, reg_00.raw); reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); /* Sanity check */ if (reg_00.bits.ID != apic_id) { @@ -4029,9 +4040,9 @@ int __init io_apic_get_version(int ioapic) union IO_APIC_reg_01 reg_01; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return reg_01.bits.version; } diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 08385e0..54e774d 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -106,7 +106,7 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) unsigned long mask = cpumask_bits(cpumask)[0]; unsigned long flags; - if (WARN_ONCE(!mask, "empty IPI mask")) + if (!mask) return; local_irq_save(flags); diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 0159a69..1aaf293 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -91,7 +91,9 @@ static inline unsigned int get_timer_irqs(int cpu) */ static __init void nmi_cpu_busy(void *data) { +#ifndef CONFIG_PREEMPT_RT local_irq_enable_in_hardirq(); +#endif /* * Intentionally don't use cpu_relax here. This is * to make sure that the performance counter really ticks, @@ -416,13 +418,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) /* We can be called before check_nmi_watchdog, hence NULL check. */ if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - static DEFINE_SPINLOCK(lock); /* Serialise the printks */ + static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */ - spin_lock(&lock); + raw_spin_lock(&lock); printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); show_regs(regs); dump_stack(); - spin_unlock(&lock); + raw_spin_unlock(&lock); cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); rc = 1; diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index b5b6b23..a920e00 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1224,7 +1224,7 @@ static void reinit_timer(void) #ifdef INIT_TIMER_AFTER_SUSPEND unsigned long flags; - spin_lock_irqsave(&i8253_lock, flags); + raw_spin_lock_irqsave(&i8253_lock, flags); /* set the clock to HZ */ outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ udelay(10); @@ -1232,7 +1232,7 @@ static void reinit_timer(void) udelay(10); outb_pit(LATCH >> 8, PIT_CH0); /* MSB */ udelay(10); - spin_unlock_irqrestore(&i8253_lock, flags); + raw_spin_unlock_irqrestore(&i8253_lock, flags); #endif } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4868e4a..fae7a32 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1013,7 +1013,9 @@ DEFINE_PER_CPU(unsigned int, irq_count) = -1; */ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, +#if DEBUG_STACK > 0 [DEBUG_STACK - 1] = DEBUG_STKSZ +#endif }; static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 55da0c5..46f40f0 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -570,7 +570,7 @@ static unsigned long set_mtrr_state(void) static unsigned long cr4; -static DEFINE_SPINLOCK(set_atomicity_lock); +static DEFINE_RAW_SPINLOCK(set_atomicity_lock); /* * Since we are disabling the cache don't allow any interrupts, @@ -590,7 +590,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) * changes to the way the kernel boots */ - spin_lock(&set_atomicity_lock); + raw_spin_lock(&set_atomicity_lock); /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ cr0 = read_cr0() | X86_CR0_CD; @@ -627,7 +627,7 @@ static void post_set(void) __releases(set_atomicity_lock) /* Restore value of CR4 */ if (cpu_has_pge) write_cr4(cr4); - spin_unlock(&set_atomicity_lock); + raw_spin_unlock(&set_atomicity_lock); } static void generic_set_all(void) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index ae775ca..04078f1 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -98,6 +98,12 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, } +#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_EVENT_TRACE) +extern unsigned long worst_stack_left; +#else +# define worst_stack_left -1L +#endif + void show_registers(struct pt_regs *regs) { int i; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 0ad9597..bd49ca5 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -22,10 +22,14 @@ (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) static char x86_stack_ids[][8] = { +#if DEBUG_STACK > 0 [ DEBUG_STACK-1 ] = "#DB", +#endif [ NMI_STACK-1 ] = "NMI", [ DOUBLEFAULT_STACK-1 ] = "#DF", +#if STACKFAULT_STACK > 0 [ STACKFAULT_STACK-1 ] = "#SS", +#endif [ MCE_STACK-1 ] = "#MC", #if DEBUG_STKSZ > EXCEPTION_STKSZ [ N_EXCEPTION_STACKS ... diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index b9c830c..02aaf8f 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -59,7 +59,7 @@ static void early_vga_write(struct console *con, const char *str, unsigned n) static struct console early_vga_console = { .name = "earlyvga", .write = early_vga_write, - .flags = CON_PRINTBUFFER, + .flags = CON_PRINTBUFFER | CON_ATOMIC, .index = -1, }; @@ -156,7 +156,7 @@ static __init void early_serial_init(char *s) static struct console early_serial_console = { .name = "earlyser", .write = early_serial_write, - .flags = CON_PRINTBUFFER, + .flags = CON_PRINTBUFFER | CON_ATOMIC, .index = -1, }; @@ -166,7 +166,7 @@ static int __initdata early_console_initialized; asmlinkage void early_printk(const char *fmt, ...) { - char buf[512]; + static char buf[512]; int n; va_list ap; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 44a8e0d..57d3849 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -375,13 +375,13 @@ END(ret_from_exception) ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? - jnz restore_all + jnz restore_nocheck need_resched: movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl - jz restore_all + jz restore_nocheck testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all + jz restore_nocheck call preempt_schedule_irq jmp need_resched END(resume_kernel) @@ -639,12 +639,9 @@ work_pending: testb $_TIF_NEED_RESCHED, %cl jz work_notifysig work_resched: - call schedule + call __schedule LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx andl $_TIF_WORK_MASK, %ecx # is there any work to be done other # than syscall tracing? diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index b5a9896..8e77347 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -30,7 +30,11 @@ static void __init zap_identity_mappings(void) { pgd_t *pgd = pgd_offset_k(0UL); pgd_clear(pgd); - __flush_tlb_all(); + /* + * preempt_disable/enable does not work this early in the + * bootup yet: + */ + write_cr3(read_cr3()); } /* Don't add a printk in there. printk relies on the PDA which is not initialized diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 7fd318b..4fabb36 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -596,6 +596,7 @@ ignore_int: call dump_stack addl $(5*4),%esp + call dump_stack popl %ds popl %es popl %edx diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 05d5fec..bb6006e 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -212,25 +212,6 @@ static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); } -/* - * Store a breakpoint's encoded address, length, and type. - */ -static int arch_store_info(struct perf_event *bp) -{ - struct arch_hw_breakpoint *info = counter_arch_bp(bp); - /* - * For kernel-addresses, either the address or symbol name can be - * specified. - */ - if (info->name) - info->address = (unsigned long) - kallsyms_lookup_name(info->name); - if (info->address) - return 0; - - return -EINVAL; -} - int arch_bp_generic_fields(int x86_len, int x86_type, int *gen_len, int *gen_type) { @@ -362,10 +343,13 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp, return ret; } - ret = arch_store_info(bp); - - if (ret < 0) - return ret; + /* + * For kernel-addresses, either the address or symbol name can be + * specified. + */ + if (info->name) + info->address = (unsigned long) + kallsyms_lookup_name(info->name); /* * Check that the low-order bits of the address are appropriate * for the alignment implied by len. diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 23c1679..2dfd315 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -16,7 +16,7 @@ #include <asm/hpet.h> #include <asm/smp.h> -DEFINE_SPINLOCK(i8253_lock); +DEFINE_RAW_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); /* @@ -33,7 +33,7 @@ struct clock_event_device *global_clock_event; static void init_pit_timer(enum clock_event_mode mode, struct clock_event_device *evt) { - spin_lock(&i8253_lock); + raw_spin_lock(&i8253_lock); switch (mode) { case CLOCK_EVT_MODE_PERIODIC: @@ -62,7 +62,7 @@ static void init_pit_timer(enum clock_event_mode mode, /* Nothing to do here */ break; } - spin_unlock(&i8253_lock); + raw_spin_unlock(&i8253_lock); } /* @@ -72,10 +72,10 @@ static void init_pit_timer(enum clock_event_mode mode, */ static int pit_next_event(unsigned long delta, struct clock_event_device *evt) { - spin_lock(&i8253_lock); + raw_spin_lock(&i8253_lock); outb_pit(delta & 0xff , PIT_CH0); /* LSB */ outb_pit(delta >> 8 , PIT_CH0); /* MSB */ - spin_unlock(&i8253_lock); + raw_spin_unlock(&i8253_lock); return 0; } @@ -130,7 +130,7 @@ static cycle_t pit_read(struct clocksource *cs) int count; u32 jifs; - spin_lock_irqsave(&i8253_lock, flags); + raw_spin_lock_irqsave(&i8253_lock, flags); /* * Although our caller may have the read side of xtime_lock, * this is now a seqlock, and we are cheating in this routine @@ -176,7 +176,7 @@ static cycle_t pit_read(struct clocksource *cs) old_count = count; old_jifs = jifs; - spin_unlock_irqrestore(&i8253_lock, flags); + raw_spin_unlock_irqrestore(&i8253_lock, flags); count = (LATCH - 1) - count; diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index df89102..96400fe 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -32,7 +32,7 @@ */ static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); +DEFINE_RAW_SPINLOCK(i8259A_lock); static void mask_and_ack_8259A(unsigned int); struct irq_chip i8259A_chip = { @@ -68,13 +68,13 @@ void disable_8259A_irq(unsigned int irq) unsigned int mask = 1 << irq; unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask |= mask; if (irq & 8) outb(cached_slave_mask, PIC_SLAVE_IMR); else outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } void enable_8259A_irq(unsigned int irq) @@ -82,13 +82,13 @@ void enable_8259A_irq(unsigned int irq) unsigned int mask = ~(1 << irq); unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask &= mask; if (irq & 8) outb(cached_slave_mask, PIC_SLAVE_IMR); else outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } int i8259A_irq_pending(unsigned int irq) @@ -97,12 +97,12 @@ int i8259A_irq_pending(unsigned int irq) unsigned long flags; int ret; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); if (irq < 8) ret = inb(PIC_MASTER_CMD) & mask; else ret = inb(PIC_SLAVE_CMD) & (mask >> 8); - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); return ret; } @@ -150,7 +150,7 @@ static void mask_and_ack_8259A(unsigned int irq) unsigned int irqmask = 1 << irq; unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); /* * Lightweight spurious IRQ detection. We do not want * to overdo spurious IRQ handling - it's usually a sign @@ -168,6 +168,8 @@ static void mask_and_ack_8259A(unsigned int irq) */ if (cached_irq_mask & irqmask) goto spurious_8259A_irq; + if (irq & 8) + outb(0x60+(irq&7), PIC_SLAVE_CMD); /* 'Specific EOI' to slave */ cached_irq_mask |= irqmask; handle_real_irq: @@ -183,7 +185,7 @@ handle_real_irq: outb(cached_master_mask, PIC_MASTER_IMR); outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */ } - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); return; spurious_8259A_irq: @@ -285,24 +287,24 @@ void mask_8259A(void) { unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } void unmask_8259A(void) { unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } void init_8259A(int auto_eoi) @@ -311,7 +313,7 @@ void init_8259A(int auto_eoi) i8259A_auto_eoi = auto_eoi; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ @@ -328,10 +330,10 @@ void init_8259A(int auto_eoi) /* 8259A-1 (the master) has a slave on IR2 */ outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); - if (auto_eoi) /* master does Auto EOI */ - outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); - else /* master expects normal EOI */ - outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + if (!auto_eoi) /* master expects normal EOI */ + outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + else /* master does Auto EOI */ + outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ @@ -356,5 +358,5 @@ void init_8259A(int auto_eoi) outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index d593222..7cdd31c 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -72,6 +72,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id) static struct irqaction fpu_irq = { .handler = math_error_irq, .name = "fpu", + .flags = IRQF_NODELAY, }; #endif @@ -81,6 +82,7 @@ static struct irqaction fpu_irq = { static struct irqaction irq2 = { .handler = no_action, .name = "cascade", + .flags = IRQF_NODELAY, }; DEFINE_PER_CPU(vector_irq_t, vector_irq) = { diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 5b8c750..04bc8c9 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -434,7 +434,7 @@ static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, /* Boost up -- we can execute copied instructions directly */ reset_current_kprobe(); regs->ip = (unsigned long)p->ainsn.insn; - preempt_enable_no_resched(); + preempt_enable(); return; } #endif @@ -543,7 +543,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) } } /* else: not a kprobe fault; let the kernel handle it */ - preempt_enable_no_resched(); + preempt_enable(); return 0; } @@ -843,7 +843,7 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs) } reset_current_kprobe(); out: - preempt_enable_no_resched(); + preempt_enable(); /* * if somebody else is singlestepping across a probe point, flags @@ -877,7 +877,7 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) restore_previous_kprobe(kcb); else reset_current_kprobe(); - preempt_enable_no_resched(); + preempt_enable(); break; case KPROBE_HIT_ACTIVE: case KPROBE_HIT_SSDONE: @@ -1024,7 +1024,7 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp), kcb->jprobes_stack, MIN_STACK_SIZE(kcb->jprobe_saved_sp)); - preempt_enable_no_resched(); + preempt_enable(); return 1; } return 0; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1b1739d..024d2c1 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -396,6 +396,20 @@ struct pv_apic_ops pv_apic_ops = { #define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) #endif +#ifdef CONFIG_HIGHPTE +/* + * kmap_atomic() might be an inline or a macro: + */ +static void *kmap_atomic_func(struct page *page, enum km_type idx) +{ + return kmap_atomic(page, idx); +} +static void *kmap_atomic_direct_func(struct page *page, enum km_type idx) +{ + return kmap_atomic_direct(page, idx); +} +#endif + struct pv_mmu_ops pv_mmu_ops = { .read_cr2 = native_read_cr2, @@ -429,7 +443,8 @@ struct pv_mmu_ops pv_mmu_ops = { .ptep_modify_prot_commit = __ptep_modify_prot_commit, #ifdef CONFIG_HIGHPTE - .kmap_atomic_pte = kmap_atomic, + .kmap_atomic_pte = kmap_atomic_func, + .kmap_atomic_pte_direct = kmap_atomic_direct_func, #endif #if PAGETABLE_LEVELS >= 3 diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 37ad1e0..46f5881 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -101,7 +101,6 @@ void cpu_idle(void) tick_nohz_stop_sched_tick(1); while (!need_resched()) { - check_pgt_cache(); rmb(); if (cpu_is_offline(cpu)) @@ -113,10 +112,12 @@ void cpu_idle(void) pm_idle(); start_critical_timings(); } + local_irq_disable(); tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } @@ -148,8 +149,10 @@ void __show_regs(struct pt_regs *regs, int all) regs->ax, regs->bx, regs->cx, regs->dx); printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", regs->si, regs->di, regs->bp, sp); - printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", - (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); + printk(KERN_DEFAULT + " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x preempt:%08x\n", + (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss, + preempt_count()); if (!all) return; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 41a26a8..bb62edf 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -146,9 +146,11 @@ void cpu_idle(void) } tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } @@ -527,6 +529,7 @@ void set_personality_ia32(void) /* Make sure to be in 32bit mode */ set_thread_flag(TIF_IA32); + current->personality |= force_personality32; /* Prepare the first "return" to user space */ current_thread_info()->status |= TS_COMPAT; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 017d937..0c1033d 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -702,7 +702,7 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) } else if (n == 6) { val = thread->debugreg6; } else if (n == 7) { - val = ptrace_get_dr7(thread->ptrace_bps); + val = thread->ptrace_dr7; } return val; } @@ -778,8 +778,11 @@ int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) return rc; } /* All that's left is DR7 */ - if (n == 7) + if (n == 7) { rc = ptrace_write_dr7(tsk, val); + if (!rc) + thread->ptrace_dr7 = val; + } ret_path: return rc; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 4fd173c..fccd2c8 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -773,6 +773,13 @@ static void do_signal(struct pt_regs *regs) int signr; sigset_t *oldset; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which is why we may in certain * cases get here from kernel mode. Just return without doing anything diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index ec1de97..a83e38d 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -120,6 +120,16 @@ static void native_smp_send_reschedule(int cpu) apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR); } +/* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + apic->send_IPI_allbutself(RESCHEDULE_VECTOR); +} + void native_send_call_func_single_ipi(int cpu) { apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR); diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index be25734..fb5cc5e 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -70,11 +70,11 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) * manually to deassert NMI lines for the watchdog if run * on an 82489DX-based system. */ - spin_lock(&i8259A_lock); + raw_spin_lock(&i8259A_lock); outb(0x0c, PIC_MASTER_OCW3); /* Ack the IRQ; AEOI will end it automatically. */ inb(PIC_MASTER_POLL); - spin_unlock(&i8259A_lock); + raw_spin_unlock(&i8259A_lock); } global_clock_event->event_handler(global_clock_event); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 3339917..9288ccf 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -88,9 +88,10 @@ static inline void conditional_sti(struct pt_regs *regs) local_irq_enable(); } -static inline void preempt_conditional_sti(struct pt_regs *regs) +static inline void preempt_conditional_sti(struct pt_regs *regs, int stack) { - inc_preempt_count(); + if (stack) + inc_preempt_count(); if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } @@ -101,11 +102,12 @@ static inline void conditional_cli(struct pt_regs *regs) local_irq_disable(); } -static inline void preempt_conditional_cli(struct pt_regs *regs) +static inline void preempt_conditional_cli(struct pt_regs *regs, int stack) { if (regs->flags & X86_EFLAGS_IF) local_irq_disable(); - dec_preempt_count(); + if (stack) + dec_preempt_count(); } #ifdef CONFIG_X86_32 @@ -232,9 +234,9 @@ dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) if (notify_die(DIE_TRAP, "stack segment", regs, error_code, 12, SIGBUS) == NOTIFY_STOP) return; - preempt_conditional_sti(regs); + preempt_conditional_sti(regs, STACKFAULT_STACK); do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); - preempt_conditional_cli(regs); + preempt_conditional_cli(regs, STACKFAULT_STACK); } dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) @@ -470,9 +472,9 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) return; #endif - preempt_conditional_sti(regs); + preempt_conditional_sti(regs, DEBUG_STACK); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); - preempt_conditional_cli(regs); + preempt_conditional_cli(regs, DEBUG_STACK); } #ifdef CONFIG_X86_64 @@ -554,7 +556,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) return; /* It's safe to allow irq's after DR6 has been saved */ - preempt_conditional_sti(regs); + preempt_conditional_sti(regs, DEBUG_STACK); if (regs->flags & X86_VM_MASK) { handle_vm86_trap((struct kernel_vm86_regs *) regs, @@ -577,7 +579,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) si_code = get_si_code(tsk->thread.debugreg6); if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS)) send_sigtrap(tsk, regs, error_code, si_code); - preempt_conditional_cli(regs); + preempt_conditional_cli(regs, DEBUG_STACK); return; } diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 34a279a..ccbea12 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -559,7 +559,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) struct irq_desc *desc; unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); /* Find out what's interrupting in the PIIX4 master 8259 */ outb(0x0c, 0x20); /* OCW3 Poll command */ @@ -596,7 +596,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) outb(0x60 + realirq, 0x20); } - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); desc = irq_to_desc(realirq); @@ -614,18 +614,20 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) return IRQ_HANDLED; out_unlock: - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); return IRQ_NONE; } static struct irqaction master_action = { .handler = piix4_master_intr, .name = "PIIX4-8259", + .flags = IRQF_NODELAY, }; static struct irqaction cascade_action = { .handler = no_action, .name = "cascade", + .flags = IRQF_NODELAY, }; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 5ffb562..8ea7e48 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -137,6 +137,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) local_irq_enable(); if (!current->thread.vm86_info) { + local_irq_disable(); printk("no vm86_info: BAD\n"); do_exit(SIGSEGV); } diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 9055e58..1bd8ff3 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -59,7 +59,7 @@ int __vgetcpu_mode __section_vgetcpu_mode; struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data = { - .lock = SEQLOCK_UNLOCKED, + .lock = __RAW_SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), .sysctl_enabled = 1, }; @@ -67,10 +67,10 @@ void update_vsyscall_tz(void) { unsigned long flags; - write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + write_raw_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); /* sys_tz has changed */ vsyscall_gtod_data.sys_tz = sys_tz; - write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); + write_raw_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, @@ -78,18 +78,45 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, { unsigned long flags; - write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + write_raw_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + + if (likely(vsyscall_gtod_data.sysctl_enabled == 2)) { + struct timespec tmp = *(wall_time); + cycle_t (*vread)(void); + cycle_t now; + + vread = vsyscall_gtod_data.clock.vread; + if (likely(vread)) + now = vread(); + else + now = clock->read(clock); + + /* calculate interval: */ + now = (now - clock->cycle_last) & clock->mask; + /* convert to nsecs: */ + tmp.tv_nsec += ( now * clock->mult) >> clock->shift; + + while (tmp.tv_nsec >= NSEC_PER_SEC) { + tmp.tv_sec += 1; + tmp.tv_nsec -= NSEC_PER_SEC; + } + + vsyscall_gtod_data.wall_time_sec = tmp.tv_sec; + vsyscall_gtod_data.wall_time_nsec = tmp.tv_nsec; + } else { + vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; + vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; + } + /* copy vsyscall data */ vsyscall_gtod_data.clock.vread = clock->vread; vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; vsyscall_gtod_data.clock.mask = clock->mask; vsyscall_gtod_data.clock.mult = mult; vsyscall_gtod_data.clock.shift = clock->shift; - vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; - vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); - write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); + write_raw_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } /* RED-PEN may want to readd seq locking, but then the variable should be @@ -125,8 +152,28 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) unsigned seq; unsigned long mult, shift, nsec; cycle_t (*vread)(void); + + if (likely(__vsyscall_gtod_data.sysctl_enabled == 2)) { + struct timeval tmp; + + do { + barrier(); + tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; + tv->tv_usec = __vsyscall_gtod_data.wall_time_nsec; + barrier(); + tmp.tv_sec = __vsyscall_gtod_data.wall_time_sec; + tmp.tv_usec = __vsyscall_gtod_data.wall_time_nsec; + + } while (tmp.tv_usec != tv->tv_usec || + tmp.tv_sec != tv->tv_sec); + + tv->tv_usec /= NSEC_PER_MSEC; + tv->tv_usec *= USEC_PER_MSEC; + return; + } + do { - seq = read_seqbegin(&__vsyscall_gtod_data.lock); + seq = read_raw_seqbegin(&__vsyscall_gtod_data.lock); vread = __vsyscall_gtod_data.clock.vread; if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) { @@ -135,6 +182,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) } now = vread(); + base = __vsyscall_gtod_data.clock.cycle_last; mask = __vsyscall_gtod_data.clock.mask; mult = __vsyscall_gtod_data.clock.mult; @@ -142,7 +190,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; nsec = __vsyscall_gtod_data.wall_time_nsec; - } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); + } while (read_raw_seqretry(&__vsyscall_gtod_data.lock, seq)); /* calculate interval: */ cycle_delta = (now - base) & mask; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 15578f1..6a9bee0 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -242,11 +242,11 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, irq_ack_notifier); - spin_lock(&ps->inject_lock); + raw_spin_lock(&ps->inject_lock); if (atomic_dec_return(&ps->pit_timer.pending) < 0) atomic_inc(&ps->pit_timer.pending); ps->irq_ack = 1; - spin_unlock(&ps->inject_lock); + raw_spin_unlock(&ps->inject_lock); } void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) @@ -624,7 +624,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) mutex_init(&pit->pit_state.lock); mutex_lock(&pit->pit_state.lock); - spin_lock_init(&pit->pit_state.inject_lock); + raw_spin_lock_init(&pit->pit_state.inject_lock); kvm->arch.vpit = pit; pit->kvm = kvm; @@ -723,12 +723,12 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) /* Try to inject pending interrupts when * last one has been acked. */ - spin_lock(&ps->inject_lock); + raw_spin_lock(&ps->inject_lock); if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { ps->irq_ack = 0; inject = 1; } - spin_unlock(&ps->inject_lock); + raw_spin_unlock(&ps->inject_lock); if (inject) __inject_pit_timer_intr(kvm); } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index d4c1c7f..900d6b0 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -27,7 +27,7 @@ struct kvm_kpit_state { u32 speaker_data_on; struct mutex lock; struct kvm_pit *pit; - spinlock_t inject_lock; + raw_spinlock_t inject_lock; unsigned long irq_ack; struct kvm_irq_ack_notifier irq_ack_notifier; }; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index d057c0c..a6a877e 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -32,6 +32,29 @@ #include <linux/kvm_host.h> #include "trace.h" +static void pic_lock(struct kvm_pic *s) + __acquires(&s->lock) +{ + raw_spin_lock(&s->lock); +} + +static void pic_unlock(struct kvm_pic *s) + __releases(&s->lock) +{ + bool wakeup = s->wakeup_needed; + struct kvm_vcpu *vcpu; + + s->wakeup_needed = false; + + raw_spin_unlock(&s->lock); + + if (wakeup) { + vcpu = s->kvm->bsp_vcpu; + if (vcpu) + kvm_vcpu_kick(vcpu); + } +} + static void pic_clear_isr(struct kvm_kpic_state *s, int irq) { s->isr &= ~(1 << irq); @@ -44,18 +67,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) * Other interrupt may be delivered to PIC while lock is dropped but * it should be safe since PIC state is already updated at this stage. */ - spin_unlock(&s->pics_state->lock); + pic_unlock(s->pics_state); kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); - spin_lock(&s->pics_state->lock); + pic_lock(s->pics_state); } void kvm_pic_clear_isr_ack(struct kvm *kvm) { struct kvm_pic *s = pic_irqchip(kvm); - spin_lock(&s->lock); + + pic_lock(s); s->pics[0].isr_ack = 0xff; s->pics[1].isr_ack = 0xff; - spin_unlock(&s->lock); + pic_unlock(s); } /* @@ -156,9 +180,9 @@ static void pic_update_irq(struct kvm_pic *s) void kvm_pic_update_irq(struct kvm_pic *s) { - spin_lock(&s->lock); + pic_lock(s); pic_update_irq(s); - spin_unlock(&s->lock); + pic_unlock(s); } int kvm_pic_set_irq(void *opaque, int irq, int level) @@ -166,14 +190,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) struct kvm_pic *s = opaque; int ret = -1; - spin_lock(&s->lock); + pic_lock(s); if (irq >= 0 && irq < PIC_NUM_PINS) { ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); pic_update_irq(s); trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, s->pics[irq >> 3].imr, ret == 0); } - spin_unlock(&s->lock); + pic_unlock(s); return ret; } @@ -203,7 +227,7 @@ int kvm_pic_read_irq(struct kvm *kvm) int irq, irq2, intno; struct kvm_pic *s = pic_irqchip(kvm); - spin_lock(&s->lock); + pic_lock(s); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { pic_intack(&s->pics[0], irq); @@ -228,7 +252,7 @@ int kvm_pic_read_irq(struct kvm *kvm) intno = s->pics[0].irq_base + irq; } pic_update_irq(s); - spin_unlock(&s->lock); + pic_unlock(s); return intno; } @@ -442,7 +466,7 @@ static int picdev_write(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte write\n"); return 0; } - spin_lock(&s->lock); + pic_lock(s); switch (addr) { case 0x20: case 0x21: @@ -455,7 +479,7 @@ static int picdev_write(struct kvm_io_device *this, elcr_ioport_write(&s->pics[addr & 1], addr, data); break; } - spin_unlock(&s->lock); + pic_unlock(s); return 0; } @@ -472,7 +496,7 @@ static int picdev_read(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte read\n"); return 0; } - spin_lock(&s->lock); + pic_lock(s); switch (addr) { case 0x20: case 0x21: @@ -486,7 +510,7 @@ static int picdev_read(struct kvm_io_device *this, break; } *(unsigned char *)val = data; - spin_unlock(&s->lock); + pic_unlock(s); return 0; } @@ -503,7 +527,7 @@ static void pic_irq_request(void *opaque, int level) s->output = level; if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { s->pics[0].isr_ack &= ~(1 << irq); - kvm_vcpu_kick(vcpu); + s->wakeup_needed = true; } } @@ -520,7 +544,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); if (!s) return NULL; - spin_lock_init(&s->lock); + raw_spin_lock_init(&s->lock); s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; s->pics[1].elcr_mask = 0xde; diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index be399e2..4ba8ce2 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -62,7 +62,8 @@ struct kvm_kpic_state { }; struct kvm_pic { - spinlock_t lock; + raw_spinlock_t lock; + bool wakeup_needed; unsigned pending_acks; struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1e1bc9..43e16bf 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2273,18 +2273,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: - spin_lock(&pic_irqchip(kvm)->lock); + raw_spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + raw_spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_PIC_SLAVE: - spin_lock(&pic_irqchip(kvm)->lock); + raw_spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + raw_spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_IOAPIC: r = kvm_set_ioapic(kvm, &chip->chip.ioapic); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f627779..60a0aa5 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -554,6 +554,7 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) nr = (address - idt_descr.address) >> 3; if (nr == 6) { + zap_rt_locks(); do_invalid_op(regs, 0); return 1; } @@ -1035,7 +1036,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) * If we're in an interrupt, have no user context or are running * in an atomic region then we must not take the fault: */ - if (unlikely(in_atomic() || !mm)) { + if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) { bad_area_nosemaphore(regs, error_code, address); return; } diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 738e659..5aeae53 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -77,13 +77,13 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, if (write) mask |= _PAGE_RW; - ptep = pte_offset_map(&pmd, addr); + ptep = pte_offset_map_direct(&pmd, addr); do { pte_t pte = gup_get_pte(ptep); struct page *page; if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { - pte_unmap(ptep); + pte_unmap_direct(ptep); return 0; } VM_BUG_ON(!pfn_valid(pte_pfn(pte))); @@ -93,7 +93,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, (*nr)++; } while (ptep++, addr += PAGE_SIZE, addr != end); - pte_unmap(ptep - 1); + pte_unmap_direct(ptep - 1); return 1; } diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 63a6ba6..dcb1899 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -4,9 +4,9 @@ void *kmap(struct page *page) { - might_sleep(); if (!PageHighMem(page)) return page_address(page); + might_sleep(); return kmap_high(page); } @@ -19,6 +19,27 @@ void kunmap(struct page *page) kunmap_high(page); } +void kunmap_virt(void *ptr) +{ + struct page *page; + + if ((unsigned long)ptr < PKMAP_ADDR(0)) + return; + page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]); + kunmap(page); +} + +struct page *kmap_to_page(void *ptr) +{ + struct page *page; + + if ((unsigned long)ptr < PKMAP_ADDR(0)) + return virt_to_page(ptr); + page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]); + return page; +} +EXPORT_SYMBOL_GPL(kmap_to_page); /* PREEMPT_RT converts some modules to use this */ + /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB @@ -27,12 +48,13 @@ void kunmap(struct page *page) * However when holding an atomic kmap it is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) +void *__kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) { enum fixed_addresses idx; unsigned long vaddr; /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) @@ -42,18 +64,23 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - BUG_ON(!pte_none(*(kmap_pte-idx))); + WARN_ON(!pte_none(*(kmap_pte-idx))); set_pte(kmap_pte-idx, mk_pte(page, prot)); return (void *)vaddr; } -void *kmap_atomic(struct page *page, enum km_type type) +void *__kmap_atomic_direct(struct page *page, enum km_type type) +{ + return __kmap_atomic_prot(page, type, kmap_prot); +} + +void *__kmap_atomic(struct page *page, enum km_type type) { return kmap_atomic_prot(page, type, kmap_prot); } -void kunmap_atomic(void *kvaddr, enum km_type type) +void __kunmap_atomic(void *kvaddr, enum km_type type) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); @@ -74,19 +101,21 @@ void kunmap_atomic(void *kvaddr, enum km_type type) } pagefault_enable(); + preempt_enable(); } /* * This is the same as kmap_atomic() but can map memory that doesn't * have a struct page associated with it. */ -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) +void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type) { + preempt_disable(); return kmap_atomic_prot_pfn(pfn, type, kmap_prot); } -EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ +EXPORT_SYMBOL_GPL(__kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ -struct page *kmap_atomic_to_page(void *ptr) +struct page *__kmap_atomic_to_page(void *ptr) { unsigned long idx, vaddr = (unsigned long)ptr; pte_t *pte; @@ -101,10 +130,11 @@ struct page *kmap_atomic_to_page(void *ptr) EXPORT_SYMBOL(kmap); EXPORT_SYMBOL(kunmap); -EXPORT_SYMBOL(kmap_atomic); -EXPORT_SYMBOL(kunmap_atomic); -EXPORT_SYMBOL(kmap_atomic_prot); -EXPORT_SYMBOL(kmap_atomic_to_page); +EXPORT_SYMBOL(kunmap_virt); +EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(__kmap_atomic_prot); +EXPORT_SYMBOL(__kmap_atomic_to_page); void __init set_highmem_pages_init(void) { diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index d406c52..9a69721 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -14,8 +14,6 @@ #include <asm/tlb.h> #include <asm/proto.h> -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - unsigned long __initdata e820_table_start; unsigned long __meminitdata e820_table_end; unsigned long __meminitdata e820_table_top; diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 84e236c..715d822 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -60,6 +60,7 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) enum fixed_addresses idx; unsigned long vaddr; + preempt_disable(); pagefault_disable(); debug_kmap_atomic(type); @@ -106,5 +107,6 @@ iounmap_atomic(void *kvaddr, enum km_type type) kpte_clear_flush(kmap_pte-idx, vaddr); pagefault_enable(); + preempt_enable(); } EXPORT_SYMBOL_GPL(iounmap_atomic); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1d4eb93..a84a759 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -861,8 +861,10 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, baddr = *addr; } +#if 0 /* Must avoid aliasing mappings in the highmem code */ kmap_flush_unused(); +#endif vm_unmap_aliases(); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index ed34f5e..c2ea747 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -132,6 +132,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) reserved at the pmd (PDPT) level. */ set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); + preempt_disable(); /* * According to Intel App note "TLBs, Paging-Structure Caches, * and Their Invalidation", April 2007, document 317080-001, @@ -140,6 +141,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) */ if (mm == current->active_mm) write_cr3(read_cr3()); + preempt_enable(); } #else /* !CONFIG_X86_PAE */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 65b58e4..426f3a1 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -41,7 +41,7 @@ union smp_flush_state { struct { struct mm_struct *flush_mm; unsigned long flush_va; - spinlock_t tlbstate_lock; + raw_spinlock_t tlbstate_lock; DECLARE_BITMAP(flush_cpumask, NR_CPUS); }; char pad[INTERNODE_CACHE_BYTES]; @@ -181,7 +181,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is * probably not worth checking this for a cache-hot lock. */ - spin_lock(&f->tlbstate_lock); + raw_spin_lock(&f->tlbstate_lock); f->flush_mm = mm; f->flush_va = va; @@ -199,7 +199,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, f->flush_mm = NULL; f->flush_va = 0; - spin_unlock(&f->tlbstate_lock); + raw_spin_unlock(&f->tlbstate_lock); } void native_flush_tlb_others(const struct cpumask *cpumask, @@ -223,7 +223,7 @@ static int __cpuinit init_smp_flush(void) int i; for (i = 0; i < ARRAY_SIZE(flush_state); i++) - spin_lock_init(&flush_state[i].tlbstate_lock); + raw_spin_lock_init(&flush_state[i].tlbstate_lock); return 0; } diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 3347f69..5db38f1 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -321,10 +321,10 @@ static void nmi_cpu_setup(void *dummy) int cpu = smp_processor_id(); struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); nmi_cpu_save_registers(msrs); - spin_lock(&oprofilefs_lock); + raw_spin_lock(&oprofilefs_lock); model->setup_ctrs(model, msrs); nmi_cpu_setup_mux(cpu, msrs); - spin_unlock(&oprofilefs_lock); + raw_spin_unlock(&oprofilefs_lock); per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC); apic_write(APIC_LVTPC, APIC_DM_NMI); } diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index d2552c6..b79d322 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -81,7 +81,7 @@ int pcibios_scanned; * This interrupt-safe spinlock protects all accesses to PCI * configuration space. */ -DEFINE_SPINLOCK(pci_config_lock); +DEFINE_RAW_SPINLOCK(pci_config_lock); static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d) { diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c index 347d882..2cc4115 100644 --- a/arch/x86/pci/direct.c +++ b/arch/x86/pci/direct.c @@ -27,7 +27,7 @@ static int pci_conf1_read(unsigned int seg, unsigned int bus, return -EINVAL; } - spin_lock_irqsave(&pci_config_lock, flags); + raw_spin_lock_irqsave(&pci_config_lock, flags); outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8); @@ -43,7 +43,7 @@ static int pci_conf1_read(unsigned int seg, unsigned int bus, break; } - spin_unlock_irqrestore(&pci_config_lock, flags); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } @@ -56,7 +56,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus, if ((bus > 255) || (devfn > 255) || (reg > 4095)) return -EINVAL; - spin_lock_irqsave(&pci_config_lock, flags); + raw_spin_lock_irqsave(&pci_config_lock, flags); outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8); @@ -72,7 +72,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus, break; } - spin_unlock_irqrestore(&pci_config_lock, flags); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } @@ -108,7 +108,7 @@ static int pci_conf2_read(unsigned int seg, unsigned int bus, if (dev & 0x10) return PCIBIOS_DEVICE_NOT_FOUND; - spin_lock_irqsave(&pci_config_lock, flags); + raw_spin_lock_irqsave(&pci_config_lock, flags); outb((u8)(0xF0 | (fn << 1)), 0xCF8); outb((u8)bus, 0xCFA); @@ -127,7 +127,7 @@ static int pci_conf2_read(unsigned int seg, unsigned int bus, outb(0, 0xCF8); - spin_unlock_irqrestore(&pci_config_lock, flags); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } @@ -147,7 +147,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus, if (dev & 0x10) return PCIBIOS_DEVICE_NOT_FOUND; - spin_lock_irqsave(&pci_config_lock, flags); + raw_spin_lock_irqsave(&pci_config_lock, flags); outb((u8)(0xF0 | (fn << 1)), 0xCF8); outb((u8)bus, 0xCFA); @@ -166,7 +166,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus, outb(0, 0xCF8); - spin_unlock_irqrestore(&pci_config_lock, flags); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } @@ -224,16 +224,23 @@ static int __init pci_check_type1(void) unsigned int tmp; int works = 0; - local_irq_save(flags); + raw_spin_lock_irqsave(&pci_config_lock, flags); outb(0x01, 0xCFB); tmp = inl(0xCF8); outl(0x80000000, 0xCF8); - if (inl(0xCF8) == 0x80000000 && pci_sanity_check(&pci_direct_conf1)) { - works = 1; + + if (inl(0xCF8) == 0x80000000) { + raw_spin_unlock_irqrestore(&pci_config_lock, flags); + + if (pci_sanity_check(&pci_direct_conf1)) + works = 1; + + raw_spin_lock_irqsave(&pci_config_lock, flags); } outl(tmp, 0xCF8); - local_irq_restore(flags); + + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return works; } @@ -243,17 +250,19 @@ static int __init pci_check_type2(void) unsigned long flags; int works = 0; - local_irq_save(flags); + raw_spin_lock_irqsave(&pci_config_lock, flags); outb(0x00, 0xCFB); outb(0x00, 0xCF8); outb(0x00, 0xCFA); - if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00 && - pci_sanity_check(&pci_direct_conf2)) { - works = 1; - } - local_irq_restore(flags); + if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00) { + raw_spin_unlock_irqrestore(&pci_config_lock, flags); + + if (pci_sanity_check(&pci_direct_conf2)) + works = 1; + } else + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return works; } diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index 90d5fd4..a3d9c54 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -64,7 +64,7 @@ err: *value = -1; if (!base) goto err; - spin_lock_irqsave(&pci_config_lock, flags); + raw_spin_lock_irqsave(&pci_config_lock, flags); pci_exp_set_dev_base(base, bus, devfn); @@ -79,7 +79,7 @@ err: *value = -1; *value = mmio_config_readl(mmcfg_virt_addr + reg); break; } - spin_unlock_irqrestore(&pci_config_lock, flags); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } @@ -97,7 +97,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus, if (!base) return -EINVAL; - spin_lock_irqsave(&pci_config_lock, flags); + raw_spin_lock_irqsave(&pci_config_lock, flags); pci_exp_set_dev_base(base, bus, devfn); @@ -112,7 +112,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus, mmio_config_writel(mmcfg_virt_addr + reg, value); break; } - spin_unlock_irqrestore(&pci_config_lock, flags); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c index 8eb295e..2dad4dc 100644 --- a/arch/x86/pci/numaq_32.c +++ b/arch/x86/pci/numaq_32.c @@ -41,7 +41,7 @@ static int pci_conf1_mq_read(unsigned int seg, unsigned int bus, if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) return -EINVAL; - spin_lock_irqsave(&pci_config_lock, flags); + raw_spin_lock_irqsave(&pci_config_lock, flags); write_cf8(bus, devfn, reg); @@ -66,7 +66,7 @@ static int pci_conf1_mq_read(unsigned int seg, unsigned int bus, break; } - spin_unlock_irqrestore(&pci_config_lock, flags); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } @@ -80,7 +80,7 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus, if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) return -EINVAL; - spin_lock_irqsave(&pci_config_lock, flags); + raw_spin_lock_irqsave(&pci_config_lock, flags); write_cf8(bus, devfn, reg); @@ -105,7 +105,7 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus, break; } - spin_unlock_irqrestore(&pci_config_lock, flags); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index 1c975cc..2daa521 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -161,7 +161,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus, if (!value || (bus > 255) || (devfn > 255) || (reg > 255)) return -EINVAL; - spin_lock_irqsave(&pci_config_lock, flags); + raw_spin_lock_irqsave(&pci_config_lock, flags); switch (len) { case 1: @@ -212,7 +212,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus, break; } - spin_unlock_irqrestore(&pci_config_lock, flags); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return (int)((result & 0xff00) >> 8); } @@ -227,7 +227,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus, if ((bus > 255) || (devfn > 255) || (reg > 255)) return -EINVAL; - spin_lock_irqsave(&pci_config_lock, flags); + raw_spin_lock_irqsave(&pci_config_lock, flags); switch (len) { case 1: @@ -268,7 +268,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus, break; } - spin_unlock_irqrestore(&pci_config_lock, flags); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return (int)((result & 0xff00) >> 8); } diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index ee55754..e1b68a5 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -47,11 +47,11 @@ notrace static noinline int do_realtime(struct timespec *ts) { unsigned long seq, ns; do { - seq = read_seqbegin(>od->lock); + seq = read_raw_seqbegin(>od->lock); ts->tv_sec = gtod->wall_time_sec; ts->tv_nsec = gtod->wall_time_nsec; ns = vgetns(); - } while (unlikely(read_seqretry(>od->lock, seq))); + } while (unlikely(read_raw_seqretry(>od->lock, seq))); timespec_add_ns(ts, ns); return 0; } @@ -76,12 +76,12 @@ notrace static noinline int do_monotonic(struct timespec *ts) { unsigned long seq, ns, secs; do { - seq = read_seqbegin(>od->lock); + seq = read_raw_seqbegin(>od->lock); secs = gtod->wall_time_sec; ns = gtod->wall_time_nsec + vgetns(); secs += gtod->wall_to_monotonic.tv_sec; ns += gtod->wall_to_monotonic.tv_nsec; - } while (unlikely(read_seqretry(>od->lock, seq))); + } while (unlikely(read_raw_seqretry(>od->lock, seq))); vset_normalized_timespec(ts, secs, ns); return 0; } @@ -90,10 +90,10 @@ notrace static noinline int do_realtime_coarse(struct timespec *ts) { unsigned long seq; do { - seq = read_seqbegin(>od->lock); + seq = read_raw_seqbegin(>od->lock); ts->tv_sec = gtod->wall_time_coarse.tv_sec; ts->tv_nsec = gtod->wall_time_coarse.tv_nsec; - } while (unlikely(read_seqretry(>od->lock, seq))); + } while (unlikely(read_raw_seqretry(>od->lock, seq))); return 0; } @@ -101,12 +101,12 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts) { unsigned long seq, ns, secs; do { - seq = read_seqbegin(>od->lock); + seq = read_raw_seqbegin(>od->lock); secs = gtod->wall_time_coarse.tv_sec; ns = gtod->wall_time_coarse.tv_nsec; secs += gtod->wall_to_monotonic.tv_sec; ns += gtod->wall_to_monotonic.tv_nsec; - } while (unlikely(read_seqretry(>od->lock, seq))); + } while (unlikely(read_raw_seqretry(>od->lock, seq))); vset_normalized_timespec(ts, secs, ns); return 0; } diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h index e39edf5..32c5e28 100644 --- a/arch/xtensa/include/asm/rwsem.h +++ b/arch/xtensa/include/asm/rwsem.h @@ -25,7 +25,7 @@ /* * the semaphore definition */ -struct rw_semaphore { +struct rw_anon_semaphore { signed long count; #define RWSEM_UNLOCKED_VALUE 0x00000000 #define RWSEM_ACTIVE_BIAS 0x00000001 @@ -37,29 +37,37 @@ struct rw_semaphore { struct list_head wait_list; }; -#define __RWSEM_INITIALIZER(name) \ +#define __RWSEM_ANON_INITIALIZER(name) \ { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \ LIST_HEAD_INIT((name).wait_list) } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define DECLARE_ANON_RWSEM(name) \ + struct rw_anon_semaphore name = __RWSEM_ANON_INITIALIZER(name) -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_down_read_failed(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_down_write_failed(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore *rwsem_wake(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_downgrade_wake(struct rw_anon_semaphore *sem); -static inline void init_rwsem(struct rw_semaphore *sem) +static inline void init_anon_rwsem(struct rw_anon_semaphore *sem) { sem->count = RWSEM_UNLOCKED_VALUE; spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); } +static inline int anon_rwsem_is_locked(struct rw_anon_semaphore *sem) +{ + return (sem->count != 0); +} + /* * lock for reading */ -static inline void __down_read(struct rw_semaphore *sem) +static inline void __down_read(struct rw_anon_semaphore *sem) { if (atomic_add_return(1,(atomic_t *)(&sem->count)) > 0) smp_wmb(); @@ -67,7 +75,7 @@ static inline void __down_read(struct rw_semaphore *sem) rwsem_down_read_failed(sem); } -static inline int __down_read_trylock(struct rw_semaphore *sem) +static inline int __down_read_trylock(struct rw_anon_semaphore *sem) { int tmp; @@ -84,7 +92,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write(struct rw_semaphore *sem) +static inline void __down_write(struct rw_anon_semaphore *sem) { int tmp; @@ -96,7 +104,7 @@ static inline void __down_write(struct rw_semaphore *sem) rwsem_down_write_failed(sem); } -static inline int __down_write_trylock(struct rw_semaphore *sem) +static inline int __down_write_trylock(struct rw_anon_semaphore *sem) { int tmp; @@ -109,7 +117,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) /* * unlock after reading */ -static inline void __up_read(struct rw_semaphore *sem) +static inline void __up_read(struct rw_anon_semaphore *sem) { int tmp; @@ -122,7 +130,7 @@ static inline void __up_read(struct rw_semaphore *sem) /* * unlock after writing */ -static inline void __up_write(struct rw_semaphore *sem) +static inline void __up_write(struct rw_anon_semaphore *sem) { smp_wmb(); if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS, @@ -133,7 +141,7 @@ static inline void __up_write(struct rw_semaphore *sem) /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(int delta, struct rw_anon_semaphore *sem) { atomic_add(delta, (atomic_t *)(&sem->count)); } @@ -141,7 +149,7 @@ static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) /* * downgrade write lock to read lock */ -static inline void __downgrade_write(struct rw_semaphore *sem) +static inline void __downgrade_write(struct rw_anon_semaphore *sem) { int tmp; @@ -154,12 +162,37 @@ static inline void __downgrade_write(struct rw_semaphore *sem) /* * implement exchange and add functionality */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline int rwsem_atomic_update(int delta, struct rw_anon_semaphore *sem) { smp_mb(); return atomic_add_return(delta, (atomic_t *)(&sem->count)); } +static inline int anon_rwsem_is_locked(struct rw_anon_semaphore *sem) +{ + return (sem->count != 0); +} + +struct rw_semaphore { + signed long count; + spinlock_t wait_lock; + struct list_head wait_list; +}; + +#define __RWSEM_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \ + LIST_HEAD_INIT((name).wait_list) } + +#define DECLARE_RWSEM(name) \ + struct rw_semaphore name = __RWSEM_INITIALIZER(name) + +static inline void init_rwsem(struct rw_semaphore *sem) +{ + sem->count = RWSEM_UNLOCKED_VALUE; + spin_lock_init(&sem->wait_lock); + INIT_LIST_HEAD(&sem->wait_list); +} + static inline int rwsem_is_locked(struct rw_semaphore *sem) { return (sem->count != 0); diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c index 19f7df3..e8184d5 100644 --- a/arch/xtensa/kernel/time.c +++ b/arch/xtensa/kernel/time.c @@ -101,7 +101,7 @@ again: update_process_times(user_mode(get_irq_regs())); #endif - write_seqlock(&xtime_lock); + write_raw_seqlock(&xtime_lock); do_timer(1); /* Linux handler in kernel/timer.c */ @@ -110,7 +110,7 @@ again: next += CCOUNT_PER_JIFFY; set_linux_timer(next); - write_sequnlock(&xtime_lock); + write_raw_sequnlock(&xtime_lock); } /* Allow platform to do something useful (Wdog). */ diff --git a/block/blk-core.c b/block/blk-core.c index 718897e..800d396 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -202,7 +202,7 @@ EXPORT_SYMBOL(blk_dump_rq_flags); */ void blk_plug_device(struct request_queue *q) { - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); /* * don't plug a stopped queue, it must be paired with blk_start_queue() @@ -242,7 +242,7 @@ EXPORT_SYMBOL(blk_plug_device_unlocked); */ int blk_remove_plug(struct request_queue *q) { - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q)) return 0; @@ -334,7 +334,7 @@ EXPORT_SYMBOL(blk_unplug); **/ void blk_start_queue(struct request_queue *q) { - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); queue_flag_clear(QUEUE_FLAG_STOPPED, q); __blk_run_queue(q); @@ -1267,7 +1267,7 @@ get_rq: spin_lock_irq(q->queue_lock); if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || bio_flagged(bio, BIO_CPU_AFFINE)) - req->cpu = blk_cpu_to_group(smp_processor_id()); + req->cpu = blk_cpu_to_group(raw_smp_processor_id()); if (queue_should_plug(q) && elv_queue_empty(q)) blk_plug_device(q); add_request(q, req); diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h index 29ba66d..c92eeb0 100644 --- a/drivers/acpi/acpica/acglobal.h +++ b/drivers/acpi/acpica/acglobal.h @@ -201,7 +201,12 @@ ACPI_EXTERN u8 acpi_gbl_global_lock_present; * interrupt level */ ACPI_EXTERN spinlock_t _acpi_gbl_gpe_lock; /* For GPE data structs and registers */ -ACPI_EXTERN spinlock_t _acpi_gbl_hardware_lock; /* For ACPI H/W except GPE registers */ + +/* + * Need to be raw because it might be used in acpi_processor_idle(): + */ +ACPI_EXTERN raw_spinlock_t _acpi_gbl_hardware_lock; /* For ACPI H/W except GPE registers */ + #define acpi_gbl_gpe_lock &_acpi_gbl_gpe_lock #define acpi_gbl_hardware_lock &_acpi_gbl_hardware_lock diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c index 15c9ed2..90240ab 100644 --- a/drivers/acpi/acpica/hwregs.c +++ b/drivers/acpi/acpica/hwregs.c @@ -263,7 +263,7 @@ acpi_status acpi_hw_clear_acpi_status(void) ACPI_BITMASK_ALL_FIXED_STATUS, ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address))); - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); + raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags); /* Clear the fixed events in PM1 A/B */ @@ -278,7 +278,7 @@ acpi_status acpi_hw_clear_acpi_status(void) status = acpi_ev_walk_gpe_list(acpi_hw_clear_gpe_block, NULL); unlock_and_exit: - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags); + raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags); return_ACPI_STATUS(status); } diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c index 647c7b6..2360da1 100644 --- a/drivers/acpi/acpica/hwxface.c +++ b/drivers/acpi/acpica/hwxface.c @@ -386,7 +386,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value) return_ACPI_STATUS(AE_BAD_PARAMETER); } - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); + raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags); /* * At this point, we know that the parent register is one of the @@ -447,7 +447,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value) unlock_and_exit: - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags); + raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags); return_ACPI_STATUS(status); } diff --git a/drivers/acpi/acpica/utmutex.c b/drivers/acpi/acpica/utmutex.c index 80bb651..d8699f8 100644 --- a/drivers/acpi/acpica/utmutex.c +++ b/drivers/acpi/acpica/utmutex.c @@ -84,7 +84,7 @@ acpi_status acpi_ut_mutex_initialize(void) /* Create the spinlocks for use at interrupt level */ spin_lock_init(acpi_gbl_gpe_lock); - spin_lock_init(acpi_gbl_hardware_lock); + raw_spin_lock_init(acpi_gbl_hardware_lock); /* Create the reader/writer lock for namespace access */ @@ -117,11 +117,6 @@ void acpi_ut_mutex_terminate(void) (void)acpi_ut_delete_mutex(i); } - /* Delete the spinlocks */ - - acpi_os_delete_lock(acpi_gbl_gpe_lock); - acpi_os_delete_lock(acpi_gbl_hardware_lock); - /* Delete the reader/writer lock */ acpi_ut_delete_rw_lock(&acpi_gbl_namespace_rw_lock); diff --git a/drivers/acpi/dock.c b/drivers/acpi/dock.c index bbc2c13..b2586f5 100644 --- a/drivers/acpi/dock.c +++ b/drivers/acpi/dock.c @@ -935,6 +935,7 @@ static int dock_add(acpi_handle handle) struct platform_device *dd; id = dock_station_count; + memset(&ds, 0, sizeof(ds)); dd = platform_device_register_data(NULL, "dock", id, &ds, sizeof(ds)); if (IS_ERR(dd)) return PTR_ERR(dd); diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index d6471bb..e497639 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -577,7 +577,19 @@ static u32 acpi_ec_gpe_handler(void *data) advance_transaction(ec, acpi_ec_read_status(ec)); if (ec_transaction_done(ec) && (acpi_ec_read_status(ec) & ACPI_EC_FLAG_IBF) == 0) { +#ifndef CONFIG_PREEMPT_RT wake_up(&ec->wait); +#else + // hack ... + if (waitqueue_active(&ec->wait)) { + struct task_struct *task; + + task = list_entry(ec->wait.task_list.next, + wait_queue_t, task_list)->private; + if (task) + wake_up_process(task); + } +#endif ec_check_sci(ec, acpi_ec_read_status(ec)); } return ACPI_INTERRUPT_HANDLED; diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 7c0441f..3e047b8 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -110,6 +110,14 @@ static struct dmi_system_id __cpuinitdata processor_power_dmi_table[] = { DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"), DMI_MATCH(DMI_BIOS_VERSION,"SHE845M0.86C.0013.D.0302131307")}, (void *)2}, + { set_max_cstate, "Pavilion zv5000", { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME,"Pavilion zv5000 (DS502A#ABA)")}, + (void *)1}, + { set_max_cstate, "Asus L8400B", { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."), + DMI_MATCH(DMI_PRODUCT_NAME,"L8400B series Notebook PC")}, + (void *)1}, {}, }; @@ -872,12 +880,14 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev, return(acpi_idle_enter_c1(dev, state)); local_irq_disable(); - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we test - * NEED_RESCHED: - */ - smp_mb(); + if (cx->entry_method != ACPI_CSTATE_FFH) { + current_thread_info()->status &= ~TS_POLLING; + /* + * TS_POLLING-cleared state must be visible before we test + * NEED_RESCHED: + */ + smp_mb(); + } if (unlikely(need_resched())) { current_thread_info()->status |= TS_POLLING; @@ -917,7 +927,7 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev, } static int c3_cpu_count; -static DEFINE_SPINLOCK(c3_lock); +static DEFINE_RAW_SPINLOCK(c3_lock); /** * acpi_idle_enter_bm - enters C3 with proper BM handling @@ -957,12 +967,14 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, } local_irq_disable(); - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we test - * NEED_RESCHED: - */ - smp_mb(); + if (cx->entry_method != ACPI_CSTATE_FFH) { + current_thread_info()->status &= ~TS_POLLING; + /* + * TS_POLLING-cleared state must be visible before we test + * NEED_RESCHED: + */ + smp_mb(); + } if (unlikely(need_resched())) { current_thread_info()->status |= TS_POLLING; @@ -992,12 +1004,12 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, * without doing anything. */ if (pr->flags.bm_check && pr->flags.bm_control) { - spin_lock(&c3_lock); + raw_spin_lock(&c3_lock); c3_cpu_count++; /* Disable bus master arbitration when all CPUs are in C3 */ if (c3_cpu_count == num_online_cpus()) acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1); - spin_unlock(&c3_lock); + raw_spin_unlock(&c3_lock); } else if (!pr->flags.bm_check) { ACPI_FLUSH_CPU_CACHE(); } @@ -1006,10 +1018,10 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, /* Re-enable bus master arbitration */ if (pr->flags.bm_check && pr->flags.bm_control) { - spin_lock(&c3_lock); + raw_spin_lock(&c3_lock); acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0); c3_cpu_count--; - spin_unlock(&c3_lock); + raw_spin_unlock(&c3_lock); } kt2 = ktime_get_real(); idle_time = ktime_to_us(ktime_sub(kt2, kt1)); diff --git a/drivers/acpi/processor_pdc.c b/drivers/acpi/processor_pdc.c index 7247819..e306ba9 100644 --- a/drivers/acpi/processor_pdc.c +++ b/drivers/acpi/processor_pdc.c @@ -125,6 +125,8 @@ acpi_processor_eval_pdc(acpi_handle handle, struct acpi_object_list *pdc_in) return status; } +static int early_pdc_done; + void acpi_processor_set_pdc(acpi_handle handle) { struct acpi_object_list *obj_list; @@ -132,6 +134,9 @@ void acpi_processor_set_pdc(acpi_handle handle) if (arch_has_acpi_pdc() == false) return; + if (early_pdc_done) + return; + obj_list = acpi_processor_alloc_pdc(); if (!obj_list) return; @@ -151,6 +156,13 @@ static int set_early_pdc_optin(const struct dmi_system_id *id) return 0; } +static int param_early_pdc_optin(char *s) +{ + early_pdc_optin = 1; + return 1; +} +__setup("acpi_early_pdc_eval", param_early_pdc_optin); + static struct dmi_system_id __cpuinitdata early_pdc_optin_table[] = { { set_early_pdc_optin, "HP Envy", { @@ -192,4 +204,6 @@ void __init acpi_early_processor_set_pdc(void) acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, ACPI_UINT32_MAX, early_init_pdc, NULL, NULL, NULL); + + early_pdc_done = 1; } diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c index 2cabadc..a959f6a 100644 --- a/drivers/acpi/processor_perflib.c +++ b/drivers/acpi/processor_perflib.c @@ -413,7 +413,11 @@ static int acpi_processor_get_performance_info(struct acpi_processor *pr) if (result) goto update_bios; - return 0; + /* We need to call _PPC once when cpufreq starts */ + if (ignore_ppc != 1) + result = acpi_processor_get_platform_limit(pr); + + return result; /* * Having _PPC but missing frequencies (_PSS, _PCT) is a very good hint that diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index ff9f622..3e00967 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1336,9 +1336,25 @@ static int acpi_bus_scan(acpi_handle handle, struct acpi_bus_ops *ops, if (child) *child = device; - return 0; + + if (device) + return 0; + else + return -ENODEV; } +/* + * acpi_bus_add and acpi_bus_start + * + * scan a given ACPI tree and (probably recently hot-plugged) + * create and add or starts found devices. + * + * If no devices were found -ENODEV is returned which does not + * mean that this is a real error, there just have been no suitable + * ACPI objects in the table trunk from which the kernel could create + * a device and add/start an appropriate driver. + */ + int acpi_bus_add(struct acpi_device **child, struct acpi_device *parent, acpi_handle handle, int type) @@ -1348,8 +1364,7 @@ acpi_bus_add(struct acpi_device **child, memset(&ops, 0, sizeof(ops)); ops.acpi_op_add = 1; - acpi_bus_scan(handle, &ops, child); - return 0; + return acpi_bus_scan(handle, &ops, child); } EXPORT_SYMBOL(acpi_bus_add); @@ -1357,11 +1372,13 @@ int acpi_bus_start(struct acpi_device *device) { struct acpi_bus_ops ops; + if (!device) + return -EINVAL; + memset(&ops, 0, sizeof(ops)); ops.acpi_op_start = 1; - acpi_bus_scan(device->handle, &ops, NULL); - return 0; + return acpi_bus_scan(device->handle, &ops, NULL); } EXPORT_SYMBOL(acpi_bus_start); diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c index f336bca..8a0ed28 100644 --- a/drivers/acpi/tables.c +++ b/drivers/acpi/tables.c @@ -213,7 +213,7 @@ acpi_table_parse_entries(char *id, unsigned long table_end; acpi_size tbl_size; - if (acpi_disabled) + if (acpi_disabled && !acpi_ht) return -ENODEV; if (!handler) @@ -280,7 +280,7 @@ int __init acpi_table_parse(char *id, acpi_table_handler handler) struct acpi_table_header *table = NULL; acpi_size tbl_size; - if (acpi_disabled) + if (acpi_disabled && !acpi_ht) return -ENODEV; if (!handler) diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c index 730ef3c..bc24dfb 100644 --- a/drivers/ata/libata-sff.c +++ b/drivers/ata/libata-sff.c @@ -837,9 +837,9 @@ unsigned int ata_sff_data_xfer_noirq(struct ata_device *dev, unsigned char *buf, unsigned long flags; unsigned int consumed; - local_irq_save(flags); + local_irq_save_nort(flags); consumed = ata_sff_data_xfer(dev, buf, buflen, rw); - local_irq_restore(flags); + local_irq_restore_nort(flags); return consumed; } @@ -878,7 +878,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc) unsigned long flags; /* FIXME: use a bounce buffer */ - local_irq_save(flags); + local_irq_save_nort(flags); buf = kmap_atomic(page, KM_IRQ0); /* do the actual data transfer */ @@ -886,7 +886,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc) do_write); kunmap_atomic(buf, KM_IRQ0); - local_irq_restore(flags); + local_irq_restore_nort(flags); } else { buf = page_address(page); ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size, @@ -1019,7 +1019,7 @@ next_sg: unsigned long flags; /* FIXME: use bounce buffer */ - local_irq_save(flags); + local_irq_save_nort(flags); buf = kmap_atomic(page, KM_IRQ0); /* do the actual data transfer */ @@ -1027,7 +1027,7 @@ next_sg: count, rw); kunmap_atomic(buf, KM_IRQ0); - local_irq_restore(flags); + local_irq_restore_nort(flags); } else { buf = page_address(page); consumed = ap->ops->sff_data_xfer(dev, buf + offset, diff --git a/drivers/base/bus.c b/drivers/base/bus.c index c0c5a43..9d1a0b1 100644 --- a/drivers/base/bus.c +++ b/drivers/base/bus.c @@ -173,10 +173,10 @@ static ssize_t driver_unbind(struct device_driver *drv, dev = bus_find_device_by_name(bus, NULL, buf); if (dev && dev->driver == drv) { if (dev->parent) /* Needed for USB */ - down(&dev->parent->sem); + mutex_lock(&dev->parent->mutex); device_release_driver(dev); if (dev->parent) - up(&dev->parent->sem); + mutex_unlock(&dev->parent->mutex); err = count; } put_device(dev); @@ -200,12 +200,12 @@ static ssize_t driver_bind(struct device_driver *drv, dev = bus_find_device_by_name(bus, NULL, buf); if (dev && dev->driver == NULL && driver_match_device(drv, dev)) { if (dev->parent) /* Needed for USB */ - down(&dev->parent->sem); - down(&dev->sem); + mutex_lock(&dev->parent->mutex); + mutex_lock(&dev->mutex); err = driver_probe_device(drv, dev); - up(&dev->sem); + mutex_unlock(&dev->mutex); if (dev->parent) - up(&dev->parent->sem); + mutex_unlock(&dev->parent->mutex); if (err > 0) { /* success */ @@ -744,10 +744,10 @@ static int __must_check bus_rescan_devices_helper(struct device *dev, if (!dev->driver) { if (dev->parent) /* Needed for USB */ - down(&dev->parent->sem); + mutex_lock(&dev->parent->mutex); ret = device_attach(dev); if (dev->parent) - up(&dev->parent->sem); + mutex_unlock(&dev->parent->mutex); } return ret < 0 ? ret : 0; } @@ -779,10 +779,10 @@ int device_reprobe(struct device *dev) { if (dev->driver) { if (dev->parent) /* Needed for USB */ - down(&dev->parent->sem); + mutex_lock(&dev->parent->mutex); device_release_driver(dev); if (dev->parent) - up(&dev->parent->sem); + mutex_unlock(&dev->parent->mutex); } return bus_rescan_devices_helper(dev, NULL); } diff --git a/drivers/base/class.c b/drivers/base/class.c index 161746d..6e2c3b0 100644 --- a/drivers/base/class.c +++ b/drivers/base/class.c @@ -59,6 +59,8 @@ static void class_release(struct kobject *kobj) else pr_debug("class '%s' does not have a release() function, " "be careful\n", class->name); + + kfree(cp); } static struct sysfs_ops class_sysfs_ops = { diff --git a/drivers/base/core.c b/drivers/base/core.c index 2820257..5e2950a 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -20,7 +20,6 @@ #include <linux/notifier.h> #include <linux/genhd.h> #include <linux/kallsyms.h> -#include <linux/semaphore.h> #include <linux/mutex.h> #include <linux/async.h> @@ -564,7 +563,8 @@ void device_initialize(struct device *dev) dev->kobj.kset = devices_kset; kobject_init(&dev->kobj, &device_ktype); INIT_LIST_HEAD(&dev->dma_pools); - init_MUTEX(&dev->sem); + mutex_init(&dev->mutex); + lockdep_set_novalidate_class(&dev->mutex); spin_lock_init(&dev->devres_lock); INIT_LIST_HEAD(&dev->devres_head); device_init_wakeup(dev, 0); diff --git a/drivers/base/dd.c b/drivers/base/dd.c index ee95c76..a68a65b 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -85,7 +85,7 @@ static void driver_sysfs_remove(struct device *dev) * for before calling this. (It is ok to call with no other effort * from a driver's probe() method.) * - * This function must be called with @dev->sem held. + * This function must be called with @dev->mutex held. */ int device_bind_driver(struct device *dev) { @@ -190,8 +190,8 @@ EXPORT_SYMBOL_GPL(wait_for_device_probe); * This function returns -ENODEV if the device is not registered, * 1 if the device is bound successfully and 0 otherwise. * - * This function must be called with @dev->sem held. When called for a - * USB interface, @dev->parent->sem must be held as well. + * This function must be called with @dev->mutex held. When called for a + * USB interface, @dev->parent->mutex must be held as well. */ int driver_probe_device(struct device_driver *drv, struct device *dev) { @@ -233,13 +233,13 @@ static int __device_attach(struct device_driver *drv, void *data) * 0 if no matching driver was found; * -ENODEV if the device is not registered. * - * When called for a USB interface, @dev->parent->sem must be held. + * When called for a USB interface, @dev->parent->mutex must be held. */ int device_attach(struct device *dev) { int ret = 0; - down(&dev->sem); + mutex_lock(&dev->mutex); if (dev->driver) { ret = device_bind_driver(dev); if (ret == 0) @@ -253,7 +253,7 @@ int device_attach(struct device *dev) ret = bus_for_each_drv(dev->bus, NULL, dev, __device_attach); pm_runtime_put_sync(dev); } - up(&dev->sem); + mutex_unlock(&dev->mutex); return ret; } EXPORT_SYMBOL_GPL(device_attach); @@ -276,13 +276,13 @@ static int __driver_attach(struct device *dev, void *data) return 0; if (dev->parent) /* Needed for USB */ - down(&dev->parent->sem); - down(&dev->sem); + mutex_lock(&dev->parent->mutex); + mutex_lock(&dev->mutex); if (!dev->driver) driver_probe_device(drv, dev); - up(&dev->sem); + mutex_unlock(&dev->mutex); if (dev->parent) - up(&dev->parent->sem); + mutex_unlock(&dev->parent->mutex); return 0; } @@ -303,8 +303,8 @@ int driver_attach(struct device_driver *drv) EXPORT_SYMBOL_GPL(driver_attach); /* - * __device_release_driver() must be called with @dev->sem held. - * When called for a USB interface, @dev->parent->sem must be held as well. + * __device_release_driver() must be called with @dev->mutex held. + * When called for a USB interface, @dev->parent->mutex must be held as well. */ static void __device_release_driver(struct device *dev) { @@ -343,7 +343,7 @@ static void __device_release_driver(struct device *dev) * @dev: device. * * Manually detach device from driver. - * When called for a USB interface, @dev->parent->sem must be held. + * When called for a USB interface, @dev->parent->mutex must be held. */ void device_release_driver(struct device *dev) { @@ -352,9 +352,9 @@ void device_release_driver(struct device *dev) * within their ->remove callback for the same device, they * will deadlock right here. */ - down(&dev->sem); + mutex_lock(&dev->mutex); __device_release_driver(dev); - up(&dev->sem); + mutex_unlock(&dev->mutex); } EXPORT_SYMBOL_GPL(device_release_driver); @@ -381,13 +381,13 @@ void driver_detach(struct device_driver *drv) spin_unlock(&drv->p->klist_devices.k_lock); if (dev->parent) /* Needed for USB */ - down(&dev->parent->sem); - down(&dev->sem); + mutex_lock(&dev->parent->mutex); + mutex_lock(&dev->mutex); if (dev->driver == drv) __device_release_driver(dev); - up(&dev->sem); + mutex_unlock(&dev->mutex); if (dev->parent) - up(&dev->parent->sem); + mutex_unlock(&dev->parent->mutex); put_device(dev); } } diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index a5142bd..05848a9 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -34,8 +34,8 @@ * because children are guaranteed to be discovered after parents, and * are inserted at the back of the list on discovery. * - * Since device_pm_add() may be called with a device semaphore held, - * we must never try to acquire a device semaphore while holding + * Since device_pm_add() may be called with a device mutex held, + * we must never try to acquire a device mutex while holding * dpm_list_mutex. */ @@ -476,7 +476,7 @@ static int device_resume(struct device *dev, pm_message_t state) TRACE_DEVICE(dev); TRACE_RESUME(0); - down(&dev->sem); + mutex_lock(&dev->mutex); if (dev->bus) { if (dev->bus->pm) { @@ -509,7 +509,7 @@ static int device_resume(struct device *dev, pm_message_t state) } } End: - up(&dev->sem); + mutex_unlock(&dev->mutex); TRACE_RESUME(error); return error; @@ -564,7 +564,7 @@ static void dpm_resume(pm_message_t state) */ static void device_complete(struct device *dev, pm_message_t state) { - down(&dev->sem); + mutex_lock(&dev->mutex); if (dev->class && dev->class->pm && dev->class->pm->complete) { pm_dev_dbg(dev, state, "completing class "); @@ -581,7 +581,7 @@ static void device_complete(struct device *dev, pm_message_t state) dev->bus->pm->complete(dev); } - up(&dev->sem); + mutex_unlock(&dev->mutex); } /** @@ -740,7 +740,7 @@ static int device_suspend(struct device *dev, pm_message_t state) { int error = 0; - down(&dev->sem); + mutex_lock(&dev->mutex); if (dev->class) { if (dev->class->pm) { @@ -773,7 +773,7 @@ static int device_suspend(struct device *dev, pm_message_t state) } } End: - up(&dev->sem); + mutex_unlock(&dev->mutex); return error; } @@ -828,7 +828,7 @@ static int device_prepare(struct device *dev, pm_message_t state) { int error = 0; - down(&dev->sem); + mutex_lock(&dev->mutex); if (dev->bus && dev->bus->pm && dev->bus->pm->prepare) { pm_dev_dbg(dev, state, "preparing "); @@ -852,7 +852,7 @@ static int device_prepare(struct device *dev, pm_message_t state) suspend_report_result(dev->class->pm->prepare, error); } End: - up(&dev->sem); + mutex_unlock(&dev->mutex); return error; } diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index b61057e..f58e765 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -66,7 +66,7 @@ struct drbd_bitmap { size_t bm_words; size_t bm_number_of_pages; sector_t bm_dev_capacity; - struct semaphore bm_change; /* serializes resize operations */ + struct mutex bm_change; /* serializes resize operations */ atomic_t bm_async_io; wait_queue_head_t bm_io_wait; @@ -114,7 +114,7 @@ void drbd_bm_lock(struct drbd_conf *mdev, char *why) return; } - trylock_failed = down_trylock(&b->bm_change); + trylock_failed = !mutex_trylock(&b->bm_change); if (trylock_failed) { dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n", @@ -125,7 +125,7 @@ void drbd_bm_lock(struct drbd_conf *mdev, char *why) b->bm_task == mdev->receiver.task ? "receiver" : b->bm_task == mdev->asender.task ? "asender" : b->bm_task == mdev->worker.task ? "worker" : "?"); - down(&b->bm_change); + mutex_lock(&b->bm_change); } if (__test_and_set_bit(BM_LOCKED, &b->bm_flags)) dev_err(DEV, "FIXME bitmap already locked in bm_lock\n"); @@ -147,7 +147,7 @@ void drbd_bm_unlock(struct drbd_conf *mdev) b->bm_why = NULL; b->bm_task = NULL; - up(&b->bm_change); + mutex_unlock(&b->bm_change); } /* word offset to long pointer */ @@ -295,7 +295,7 @@ int drbd_bm_init(struct drbd_conf *mdev) if (!b) return -ENOMEM; spin_lock_init(&b->bm_lock); - init_MUTEX(&b->bm_change); + mutex_init(&b->bm_change); init_waitqueue_head(&b->bm_io_wait); mdev->bitmap = b; diff --git a/drivers/block/hd.c b/drivers/block/hd.c index d5cdce0..d6efe0c 100644 --- a/drivers/block/hd.c +++ b/drivers/block/hd.c @@ -165,12 +165,12 @@ unsigned long read_timer(void) unsigned long t, flags; int i; - spin_lock_irqsave(&i8253_lock, flags); + raw_spin_lock_irqsave(&i8253_lock, flags); t = jiffies * 11932; outb_p(0, 0x43); i = inb_p(0x40); i |= inb(0x40) << 8; - spin_unlock_irqrestore(&i8253_lock, flags); + raw_spin_unlock_irqrestore(&i8253_lock, flags); return(t - i); } #endif diff --git a/drivers/block/paride/pseudo.h b/drivers/block/paride/pseudo.h index bc37032..0fbc78c 100644 --- a/drivers/block/paride/pseudo.h +++ b/drivers/block/paride/pseudo.h @@ -43,7 +43,7 @@ static unsigned long ps_timeout; static int ps_tq_active = 0; static int ps_nice = 0; -static DEFINE_SPINLOCK(ps_spinlock __attribute__((unused))); +static __attribute__((unused)) DEFINE_SPINLOCK(ps_spinlock); static DECLARE_DELAYED_WORK(ps_tq, ps_tq_int); diff --git a/drivers/char/bfin_jtag_comm.c b/drivers/char/bfin_jtag_comm.c index 2628c74..6de1f8e 100644 --- a/drivers/char/bfin_jtag_comm.c +++ b/drivers/char/bfin_jtag_comm.c @@ -182,16 +182,16 @@ bfin_jc_circ_write(const unsigned char *buf, int count) } #ifndef CONFIG_BFIN_JTAG_COMM_CONSOLE -# define acquire_console_sem() -# define release_console_sem() +# define acquire_console_mutex() +# define release_console_mutex() #endif static int bfin_jc_write(struct tty_struct *tty, const unsigned char *buf, int count) { int i; - acquire_console_sem(); + acquire_console_mutex(); i = bfin_jc_circ_write(buf, count); - release_console_sem(); + release_console_mutex(); wake_up_process(bfin_jc_kthread); return i; } diff --git a/drivers/char/random.c b/drivers/char/random.c index 2849713..5e54b4b 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -625,8 +625,11 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num) preempt_disable(); /* if over the trickle threshold, use only 1 in 4096 samples */ if (input_pool.entropy_count > trickle_thresh && - (__get_cpu_var(trickle_count)++ & 0xfff)) - goto out; + (__get_cpu_var(trickle_count)++ & 0xfff)) { + preempt_enable(); + return; + } + preempt_enable(); sample.jiffies = jiffies; sample.cycles = get_cycles(); @@ -668,8 +671,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num) credit_entropy_bits(&input_pool, min_t(int, fls(delta>>1), 11)); } -out: - preempt_enable(); } void add_input_randomness(unsigned int type, unsigned int code, diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c index 95acb8c..c973c0b 100644 --- a/drivers/char/rtc.c +++ b/drivers/char/rtc.c @@ -1195,10 +1195,12 @@ static void rtc_dropped_irq(unsigned long data) spin_unlock_irq(&rtc_lock); +#ifndef CONFIG_PREEMPT_RT if (printk_ratelimit()) { printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", freq); } +#endif /* Now we have new data */ wake_up_interruptible(&rtc_wait); diff --git a/drivers/char/selection.c b/drivers/char/selection.c index f97b9e8..a382302 100644 --- a/drivers/char/selection.c +++ b/drivers/char/selection.c @@ -312,9 +312,9 @@ int paste_selection(struct tty_struct *tty) struct tty_ldisc *ld; DECLARE_WAITQUEUE(wait, current); - acquire_console_sem(); + acquire_console_mutex(); poke_blanked_console(); - release_console_sem(); + release_console_mutex(); ld = tty_ldisc_ref_wait(tty); diff --git a/drivers/char/tty_buffer.c b/drivers/char/tty_buffer.c index 66fa4e1..17dfa02 100644 --- a/drivers/char/tty_buffer.c +++ b/drivers/char/tty_buffer.c @@ -492,10 +492,14 @@ void tty_flip_buffer_push(struct tty_struct *tty) tty->buf.tail->commit = tty->buf.tail->used; spin_unlock_irqrestore(&tty->buf.lock, flags); +#ifndef CONFIG_PREEMPT_RT if (tty->low_latency) flush_to_ldisc(&tty->buf.work.work); else schedule_delayed_work(&tty->buf.work, 1); +#else + flush_to_ldisc(&tty->buf.work.work); +#endif } EXPORT_SYMBOL(tty_flip_buffer_push); diff --git a/drivers/char/tty_ldisc.c b/drivers/char/tty_ldisc.c index 3f653f7..53b87b5 100644 --- a/drivers/char/tty_ldisc.c +++ b/drivers/char/tty_ldisc.c @@ -71,7 +71,7 @@ static void put_ldisc(struct tty_ldisc *ld) * We really want an "atomic_dec_and_lock_irqsave()", * but we don't have it, so this does it by hand. */ - local_irq_save(flags); + local_irq_save_nort(flags); if (atomic_dec_and_lock(&ld->users, &tty_ldisc_lock)) { struct tty_ldisc_ops *ldo = ld->ops; @@ -82,7 +82,7 @@ static void put_ldisc(struct tty_ldisc *ld) kfree(ld); return; } - local_irq_restore(flags); + local_irq_restore_nort(flags); } /** diff --git a/drivers/char/vc_screen.c b/drivers/char/vc_screen.c index c1791a6..3e92ac2 100644 --- a/drivers/char/vc_screen.c +++ b/drivers/char/vc_screen.c @@ -115,7 +115,7 @@ vcs_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) /* Select the proper current console and verify * sanity of the situation under the console lock. */ - acquire_console_sem(); + acquire_console_mutex(); attr = (currcons & 128); currcons = (currcons & 127); @@ -246,9 +246,9 @@ vcs_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) * the pagefault handling code may want to call printk(). */ - release_console_sem(); + release_console_mutex(); ret = copy_to_user(buf, con_buf_start, orig_count); - acquire_console_sem(); + acquire_console_mutex(); if (ret) { read += (orig_count - ret); @@ -264,7 +264,7 @@ vcs_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) if (read) ret = read; unlock_out: - release_console_sem(); + release_console_mutex(); mutex_unlock(&con_buf_mtx); return ret; } @@ -289,7 +289,7 @@ vcs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) /* Select the proper current console and verify * sanity of the situation under the console lock. */ - acquire_console_sem(); + acquire_console_mutex(); attr = (currcons & 128); currcons = (currcons & 127); @@ -324,9 +324,9 @@ vcs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) /* Temporarily drop the console lock so that we can read * in the write data from userspace safely. */ - release_console_sem(); + release_console_mutex(); ret = copy_from_user(con_buf, buf, this_round); - acquire_console_sem(); + acquire_console_mutex(); if (ret) { this_round -= ret; @@ -450,7 +450,7 @@ vcs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) ret = written; unlock_out: - release_console_sem(); + release_console_mutex(); mutex_unlock(&con_buf_mtx); diff --git a/drivers/char/vt.c b/drivers/char/vt.c index 50faa1f..2cdf548 100644 --- a/drivers/char/vt.c +++ b/drivers/char/vt.c @@ -981,9 +981,9 @@ static int vt_resize(struct tty_struct *tty, struct winsize *ws) struct vc_data *vc = tty->driver_data; int ret; - acquire_console_sem(); + acquire_console_mutex(); ret = vc_do_resize(tty, vc, ws->ws_col, ws->ws_row); - release_console_sem(); + release_console_mutex(); return ret; } @@ -1249,7 +1249,7 @@ static void default_attr(struct vc_data *vc) vc->vc_color = vc->vc_def_color; } -/* console_sem is held */ +/* console_mutex is held */ static void csi_m(struct vc_data *vc) { int i; @@ -1393,7 +1393,7 @@ int mouse_reporting(void) return vc_cons[fg_console].d->vc_report_mouse; } -/* console_sem is held */ +/* console_mutex is held */ static void set_mode(struct vc_data *vc, int on_off) { int i; @@ -1463,7 +1463,7 @@ static void set_mode(struct vc_data *vc, int on_off) } } -/* console_sem is held */ +/* console_mutex is held */ static void setterm_command(struct vc_data *vc) { switch(vc->vc_par[0]) { @@ -1523,7 +1523,7 @@ static void setterm_command(struct vc_data *vc) } } -/* console_sem is held */ +/* console_mutex is held */ static void csi_at(struct vc_data *vc, unsigned int nr) { if (nr > vc->vc_cols - vc->vc_x) @@ -1533,7 +1533,7 @@ static void csi_at(struct vc_data *vc, unsigned int nr) insert_char(vc, nr); } -/* console_sem is held */ +/* console_mutex is held */ static void csi_L(struct vc_data *vc, unsigned int nr) { if (nr > vc->vc_rows - vc->vc_y) @@ -1544,7 +1544,7 @@ static void csi_L(struct vc_data *vc, unsigned int nr) vc->vc_need_wrap = 0; } -/* console_sem is held */ +/* console_mutex is held */ static void csi_P(struct vc_data *vc, unsigned int nr) { if (nr > vc->vc_cols - vc->vc_x) @@ -1554,7 +1554,7 @@ static void csi_P(struct vc_data *vc, unsigned int nr) delete_char(vc, nr); } -/* console_sem is held */ +/* console_mutex is held */ static void csi_M(struct vc_data *vc, unsigned int nr) { if (nr > vc->vc_rows - vc->vc_y) @@ -1565,7 +1565,7 @@ static void csi_M(struct vc_data *vc, unsigned int nr) vc->vc_need_wrap = 0; } -/* console_sem is held (except via vc_init->reset_terminal */ +/* console_mutex is held (except via vc_init->reset_terminal */ static void save_cur(struct vc_data *vc) { vc->vc_saved_x = vc->vc_x; @@ -1581,7 +1581,7 @@ static void save_cur(struct vc_data *vc) vc->vc_saved_G1 = vc->vc_G1_charset; } -/* console_sem is held */ +/* console_mutex is held */ static void restore_cur(struct vc_data *vc) { gotoxy(vc, vc->vc_saved_x, vc->vc_saved_y); @@ -1603,7 +1603,7 @@ enum { ESnormal, ESesc, ESsquare, ESgetpars, ESgotpars, ESfunckey, EShash, ESsetG0, ESsetG1, ESpercent, ESignore, ESnonstd, ESpalette }; -/* console_sem is held (except via vc_init()) */ +/* console_mutex is held (except via vc_init()) */ static void reset_terminal(struct vc_data *vc, int do_clear) { vc->vc_top = 0; @@ -1663,7 +1663,7 @@ static void reset_terminal(struct vc_data *vc, int do_clear) csi_J(vc, 2); } -/* console_sem is held */ +/* console_mutex is held */ static void do_con_trol(struct tty_struct *tty, struct vc_data *vc, int c) { /* @@ -2097,7 +2097,7 @@ static int is_double_width(uint32_t ucs) return bisearch(ucs, double_width, ARRAY_SIZE(double_width) - 1); } -/* acquires console_sem */ +/* acquires console_mutex */ static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int count) { #ifdef VT_BUF_VRAM_ONLY @@ -2127,11 +2127,11 @@ static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int co might_sleep(); - acquire_console_sem(); + acquire_console_mutex(); vc = tty->driver_data; if (vc == NULL) { printk(KERN_ERR "vt: argh, driver_data is NULL !\n"); - release_console_sem(); + release_console_mutex(); return 0; } @@ -2139,7 +2139,7 @@ static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int co if (!vc_cons_allocated(currcons)) { /* could this happen? */ printk_once("con_write: tty %d not allocated\n", currcons+1); - release_console_sem(); + release_console_mutex(); return 0; } orig_buf = buf; @@ -2357,7 +2357,7 @@ rescan_last_byte: } FLUSH console_conditional_schedule(); - release_console_sem(); + release_console_mutex(); notify_update(vc); return n; #undef FLUSH @@ -2370,11 +2370,11 @@ rescan_last_byte: * us to do the switches asynchronously (needed when we want * to switch due to a keyboard interrupt). Synchronization * with other console code and prevention of re-entrancy is - * ensured with console_sem. + * ensured with console_mutex. */ static void console_callback(struct work_struct *ignored) { - acquire_console_sem(); + acquire_console_mutex(); if (want_console >= 0) { if (want_console != fg_console && @@ -2404,7 +2404,7 @@ static void console_callback(struct work_struct *ignored) } notify_update(vc_cons[fg_console].d); - release_console_sem(); + release_console_mutex(); } int set_console(int nr) @@ -2575,7 +2575,7 @@ static struct console vt_console_driver = { .write = vt_console_print, .device = vt_console_device, .unblank = unblank_screen, - .flags = CON_PRINTBUFFER, + .flags = CON_PRINTBUFFER | CON_ATOMIC, .index = -1, }; #endif @@ -2585,7 +2585,7 @@ static struct console vt_console_driver = { */ /* - * Generally a bit racy with respect to console_sem(). + * Generally a bit racy with respect to console_mutex(). * * There are some functions which don't need it. * @@ -2613,17 +2613,17 @@ int tioclinux(struct tty_struct *tty, unsigned long arg) switch (type) { case TIOCL_SETSEL: - acquire_console_sem(); + acquire_console_mutex(); ret = set_selection((struct tiocl_selection __user *)(p+1), tty); - release_console_sem(); + release_console_mutex(); break; case TIOCL_PASTESEL: ret = paste_selection(tty); break; case TIOCL_UNBLANKSCREEN: - acquire_console_sem(); + acquire_console_mutex(); unblank_screen(); - release_console_sem(); + release_console_mutex(); break; case TIOCL_SELLOADLUT: ret = sel_loadlut(p); @@ -2672,10 +2672,10 @@ int tioclinux(struct tty_struct *tty, unsigned long arg) } break; case TIOCL_BLANKSCREEN: /* until explicitly unblanked, not only poked */ - acquire_console_sem(); + acquire_console_mutex(); ignore_poke = 1; do_blank_screen(0); - release_console_sem(); + release_console_mutex(); break; case TIOCL_BLANKEDSCREEN: ret = console_blanked; @@ -2775,11 +2775,11 @@ static void con_flush_chars(struct tty_struct *tty) return; /* if we race with con_close(), vt may be null */ - acquire_console_sem(); + acquire_console_mutex(); vc = tty->driver_data; if (vc) set_cursor(vc); - release_console_sem(); + release_console_mutex(); } /* @@ -2790,7 +2790,7 @@ static int con_open(struct tty_struct *tty, struct file *filp) unsigned int currcons = tty->index; int ret = 0; - acquire_console_sem(); + acquire_console_mutex(); if (tty->driver_data == NULL) { ret = vc_allocate(currcons); if (ret == 0) { @@ -2798,7 +2798,7 @@ static int con_open(struct tty_struct *tty, struct file *filp) /* Still being freed */ if (vc->vc_tty) { - release_console_sem(); + release_console_mutex(); return -ERESTARTSYS; } tty->driver_data = vc; @@ -2812,11 +2812,11 @@ static int con_open(struct tty_struct *tty, struct file *filp) tty->termios->c_iflag |= IUTF8; else tty->termios->c_iflag &= ~IUTF8; - release_console_sem(); + release_console_mutex(); return ret; } } - release_console_sem(); + release_console_mutex(); return ret; } @@ -2829,9 +2829,9 @@ static void con_shutdown(struct tty_struct *tty) { struct vc_data *vc = tty->driver_data; BUG_ON(vc == NULL); - acquire_console_sem(); + acquire_console_mutex(); vc->vc_tty = NULL; - release_console_sem(); + release_console_mutex(); tty_shutdown(tty); } @@ -2878,13 +2878,13 @@ static int __init con_init(void) struct vc_data *vc; unsigned int currcons = 0, i; - acquire_console_sem(); + acquire_console_mutex(); if (conswitchp) display_desc = conswitchp->con_startup(); if (!display_desc) { fg_console = 0; - release_console_sem(); + release_console_mutex(); return 0; } @@ -2930,7 +2930,7 @@ static int __init con_init(void) printable = 1; printk("\n"); - release_console_sem(); + release_console_mutex(); #ifdef CONFIG_VT_CONSOLE register_console(&vt_console_driver); @@ -3010,7 +3010,7 @@ static int bind_con_driver(const struct consw *csw, int first, int last, if (!try_module_get(owner)) return -ENODEV; - acquire_console_sem(); + acquire_console_mutex(); /* check if driver is registered */ for (i = 0; i < MAX_NR_CON_DRIVER; i++) { @@ -3095,7 +3095,7 @@ static int bind_con_driver(const struct consw *csw, int first, int last, retval = 0; err: - release_console_sem(); + release_console_mutex(); module_put(owner); return retval; }; @@ -3144,7 +3144,7 @@ int unbind_con_driver(const struct consw *csw, int first, int last, int deflt) if (!try_module_get(owner)) return -ENODEV; - acquire_console_sem(); + acquire_console_mutex(); /* check if driver is registered and if it is unbindable */ for (i = 0; i < MAX_NR_CON_DRIVER; i++) { @@ -3158,7 +3158,7 @@ int unbind_con_driver(const struct consw *csw, int first, int last, int deflt) } if (retval) { - release_console_sem(); + release_console_mutex(); goto err; } @@ -3177,12 +3177,12 @@ int unbind_con_driver(const struct consw *csw, int first, int last, int deflt) } if (retval) { - release_console_sem(); + release_console_mutex(); goto err; } if (!con_is_bound(csw)) { - release_console_sem(); + release_console_mutex(); goto err; } @@ -3211,7 +3211,7 @@ int unbind_con_driver(const struct consw *csw, int first, int last, int deflt) if (!con_is_bound(csw)) con_driver->flag &= ~CON_DRIVER_FLAG_INIT; - release_console_sem(); + release_console_mutex(); /* ignore return value, binding should not fail */ bind_con_driver(defcsw, first, last, deflt); err: @@ -3437,7 +3437,7 @@ int register_con_driver(const struct consw *csw, int first, int last) if (!try_module_get(owner)) return -ENODEV; - acquire_console_sem(); + acquire_console_mutex(); for (i = 0; i < MAX_NR_CON_DRIVER; i++) { con_driver = ®istered_con_driver[i]; @@ -3491,7 +3491,7 @@ int register_con_driver(const struct consw *csw, int first, int last) } err: - release_console_sem(); + release_console_mutex(); module_put(owner); return retval; } @@ -3512,7 +3512,7 @@ int unregister_con_driver(const struct consw *csw) { int i, retval = -ENODEV; - acquire_console_sem(); + acquire_console_mutex(); /* cannot unregister a bound driver */ if (con_is_bound(csw)) @@ -3538,7 +3538,7 @@ int unregister_con_driver(const struct consw *csw) } } err: - release_console_sem(); + release_console_mutex(); return retval; } EXPORT_SYMBOL(unregister_con_driver); @@ -3832,9 +3832,9 @@ int con_set_cmap(unsigned char __user *arg) { int rc; - acquire_console_sem(); + acquire_console_mutex(); rc = set_get_cmap (arg,1); - release_console_sem(); + release_console_mutex(); return rc; } @@ -3843,9 +3843,9 @@ int con_get_cmap(unsigned char __user *arg) { int rc; - acquire_console_sem(); + acquire_console_mutex(); rc = set_get_cmap (arg,0); - release_console_sem(); + release_console_mutex(); return rc; } @@ -3892,12 +3892,12 @@ static int con_font_get(struct vc_data *vc, struct console_font_op *op) } else font.data = NULL; - acquire_console_sem(); + acquire_console_mutex(); if (vc->vc_sw->con_font_get) rc = vc->vc_sw->con_font_get(vc, &font); else rc = -ENOSYS; - release_console_sem(); + release_console_mutex(); if (rc) goto out; @@ -3978,12 +3978,12 @@ static int con_font_set(struct vc_data *vc, struct console_font_op *op) kfree(font.data); return -EFAULT; } - acquire_console_sem(); + acquire_console_mutex(); if (vc->vc_sw->con_font_set) rc = vc->vc_sw->con_font_set(vc, &font, op->flags); else rc = -ENOSYS; - release_console_sem(); + release_console_mutex(); kfree(font.data); return rc; } @@ -4005,12 +4005,12 @@ static int con_font_default(struct vc_data *vc, struct console_font_op *op) else name[MAX_FONT_NAME - 1] = 0; - acquire_console_sem(); + acquire_console_mutex(); if (vc->vc_sw->con_font_default) rc = vc->vc_sw->con_font_default(vc, &font, s); else rc = -ENOSYS; - release_console_sem(); + release_console_mutex(); if (!rc) { op->width = font.width; op->height = font.height; @@ -4026,7 +4026,7 @@ static int con_font_copy(struct vc_data *vc, struct console_font_op *op) if (vc->vc_mode != KD_TEXT) return -EINVAL; - acquire_console_sem(); + acquire_console_mutex(); if (!vc->vc_sw->con_font_copy) rc = -ENOSYS; else if (con < 0 || !vc_cons_allocated(con)) @@ -4035,7 +4035,7 @@ static int con_font_copy(struct vc_data *vc, struct console_font_op *op) rc = 0; else rc = vc->vc_sw->con_font_copy(vc, con); - release_console_sem(); + release_console_mutex(); return rc; } diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c index 6aa1028..26a7300 100644 --- a/drivers/char/vt_ioctl.c +++ b/drivers/char/vt_ioctl.c @@ -648,12 +648,12 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, /* * explicitly blank/unblank the screen if switching modes */ - acquire_console_sem(); + acquire_console_mutex(); if (arg == KD_TEXT) do_unblank_screen(1); else do_blank_screen(1); - release_console_sem(); + release_console_mutex(); break; case KDGETMODE: @@ -892,7 +892,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, ret = -EINVAL; goto out; } - acquire_console_sem(); + acquire_console_mutex(); vc->vt_mode = tmp; /* the frsig is ignored, so we set it to 0 */ vc->vt_mode.frsig = 0; @@ -900,7 +900,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, vc->vt_pid = get_pid(task_pid(current)); /* no switch is required -- saw@shade.msu.ru */ vc->vt_newvt = -1; - release_console_sem(); + release_console_mutex(); break; } @@ -909,9 +909,9 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, struct vt_mode tmp; int rc; - acquire_console_sem(); + acquire_console_mutex(); memcpy(&tmp, &vc->vt_mode, sizeof(struct vt_mode)); - release_console_sem(); + release_console_mutex(); rc = copy_to_user(up, &tmp, sizeof(struct vt_mode)); if (rc) @@ -964,9 +964,9 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, ret = -ENXIO; else { arg--; - acquire_console_sem(); + acquire_console_mutex(); ret = vc_allocate(arg); - release_console_sem(); + release_console_mutex(); if (ret) break; set_console(arg); @@ -989,7 +989,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, ret = -ENXIO; else { vsa.console--; - acquire_console_sem(); + acquire_console_mutex(); ret = vc_allocate(vsa.console); if (ret == 0) { struct vc_data *nvc; @@ -1002,7 +1002,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, put_pid(nvc->vt_pid); nvc->vt_pid = get_pid(task_pid(current)); } - release_console_sem(); + release_console_mutex(); if (ret) break; /* Commence switch and lock */ @@ -1043,7 +1043,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, /* * Switching-from response */ - acquire_console_sem(); + acquire_console_mutex(); if (vc->vt_newvt >= 0) { if (arg == 0) /* @@ -1062,7 +1062,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, vc->vt_newvt = -1; ret = vc_allocate(newvt); if (ret) { - release_console_sem(); + release_console_mutex(); break; } /* @@ -1082,7 +1082,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, if (arg != VT_ACKACQ) ret = -EINVAL; } - release_console_sem(); + release_console_mutex(); break; /* @@ -1095,20 +1095,20 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, } if (arg == 0) { /* deallocate all unused consoles, but leave 0 */ - acquire_console_sem(); + acquire_console_mutex(); for (i=1; i<MAX_NR_CONSOLES; i++) if (! VT_BUSY(i)) vc_deallocate(i); - release_console_sem(); + release_console_mutex(); } else { /* deallocate a single console, if possible */ arg--; if (VT_BUSY(arg)) ret = -EBUSY; else if (arg) { /* leave 0 */ - acquire_console_sem(); + acquire_console_mutex(); vc_deallocate(arg); - release_console_sem(); + release_console_mutex(); } } break; @@ -1125,7 +1125,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, get_user(cc, &vtsizes->v_cols)) ret = -EFAULT; else { - acquire_console_sem(); + acquire_console_mutex(); for (i = 0; i < MAX_NR_CONSOLES; i++) { vc = vc_cons[i].d; @@ -1134,7 +1134,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, vc_resize(vc_cons[i].d, cc, ll); } } - release_console_sem(); + release_console_mutex(); } break; } @@ -1186,14 +1186,14 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, for (i = 0; i < MAX_NR_CONSOLES; i++) { if (!vc_cons[i].d) continue; - acquire_console_sem(); + acquire_console_mutex(); if (vlin) vc_cons[i].d->vc_scan_lines = vlin; if (clin) vc_cons[i].d->vc_font.height = clin; vc_cons[i].d->vc_resize_user = 1; vc_resize(vc_cons[i].d, cc, ll); - release_console_sem(); + release_console_mutex(); } break; } @@ -1364,7 +1364,7 @@ void vc_SAK(struct work_struct *work) struct vc_data *vc; struct tty_struct *tty; - acquire_console_sem(); + acquire_console_mutex(); vc = vc_con->d; if (vc) { tty = vc->vc_tty; @@ -1376,7 +1376,7 @@ void vc_SAK(struct work_struct *work) __do_SAK(tty); reset_vc(vc); } - release_console_sem(); + release_console_mutex(); } #ifdef CONFIG_COMPAT @@ -1734,10 +1734,10 @@ int vt_move_to_console(unsigned int vt, int alloc) { int prev; - acquire_console_sem(); + acquire_console_mutex(); /* Graphics mode - up to X */ if (disable_vt_switch) { - release_console_sem(); + release_console_mutex(); return 0; } prev = fg_console; @@ -1745,7 +1745,7 @@ int vt_move_to_console(unsigned int vt, int alloc) if (alloc && vc_allocate(vt)) { /* we can't have a free VC for now. Too bad, * we don't want to mess the screen for now. */ - release_console_sem(); + release_console_mutex(); return -ENOSPC; } @@ -1755,10 +1755,10 @@ int vt_move_to_console(unsigned int vt, int alloc) * Let the calling function know so it can decide * what to do. */ - release_console_sem(); + release_console_mutex(); return -EIO; } - release_console_sem(); + release_console_mutex(); if (vt_waitactive(vt + 1)) { pr_debug("Suspend: Can't switch VCs."); return -EINTR; @@ -1775,8 +1775,8 @@ int vt_move_to_console(unsigned int vt, int alloc) */ void pm_set_vt_switch(int do_switch) { - acquire_console_sem(); + acquire_console_mutex(); disable_vt_switch = !do_switch; - release_console_sem(); + release_console_mutex(); } EXPORT_SYMBOL(pm_set_vt_switch); diff --git a/drivers/clocksource/cs5535-clockevt.c b/drivers/clocksource/cs5535-clockevt.c index 27d20fa..b314a99 100644 --- a/drivers/clocksource/cs5535-clockevt.c +++ b/drivers/clocksource/cs5535-clockevt.c @@ -21,7 +21,7 @@ #define DRV_NAME "cs5535-clockevt" -static int timer_irq = CONFIG_CS5535_MFGPT_DEFAULT_IRQ; +static int timer_irq; module_param_named(irq, timer_irq, int, 0644); MODULE_PARM_DESC(irq, "Which IRQ to use for the clock source MFGPT ticks."); diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c index 9d0dfcb..b090e6e 100644 --- a/drivers/firewire/core-device.c +++ b/drivers/firewire/core-device.c @@ -762,9 +762,9 @@ static int update_unit(struct device *dev, void *data) struct fw_driver *driver = (struct fw_driver *)dev->driver; if (is_fw_unit(dev) && driver != NULL && driver->update != NULL) { - down(&dev->sem); + mutex_lock(&dev->mutex); driver->update(unit); - up(&dev->sem); + mutex_unlock(&dev->mutex); } return 0; diff --git a/drivers/firewire/net.c b/drivers/firewire/net.c index cbaf420..2d3dc7d 100644 --- a/drivers/firewire/net.c +++ b/drivers/firewire/net.c @@ -893,20 +893,31 @@ static void fwnet_receive_broadcast(struct fw_iso_context *context, static struct kmem_cache *fwnet_packet_task_cache; +static void fwnet_free_ptask(struct fwnet_packet_task *ptask) +{ + dev_kfree_skb_any(ptask->skb); + kmem_cache_free(fwnet_packet_task_cache, ptask); +} + static int fwnet_send_packet(struct fwnet_packet_task *ptask); static void fwnet_transmit_packet_done(struct fwnet_packet_task *ptask) { - struct fwnet_device *dev; + struct fwnet_device *dev = ptask->dev; unsigned long flags; - - dev = ptask->dev; + bool free; spin_lock_irqsave(&dev->lock, flags); - list_del(&ptask->pt_link); - spin_unlock_irqrestore(&dev->lock, flags); - ptask->outstanding_pkts--; /* FIXME access inside lock */ + ptask->outstanding_pkts--; + + /* Check whether we or the networking TX soft-IRQ is last user. */ + free = (ptask->outstanding_pkts == 0 && !list_empty(&ptask->pt_link)); + + if (ptask->outstanding_pkts == 0) + list_del(&ptask->pt_link); + + spin_unlock_irqrestore(&dev->lock, flags); if (ptask->outstanding_pkts > 0) { u16 dg_size; @@ -951,10 +962,10 @@ static void fwnet_transmit_packet_done(struct fwnet_packet_task *ptask) ptask->max_payload = skb->len + RFC2374_FRAG_HDR_SIZE; } fwnet_send_packet(ptask); - } else { - dev_kfree_skb_any(ptask->skb); - kmem_cache_free(fwnet_packet_task_cache, ptask); } + + if (free) + fwnet_free_ptask(ptask); } static void fwnet_write_complete(struct fw_card *card, int rcode, @@ -977,6 +988,7 @@ static int fwnet_send_packet(struct fwnet_packet_task *ptask) unsigned tx_len; struct rfc2734_header *bufhdr; unsigned long flags; + bool free; dev = ptask->dev; tx_len = ptask->max_payload; @@ -1022,12 +1034,16 @@ static int fwnet_send_packet(struct fwnet_packet_task *ptask) generation, SCODE_100, 0ULL, ptask->skb->data, tx_len + 8, fwnet_write_complete, ptask); - /* FIXME race? */ spin_lock_irqsave(&dev->lock, flags); - list_add_tail(&ptask->pt_link, &dev->broadcasted_list); + + /* If the AT tasklet already ran, we may be last user. */ + free = (ptask->outstanding_pkts == 0 && list_empty(&ptask->pt_link)); + if (!free) + list_add_tail(&ptask->pt_link, &dev->broadcasted_list); + spin_unlock_irqrestore(&dev->lock, flags); - return 0; + goto out; } fw_send_request(dev->card, &ptask->transaction, @@ -1035,12 +1051,19 @@ static int fwnet_send_packet(struct fwnet_packet_task *ptask) ptask->generation, ptask->speed, ptask->fifo_addr, ptask->skb->data, tx_len, fwnet_write_complete, ptask); - /* FIXME race? */ spin_lock_irqsave(&dev->lock, flags); - list_add_tail(&ptask->pt_link, &dev->sent_list); + + /* If the AT tasklet already ran, we may be last user. */ + free = (ptask->outstanding_pkts == 0 && list_empty(&ptask->pt_link)); + if (!free) + list_add_tail(&ptask->pt_link, &dev->sent_list); + spin_unlock_irqrestore(&dev->lock, flags); dev->netdev->trans_start = jiffies; + out: + if (free) + fwnet_free_ptask(ptask); return 0; } @@ -1298,6 +1321,8 @@ static netdev_tx_t fwnet_tx(struct sk_buff *skb, struct net_device *net) spin_unlock_irqrestore(&dev->lock, flags); ptask->max_payload = max_payload; + INIT_LIST_HEAD(&ptask->pt_link); + fwnet_send_packet(ptask); return NETDEV_TX_OK; diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index 2345d41..43ebf33 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -2101,11 +2101,6 @@ static int ohci_queue_iso_transmit(struct fw_iso_context *base, u32 payload_index, payload_end_index, next_page_index; int page, end_page, i, length, offset; - /* - * FIXME: Cycle lost behavior should be configurable: lose - * packet, retransmit or terminate.. - */ - p = packet; payload_index = payload; @@ -2135,6 +2130,14 @@ static int ohci_queue_iso_transmit(struct fw_iso_context *base, if (!p->skip) { d[0].control = cpu_to_le16(DESCRIPTOR_KEY_IMMEDIATE); d[0].req_count = cpu_to_le16(8); + /* + * Link the skip address to this descriptor itself. This causes + * a context to skip a cycle whenever lost cycles or FIFO + * overruns occur, without dropping the data. The application + * should then decide whether this is an error condition or not. + * FIXME: Make the context's cycle-lost behaviour configurable? + */ + d[0].branch_address = cpu_to_le32(d_bus | z); header = (__le32 *) &d[1]; header[0] = cpu_to_le32(IT_HEADER_SY(p->sy) | diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c index f665b05..ab6c973 100644 --- a/drivers/gpu/drm/drm_edid.c +++ b/drivers/gpu/drm/drm_edid.c @@ -598,6 +598,50 @@ struct drm_display_mode *drm_mode_std(struct drm_device *dev, return mode; } +/* + * EDID is delightfully ambiguous about how interlaced modes are to be + * encoded. Our internal representation is of frame height, but some + * HDTV detailed timings are encoded as field height. + * + * The format list here is from CEA, in frame size. Technically we + * should be checking refresh rate too. Whatever. + */ +static void +drm_mode_do_interlace_quirk(struct drm_display_mode *mode, + struct detailed_pixel_timing *pt) +{ + int i; + static const struct { + int w, h; + } cea_interlaced[] = { + { 1920, 1080 }, + { 720, 480 }, + { 1440, 480 }, + { 2880, 480 }, + { 720, 576 }, + { 1440, 576 }, + { 2880, 576 }, + }; + static const int n_sizes = + sizeof(cea_interlaced)/sizeof(cea_interlaced[0]); + + if (!(pt->misc & DRM_EDID_PT_INTERLACED)) + return; + + for (i = 0; i < n_sizes; i++) { + if ((mode->hdisplay == cea_interlaced[i].w) && + (mode->vdisplay == cea_interlaced[i].h / 2)) { + mode->vdisplay *= 2; + mode->vsync_start *= 2; + mode->vsync_end *= 2; + mode->vtotal *= 2; + mode->vtotal |= 1; + } + } + + mode->flags |= DRM_MODE_FLAG_INTERLACE; +} + /** * drm_mode_detailed - create a new mode from an EDID detailed timing section * @dev: DRM device (needed to create new mode) @@ -680,8 +724,7 @@ static struct drm_display_mode *drm_mode_detailed(struct drm_device *dev, drm_mode_set_name(mode); - if (pt->misc & DRM_EDID_PT_INTERLACED) - mode->flags |= DRM_MODE_FLAG_INTERLACE; + drm_mode_do_interlace_quirk(mode, pt); if (quirks & EDID_QUIRK_DETAILED_SYNC_PP) { pt->misc |= DRM_EDID_PT_HSYNC_POSITIVE | DRM_EDID_PT_VSYNC_POSITIVE; diff --git a/drivers/gpu/drm/drm_mm.c b/drivers/gpu/drm/drm_mm.c index cdec329..2ac074c 100644 --- a/drivers/gpu/drm/drm_mm.c +++ b/drivers/gpu/drm/drm_mm.c @@ -405,7 +405,8 @@ struct drm_mm_node *drm_mm_search_free_in_range(const struct drm_mm *mm, wasted += alignment - tmp; } - if (entry->size >= size + wasted) { + if (entry->size >= size + wasted && + (entry->start + wasted + size) <= end) { if (!best_match) return entry; if (entry->size < best_size) { diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index 79beffc..cf4cb3e 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -176,6 +176,8 @@ MODULE_DEVICE_TABLE(pci, pciidlist); static int i915_drm_freeze(struct drm_device *dev) { + struct drm_i915_private *dev_priv = dev->dev_private; + pci_save_state(dev->pdev); /* If KMS is active, we do the leavevt stuff here */ @@ -191,17 +193,12 @@ static int i915_drm_freeze(struct drm_device *dev) i915_save_state(dev); - return 0; -} - -static void i915_drm_suspend(struct drm_device *dev) -{ - struct drm_i915_private *dev_priv = dev->dev_private; - intel_opregion_free(dev, 1); /* Modeset on resume, not lid events */ dev_priv->modeset_on_lid = 0; + + return 0; } static int i915_suspend(struct drm_device *dev, pm_message_t state) @@ -221,8 +218,6 @@ static int i915_suspend(struct drm_device *dev, pm_message_t state) if (error) return error; - i915_drm_suspend(dev); - if (state.event == PM_EVENT_SUSPEND) { /* Shut down the device */ pci_disable_device(dev->pdev); @@ -237,6 +232,10 @@ static int i915_drm_thaw(struct drm_device *dev) struct drm_i915_private *dev_priv = dev->dev_private; int error = 0; + i915_restore_state(dev); + + intel_opregion_init(dev, 1); + /* KMS EnterVT equivalent */ if (drm_core_check_feature(dev, DRIVER_MODESET)) { mutex_lock(&dev->struct_mutex); @@ -263,10 +262,6 @@ static int i915_resume(struct drm_device *dev) pci_set_master(dev->pdev); - i915_restore_state(dev); - - intel_opregion_init(dev, 1); - return i915_drm_thaw(dev); } @@ -423,8 +418,6 @@ static int i915_pm_suspend(struct device *dev) if (error) return error; - i915_drm_suspend(drm_dev); - pci_disable_device(pdev); pci_set_power_state(pdev, PCI_D3hot); @@ -464,13 +457,8 @@ static int i915_pm_poweroff(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); struct drm_device *drm_dev = pci_get_drvdata(pdev); - int error; - - error = i915_drm_freeze(drm_dev); - if (!error) - i915_drm_suspend(drm_dev); - return error; + return i915_drm_freeze(drm_dev); } const struct dev_pm_ops i915_pm_ops = { diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c index b1d0acb..c2e8a45 100644 --- a/drivers/gpu/drm/i915/intel_lvds.c +++ b/drivers/gpu/drm/i915/intel_lvds.c @@ -636,6 +636,13 @@ static const struct dmi_system_id bad_lid_status[] = { DMI_MATCH(DMI_PRODUCT_NAME, "PC-81005"), }, }, + { + .ident = "Clevo M5x0N", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "CLEVO Co."), + DMI_MATCH(DMI_BOARD_NAME, "M5x0N"), + }, + }, { } }; diff --git a/drivers/gpu/drm/nouveau/nouveau_bios.c b/drivers/gpu/drm/nouveau/nouveau_bios.c index 2cd0fad..0e9cd1d 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bios.c +++ b/drivers/gpu/drm/nouveau/nouveau_bios.c @@ -5861,13 +5861,12 @@ nouveau_bios_run_init_table(struct drm_device *dev, uint16_t table, struct drm_nouveau_private *dev_priv = dev->dev_private; struct nvbios *bios = &dev_priv->VBIOS; struct init_exec iexec = { true, false }; - unsigned long flags; - spin_lock_irqsave(&bios->lock, flags); + mutex_lock(&bios->lock); bios->display.output = dcbent; parse_init_table(bios, table, &iexec); bios->display.output = NULL; - spin_unlock_irqrestore(&bios->lock, flags); + mutex_unlock(&bios->lock); } static bool NVInitVBIOS(struct drm_device *dev) @@ -5876,7 +5875,7 @@ static bool NVInitVBIOS(struct drm_device *dev) struct nvbios *bios = &dev_priv->VBIOS; memset(bios, 0, sizeof(struct nvbios)); - spin_lock_init(&bios->lock); + mutex_init(&bios->lock); bios->dev = dev; if (!NVShadowVBIOS(dev, bios->data)) diff --git a/drivers/gpu/drm/nouveau/nouveau_bios.h b/drivers/gpu/drm/nouveau/nouveau_bios.h index 68446fd..fd94bd6 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bios.h +++ b/drivers/gpu/drm/nouveau/nouveau_bios.h @@ -205,7 +205,7 @@ struct nvbios { struct drm_device *dev; struct nouveau_bios_info pub; - spinlock_t lock; + struct mutex lock; uint8_t data[NV_PROM_SIZE]; unsigned int length; diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.c b/drivers/gpu/drm/nouveau/nouveau_drv.c index da3b93b..4ea27a4 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drv.c +++ b/drivers/gpu/drm/nouveau/nouveau_drv.c @@ -219,9 +219,9 @@ nouveau_pci_suspend(struct pci_dev *pdev, pm_message_t pm_state) pci_set_power_state(pdev, PCI_D3hot); } - acquire_console_sem(); + acquire_console_mutex(); fb_set_suspend(dev_priv->fbdev_info, 1); - release_console_sem(); + release_console_mutex(); dev_priv->fbdev_info->flags = fbdev_flags; return 0; @@ -321,9 +321,9 @@ nouveau_pci_resume(struct pci_dev *pdev) nv_crtc->lut.depth = 0; } - acquire_console_sem(); + acquire_console_mutex(); fb_set_suspend(dev_priv->fbdev_info, 0); - release_console_sem(); + release_console_mutex(); nouveau_fbcon_zfill(dev); diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h index 5445cef..1c15ef3 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drv.h +++ b/drivers/gpu/drm/nouveau/nouveau_drv.h @@ -583,6 +583,7 @@ struct drm_nouveau_private { uint64_t vm_end; struct nouveau_gpuobj *vm_vram_pt[NV50_VM_VRAM_NR]; int vm_vram_pt_nr; + uint64_t vram_sys_base; /* the mtrr covering the FB */ int fb_mtrr; diff --git a/drivers/gpu/drm/nouveau/nouveau_mem.c b/drivers/gpu/drm/nouveau/nouveau_mem.c index 8f3a12f..2dc09db 100644 --- a/drivers/gpu/drm/nouveau/nouveau_mem.c +++ b/drivers/gpu/drm/nouveau/nouveau_mem.c @@ -285,53 +285,50 @@ nv50_mem_vm_bind_linear(struct drm_device *dev, uint64_t virt, uint32_t size, uint32_t flags, uint64_t phys) { struct drm_nouveau_private *dev_priv = dev->dev_private; - struct nouveau_gpuobj **pgt; - unsigned psz, pfl, pages; - - if (virt >= dev_priv->vm_gart_base && - (virt + size) < (dev_priv->vm_gart_base + dev_priv->vm_gart_size)) { - psz = 12; - pgt = &dev_priv->gart_info.sg_ctxdma; - pfl = 0x21; - virt -= dev_priv->vm_gart_base; - } else - if (virt >= dev_priv->vm_vram_base && - (virt + size) < (dev_priv->vm_vram_base + dev_priv->vm_vram_size)) { - psz = 16; - pgt = dev_priv->vm_vram_pt; - pfl = 0x01; - virt -= dev_priv->vm_vram_base; - } else { - NV_ERROR(dev, "Invalid address: 0x%16llx-0x%16llx\n", - virt, virt + size - 1); - return -EINVAL; - } + struct nouveau_gpuobj *pgt; + unsigned block; + int i; - pages = size >> psz; + virt = ((virt - dev_priv->vm_vram_base) >> 16) << 1; + size = (size >> 16) << 1; + + phys |= ((uint64_t)flags << 32); + phys |= 1; + if (dev_priv->vram_sys_base) { + phys += dev_priv->vram_sys_base; + phys |= 0x30; + } dev_priv->engine.instmem.prepare_access(dev, true); - if (flags & 0x80000000) { - while (pages--) { - struct nouveau_gpuobj *pt = pgt[virt >> 29]; - unsigned pte = ((virt & 0x1fffffffULL) >> psz) << 1; + while (size) { + unsigned offset_h = upper_32_bits(phys); + unsigned offset_l = lower_32_bits(phys); + unsigned pte, end; + + for (i = 7; i >= 0; i--) { + block = 1 << (i + 1); + if (size >= block && !(virt & (block - 1))) + break; + } + offset_l |= (i << 7); - nv_wo32(dev, pt, pte++, 0x00000000); - nv_wo32(dev, pt, pte++, 0x00000000); + phys += block << 15; + size -= block; - virt += (1 << psz); - } - } else { - while (pages--) { - struct nouveau_gpuobj *pt = pgt[virt >> 29]; - unsigned pte = ((virt & 0x1fffffffULL) >> psz) << 1; - unsigned offset_h = upper_32_bits(phys) & 0xff; - unsigned offset_l = lower_32_bits(phys); + while (block) { + pgt = dev_priv->vm_vram_pt[virt >> 14]; + pte = virt & 0x3ffe; - nv_wo32(dev, pt, pte++, offset_l | pfl); - nv_wo32(dev, pt, pte++, offset_h | flags); + end = pte + block; + if (end > 16384) + end = 16384; + block -= (end - pte); + virt += (end - pte); - phys += (1 << psz); - virt += (1 << psz); + while (pte < end) { + nv_wo32(dev, pgt, pte++, offset_l); + nv_wo32(dev, pgt, pte++, offset_h); + } } } dev_priv->engine.instmem.finish_access(dev); @@ -356,7 +353,41 @@ nv50_mem_vm_bind_linear(struct drm_device *dev, uint64_t virt, uint32_t size, void nv50_mem_vm_unbind(struct drm_device *dev, uint64_t virt, uint32_t size) { - nv50_mem_vm_bind_linear(dev, virt, size, 0x80000000, 0); + struct drm_nouveau_private *dev_priv = dev->dev_private; + struct nouveau_gpuobj *pgt; + unsigned pages, pte, end; + + virt -= dev_priv->vm_vram_base; + pages = (size >> 16) << 1; + + dev_priv->engine.instmem.prepare_access(dev, true); + while (pages) { + pgt = dev_priv->vm_vram_pt[virt >> 29]; + pte = (virt & 0x1ffe0000ULL) >> 15; + + end = pte + pages; + if (end > 16384) + end = 16384; + pages -= (end - pte); + virt += (end - pte) << 15; + + while (pte < end) + nv_wo32(dev, pgt, pte++, 0); + } + dev_priv->engine.instmem.finish_access(dev); + + nv_wr32(dev, 0x100c80, 0x00050001); + if (!nv_wait(0x100c80, 0x00000001, 0x00000000)) { + NV_ERROR(dev, "timeout: (0x100c80 & 1) == 0 (2)\n"); + NV_ERROR(dev, "0x100c80 = 0x%08x\n", nv_rd32(dev, 0x100c80)); + return; + } + + nv_wr32(dev, 0x100c80, 0x00000001); + if (!nv_wait(0x100c80, 0x00000001, 0x00000000)) { + NV_ERROR(dev, "timeout: (0x100c80 & 1) == 0 (2)\n"); + NV_ERROR(dev, "0x100c80 = 0x%08x\n", nv_rd32(dev, 0x100c80)); + } } /* diff --git a/drivers/gpu/drm/nouveau/nv04_dac.c b/drivers/gpu/drm/nouveau/nv04_dac.c index d0e038d..1d73b15 100644 --- a/drivers/gpu/drm/nouveau/nv04_dac.c +++ b/drivers/gpu/drm/nouveau/nv04_dac.c @@ -119,7 +119,7 @@ static enum drm_connector_status nv04_dac_detect(struct drm_encoder *encoder, struct drm_connector *connector) { struct drm_device *dev = encoder->dev; - uint8_t saved_seq1, saved_pi, saved_rpc1; + uint8_t saved_seq1, saved_pi, saved_rpc1, saved_cr_mode; uint8_t saved_palette0[3], saved_palette_mask; uint32_t saved_rtest_ctrl, saved_rgen_ctrl; int i; @@ -135,6 +135,9 @@ static enum drm_connector_status nv04_dac_detect(struct drm_encoder *encoder, /* only implemented for head A for now */ NVSetOwner(dev, 0); + saved_cr_mode = NVReadVgaCrtc(dev, 0, NV_CIO_CR_MODE_INDEX); + NVWriteVgaCrtc(dev, 0, NV_CIO_CR_MODE_INDEX, saved_cr_mode | 0x80); + saved_seq1 = NVReadVgaSeq(dev, 0, NV_VIO_SR_CLOCK_INDEX); NVWriteVgaSeq(dev, 0, NV_VIO_SR_CLOCK_INDEX, saved_seq1 & ~0x20); @@ -203,6 +206,7 @@ out: NVWriteVgaCrtc(dev, 0, NV_CIO_CRE_PIXEL_INDEX, saved_pi); NVWriteVgaCrtc(dev, 0, NV_CIO_CRE_RPC1_INDEX, saved_rpc1); NVWriteVgaSeq(dev, 0, NV_VIO_SR_CLOCK_INDEX, saved_seq1); + NVWriteVgaCrtc(dev, 0, NV_CIO_CR_MODE_INDEX, saved_cr_mode); if (blue == 0x18) { NV_INFO(dev, "Load detected on head A\n"); diff --git a/drivers/gpu/drm/nouveau/nv17_tv.c b/drivers/gpu/drm/nouveau/nv17_tv.c index 58b917c..21ac6e4 100644 --- a/drivers/gpu/drm/nouveau/nv17_tv.c +++ b/drivers/gpu/drm/nouveau/nv17_tv.c @@ -579,6 +579,8 @@ static void nv17_tv_restore(struct drm_encoder *encoder) nouveau_encoder(encoder)->restore.output); nv17_tv_state_load(dev, &to_tv_enc(encoder)->saved_state); + + nouveau_encoder(encoder)->last_dpms = NV_DPMS_CLEARED; } static int nv17_tv_create_resources(struct drm_encoder *encoder, diff --git a/drivers/gpu/drm/nouveau/nv50_instmem.c b/drivers/gpu/drm/nouveau/nv50_instmem.c index 94400f7..f0dc4e3 100644 --- a/drivers/gpu/drm/nouveau/nv50_instmem.c +++ b/drivers/gpu/drm/nouveau/nv50_instmem.c @@ -76,6 +76,11 @@ nv50_instmem_init(struct drm_device *dev) for (i = 0x1700; i <= 0x1710; i += 4) priv->save1700[(i-0x1700)/4] = nv_rd32(dev, i); + if (dev_priv->chipset == 0xaa || dev_priv->chipset == 0xac) + dev_priv->vram_sys_base = nv_rd32(dev, 0x100e10) << 12; + else + dev_priv->vram_sys_base = 0; + /* Reserve the last MiB of VRAM, we should probably try to avoid * setting up the below tables over the top of the VBIOS image at * some point. @@ -172,16 +177,28 @@ nv50_instmem_init(struct drm_device *dev) * We map the entire fake channel into the start of the PRAMIN BAR */ ret = nouveau_gpuobj_new_ref(dev, chan, NULL, 0, pt_size, 0x1000, - 0, &priv->pramin_pt); + 0, &priv->pramin_pt); if (ret) return ret; - for (i = 0, v = c_offset; i < pt_size; i += 8, v += 0x1000) { - if (v < (c_offset + c_size)) - BAR0_WI32(priv->pramin_pt->gpuobj, i + 0, v | 1); - else - BAR0_WI32(priv->pramin_pt->gpuobj, i + 0, 0x00000009); + v = c_offset | 1; + if (dev_priv->vram_sys_base) { + v += dev_priv->vram_sys_base; + v |= 0x30; + } + + i = 0; + while (v < dev_priv->vram_sys_base + c_offset + c_size) { + BAR0_WI32(priv->pramin_pt->gpuobj, i + 0, v); + BAR0_WI32(priv->pramin_pt->gpuobj, i + 4, 0x00000000); + v += 0x1000; + i += 8; + } + + while (i < pt_size) { + BAR0_WI32(priv->pramin_pt->gpuobj, i + 0, 0x00000000); BAR0_WI32(priv->pramin_pt->gpuobj, i + 4, 0x00000000); + i += 8; } BAR0_WI32(chan->vm_pd, 0x00, priv->pramin_pt->instance | 0x63); @@ -416,7 +433,9 @@ nv50_instmem_bind(struct drm_device *dev, struct nouveau_gpuobj *gpuobj) { struct drm_nouveau_private *dev_priv = dev->dev_private; struct nv50_instmem_priv *priv = dev_priv->engine.instmem.priv; - uint32_t pte, pte_end, vram; + struct nouveau_gpuobj *pramin_pt = priv->pramin_pt->gpuobj; + uint32_t pte, pte_end; + uint64_t vram; if (!gpuobj->im_backing || !gpuobj->im_pramin || gpuobj->im_bound) return -EINVAL; @@ -424,20 +443,24 @@ nv50_instmem_bind(struct drm_device *dev, struct nouveau_gpuobj *gpuobj) NV_DEBUG(dev, "st=0x%0llx sz=0x%0llx\n", gpuobj->im_pramin->start, gpuobj->im_pramin->size); - pte = (gpuobj->im_pramin->start >> 12) << 3; - pte_end = ((gpuobj->im_pramin->size >> 12) << 3) + pte; + pte = (gpuobj->im_pramin->start >> 12) << 1; + pte_end = ((gpuobj->im_pramin->size >> 12) << 1) + pte; vram = gpuobj->im_backing_start; NV_DEBUG(dev, "pramin=0x%llx, pte=%d, pte_end=%d\n", gpuobj->im_pramin->start, pte, pte_end); NV_DEBUG(dev, "first vram page: 0x%08x\n", gpuobj->im_backing_start); + vram |= 1; + if (dev_priv->vram_sys_base) { + vram += dev_priv->vram_sys_base; + vram |= 0x30; + } + dev_priv->engine.instmem.prepare_access(dev, true); while (pte < pte_end) { - nv_wo32(dev, priv->pramin_pt->gpuobj, (pte + 0)/4, vram | 1); - nv_wo32(dev, priv->pramin_pt->gpuobj, (pte + 4)/4, 0x00000000); - - pte += 8; + nv_wo32(dev, pramin_pt, pte++, lower_32_bits(vram)); + nv_wo32(dev, pramin_pt, pte++, upper_32_bits(vram)); vram += NV50_INSTMEM_PAGE_SIZE; } dev_priv->engine.instmem.finish_access(dev); @@ -470,14 +493,13 @@ nv50_instmem_unbind(struct drm_device *dev, struct nouveau_gpuobj *gpuobj) if (gpuobj->im_bound == 0) return -EINVAL; - pte = (gpuobj->im_pramin->start >> 12) << 3; - pte_end = ((gpuobj->im_pramin->size >> 12) << 3) + pte; + pte = (gpuobj->im_pramin->start >> 12) << 1; + pte_end = ((gpuobj->im_pramin->size >> 12) << 1) + pte; dev_priv->engine.instmem.prepare_access(dev, true); while (pte < pte_end) { - nv_wo32(dev, priv->pramin_pt->gpuobj, (pte + 0)/4, 0x00000009); - nv_wo32(dev, priv->pramin_pt->gpuobj, (pte + 4)/4, 0x00000000); - pte += 8; + nv_wo32(dev, priv->pramin_pt->gpuobj, pte++, 0x00000000); + nv_wo32(dev, priv->pramin_pt->gpuobj, pte++, 0x00000000); } dev_priv->engine.instmem.finish_access(dev); diff --git a/drivers/gpu/drm/radeon/atom.c b/drivers/gpu/drm/radeon/atom.c index e3b4456..7f152f6 100644 --- a/drivers/gpu/drm/radeon/atom.c +++ b/drivers/gpu/drm/radeon/atom.c @@ -24,6 +24,7 @@ #include <linux/module.h> #include <linux/sched.h> +#include <asm/unaligned.h> #define ATOM_DEBUG @@ -212,7 +213,9 @@ static uint32_t atom_get_src_int(atom_exec_context *ctx, uint8_t attr, case ATOM_ARG_PS: idx = U8(*ptr); (*ptr)++; - val = le32_to_cpu(ctx->ps[idx]); + /* get_unaligned_le32 avoids unaligned accesses from atombios + * tables, noticed on a DEC Alpha. */ + val = get_unaligned_le32((u32 *)&ctx->ps[idx]); if (print) DEBUG("PS[0x%02X,0x%04X]", idx, val); break; @@ -640,7 +643,7 @@ static void atom_op_delay(atom_exec_context *ctx, int *ptr, int arg) uint8_t count = U8((*ptr)++); SDEBUG(" count: %d\n", count); if (arg == ATOM_UNIT_MICROSEC) - schedule_timeout_uninterruptible(usecs_to_jiffies(count)); + udelay(count); else schedule_timeout_uninterruptible(msecs_to_jiffies(count)); } diff --git a/drivers/gpu/drm/radeon/atombios_dp.c b/drivers/gpu/drm/radeon/atombios_dp.c index b32eeea..99915a6 100644 --- a/drivers/gpu/drm/radeon/atombios_dp.c +++ b/drivers/gpu/drm/radeon/atombios_dp.c @@ -350,7 +350,7 @@ retry: atom_execute_table(rdev->mode_info.atom_context, index, (uint32_t *)&args); if (args.ucReplyStatus && !args.ucDataOutLen) { - if (args.ucReplyStatus == 0x20 && retry_count < 10) + if (args.ucReplyStatus == 0x20 && retry_count++ < 10) goto retry; DRM_DEBUG("failed to get auxch %02x%02x %02x %02x 0x%02x %02x after %d retries\n", req_bytes[1], req_bytes[0], req_bytes[2], req_bytes[3], diff --git a/drivers/gpu/drm/radeon/r600_blit_kms.c b/drivers/gpu/drm/radeon/r600_blit_kms.c index af1c3ca..446b765 100644 --- a/drivers/gpu/drm/radeon/r600_blit_kms.c +++ b/drivers/gpu/drm/radeon/r600_blit_kms.c @@ -543,9 +543,6 @@ int r600_vb_ib_get(struct radeon_device *rdev) void r600_vb_ib_put(struct radeon_device *rdev) { radeon_fence_emit(rdev, rdev->r600_blit.vb_ib->fence); - mutex_lock(&rdev->ib_pool.mutex); - list_add_tail(&rdev->r600_blit.vb_ib->list, &rdev->ib_pool.scheduled_ibs); - mutex_unlock(&rdev->ib_pool.mutex); radeon_ib_free(rdev, &rdev->r600_blit.vb_ib); } diff --git a/drivers/gpu/drm/radeon/r600_cp.c b/drivers/gpu/drm/radeon/r600_cp.c index 6d5a711..75bcf35 100644 --- a/drivers/gpu/drm/radeon/r600_cp.c +++ b/drivers/gpu/drm/radeon/r600_cp.c @@ -1428,9 +1428,12 @@ static void r700_gfx_init(struct drm_device *dev, gb_tiling_config |= R600_BANK_SWAPS(1); - backend_map = r700_get_tile_pipe_to_backend_map(dev_priv->r600_max_tile_pipes, - dev_priv->r600_max_backends, - (0xff << dev_priv->r600_max_backends) & 0xff); + if ((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RV740) + backend_map = 0x28; + else + backend_map = r700_get_tile_pipe_to_backend_map(dev_priv->r600_max_tile_pipes, + dev_priv->r600_max_backends, + (0xff << dev_priv->r600_max_backends) & 0xff); gb_tiling_config |= R600_BACKEND_MAP(backend_map); cc_gc_shader_pipe_config = diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index f57480b..c0356bb 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -96,6 +96,7 @@ extern int radeon_audio; * symbol; */ #define RADEON_MAX_USEC_TIMEOUT 100000 /* 100 ms */ +/* RADEON_IB_POOL_SIZE must be a power of 2 */ #define RADEON_IB_POOL_SIZE 16 #define RADEON_DEBUGFS_MAX_NUM_FILES 32 #define RADEONFB_CONN_LIMIT 4 @@ -363,11 +364,12 @@ void radeon_irq_kms_sw_irq_put(struct radeon_device *rdev); */ struct radeon_ib { struct list_head list; - unsigned long idx; + unsigned idx; uint64_t gpu_addr; struct radeon_fence *fence; - uint32_t *ptr; + uint32_t *ptr; uint32_t length_dw; + bool free; }; /* @@ -377,10 +379,9 @@ struct radeon_ib { struct radeon_ib_pool { struct mutex mutex; struct radeon_bo *robj; - struct list_head scheduled_ibs; struct radeon_ib ibs[RADEON_IB_POOL_SIZE]; bool ready; - DECLARE_BITMAP(alloc_bm, RADEON_IB_POOL_SIZE); + unsigned head_id; }; struct radeon_cp { diff --git a/drivers/gpu/drm/radeon/radeon_atombios.c b/drivers/gpu/drm/radeon/radeon_atombios.c index 2dcda61..4d88315 100644 --- a/drivers/gpu/drm/radeon/radeon_atombios.c +++ b/drivers/gpu/drm/radeon/radeon_atombios.c @@ -206,6 +206,15 @@ static bool radeon_atom_apply_quirks(struct drm_device *dev, *connector_type = DRM_MODE_CONNECTOR_DVID; } + /* Asrock RS600 board lists the DVI port as HDMI */ + if ((dev->pdev->device == 0x7941) && + (dev->pdev->subsystem_vendor == 0x1849) && + (dev->pdev->subsystem_device == 0x7941)) { + if ((*connector_type == DRM_MODE_CONNECTOR_HDMIA) && + (supported_device == ATOM_DEVICE_DFP3_SUPPORT)) + *connector_type = DRM_MODE_CONNECTOR_DVID; + } + /* a-bit f-i90hd - ciaranm on #radeonhd - this board has no DVI */ if ((dev->pdev->device == 0x7941) && (dev->pdev->subsystem_vendor == 0x147b) && diff --git a/drivers/gpu/drm/radeon/radeon_connectors.c b/drivers/gpu/drm/radeon/radeon_connectors.c index 2381885..65f8194 100644 --- a/drivers/gpu/drm/radeon/radeon_connectors.c +++ b/drivers/gpu/drm/radeon/radeon_connectors.c @@ -780,7 +780,7 @@ static enum drm_connector_status radeon_dvi_detect(struct drm_connector *connect * connected and the DVI port disconnected. If the edid doesn't * say HDMI, vice versa. */ - if (radeon_connector->shared_ddc && connector_status_connected) { + if (radeon_connector->shared_ddc && (ret == connector_status_connected)) { struct drm_device *dev = connector->dev; struct drm_connector *list_connector; struct radeon_connector *list_radeon_connector; @@ -1060,8 +1060,7 @@ radeon_add_atom_connector(struct drm_device *dev, return; } if (radeon_connector->ddc_bus && i2c_bus->valid) { - if (memcmp(&radeon_connector->ddc_bus->rec, i2c_bus, - sizeof(struct radeon_i2c_bus_rec)) == 0) { + if (radeon_connector->ddc_bus->rec.i2c_id == i2c_bus->i2c_id) { radeon_connector->shared_ddc = true; shared_ddc = true; } diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index 1190148..e9d0850 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -86,7 +86,7 @@ int radeon_cs_parser_relocs(struct radeon_cs_parser *p) &p->validated); } } - return radeon_bo_list_validate(&p->validated, p->ib->fence); + return radeon_bo_list_validate(&p->validated); } int radeon_cs_parser_init(struct radeon_cs_parser *p, void *data) @@ -189,12 +189,10 @@ static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error) { unsigned i; - if (error && parser->ib) { - radeon_bo_list_unvalidate(&parser->validated, - parser->ib->fence); - } else { - radeon_bo_list_unreserve(&parser->validated); + if (!error && parser->ib) { + radeon_bo_list_fence(&parser->validated, parser->ib->fence); } + radeon_bo_list_unreserve(&parser->validated); for (i = 0; i < parser->nrelocs; i++) { if (parser->relocs[i].gobj) { mutex_lock(&parser->rdev->ddev->struct_mutex); diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index 768b150..f76ae34 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -781,9 +781,9 @@ int radeon_suspend_kms(struct drm_device *dev, pm_message_t state) pci_disable_device(dev->pdev); pci_set_power_state(dev->pdev, PCI_D3hot); } - acquire_console_sem(); + acquire_console_mutex(); fb_set_suspend(rdev->fbdev_info, 1); - release_console_sem(); + release_console_mutex(); return 0; } @@ -791,11 +791,11 @@ int radeon_resume_kms(struct drm_device *dev) { struct radeon_device *rdev = dev->dev_private; - acquire_console_sem(); + acquire_console_mutex(); pci_set_power_state(dev->pdev, PCI_D0); pci_restore_state(dev->pdev); if (pci_enable_device(dev->pdev)) { - release_console_sem(); + release_console_mutex(); return -1; } pci_set_master(dev->pdev); @@ -804,7 +804,7 @@ int radeon_resume_kms(struct drm_device *dev) radeon_resume(rdev); radeon_restore_bios_scratch_regs(rdev); fb_set_suspend(rdev->fbdev_info, 0); - release_console_sem(); + release_console_mutex(); /* reset hpd state */ radeon_hpd_init(rdev); diff --git a/drivers/gpu/drm/radeon/radeon_drv.h b/drivers/gpu/drm/radeon/radeon_drv.h index e137852..c57ad60 100644 --- a/drivers/gpu/drm/radeon/radeon_drv.h +++ b/drivers/gpu/drm/radeon/radeon_drv.h @@ -106,9 +106,10 @@ * 1.29- R500 3D cmd buffer support * 1.30- Add support for occlusion queries * 1.31- Add support for num Z pipes from GET_PARAM + * 1.32- fixes for rv740 setup */ #define DRIVER_MAJOR 1 -#define DRIVER_MINOR 31 +#define DRIVER_MINOR 32 #define DRIVER_PATCHLEVEL 0 enum radeon_cp_microcode_version { diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c index d72a71b..f1da370 100644 --- a/drivers/gpu/drm/radeon/radeon_object.c +++ b/drivers/gpu/drm/radeon/radeon_object.c @@ -306,11 +306,10 @@ void radeon_bo_list_unreserve(struct list_head *head) } } -int radeon_bo_list_validate(struct list_head *head, void *fence) +int radeon_bo_list_validate(struct list_head *head) { struct radeon_bo_list *lobj; struct radeon_bo *bo; - struct radeon_fence *old_fence = NULL; int r; r = radeon_bo_list_reserve(head); @@ -334,32 +333,27 @@ int radeon_bo_list_validate(struct list_head *head, void *fence) } lobj->gpu_offset = radeon_bo_gpu_offset(bo); lobj->tiling_flags = bo->tiling_flags; - if (fence) { - old_fence = (struct radeon_fence *)bo->tbo.sync_obj; - bo->tbo.sync_obj = radeon_fence_ref(fence); - bo->tbo.sync_obj_arg = NULL; - } - if (old_fence) { - radeon_fence_unref(&old_fence); - } } return 0; } -void radeon_bo_list_unvalidate(struct list_head *head, void *fence) +void radeon_bo_list_fence(struct list_head *head, void *fence) { struct radeon_bo_list *lobj; - struct radeon_fence *old_fence; - - if (fence) - list_for_each_entry(lobj, head, list) { - old_fence = to_radeon_fence(lobj->bo->tbo.sync_obj); - if (old_fence == fence) { - lobj->bo->tbo.sync_obj = NULL; - radeon_fence_unref(&old_fence); - } + struct radeon_bo *bo; + struct radeon_fence *old_fence = NULL; + + list_for_each_entry(lobj, head, list) { + bo = lobj->bo; + spin_lock(&bo->tbo.lock); + old_fence = (struct radeon_fence *)bo->tbo.sync_obj; + bo->tbo.sync_obj = radeon_fence_ref(fence); + bo->tbo.sync_obj_arg = NULL; + spin_unlock(&bo->tbo.lock); + if (old_fence) { + radeon_fence_unref(&old_fence); } - radeon_bo_list_unreserve(head); + } } int radeon_bo_fbdev_mmap(struct radeon_bo *bo, diff --git a/drivers/gpu/drm/radeon/radeon_object.h b/drivers/gpu/drm/radeon/radeon_object.h index a02f180..7ab43de 100644 --- a/drivers/gpu/drm/radeon/radeon_object.h +++ b/drivers/gpu/drm/radeon/radeon_object.h @@ -156,8 +156,8 @@ extern void radeon_bo_list_add_object(struct radeon_bo_list *lobj, struct list_head *head); extern int radeon_bo_list_reserve(struct list_head *head); extern void radeon_bo_list_unreserve(struct list_head *head); -extern int radeon_bo_list_validate(struct list_head *head, void *fence); -extern void radeon_bo_list_unvalidate(struct list_head *head, void *fence); +extern int radeon_bo_list_validate(struct list_head *head); +extern void radeon_bo_list_fence(struct list_head *head, void *fence); extern int radeon_bo_fbdev_mmap(struct radeon_bo *bo, struct vm_area_struct *vma); extern int radeon_bo_set_tiling_flags(struct radeon_bo *bo, diff --git a/drivers/gpu/drm/radeon/radeon_ring.c b/drivers/gpu/drm/radeon/radeon_ring.c index 4d12b2d..6579eb4 100644 --- a/drivers/gpu/drm/radeon/radeon_ring.c +++ b/drivers/gpu/drm/radeon/radeon_ring.c @@ -41,68 +41,55 @@ int radeon_ib_get(struct radeon_device *rdev, struct radeon_ib **ib) { struct radeon_fence *fence; struct radeon_ib *nib; - unsigned long i; - int r = 0; + int r = 0, i, c; *ib = NULL; r = radeon_fence_create(rdev, &fence); if (r) { - DRM_ERROR("failed to create fence for new IB\n"); + dev_err(rdev->dev, "failed to create fence for new IB\n"); return r; } mutex_lock(&rdev->ib_pool.mutex); - i = find_first_zero_bit(rdev->ib_pool.alloc_bm, RADEON_IB_POOL_SIZE); - if (i < RADEON_IB_POOL_SIZE) { - set_bit(i, rdev->ib_pool.alloc_bm); - rdev->ib_pool.ibs[i].length_dw = 0; - *ib = &rdev->ib_pool.ibs[i]; - mutex_unlock(&rdev->ib_pool.mutex); - goto out; + for (i = rdev->ib_pool.head_id, c = 0, nib = NULL; c < RADEON_IB_POOL_SIZE; c++, i++) { + i &= (RADEON_IB_POOL_SIZE - 1); + if (rdev->ib_pool.ibs[i].free) { + nib = &rdev->ib_pool.ibs[i]; + break; + } } - if (list_empty(&rdev->ib_pool.scheduled_ibs)) { - /* we go do nothings here */ + if (nib == NULL) { + /* This should never happen, it means we allocated all + * IB and haven't scheduled one yet, return EBUSY to + * userspace hoping that on ioctl recall we get better + * luck + */ + dev_err(rdev->dev, "no free indirect buffer !\n"); mutex_unlock(&rdev->ib_pool.mutex); - DRM_ERROR("all IB allocated none scheduled.\n"); - r = -EINVAL; - goto out; + radeon_fence_unref(&fence); + return -EBUSY; } - /* get the first ib on the scheduled list */ - nib = list_entry(rdev->ib_pool.scheduled_ibs.next, - struct radeon_ib, list); - if (nib->fence == NULL) { - /* we go do nothings here */ + rdev->ib_pool.head_id = (nib->idx + 1) & (RADEON_IB_POOL_SIZE - 1); + nib->free = false; + if (nib->fence) { mutex_unlock(&rdev->ib_pool.mutex); - DRM_ERROR("IB %lu scheduled without a fence.\n", nib->idx); - r = -EINVAL; - goto out; - } - mutex_unlock(&rdev->ib_pool.mutex); - - r = radeon_fence_wait(nib->fence, false); - if (r) { - DRM_ERROR("radeon: IB(%lu:0x%016lX:%u)\n", nib->idx, - (unsigned long)nib->gpu_addr, nib->length_dw); - DRM_ERROR("radeon: GPU lockup detected, fail to get a IB\n"); - goto out; + r = radeon_fence_wait(nib->fence, false); + if (r) { + dev_err(rdev->dev, "error waiting fence of IB(%u:0x%016lX:%u)\n", + nib->idx, (unsigned long)nib->gpu_addr, nib->length_dw); + mutex_lock(&rdev->ib_pool.mutex); + nib->free = true; + mutex_unlock(&rdev->ib_pool.mutex); + radeon_fence_unref(&fence); + return r; + } + mutex_lock(&rdev->ib_pool.mutex); } radeon_fence_unref(&nib->fence); - + nib->fence = fence; nib->length_dw = 0; - - /* scheduled list is accessed here */ - mutex_lock(&rdev->ib_pool.mutex); - list_del(&nib->list); - INIT_LIST_HEAD(&nib->list); mutex_unlock(&rdev->ib_pool.mutex); - *ib = nib; -out: - if (r) { - radeon_fence_unref(&fence); - } else { - (*ib)->fence = fence; - } - return r; + return 0; } void radeon_ib_free(struct radeon_device *rdev, struct radeon_ib **ib) @@ -113,19 +100,10 @@ void radeon_ib_free(struct radeon_device *rdev, struct radeon_ib **ib) if (tmp == NULL) { return; } - mutex_lock(&rdev->ib_pool.mutex); - if (!list_empty(&tmp->list) && !radeon_fence_signaled(tmp->fence)) { - /* IB is scheduled & not signaled don't do anythings */ - mutex_unlock(&rdev->ib_pool.mutex); - return; - } - list_del(&tmp->list); - INIT_LIST_HEAD(&tmp->list); - if (tmp->fence) + if (!tmp->fence->emited) radeon_fence_unref(&tmp->fence); - - tmp->length_dw = 0; - clear_bit(tmp->idx, rdev->ib_pool.alloc_bm); + mutex_lock(&rdev->ib_pool.mutex); + tmp->free = true; mutex_unlock(&rdev->ib_pool.mutex); } @@ -135,7 +113,7 @@ int radeon_ib_schedule(struct radeon_device *rdev, struct radeon_ib *ib) if (!ib->length_dw || !rdev->cp.ready) { /* TODO: Nothings in the ib we should report. */ - DRM_ERROR("radeon: couldn't schedule IB(%lu).\n", ib->idx); + DRM_ERROR("radeon: couldn't schedule IB(%u).\n", ib->idx); return -EINVAL; } @@ -148,7 +126,8 @@ int radeon_ib_schedule(struct radeon_device *rdev, struct radeon_ib *ib) radeon_ring_ib_execute(rdev, ib); radeon_fence_emit(rdev, ib->fence); mutex_lock(&rdev->ib_pool.mutex); - list_add_tail(&ib->list, &rdev->ib_pool.scheduled_ibs); + /* once scheduled IB is considered free and protected by the fence */ + ib->free = true; mutex_unlock(&rdev->ib_pool.mutex); radeon_ring_unlock_commit(rdev); return 0; @@ -164,7 +143,6 @@ int radeon_ib_pool_init(struct radeon_device *rdev) if (rdev->ib_pool.robj) return 0; /* Allocate 1M object buffer */ - INIT_LIST_HEAD(&rdev->ib_pool.scheduled_ibs); r = radeon_bo_create(rdev, NULL, RADEON_IB_POOL_SIZE*64*1024, true, RADEON_GEM_DOMAIN_GTT, &rdev->ib_pool.robj); @@ -195,9 +173,9 @@ int radeon_ib_pool_init(struct radeon_device *rdev) rdev->ib_pool.ibs[i].ptr = ptr + offset; rdev->ib_pool.ibs[i].idx = i; rdev->ib_pool.ibs[i].length_dw = 0; - INIT_LIST_HEAD(&rdev->ib_pool.ibs[i].list); + rdev->ib_pool.ibs[i].free = true; } - bitmap_zero(rdev->ib_pool.alloc_bm, RADEON_IB_POOL_SIZE); + rdev->ib_pool.head_id = 0; rdev->ib_pool.ready = true; DRM_INFO("radeon: ib pool ready.\n"); if (radeon_debugfs_ib_init(rdev)) { @@ -214,7 +192,6 @@ void radeon_ib_pool_fini(struct radeon_device *rdev) return; } mutex_lock(&rdev->ib_pool.mutex); - bitmap_zero(rdev->ib_pool.alloc_bm, RADEON_IB_POOL_SIZE); if (rdev->ib_pool.robj) { r = radeon_bo_reserve(rdev->ib_pool.robj, false); if (likely(r == 0)) { @@ -363,7 +340,7 @@ static int radeon_debugfs_ib_info(struct seq_file *m, void *data) if (ib == NULL) { return 0; } - seq_printf(m, "IB %04lu\n", ib->idx); + seq_printf(m, "IB %04u\n", ib->idx); seq_printf(m, "IB fence %p\n", ib->fence); seq_printf(m, "IB size %05u dwords\n", ib->length_dw); for (i = 0; i < ib->length_dw; i++) { diff --git a/drivers/gpu/drm/radeon/rv770.c b/drivers/gpu/drm/radeon/rv770.c index 5943d56..0302167 100644 --- a/drivers/gpu/drm/radeon/rv770.c +++ b/drivers/gpu/drm/radeon/rv770.c @@ -549,9 +549,12 @@ static void rv770_gpu_init(struct radeon_device *rdev) gb_tiling_config |= BANK_SWAPS(1); - backend_map = r700_get_tile_pipe_to_backend_map(rdev->config.rv770.max_tile_pipes, - rdev->config.rv770.max_backends, - (0xff << rdev->config.rv770.max_backends) & 0xff); + if (rdev->family == CHIP_RV740) + backend_map = 0x28; + else + backend_map = r700_get_tile_pipe_to_backend_map(rdev->config.rv770.max_tile_pipes, + rdev->config.rv770.max_backends, + (0xff << rdev->config.rv770.max_backends) & 0xff); gb_tiling_config |= BACKEND_MAP(backend_map); cc_gc_shader_pipe_config = diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 1a3e909..c7320ce 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -1020,6 +1020,12 @@ static int ttm_bo_mem_compat(struct ttm_placement *placement, struct ttm_mem_reg *mem) { int i; + struct drm_mm_node *node = mem->mm_node; + + if (node && placement->lpfn != 0 && + (node->start < placement->fpfn || + node->start + node->size > placement->lpfn)) + return -1; for (i = 0; i < placement->num_placement; i++) { if ((placement->placement[i] & mem->placement & diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c index e2123af..3d47a2c 100644 --- a/drivers/gpu/drm/ttm/ttm_tt.c +++ b/drivers/gpu/drm/ttm/ttm_tt.c @@ -196,14 +196,15 @@ EXPORT_SYMBOL(ttm_tt_populate); #ifdef CONFIG_X86 static inline int ttm_tt_set_page_caching(struct page *p, - enum ttm_caching_state c_state) + enum ttm_caching_state c_old, + enum ttm_caching_state c_new) { int ret = 0; if (PageHighMem(p)) return 0; - if (get_page_memtype(p) != -1) { + if (c_old != tt_cached) { /* p isn't in the default caching state, set it to * writeback first to free its current memtype. */ @@ -212,16 +213,17 @@ static inline int ttm_tt_set_page_caching(struct page *p, return ret; } - if (c_state == tt_wc) + if (c_new == tt_wc) ret = set_memory_wc((unsigned long) page_address(p), 1); - else if (c_state == tt_uncached) + else if (c_new == tt_uncached) ret = set_pages_uc(p, 1); return ret; } #else /* CONFIG_X86 */ static inline int ttm_tt_set_page_caching(struct page *p, - enum ttm_caching_state c_state) + enum ttm_caching_state c_old, + enum ttm_caching_state c_new) { return 0; } @@ -254,7 +256,9 @@ static int ttm_tt_set_caching(struct ttm_tt *ttm, for (i = 0; i < ttm->num_pages; ++i) { cur_page = ttm->pages[i]; if (likely(cur_page != NULL)) { - ret = ttm_tt_set_page_caching(cur_page, c_state); + ret = ttm_tt_set_page_caching(cur_page, + ttm->caching_state, + c_state); if (unlikely(ret != 0)) goto out_err; } @@ -268,7 +272,7 @@ out_err: for (j = 0; j < i; ++j) { cur_page = ttm->pages[j]; if (likely(cur_page != NULL)) { - (void)ttm_tt_set_page_caching(cur_page, + (void)ttm_tt_set_page_caching(cur_page, c_state, ttm->caching_state); } } diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c index a6e8f68..0c9c081 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c @@ -348,22 +348,19 @@ static int vmw_driver_load(struct drm_device *dev, unsigned long chipset) */ DRM_INFO("It appears like vesafb is loaded. " - "Ignore above error if any. Entering stealth mode.\n"); + "Ignore above error if any.\n"); ret = pci_request_region(dev->pdev, 2, "vmwgfx stealth probe"); if (unlikely(ret != 0)) { DRM_ERROR("Failed reserving the SVGA MMIO resource.\n"); goto out_no_device; } - vmw_kms_init(dev_priv); - vmw_overlay_init(dev_priv); - } else { - ret = vmw_request_device(dev_priv); - if (unlikely(ret != 0)) - goto out_no_device; - vmw_kms_init(dev_priv); - vmw_overlay_init(dev_priv); - vmw_fb_init(dev_priv); } + ret = vmw_request_device(dev_priv); + if (unlikely(ret != 0)) + goto out_no_device; + vmw_kms_init(dev_priv); + vmw_overlay_init(dev_priv); + vmw_fb_init(dev_priv); dev_priv->pm_nb.notifier_call = vmwgfx_pm_notifier; register_pm_notifier(&dev_priv->pm_nb); @@ -406,17 +403,15 @@ static int vmw_driver_unload(struct drm_device *dev) unregister_pm_notifier(&dev_priv->pm_nb); - if (!dev_priv->stealth) { - vmw_fb_close(dev_priv); - vmw_kms_close(dev_priv); - vmw_overlay_close(dev_priv); - vmw_release_device(dev_priv); - pci_release_regions(dev->pdev); - } else { - vmw_kms_close(dev_priv); - vmw_overlay_close(dev_priv); + vmw_fb_close(dev_priv); + vmw_kms_close(dev_priv); + vmw_overlay_close(dev_priv); + vmw_release_device(dev_priv); + if (dev_priv->stealth) pci_release_region(dev->pdev, 2); - } + else + pci_release_regions(dev->pdev); + if (dev_priv->capabilities & SVGA_CAP_IRQMASK) drm_irq_uninstall(dev_priv->dev); if (dev->devname == vmw_devname) @@ -585,11 +580,6 @@ static int vmw_master_set(struct drm_device *dev, int ret = 0; DRM_INFO("Master set.\n"); - if (dev_priv->stealth) { - ret = vmw_request_device(dev_priv); - if (unlikely(ret != 0)) - return ret; - } if (active) { BUG_ON(active != &dev_priv->fbdev_master); @@ -649,18 +639,11 @@ static void vmw_master_drop(struct drm_device *dev, ttm_lock_set_kill(&vmaster->lock, true, SIGTERM); - if (dev_priv->stealth) { - ret = ttm_bo_evict_mm(&dev_priv->bdev, TTM_PL_VRAM); - if (unlikely(ret != 0)) - DRM_ERROR("Unable to clean VRAM on master drop.\n"); - vmw_release_device(dev_priv); - } dev_priv->active_master = &dev_priv->fbdev_master; ttm_lock_set_kill(&dev_priv->fbdev_master.lock, false, SIGTERM); ttm_vt_unlock(&dev_priv->fbdev_master.lock); - if (!dev_priv->stealth) - vmw_fb_on(dev_priv); + vmw_fb_on(dev_priv); } diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c index d69caf9..0897359 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c @@ -182,25 +182,19 @@ static int vmw_cmd_present_check(struct vmw_private *dev_priv, return vmw_cmd_sid_check(dev_priv, sw_context, &cmd->body.sid); } -static int vmw_cmd_dma(struct vmw_private *dev_priv, - struct vmw_sw_context *sw_context, - SVGA3dCmdHeader *header) +static int vmw_translate_guest_ptr(struct vmw_private *dev_priv, + struct vmw_sw_context *sw_context, + SVGAGuestPtr *ptr, + struct vmw_dma_buffer **vmw_bo_p) { - uint32_t handle; struct vmw_dma_buffer *vmw_bo = NULL; struct ttm_buffer_object *bo; - struct vmw_surface *srf = NULL; - struct vmw_dma_cmd { - SVGA3dCmdHeader header; - SVGA3dCmdSurfaceDMA dma; - } *cmd; + uint32_t handle = ptr->gmrId; struct vmw_relocation *reloc; - int ret; uint32_t cur_validate_node; struct ttm_validate_buffer *val_buf; + int ret; - cmd = container_of(header, struct vmw_dma_cmd, header); - handle = cmd->dma.guest.ptr.gmrId; ret = vmw_user_dmabuf_lookup(sw_context->tfile, handle, &vmw_bo); if (unlikely(ret != 0)) { DRM_ERROR("Could not find or use GMR region.\n"); @@ -209,14 +203,14 @@ static int vmw_cmd_dma(struct vmw_private *dev_priv, bo = &vmw_bo->base; if (unlikely(sw_context->cur_reloc >= VMWGFX_MAX_RELOCATIONS)) { - DRM_ERROR("Max number of DMA commands per submission" + DRM_ERROR("Max number relocations per submission" " exceeded\n"); ret = -EINVAL; goto out_no_reloc; } reloc = &sw_context->relocs[sw_context->cur_reloc++]; - reloc->location = &cmd->dma.guest.ptr; + reloc->location = ptr; cur_validate_node = vmw_dmabuf_validate_node(bo, sw_context->cur_val_buf); if (unlikely(cur_validate_node >= VMWGFX_MAX_GMRS)) { @@ -234,7 +228,89 @@ static int vmw_cmd_dma(struct vmw_private *dev_priv, list_add_tail(&val_buf->head, &sw_context->validate_nodes); ++sw_context->cur_val_buf; } + *vmw_bo_p = vmw_bo; + return 0; + +out_no_reloc: + vmw_dmabuf_unreference(&vmw_bo); + vmw_bo_p = NULL; + return ret; +} + +static int vmw_cmd_end_query(struct vmw_private *dev_priv, + struct vmw_sw_context *sw_context, + SVGA3dCmdHeader *header) +{ + struct vmw_dma_buffer *vmw_bo; + struct vmw_query_cmd { + SVGA3dCmdHeader header; + SVGA3dCmdEndQuery q; + } *cmd; + int ret; + + cmd = container_of(header, struct vmw_query_cmd, header); + ret = vmw_cmd_cid_check(dev_priv, sw_context, header); + if (unlikely(ret != 0)) + return ret; + + ret = vmw_translate_guest_ptr(dev_priv, sw_context, + &cmd->q.guestResult, + &vmw_bo); + if (unlikely(ret != 0)) + return ret; + + vmw_dmabuf_unreference(&vmw_bo); + return 0; +} +static int vmw_cmd_wait_query(struct vmw_private *dev_priv, + struct vmw_sw_context *sw_context, + SVGA3dCmdHeader *header) +{ + struct vmw_dma_buffer *vmw_bo; + struct vmw_query_cmd { + SVGA3dCmdHeader header; + SVGA3dCmdWaitForQuery q; + } *cmd; + int ret; + + cmd = container_of(header, struct vmw_query_cmd, header); + ret = vmw_cmd_cid_check(dev_priv, sw_context, header); + if (unlikely(ret != 0)) + return ret; + + ret = vmw_translate_guest_ptr(dev_priv, sw_context, + &cmd->q.guestResult, + &vmw_bo); + if (unlikely(ret != 0)) + return ret; + + vmw_dmabuf_unreference(&vmw_bo); + return 0; +} + + +static int vmw_cmd_dma(struct vmw_private *dev_priv, + struct vmw_sw_context *sw_context, + SVGA3dCmdHeader *header) +{ + struct vmw_dma_buffer *vmw_bo = NULL; + struct ttm_buffer_object *bo; + struct vmw_surface *srf = NULL; + struct vmw_dma_cmd { + SVGA3dCmdHeader header; + SVGA3dCmdSurfaceDMA dma; + } *cmd; + int ret; + + cmd = container_of(header, struct vmw_dma_cmd, header); + ret = vmw_translate_guest_ptr(dev_priv, sw_context, + &cmd->dma.guest.ptr, + &vmw_bo); + if (unlikely(ret != 0)) + return ret; + + bo = &vmw_bo->base; ret = vmw_user_surface_lookup_handle(dev_priv, sw_context->tfile, cmd->dma.host.sid, &srf); if (ret) { @@ -379,8 +455,8 @@ static vmw_cmd_func vmw_cmd_funcs[SVGA_3D_CMD_MAX] = { VMW_CMD_DEF(SVGA_3D_CMD_DRAW_PRIMITIVES, &vmw_cmd_draw), VMW_CMD_DEF(SVGA_3D_CMD_SETSCISSORRECT, &vmw_cmd_cid_check), VMW_CMD_DEF(SVGA_3D_CMD_BEGIN_QUERY, &vmw_cmd_cid_check), - VMW_CMD_DEF(SVGA_3D_CMD_END_QUERY, &vmw_cmd_cid_check), - VMW_CMD_DEF(SVGA_3D_CMD_WAIT_FOR_QUERY, &vmw_cmd_cid_check), + VMW_CMD_DEF(SVGA_3D_CMD_END_QUERY, &vmw_cmd_end_query), + VMW_CMD_DEF(SVGA_3D_CMD_WAIT_FOR_QUERY, &vmw_cmd_wait_query), VMW_CMD_DEF(SVGA_3D_CMD_PRESENT_READBACK, &vmw_cmd_ok), VMW_CMD_DEF(SVGA_3D_CMD_BLIT_SURFACE_TO_SCREEN, &vmw_cmd_blt_surf_screen_check) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fb.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fb.c index 4f4f643..a933670 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_fb.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fb.c @@ -559,6 +559,9 @@ int vmw_fb_init(struct vmw_private *vmw_priv) info->pixmap.scan_align = 1; #endif + info->aperture_base = vmw_priv->vram_start; + info->aperture_size = vmw_priv->vram_size; + /* * Dirty & Deferred IO */ diff --git a/drivers/gpu/vga/vgaarb.c b/drivers/gpu/vga/vgaarb.c index 24b56dc..2f6cf69 100644 --- a/drivers/gpu/vga/vgaarb.c +++ b/drivers/gpu/vga/vgaarb.c @@ -961,7 +961,7 @@ static ssize_t vga_arb_write(struct file *file, const char __user * buf, remaining -= 7; pr_devel("client 0x%p called 'target'\n", priv); /* if target is default */ - if (!strncmp(kbuf, "default", 7)) + if (!strncmp(curr_pos, "default", 7)) pdev = pci_dev_get(vga_default_device()); else { if (!vga_pci_str_to_vars(curr_pos, remaining, diff --git a/drivers/hwmon/s3c-hwmon.c b/drivers/hwmon/s3c-hwmon.c index 3f3f9a4..05248f2 100644 --- a/drivers/hwmon/s3c-hwmon.c +++ b/drivers/hwmon/s3c-hwmon.c @@ -51,7 +51,7 @@ struct s3c_hwmon_attr { * @attr: The holders for the channel attributes. */ struct s3c_hwmon { - struct semaphore lock; + struct mutex lock; struct s3c_adc_client *client; struct device *hwmon_dev; @@ -73,14 +73,14 @@ static int s3c_hwmon_read_ch(struct device *dev, { int ret; - ret = down_interruptible(&hwmon->lock); + ret = mutex_lock_interruptible(&hwmon->lock); if (ret < 0) return ret; dev_dbg(dev, "reading channel %d\n", channel); ret = s3c_adc_read(hwmon->client, channel); - up(&hwmon->lock); + mutex_unlock(&hwmon->lock); return ret; } @@ -296,7 +296,7 @@ static int __devinit s3c_hwmon_probe(struct platform_device *dev) platform_set_drvdata(dev, hwmon); - init_MUTEX(&hwmon->lock); + mutex_init(&hwmon->lock); /* Register with the core ADC driver. */ diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c index 0abc43f..ebad3df 100644 --- a/drivers/ide/alim15x3.c +++ b/drivers/ide/alim15x3.c @@ -80,7 +80,7 @@ static void ali_set_pio_mode(ide_drive_t *drive, const u8 pio) if (r_clc >= 16) r_clc = 0; } - local_irq_save(flags); + local_irq_save_nort(flags); /* * PIO mode => ATA FIFO on, ATAPI FIFO off @@ -102,7 +102,7 @@ static void ali_set_pio_mode(ide_drive_t *drive, const u8 pio) pci_write_config_byte(dev, port, s_clc); pci_write_config_byte(dev, port + unit + 2, (a_clc << 4) | r_clc); - local_irq_restore(flags); + local_irq_restore_nort(flags); } /** @@ -213,7 +213,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev) isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL); - local_irq_save(flags); + local_irq_save_nort(flags); if (m5229_revision < 0xC2) { /* @@ -304,7 +304,7 @@ out: } pci_dev_put(north); pci_dev_put(isa_dev); - local_irq_restore(flags); + local_irq_restore_nort(flags); return 0; } @@ -366,7 +366,7 @@ static u8 ali_cable_detect(ide_hwif_t *hwif) unsigned long flags; u8 cbl = ATA_CBL_PATA40, tmpbyte; - local_irq_save(flags); + local_irq_save_nort(flags); if (m5229_revision >= 0xC2) { /* @@ -387,7 +387,7 @@ static u8 ali_cable_detect(ide_hwif_t *hwif) } } - local_irq_restore(flags); + local_irq_restore_nort(flags); return cbl; } diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c index 4d90ac2..360fd47 100644 --- a/drivers/ide/hpt366.c +++ b/drivers/ide/hpt366.c @@ -1237,7 +1237,7 @@ static int __devinit init_dma_hpt366(ide_hwif_t *hwif, dma_old = inb(base + 2); - local_irq_save(flags); + local_irq_save_nort(flags); dma_new = dma_old; pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma); @@ -1248,7 +1248,7 @@ static int __devinit init_dma_hpt366(ide_hwif_t *hwif, if (dma_new != dma_old) outb(dma_new, base + 2); - local_irq_restore(flags); + local_irq_restore_nort(flags); printk(KERN_INFO " %s: BM-DMA at 0x%04lx-0x%04lx\n", hwif->name, base, base + 7); diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c index 46721c4..b6f114a 100644 --- a/drivers/ide/ide-io-std.c +++ b/drivers/ide/ide-io-std.c @@ -174,7 +174,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf, unsigned long uninitialized_var(flags); if ((io_32bit & 2) && !mmio) { - local_irq_save(flags); + local_irq_save_nort(flags); ata_vlb_sync(io_ports->nsect_addr); } @@ -185,7 +185,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf, insl(data_addr, buf, words); if ((io_32bit & 2) && !mmio) - local_irq_restore(flags); + local_irq_restore_nort(flags); if (((len + 1) & 3) < 2) return; @@ -218,7 +218,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf, unsigned long uninitialized_var(flags); if ((io_32bit & 2) && !mmio) { - local_irq_save(flags); + local_irq_save_nort(flags); ata_vlb_sync(io_ports->nsect_addr); } @@ -229,7 +229,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf, outsl(data_addr, buf, words); if ((io_32bit & 2) && !mmio) - local_irq_restore(flags); + local_irq_restore_nort(flags); if (((len + 1) & 3) < 2) return; diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index db96138..b8b3ab6 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -667,7 +667,7 @@ void ide_timer_expiry (unsigned long data) /* disable_irq_nosync ?? */ disable_irq(hwif->irq); /* local CPU only, as if we were handling an interrupt */ - local_irq_disable(); + local_irq_disable_nort(); if (hwif->polling) { startstop = handler(drive); } else if (drive_is_ready(drive)) { diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c index 222c1ef..53a085c 100644 --- a/drivers/ide/ide-iops.c +++ b/drivers/ide/ide-iops.c @@ -129,12 +129,12 @@ int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad, if ((stat & ATA_BUSY) == 0) break; - local_irq_restore(flags); + local_irq_restore_nort(flags); *rstat = stat; return -EBUSY; } } - local_irq_restore(flags); + local_irq_restore_nort(flags); } /* * Allow status to settle, then read it again. diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 4d76ba4..5b39ce3 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -196,10 +196,10 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id) int bswap = 1; /* local CPU only; some systems need this */ - local_irq_save(flags); + local_irq_save_nort(flags); /* read 512 bytes of id info */ hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE); - local_irq_restore(flags); + local_irq_restore_nort(flags); drive->dev_flags |= IDE_DFLAG_ID_READ; #ifdef DEBUG diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index cc8633c..7253867 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -250,7 +250,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd, page_is_high = PageHighMem(page); if (page_is_high) - local_irq_save(flags); + local_irq_save_nort(flags); buf = kmap_atomic(page, KM_BIO_SRC_IRQ) + offset; @@ -271,7 +271,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd, kunmap_atomic(buf, KM_BIO_SRC_IRQ); if (page_is_high) - local_irq_restore(flags); + local_irq_restore_nort(flags); len -= nr_bytes; } @@ -414,7 +414,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive, } if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0) - local_irq_disable(); + local_irq_disable_nort(); ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE); diff --git a/drivers/ieee1394/nodemgr.c b/drivers/ieee1394/nodemgr.c index 5122b5a..bcc3d32 100644 --- a/drivers/ieee1394/nodemgr.c +++ b/drivers/ieee1394/nodemgr.c @@ -1397,9 +1397,9 @@ static int update_pdrv(struct device *dev, void *data) pdrv = container_of(drv, struct hpsb_protocol_driver, driver); if (pdrv->update) { - down(&ud->device.sem); + mutex_lock(&ud->device.mutex); error = pdrv->update(ud); - up(&ud->device.sem); + mutex_unlock(&ud->device.mutex); } if (error) device_release_driver(&ud->device); diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 7de0296..8caa9e2 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -1004,7 +1004,7 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, port->ib_dev = device; port->port_num = port_num; - init_MUTEX(&port->sm_sem); + sema_init(&port->sm_sem, 1); mutex_init(&port->file_mutex); INIT_LIST_HEAD(&port->file_list); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 8763c1e..aeebf74 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -796,7 +796,7 @@ void ipoib_mcast_restart_task(struct work_struct *work) ipoib_mcast_stop_thread(dev, 0); - local_irq_save(flags); + local_irq_save_nort(flags); netif_addr_lock(dev); spin_lock(&priv->lock); @@ -880,7 +880,7 @@ void ipoib_mcast_restart_task(struct work_struct *work) spin_unlock(&priv->lock); netif_addr_unlock(dev); - local_irq_restore(flags); + local_irq_restore_nort(flags); /* We have to cancel outside of the spinlock */ list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c index ac11be0..8a3e940 100644 --- a/drivers/input/gameport/gameport.c +++ b/drivers/input/gameport/gameport.c @@ -20,6 +20,7 @@ #include <linux/slab.h> #include <linux/delay.h> #include <linux/kthread.h> +#include <linux/interrupt.h> #include <linux/sched.h> /* HZ */ #include <linux/mutex.h> #include <linux/freezer.h> @@ -57,11 +58,11 @@ static unsigned int get_time_pit(void) unsigned long flags; unsigned int count; - spin_lock_irqsave(&i8253_lock, flags); + raw_spin_lock_irqsave(&i8253_lock, flags); outb_p(0x00, 0x43); count = inb_p(0x40); count |= inb_p(0x40) << 8; - spin_unlock_irqrestore(&i8253_lock, flags); + raw_spin_unlock_irqrestore(&i8253_lock, flags); return count; } @@ -87,12 +88,12 @@ static int gameport_measure_speed(struct gameport *gameport) tx = 1 << 30; for(i = 0; i < 50; i++) { - local_irq_save(flags); + local_irq_save_nort(flags); GET_TIME(t1); for (t = 0; t < 50; t++) gameport_read(gameport); GET_TIME(t2); GET_TIME(t3); - local_irq_restore(flags); + local_irq_restore_nort(flags); udelay(i * 10); if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t; } @@ -111,11 +112,11 @@ static int gameport_measure_speed(struct gameport *gameport) tx = 1 << 30; for(i = 0; i < 50; i++) { - local_irq_save(flags); + local_irq_save_nort(flags); rdtscl(t1); for (t = 0; t < 50; t++) gameport_read(gameport); rdtscl(t2); - local_irq_restore(flags); + local_irq_restore_nort(flags); udelay(i * 10); if (t2 - t1 < tx) tx = t2 - t1; } diff --git a/drivers/input/input-polldev.c b/drivers/input/input-polldev.c index aa6713b..291d939 100644 --- a/drivers/input/input-polldev.c +++ b/drivers/input/input-polldev.c @@ -100,6 +100,12 @@ static void input_close_polled_device(struct input_dev *input) struct input_polled_dev *dev = input_get_drvdata(input); cancel_delayed_work_sync(&dev->work); + /* + * Clean up work struct to remove references to the workqueue. + * It may be destroyed by the next call. This causes problems + * at next device open-close in case of poll_interval == 0. + */ + INIT_DELAYED_WORK(&dev->work, dev->work.work.func); input_polldev_stop_workqueue(); if (dev->close) diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c index 1c0b529..4afe0a3 100644 --- a/drivers/input/joystick/analog.c +++ b/drivers/input/joystick/analog.c @@ -146,11 +146,11 @@ static unsigned int get_time_pit(void) unsigned long flags; unsigned int count; - spin_lock_irqsave(&i8253_lock, flags); + raw_spin_lock_irqsave(&i8253_lock, flags); outb_p(0x00, 0x43); count = inb_p(0x40); count |= inb_p(0x40) << 8; - spin_unlock_irqrestore(&i8253_lock, flags); + raw_spin_unlock_irqrestore(&i8253_lock, flags); return count; } diff --git a/drivers/input/misc/hp_sdc_rtc.c b/drivers/input/misc/hp_sdc_rtc.c index ad730e1..e44f277 100644 --- a/drivers/input/misc/hp_sdc_rtc.c +++ b/drivers/input/misc/hp_sdc_rtc.c @@ -53,7 +53,7 @@ MODULE_LICENSE("Dual BSD/GPL"); static unsigned long epoch = 2000; -static struct semaphore i8042tregs; +static DEFINE_SEMAPHORE(i8042tregs, 1); static hp_sdc_irqhook hp_sdc_rtc_isr; @@ -83,7 +83,7 @@ static void hp_sdc_rtc_isr (int irq, void *dev_id, static int hp_sdc_rtc_do_read_bbrtc (struct rtc_time *rtctm) { - struct semaphore tsem; + DEFINE_SEMAPHORE(tsem, 0); hp_sdc_transaction t; uint8_t tseq[91]; int i; @@ -103,8 +103,7 @@ static int hp_sdc_rtc_do_read_bbrtc (struct rtc_time *rtctm) t.endidx = 91; t.seq = tseq; t.act.semaphore = &tsem; - init_MUTEX_LOCKED(&tsem); - + if (hp_sdc_enqueue_transaction(&t)) return -1; down_interruptible(&tsem); /* Put ourselves to sleep for results. */ @@ -684,8 +683,6 @@ static int __init hp_sdc_rtc_init(void) return -ENODEV; #endif - init_MUTEX(&i8042tregs); - if ((ret = hp_sdc_request_timer_irq(&hp_sdc_rtc_isr))) return ret; if (misc_register(&hp_sdc_rtc_dev) != 0) diff --git a/drivers/input/misc/pcspkr.c b/drivers/input/misc/pcspkr.c index ea4e1fd..f080dd3 100644 --- a/drivers/input/misc/pcspkr.c +++ b/drivers/input/misc/pcspkr.c @@ -30,7 +30,7 @@ MODULE_ALIAS("platform:pcspkr"); #include <asm/i8253.h> #else #include <asm/8253pit.h> -static DEFINE_SPINLOCK(i8253_lock); +static DEFINE_RAW_SPINLOCK(i8253_lock); #endif static int pcspkr_event(struct input_dev *dev, unsigned int type, unsigned int code, int value) @@ -50,7 +50,7 @@ static int pcspkr_event(struct input_dev *dev, unsigned int type, unsigned int c if (value > 20 && value < 32767) count = PIT_TICK_RATE / value; - spin_lock_irqsave(&i8253_lock, flags); + raw_spin_lock_irqsave(&i8253_lock, flags); if (count) { /* set command for counter 2, 2 byte write */ @@ -65,7 +65,7 @@ static int pcspkr_event(struct input_dev *dev, unsigned int type, unsigned int c outb(inb_p(0x61) & 0xFC, 0x61); } - spin_unlock_irqrestore(&i8253_lock, flags); + raw_spin_unlock_irqrestore(&i8253_lock, flags); return 0; } diff --git a/drivers/input/serio/hil_mlc.c b/drivers/input/serio/hil_mlc.c index 6cd03eb..9e78665 100644 --- a/drivers/input/serio/hil_mlc.c +++ b/drivers/input/serio/hil_mlc.c @@ -914,15 +914,15 @@ int hil_mlc_register(hil_mlc *mlc) mlc->ostarted = 0; rwlock_init(&mlc->lock); - init_MUTEX(&mlc->osem); + sema_init(&mlc->osem, 1); - init_MUTEX(&mlc->isem); + sema_init(&mlc->isem, 1); mlc->icount = -1; mlc->imatch = 0; mlc->opercnt = 0; - init_MUTEX_LOCKED(&(mlc->csem)); + sema_init(&mlc->csem, 0); hil_mlc_clear_di_scratch(mlc); hil_mlc_clear_di_map(mlc, 0); diff --git a/drivers/input/serio/hp_sdc.c b/drivers/input/serio/hp_sdc.c index bcc2d30..8c0b51c 100644 --- a/drivers/input/serio/hp_sdc.c +++ b/drivers/input/serio/hp_sdc.c @@ -905,7 +905,7 @@ static int __init hp_sdc_init(void) ts_sync[1] = 0x0f; ts_sync[2] = ts_sync[3] = ts_sync[4] = ts_sync[5] = 0; t_sync.act.semaphore = &s_sync; - init_MUTEX_LOCKED(&s_sync); + sema_init(&s_sync, 0); hp_sdc_enqueue_transaction(&t_sync); down(&s_sync); /* Wait for t_sync to complete */ @@ -1039,7 +1039,7 @@ static int __init hp_sdc_register(void) return hp_sdc.dev_err; } - init_MUTEX_LOCKED(&tq_init_sem); + sema_init(&tq_init_sem, 0); tq_init.actidx = 0; tq_init.idx = 1; diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c index d84a36e..b54aee7 100644 --- a/drivers/input/serio/i8042.c +++ b/drivers/input/serio/i8042.c @@ -1161,9 +1161,17 @@ static int i8042_pm_restore(struct device *dev) return 0; } +static int i8042_pm_thaw(struct device *dev) +{ + i8042_interrupt(0, NULL); + + return 0; +} + static const struct dev_pm_ops i8042_pm_ops = { .suspend = i8042_pm_reset, .resume = i8042_pm_restore, + .thaw = i8042_pm_thaw, .poweroff = i8042_pm_reset, .restore = i8042_pm_restore, }; diff --git a/drivers/input/touchscreen/usbtouchscreen.c b/drivers/input/touchscreen/usbtouchscreen.c index 09a5e73..5256123 100644 --- a/drivers/input/touchscreen/usbtouchscreen.c +++ b/drivers/input/touchscreen/usbtouchscreen.c @@ -618,8 +618,8 @@ static int idealtek_read_data(struct usbtouch_usb *dev, unsigned char *pkt) #ifdef CONFIG_TOUCHSCREEN_USB_GENERAL_TOUCH static int general_touch_read_data(struct usbtouch_usb *dev, unsigned char *pkt) { - dev->x = ((pkt[2] & 0x0F) << 8) | pkt[1] ; - dev->y = ((pkt[4] & 0x0F) << 8) | pkt[3] ; + dev->x = (pkt[2] << 8) | pkt[1]; + dev->y = (pkt[4] << 8) | pkt[3]; dev->press = pkt[5] & 0xff; dev->touch = pkt[0] & 0x01; @@ -809,9 +809,9 @@ static struct usbtouch_device_info usbtouch_dev_info[] = { #ifdef CONFIG_TOUCHSCREEN_USB_GENERAL_TOUCH [DEVTYPE_GENERAL_TOUCH] = { .min_xc = 0x0, - .max_xc = 0x0500, + .max_xc = 0x7fff, .min_yc = 0x0, - .max_yc = 0x0500, + .max_yc = 0x7fff, .rept_size = 7, .read_data = general_touch_read_data, }, diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c index 23741ce..14910a0 100644 --- a/drivers/macintosh/adb.c +++ b/drivers/macintosh/adb.c @@ -83,7 +83,7 @@ static struct adb_driver *adb_controller; BLOCKING_NOTIFIER_HEAD(adb_client_list); static int adb_got_sleep; static int adb_inited; -static DECLARE_MUTEX(adb_probe_mutex); +static DEFINE_SEMAPHORE(adb_probe_mutex, 1); static int sleepy_trackpad; static int autopoll_devs; int __adb_probe_sync; diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c index 54abf9e..f1c8cae 100644 --- a/drivers/md/dm-log-userspace-transfer.c +++ b/drivers/md/dm-log-userspace-transfer.c @@ -172,11 +172,15 @@ int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type, { int r = 0; size_t dummy = 0; - int overhead_size = - sizeof(struct dm_ulog_request *) + sizeof(struct cn_msg); + int overhead_size = sizeof(struct dm_ulog_request) + sizeof(struct cn_msg); struct dm_ulog_request *tfr = prealloced_ulog_tfr; struct receiving_pkg pkg; + /* + * Given the space needed to hold the 'struct cn_msg' and + * 'struct dm_ulog_request' - do we have enough payload + * space remaining? + */ if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) { DMINFO("Size of tfr exceeds preallocated size"); return -EINVAL; @@ -191,7 +195,7 @@ resend: */ mutex_lock(&dm_ulog_lock); - memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size); + memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg)); memcpy(tfr->uuid, uuid, DM_UUID_LEN); tfr->luid = luid; tfr->seq = dm_ulog_seq++; diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index ad779bd..6c1046d 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -724,7 +724,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) /* * Dispatch io. */ - if (unlikely(ms->log_failure)) { + if (unlikely(ms->log_failure) && errors_handled(ms)) { spin_lock_irq(&ms->lock); bio_list_merge(&ms->failures, &sync); spin_unlock_irq(&ms->lock); diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index 5f19ceb..168bd38 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -660,10 +660,9 @@ void dm_rh_recovery_end(struct dm_region *reg, int success) spin_lock_irq(&rh->region_lock); if (success) list_add(®->list, ®->rh->recovered_regions); - else { - reg->state = DM_RH_NOSYNC; + else list_add(®->list, ®->rh->failed_recovered_regions); - } + spin_unlock_irq(&rh->region_lock); rh->wakeup_workers(rh->context); diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 7d08879..c097d8a 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -254,7 +254,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, * Issue the synchronous I/O from a different thread * to avoid generic_make_request recursion. */ - INIT_WORK(&req.work, do_metadata); + INIT_WORK_ON_STACK(&req.work, do_metadata); queue_work(ps->metadata_wq, &req.work); flush_workqueue(ps->metadata_wq); diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index e0efc1a..bd58703 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -110,7 +110,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) } stripes = simple_strtoul(argv[0], &end, 10); - if (*end) { + if (!stripes || *end) { ti->error = "Invalid stripe count"; return -EINVAL; } diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c index f53392d..f91b409 100644 --- a/drivers/md/dm-sysfs.c +++ b/drivers/md/dm-sysfs.c @@ -80,20 +80,12 @@ static struct sysfs_ops dm_sysfs_ops = { }; /* - * The sysfs structure is embedded in md struct, nothing to do here - */ -static void dm_sysfs_release(struct kobject *kobj) -{ -} - -/* * dm kobject is embedded in mapped_device structure * no need to define release function here */ static struct kobj_type dm_ktype = { .sysfs_ops = &dm_sysfs_ops, .default_attrs = dm_attrs, - .release = dm_sysfs_release }; /* diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 3167480..aa4e2aa 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1595,10 +1595,15 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq) return BLKPREP_OK; } -static void map_request(struct dm_target *ti, struct request *clone, - struct mapped_device *md) +/* + * Returns: + * 0 : the request has been processed (not requeued) + * !0 : the request has been requeued + */ +static int map_request(struct dm_target *ti, struct request *clone, + struct mapped_device *md) { - int r; + int r, requeued = 0; struct dm_rq_target_io *tio = clone->end_io_data; /* @@ -1625,6 +1630,7 @@ static void map_request(struct dm_target *ti, struct request *clone, case DM_MAPIO_REQUEUE: /* The target wants to requeue the I/O */ dm_requeue_unmapped_request(clone); + requeued = 1; break; default: if (r > 0) { @@ -1636,6 +1642,8 @@ static void map_request(struct dm_target *ti, struct request *clone, dm_kill_unmapped_request(clone, r); break; } + + return requeued; } /* @@ -1677,12 +1685,17 @@ static void dm_request_fn(struct request_queue *q) atomic_inc(&md->pending[rq_data_dir(clone)]); spin_unlock(q->queue_lock); - map_request(ti, clone, md); + if (map_request(ti, clone, md)) + goto requeued; + spin_lock_irq(q->queue_lock); } goto out; +requeued: + spin_lock_irq(q->queue_lock); + plug_and_out: if (!elv_queue_empty(q)) /* Some requests still remain, retry later */ diff --git a/drivers/media/dvb/dvb-core/dvb_frontend.c b/drivers/media/dvb/dvb-core/dvb_frontend.c index 0746122..e650983 100644 --- a/drivers/media/dvb/dvb-core/dvb_frontend.c +++ b/drivers/media/dvb/dvb-core/dvb_frontend.c @@ -694,7 +694,7 @@ static void dvb_frontend_stop(struct dvb_frontend *fe) kthread_stop(fepriv->thread); - init_MUTEX (&fepriv->sem); + sema_init(&fepriv->sem, 1); fepriv->state = FESTATE_IDLE; /* paranoia check in case a signal arrived */ @@ -2054,7 +2054,7 @@ int dvb_register_frontend(struct dvb_adapter* dvb, } fepriv = fe->frontend_priv; - init_MUTEX (&fepriv->sem); + sema_init(&fepriv->sem, 1); init_waitqueue_head (&fepriv->wait_queue); init_waitqueue_head (&fepriv->events.wait_queue); mutex_init(&fepriv->events.mtx); diff --git a/drivers/media/dvb/dvb-usb/Kconfig b/drivers/media/dvb/dvb-usb/Kconfig index 1b24989..465295b 100644 --- a/drivers/media/dvb/dvb-usb/Kconfig +++ b/drivers/media/dvb/dvb-usb/Kconfig @@ -112,11 +112,13 @@ config DVB_USB_CXUSB select DVB_MT352 if !DVB_FE_CUSTOMISE select DVB_ZL10353 if !DVB_FE_CUSTOMISE select DVB_DIB7000P if !DVB_FE_CUSTOMISE - select DVB_LGS8GL5 if !DVB_FE_CUSTOMISE select DVB_TUNER_DIB0070 if !DVB_FE_CUSTOMISE + select DVB_ATBM8830 if !DVB_FE_CUSTOMISE + select DVB_LGS8GXX if !DVB_FE_CUSTOMISE select MEDIA_TUNER_SIMPLE if !MEDIA_TUNER_CUSTOMISE select MEDIA_TUNER_XC2028 if !MEDIA_TUNER_CUSTOMISE select MEDIA_TUNER_MXL5005S if !MEDIA_TUNER_CUSTOMISE + select MEDIA_TUNER_MAX2165 if !MEDIA_TUNER_CUSTOMISE help Say Y here to support the Conexant USB2.0 hybrid reference design. Currently, only DVB and ATSC modes are supported, analog mode diff --git a/drivers/media/dvb/frontends/l64781.c b/drivers/media/dvb/frontends/l64781.c index 3051b64..445fa10 100644 --- a/drivers/media/dvb/frontends/l64781.c +++ b/drivers/media/dvb/frontends/l64781.c @@ -192,8 +192,8 @@ static int apply_frontend_param (struct dvb_frontend* fe, struct dvb_frontend_pa spi_bias *= qam_tab[p->constellation]; spi_bias /= p->code_rate_HP + 1; spi_bias /= (guard_tab[p->guard_interval] + 32); - spi_bias *= 1000ULL; - spi_bias /= 1000ULL + ppm/1000; + spi_bias *= 1000; + spi_bias /= 1000 + ppm/1000; spi_bias *= p->code_rate_HP; val0x04 = (p->transmission_mode << 2) | p->guard_interval; diff --git a/drivers/media/video/bt8xx/bttv-driver.c b/drivers/media/video/bt8xx/bttv-driver.c index 3182a40..ae08b07 100644 --- a/drivers/media/video/bt8xx/bttv-driver.c +++ b/drivers/media/video/bt8xx/bttv-driver.c @@ -4461,6 +4461,7 @@ static int __devinit bttv_probe(struct pci_dev *dev, request_modules(btv); } + init_bttv_i2c_ir(btv); bttv_input_init(btv); /* everything is fine */ diff --git a/drivers/media/video/bt8xx/bttv-i2c.c b/drivers/media/video/bt8xx/bttv-i2c.c index 63aa31a..407fa61 100644 --- a/drivers/media/video/bt8xx/bttv-i2c.c +++ b/drivers/media/video/bt8xx/bttv-i2c.c @@ -388,7 +388,12 @@ int __devinit init_bttv_i2c(struct bttv *btv) if (0 == btv->i2c_rc && i2c_scan) do_i2c_scan(btv->c.v4l2_dev.name, &btv->i2c_client); - /* Instantiate the IR receiver device, if present */ + return btv->i2c_rc; +} + +/* Instantiate the I2C IR receiver device, if present */ +void __devinit init_bttv_i2c_ir(struct bttv *btv) +{ if (0 == btv->i2c_rc) { struct i2c_board_info info; /* The external IR receiver is at i2c address 0x34 (0x35 for @@ -408,7 +413,6 @@ int __devinit init_bttv_i2c(struct bttv *btv) strlcpy(info.type, "ir_video", I2C_NAME_SIZE); i2c_new_probed_device(&btv->c.i2c_adap, &info, addr_list); } - return btv->i2c_rc; } int __devexit fini_bttv_i2c(struct bttv *btv) diff --git a/drivers/media/video/bt8xx/bttvp.h b/drivers/media/video/bt8xx/bttvp.h index a1d0e9c..6cccc2a 100644 --- a/drivers/media/video/bt8xx/bttvp.h +++ b/drivers/media/video/bt8xx/bttvp.h @@ -279,6 +279,7 @@ extern unsigned int bttv_debug; extern unsigned int bttv_gpio; extern void bttv_gpio_tracking(struct bttv *btv, char *comment); extern int init_bttv_i2c(struct bttv *btv); +extern void init_bttv_i2c_ir(struct bttv *btv); extern int fini_bttv_i2c(struct bttv *btv); #define bttv_printk if (bttv_verbose) printk diff --git a/drivers/media/video/mt9t112.c b/drivers/media/video/mt9t112.c index fc4dd60..7438f8d 100644 --- a/drivers/media/video/mt9t112.c +++ b/drivers/media/video/mt9t112.c @@ -514,7 +514,7 @@ static int mt9t112_init_pll(const struct i2c_client *client) /* poll to verify out of standby. Must Poll this bit */ for (i = 0; i < 100; i++) { mt9t112_reg_read(data, client, 0x0018); - if (0x4000 & data) + if (!(0x4000 & data)) break; mdelay(10); diff --git a/drivers/media/video/pwc/pwc-ctrl.c b/drivers/media/video/pwc/pwc-ctrl.c index 50b415e..f7f7e04 100644 --- a/drivers/media/video/pwc/pwc-ctrl.c +++ b/drivers/media/video/pwc/pwc-ctrl.c @@ -753,7 +753,7 @@ int pwc_set_shutter_speed(struct pwc_device *pdev, int mode, int value) buf[0] = 0xff; /* fixed */ ret = send_control_msg(pdev, - SET_LUM_CTL, SHUTTER_MODE_FORMATTER, &buf, sizeof(buf)); + SET_LUM_CTL, SHUTTER_MODE_FORMATTER, &buf, 1); if (!mode && ret >= 0) { if (value < 0) diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c index 252b741..b281217 100644 --- a/drivers/mfd/ucb1x00-core.c +++ b/drivers/mfd/ucb1x00-core.c @@ -27,6 +27,7 @@ #include <linux/mutex.h> #include <linux/mfd/ucb1x00.h> #include <linux/gpio.h> +#include <linux/semaphore.h> #include <mach/dma.h> #include <mach/hardware.h> diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index e3551d2..9eaa647 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -90,6 +90,35 @@ config IBM_ASM information on the specific driver level and support statement for your IBM server. +config HWLAT_DETECTOR + tristate "Testing module to detect hardware-induced latencies" + depends on DEBUG_FS + depends on RING_BUFFER + default m + ---help--- + A simple hardware latency detector. Use this module to detect + large latencies introduced by the behavior of the underlying + system firmware external to Linux. We do this using periodic + use of stop_machine to grab all available CPUs and measure + for unexplainable gaps in the CPU timestamp counter(s). By + default, the module is not enabled until the "enable" file + within the "hwlat_detector" debugfs directory is toggled. + + This module is often used to detect SMI (System Management + Interrupts) on x86 systems, though is not x86 specific. To + this end, we default to using a sample window of 1 second, + during which we will sample for 0.5 seconds. If an SMI or + similar event occurs during that time, it is recorded + into an 8K samples global ring buffer until retreived. + + WARNING: This software should never be enabled (it can be built + but should not be turned on after it is loaded) in a production + environment where high latencies are a concern since the + sampling mechanism actually introduces latencies for + regular tasks while the CPU(s) are being held. + + If unsure, say N + config PHANTOM tristate "Sensable PHANToM (PCI)" depends on PCI diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index 049ff24..d200c8c 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -28,3 +28,4 @@ obj-$(CONFIG_C2PORT) += c2port/ obj-$(CONFIG_IWMC3200TOP) += iwmc3200top/ obj-y += eeprom/ obj-y += cb710/ +obj-$(CONFIG_HWLAT_DETECTOR) += hwlat_detector.o diff --git a/drivers/misc/hwlat_detector.c b/drivers/misc/hwlat_detector.c new file mode 100644 index 0000000..953783c --- /dev/null +++ b/drivers/misc/hwlat_detector.c @@ -0,0 +1,1210 @@ +/* + * hwlat_detector.c - A simple Hardware Latency detector. + * + * Use this module to detect large system latencies induced by the behavior of + * certain underlying system hardware or firmware, independent of Linux itself. + * The code was developed originally to detect the presence of SMIs on Intel + * and AMD systems, although there is no dependency upon x86 herein. + * + * The classical example usage of this module is in detecting the presence of + * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a + * somewhat special form of hardware interrupt spawned from earlier CPU debug + * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge + * LPC (or other device) to generate a special interrupt under certain + * circumstances, for example, upon expiration of a special SMI timer device, + * due to certain external thermal readings, on certain I/O address accesses, + * and other situations. An SMI hits a special CPU pin, triggers a special + * SMI mode (complete with special memory map), and the OS is unaware. + * + * Although certain hardware-inducing latencies are necessary (for example, + * a modern system often requires an SMI handler for correct thermal control + * and remote management) they can wreak havoc upon any OS-level performance + * guarantees toward low-latency, especially when the OS is not even made + * aware of the presence of these interrupts. For this reason, we need a + * somewhat brute force mechanism to detect these interrupts. In this case, + * we do it by hogging all of the CPU(s) for configurable timer intervals, + * sampling the built-in CPU timer, looking for discontiguous readings. + * + * WARNING: This implementation necessarily introduces latencies. Therefore, + * you should NEVER use this module in a production environment + * requiring any kind of low-latency performance guarantee(s). + * + * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com> + * + * Includes useful feedback from Clark Williams <clark@redhat.com> + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/ring_buffer.h> +#include <linux/stop_machine.h> +#include <linux/time.h> +#include <linux/hrtimer.h> +#include <linux/kthread.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <linux/uaccess.h> +#include <linux/version.h> +#include <linux/delay.h> + +#define BUF_SIZE_DEFAULT 262144UL /* 8K*(sizeof(entry)) */ +#define BUF_FLAGS (RB_FL_OVERWRITE) /* no block on full */ +#define U64STR_SIZE 22 /* 20 digits max */ + +#define VERSION "1.0.0" +#define BANNER "hwlat_detector: " +#define DRVNAME "hwlat_detector" +#define DEFAULT_SAMPLE_WINDOW 1000000 /* 1s */ +#define DEFAULT_SAMPLE_WIDTH 500000 /* 0.5s */ +#define DEFAULT_LAT_THRESHOLD 10 /* 10us */ + +/* Module metadata */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jon Masters <jcm@redhat.com>"); +MODULE_DESCRIPTION("A simple hardware latency detector"); +MODULE_VERSION(VERSION); + +/* Module parameters */ + +static int debug; +static int enabled; +static int threshold; + +module_param(debug, int, 0); /* enable debug */ +module_param(enabled, int, 0); /* enable detector */ +module_param(threshold, int, 0); /* latency threshold */ + +/* Buffering and sampling */ + +static struct ring_buffer *ring_buffer; /* sample buffer */ +static DEFINE_MUTEX(ring_buffer_mutex); /* lock changes */ +static unsigned long buf_size = BUF_SIZE_DEFAULT; +static struct task_struct *kthread; /* sampling thread */ + +/* DebugFS filesystem entries */ + +static struct dentry *debug_dir; /* debugfs directory */ +static struct dentry *debug_max; /* maximum TSC delta */ +static struct dentry *debug_count; /* total detect count */ +static struct dentry *debug_sample_width; /* sample width us */ +static struct dentry *debug_sample_window; /* sample window us */ +static struct dentry *debug_sample; /* raw samples us */ +static struct dentry *debug_threshold; /* threshold us */ +static struct dentry *debug_enable; /* enable/disable */ + +/* Individual samples and global state */ + +struct sample; /* latency sample */ +struct data; /* Global state */ + +/* Sampling functions */ +static int __buffer_add_sample(struct sample *sample); +static struct sample *buffer_get_sample(struct sample *sample); +static int get_sample(void *unused); + +/* Threading and state */ +static int kthread_fn(void *unused); +static int start_kthread(void); +static int stop_kthread(void); +static void __reset_stats(void); +static int init_stats(void); + +/* Debugfs interface */ +static ssize_t simple_data_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos, const u64 *entry); +static ssize_t simple_data_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos, u64 *entry); +static int debug_sample_fopen(struct inode *inode, struct file *filp); +static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos); +static int debug_sample_release(struct inode *inode, struct file *filp); +static int debug_enable_fopen(struct inode *inode, struct file *filp); +static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos); +static ssize_t debug_enable_fwrite(struct file *file, + const char __user *user_buffer, + size_t user_size, loff_t *offset); + +/* Initialization functions */ +static int init_debugfs(void); +static void free_debugfs(void); +static int detector_init(void); +static void detector_exit(void); + +/* Individual latency samples are stored here when detected and packed into + * the ring_buffer circular buffer, where they are overwritten when + * more than buf_size/sizeof(sample) samples are received. */ +struct sample { + u64 seqnum; /* unique sequence */ + u64 duration; /* ktime delta */ + struct timespec timestamp; /* wall time */ +}; + +/* keep the global state somewhere. Mostly used under stop_machine. */ +static struct data { + + struct mutex lock; /* protect changes */ + + u64 count; /* total since reset */ + u64 max_sample; /* max hardware latency */ + u64 threshold; /* sample threshold level */ + + u64 sample_window; /* total sampling window (on+off) */ + u64 sample_width; /* active sampling portion of window */ + + atomic_t sample_open; /* whether the sample file is open */ + + wait_queue_head_t wq; /* waitqeue for new sample values */ + +} data; + +/** + * __buffer_add_sample - add a new latency sample recording to the ring buffer + * @sample: The new latency sample value + * + * This receives a new latency sample and records it in a global ring buffer. + * No additional locking is used in this case - suited for stop_machine use. + */ +static int __buffer_add_sample(struct sample *sample) +{ + return ring_buffer_write(ring_buffer, + sizeof(struct sample), sample); +} + +/** + * buffer_get_sample - remove a hardware latency sample from the ring buffer + * @sample: Pre-allocated storage for the sample + * + * This retrieves a hardware latency sample from the global circular buffer + */ +static struct sample *buffer_get_sample(struct sample *sample) +{ + struct ring_buffer_event *e = NULL; + struct sample *s = NULL; + unsigned int cpu = 0; + + if (!sample) + return NULL; + + mutex_lock(&ring_buffer_mutex); + for_each_online_cpu(cpu) { + e = ring_buffer_consume(ring_buffer, cpu, NULL); + if (e) + break; + } + + if (e) { + s = ring_buffer_event_data(e); + memcpy(sample, s, sizeof(struct sample)); + } else + sample = NULL; + mutex_unlock(&ring_buffer_mutex); + + return sample; +} + +/** + * get_sample - sample the CPU TSC and look for likely hardware latencies + * @unused: This is not used but is a part of the stop_machine API + * + * Used to repeatedly capture the CPU TSC (or similar), looking for potential + * hardware-induced latency. Called under stop_machine, with data.lock held. + */ +static int get_sample(void *unused) +{ + ktime_t start, t1, t2; + s64 diff, total = 0; + u64 sample = 0; + int ret = 1; + + start = ktime_get(); /* start timestamp */ + + do { + + t1 = ktime_get(); /* we'll look for a discontinuity */ + t2 = ktime_get(); + + total = ktime_to_us(ktime_sub(t2, start)); /* sample width */ + diff = ktime_to_us(ktime_sub(t2, t1)); /* current diff */ + + /* This shouldn't happen */ + if (diff < 0) { + printk(KERN_ERR BANNER "time running backwards\n"); + goto out; + } + + if (diff > sample) + sample = diff; /* only want highest value */ + + } while (total <= data.sample_width); + + /* If we exceed the threshold value, we have found a hardware latency */ + if (sample > data.threshold) { + struct sample s; + + data.count++; + s.seqnum = data.count; + s.duration = sample; + s.timestamp = CURRENT_TIME; + __buffer_add_sample(&s); + + /* Keep a running maximum ever recorded hardware latency */ + if (sample > data.max_sample) + data.max_sample = sample; + } + + ret = 0; +out: + return ret; +} + +/* + * kthread_fn - The CPU time sampling/hardware latency detection kernel thread + * @unused: A required part of the kthread API. + * + * Used to periodically sample the CPU TSC via a call to get_sample. We + * use stop_machine, whith does (intentionally) introduce latency since we + * need to ensure nothing else might be running (and thus pre-empting). + * Obviously this should never be used in production environments. + * + * stop_machine will schedule us typically only on CPU0 which is fine for + * almost every real-world hardware latency situation - but we might later + * generalize this if we find there are any actualy systems with alternate + * SMI delivery or other non CPU0 hardware latencies. + */ +static int kthread_fn(void *unused) +{ + int err = 0; + u64 interval = 0; + + while (!kthread_should_stop()) { + + mutex_lock(&data.lock); + + err = stop_machine(get_sample, unused, 0); + if (err) { + /* Houston, we have a problem */ + mutex_unlock(&data.lock); + goto err_out; + } + + wake_up(&data.wq); /* wake up reader(s) */ + + interval = data.sample_window - data.sample_width; + do_div(interval, USEC_PER_MSEC); /* modifies interval value */ + + mutex_unlock(&data.lock); + + if (msleep_interruptible(interval)) + goto out; + } + goto out; +err_out: + printk(KERN_ERR BANNER "could not call stop_machine, disabling\n"); + enabled = 0; +out: + return err; + +} + +/** + * start_kthread - Kick off the hardware latency sampling/detector kthread + * + * This starts a kernel thread that will sit and sample the CPU timestamp + * counter (TSC or similar) and look for potential hardware latencies. + */ +static int start_kthread(void) +{ + kthread = kthread_run(kthread_fn, NULL, + DRVNAME); + if (IS_ERR(kthread)) { + printk(KERN_ERR BANNER "could not start sampling thread\n"); + enabled = 0; + return -ENOMEM; + } + + return 0; +} + +/** + * stop_kthread - Inform the hardware latency samping/detector kthread to stop + * + * This kicks the running hardware latency sampling/detector kernel thread and + * tells it to stop sampling now. Use this on unload and at system shutdown. + */ +static int stop_kthread(void) +{ + int ret; + + ret = kthread_stop(kthread); + + return ret; +} + +/** + * __reset_stats - Reset statistics for the hardware latency detector + * + * We use data to store various statistics and global state. We call this + * function in order to reset those when "enable" is toggled on or off, and + * also at initialization. Should be called with data.lock held. + */ +static void __reset_stats(void) +{ + data.count = 0; + data.max_sample = 0; + ring_buffer_reset(ring_buffer); /* flush out old sample entries */ +} + +/** + * init_stats - Setup global state statistics for the hardware latency detector + * + * We use data to store various statistics and global state. We also use + * a global ring buffer (ring_buffer) to keep raw samples of detected hardware + * induced system latencies. This function initializes these structures and + * allocates the global ring buffer also. + */ +static int init_stats(void) +{ + int ret = -ENOMEM; + + mutex_init(&data.lock); + init_waitqueue_head(&data.wq); + atomic_set(&data.sample_open, 0); + + ring_buffer = ring_buffer_alloc(buf_size, BUF_FLAGS); + + if (WARN(!ring_buffer, KERN_ERR BANNER + "failed to allocate ring buffer!\n")) + goto out; + + __reset_stats(); + data.threshold = DEFAULT_LAT_THRESHOLD; /* threshold us */ + data.sample_window = DEFAULT_SAMPLE_WINDOW; /* window us */ + data.sample_width = DEFAULT_SAMPLE_WIDTH; /* width us */ + + ret = 0; + +out: + return ret; + +} + +/* + * simple_data_read - Wrapper read function for global state debugfs entries + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * @entry: The entry to read from + * + * This function provides a generic read implementation for the global state + * "data" structure debugfs filesystem entries. It would be nice to use + * simple_attr_read directly, but we need to make sure that the data.lock + * spinlock is held during the actual read (even though we likely won't ever + * actually race here as the updater runs under a stop_machine context). + */ +static ssize_t simple_data_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos, const u64 *entry) +{ + char buf[U64STR_SIZE]; + u64 val = 0; + int len = 0; + + memset(buf, 0, sizeof(buf)); + + if (!entry) + return -EFAULT; + + mutex_lock(&data.lock); + val = *entry; + mutex_unlock(&data.lock); + + len = snprintf(buf, sizeof(buf), "%llu\n", (unsigned long long)val); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); + +} + +/* + * simple_data_write - Wrapper write function for global state debugfs entries + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The userspace provided buffer to write value from + * @cnt: The maximum number of bytes to write + * @ppos: The current "file" position + * @entry: The entry to write to + * + * This function provides a generic write implementation for the global state + * "data" structure debugfs filesystem entries. It would be nice to use + * simple_attr_write directly, but we need to make sure that the data.lock + * spinlock is held during the actual write (even though we likely won't ever + * actually race here as the updater runs under a stop_machine context). + */ +static ssize_t simple_data_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos, u64 *entry) +{ + char buf[U64STR_SIZE]; + int csize = min(cnt, sizeof(buf)); + u64 val = 0; + int err = 0; + + memset(buf, '\0', sizeof(buf)); + if (copy_from_user(buf, ubuf, csize)) + return -EFAULT; + + buf[U64STR_SIZE-1] = '\0'; /* just in case */ + err = strict_strtoull(buf, 10, &val); + if (err) + return -EINVAL; + + mutex_lock(&data.lock); + *entry = val; + mutex_unlock(&data.lock); + + return csize; +} + +/** + * debug_count_fopen - Open function for "count" debugfs entry + * @inode: The in-kernel inode representation of the debugfs "file" + * @filp: The active open file structure for the debugfs "file" + * + * This function provides an open implementation for the "count" debugfs + * interface to the hardware latency detector. + */ +static int debug_count_fopen(struct inode *inode, struct file *filp) +{ + return 0; +} + +/** + * debug_count_fread - Read function for "count" debugfs entry + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * This function provides a read implementation for the "count" debugfs + * interface to the hardware latency detector. Can be used to read the + * number of latency readings exceeding the configured threshold since + * the detector was last reset (e.g. by writing a zero into "count"). + */ +static ssize_t debug_count_fread(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_data_read(filp, ubuf, cnt, ppos, &data.count); +} + +/** + * debug_count_fwrite - Write function for "count" debugfs entry + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in the debugfs "file" + * + * This function provides a write implementation for the "count" debugfs + * interface to the hardware latency detector. Can be used to write a + * desired value, especially to zero the total count. + */ +static ssize_t debug_count_fwrite(struct file *filp, + const char __user *ubuf, + size_t cnt, + loff_t *ppos) +{ + return simple_data_write(filp, ubuf, cnt, ppos, &data.count); +} + +/** + * debug_enable_fopen - Dummy open function for "enable" debugfs interface + * @inode: The in-kernel inode representation of the debugfs "file" + * @filp: The active open file structure for the debugfs "file" + * + * This function provides an open implementation for the "enable" debugfs + * interface to the hardware latency detector. + */ +static int debug_enable_fopen(struct inode *inode, struct file *filp) +{ + return 0; +} + +/** + * debug_enable_fread - Read function for "enable" debugfs interface + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * This function provides a read implementation for the "enable" debugfs + * interface to the hardware latency detector. Can be used to determine + * whether the detector is currently enabled ("0\n" or "1\n" returned). + */ +static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[4]; + + if ((cnt < sizeof(buf)) || (*ppos)) + return 0; + + buf[0] = enabled ? '1' : '0'; + buf[1] = '\n'; + buf[2] = '\0'; + if (copy_to_user(ubuf, buf, strlen(buf))) + return -EFAULT; + return *ppos = strlen(buf); +} + +/** + * debug_enable_fwrite - Write function for "enable" debugfs interface + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in the debugfs "file" + * + * This function provides a write implementation for the "enable" debugfs + * interface to the hardware latency detector. Can be used to enable or + * disable the detector, which will have the side-effect of possibly + * also resetting the global stats and kicking off the measuring + * kthread (on an enable) or the converse (upon a disable). + */ +static ssize_t debug_enable_fwrite(struct file *filp, + const char __user *ubuf, + size_t cnt, + loff_t *ppos) +{ + char buf[4]; + int csize = min(cnt, sizeof(buf)); + long val = 0; + int err = 0; + + memset(buf, '\0', sizeof(buf)); + if (copy_from_user(buf, ubuf, csize)) + return -EFAULT; + + buf[sizeof(buf)-1] = '\0'; /* just in case */ + err = strict_strtoul(buf, 10, &val); + if (0 != err) + return -EINVAL; + + if (val) { + if (enabled) + goto unlock; + enabled = 1; + __reset_stats(); + if (start_kthread()) + return -EFAULT; + } else { + if (!enabled) + goto unlock; + enabled = 0; + err = stop_kthread(); + if (err) { + printk(KERN_ERR BANNER "cannot stop kthread\n"); + return -EFAULT; + } + wake_up(&data.wq); /* reader(s) should return */ + } +unlock: + return csize; +} + +/** + * debug_max_fopen - Open function for "max" debugfs entry + * @inode: The in-kernel inode representation of the debugfs "file" + * @filp: The active open file structure for the debugfs "file" + * + * This function provides an open implementation for the "max" debugfs + * interface to the hardware latency detector. + */ +static int debug_max_fopen(struct inode *inode, struct file *filp) +{ + return 0; +} + +/** + * debug_max_fread - Read function for "max" debugfs entry + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * This function provides a read implementation for the "max" debugfs + * interface to the hardware latency detector. Can be used to determine + * the maximum latency value observed since it was last reset. + */ +static ssize_t debug_max_fread(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_data_read(filp, ubuf, cnt, ppos, &data.max_sample); +} + +/** + * debug_max_fwrite - Write function for "max" debugfs entry + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in the debugfs "file" + * + * This function provides a write implementation for the "max" debugfs + * interface to the hardware latency detector. Can be used to reset the + * maximum or set it to some other desired value - if, then, subsequent + * measurements exceed this value, the maximum will be updated. + */ +static ssize_t debug_max_fwrite(struct file *filp, + const char __user *ubuf, + size_t cnt, + loff_t *ppos) +{ + return simple_data_write(filp, ubuf, cnt, ppos, &data.max_sample); +} + + +/** + * debug_sample_fopen - An open function for "sample" debugfs interface + * @inode: The in-kernel inode representation of this debugfs "file" + * @filp: The active open file structure for the debugfs "file" + * + * This function handles opening the "sample" file within the hardware + * latency detector debugfs directory interface. This file is used to read + * raw samples from the global ring_buffer and allows the user to see a + * running latency history. Can be opened blocking or non-blocking, + * affecting whether it behaves as a buffer read pipe, or does not. + * Implements simple locking to prevent multiple simultaneous use. + */ +static int debug_sample_fopen(struct inode *inode, struct file *filp) +{ + if (!atomic_add_unless(&data.sample_open, 1, 1)) + return -EBUSY; + else + return 0; +} + +/** + * debug_sample_fread - A read function for "sample" debugfs interface + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The user buffer that will contain the samples read + * @cnt: The maximum bytes to read from the debugfs "file" + * @ppos: The current position in the debugfs "file" + * + * This function handles reading from the "sample" file within the hardware + * latency detector debugfs directory interface. This file is used to read + * raw samples from the global ring_buffer and allows the user to see a + * running latency history. By default this will block pending a new + * value written into the sample buffer, unless there are already a + * number of value(s) waiting in the buffer, or the sample file was + * previously opened in a non-blocking mode of operation. + */ +static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + int len = 0; + char buf[64]; + struct sample *sample = NULL; + + if (!enabled) + return 0; + + sample = kzalloc(sizeof(struct sample), GFP_KERNEL); + if (!sample) + return -ENOMEM; + + while (!buffer_get_sample(sample)) { + + DEFINE_WAIT(wait); + + if (filp->f_flags & O_NONBLOCK) { + len = -EAGAIN; + goto out; + } + + prepare_to_wait(&data.wq, &wait, TASK_INTERRUPTIBLE); + schedule(); + finish_wait(&data.wq, &wait); + + if (signal_pending(current)) { + len = -EINTR; + goto out; + } + + if (!enabled) { /* enable was toggled */ + len = 0; + goto out; + } + } + + len = snprintf(buf, sizeof(buf), "%010lu.%010lu\t%llu\n", + sample->timestamp.tv_sec, + sample->timestamp.tv_nsec, + sample->duration); + + + /* handling partial reads is more trouble than it's worth */ + if (len > cnt) + goto out; + + if (copy_to_user(ubuf, buf, len)) + len = -EFAULT; + +out: + kfree(sample); + return len; +} + +/** + * debug_sample_release - Release function for "sample" debugfs interface + * @inode: The in-kernel inode represenation of the debugfs "file" + * @filp: The active open file structure for the debugfs "file" + * + * This function completes the close of the debugfs interface "sample" file. + * Frees the sample_open "lock" so that other users may open the interface. + */ +static int debug_sample_release(struct inode *inode, struct file *filp) +{ + atomic_dec(&data.sample_open); + + return 0; +} + +/** + * debug_threshold_fopen - Open function for "threshold" debugfs entry + * @inode: The in-kernel inode representation of the debugfs "file" + * @filp: The active open file structure for the debugfs "file" + * + * This function provides an open implementation for the "threshold" debugfs + * interface to the hardware latency detector. + */ +static int debug_threshold_fopen(struct inode *inode, struct file *filp) +{ + return 0; +} + +/** + * debug_threshold_fread - Read function for "threshold" debugfs entry + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * This function provides a read implementation for the "threshold" debugfs + * interface to the hardware latency detector. It can be used to determine + * the current threshold level at which a latency will be recorded in the + * global ring buffer, typically on the order of 10us. + */ +static ssize_t debug_threshold_fread(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_data_read(filp, ubuf, cnt, ppos, &data.threshold); +} + +/** + * debug_threshold_fwrite - Write function for "threshold" debugfs entry + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in the debugfs "file" + * + * This function provides a write implementation for the "threshold" debugfs + * interface to the hardware latency detector. It can be used to configure + * the threshold level at which any subsequently detected latencies will + * be recorded into the global ring buffer. + */ +static ssize_t debug_threshold_fwrite(struct file *filp, + const char __user *ubuf, + size_t cnt, + loff_t *ppos) +{ + int ret; + + ret = simple_data_write(filp, ubuf, cnt, ppos, &data.threshold); + + if (enabled) + wake_up_process(kthread); + + return ret; +} + +/** + * debug_width_fopen - Open function for "width" debugfs entry + * @inode: The in-kernel inode representation of the debugfs "file" + * @filp: The active open file structure for the debugfs "file" + * + * This function provides an open implementation for the "width" debugfs + * interface to the hardware latency detector. + */ +static int debug_width_fopen(struct inode *inode, struct file *filp) +{ + return 0; +} + +/** + * debug_width_fread - Read function for "width" debugfs entry + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * This function provides a read implementation for the "width" debugfs + * interface to the hardware latency detector. It can be used to determine + * for how many us of the total window us we will actively sample for any + * hardware-induced latecy periods. Obviously, it is not possible to + * sample constantly and have the system respond to a sample reader, or, + * worse, without having the system appear to have gone out to lunch. + */ +static ssize_t debug_width_fread(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_width); +} + +/** + * debug_width_fwrite - Write function for "width" debugfs entry + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in the debugfs "file" + * + * This function provides a write implementation for the "width" debugfs + * interface to the hardware latency detector. It can be used to configure + * for how many us of the total window us we will actively sample for any + * hardware-induced latency periods. Obviously, it is not possible to + * sample constantly and have the system respond to a sample reader, or, + * worse, without having the system appear to have gone out to lunch. It + * is enforced that width is less that the total window size. + */ +static ssize_t debug_width_fwrite(struct file *filp, + const char __user *ubuf, + size_t cnt, + loff_t *ppos) +{ + char buf[U64STR_SIZE]; + int csize = min(cnt, sizeof(buf)); + u64 val = 0; + int err = 0; + + memset(buf, '\0', sizeof(buf)); + if (copy_from_user(buf, ubuf, csize)) + return -EFAULT; + + buf[U64STR_SIZE-1] = '\0'; /* just in case */ + err = strict_strtoull(buf, 10, &val); + if (0 != err) + return -EINVAL; + + mutex_lock(&data.lock); + if (val < data.sample_window) + data.sample_width = val; + else { + mutex_unlock(&data.lock); + return -EINVAL; + } + mutex_unlock(&data.lock); + + if (enabled) + wake_up_process(kthread); + + return csize; +} + +/** + * debug_window_fopen - Open function for "window" debugfs entry + * @inode: The in-kernel inode representation of the debugfs "file" + * @filp: The active open file structure for the debugfs "file" + * + * This function provides an open implementation for the "window" debugfs + * interface to the hardware latency detector. The window is the total time + * in us that will be considered one sample period. Conceptually, windows + * occur back-to-back and contain a sample width period during which + * actual sampling occurs. + */ +static int debug_window_fopen(struct inode *inode, struct file *filp) +{ + return 0; +} + +/** + * debug_window_fread - Read function for "window" debugfs entry + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * This function provides a read implementation for the "window" debugfs + * interface to the hardware latency detector. The window is the total time + * in us that will be considered one sample period. Conceptually, windows + * occur back-to-back and contain a sample width period during which + * actual sampling occurs. Can be used to read the total window size. + */ +static ssize_t debug_window_fread(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_window); +} + +/** + * debug_window_fwrite - Write function for "window" debugfs entry + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in the debugfs "file" + * + * This function provides a write implementation for the "window" debufds + * interface to the hardware latency detetector. The window is the total time + * in us that will be considered one sample period. Conceptually, windows + * occur back-to-back and contain a sample width period during which + * actual sampling occurs. Can be used to write a new total window size. It + * is enfoced that any value written must be greater than the sample width + * size, or an error results. + */ +static ssize_t debug_window_fwrite(struct file *filp, + const char __user *ubuf, + size_t cnt, + loff_t *ppos) +{ + char buf[U64STR_SIZE]; + int csize = min(cnt, sizeof(buf)); + u64 val = 0; + int err = 0; + + memset(buf, '\0', sizeof(buf)); + if (copy_from_user(buf, ubuf, csize)) + return -EFAULT; + + buf[U64STR_SIZE-1] = '\0'; /* just in case */ + err = strict_strtoull(buf, 10, &val); + if (0 != err) + return -EINVAL; + + mutex_lock(&data.lock); + if (data.sample_width < val) + data.sample_window = val; + else { + mutex_unlock(&data.lock); + return -EINVAL; + } + mutex_unlock(&data.lock); + + return csize; +} + +/* + * Function pointers for the "count" debugfs file operations + */ +static const struct file_operations count_fops = { + .open = debug_count_fopen, + .read = debug_count_fread, + .write = debug_count_fwrite, + .owner = THIS_MODULE, +}; + +/* + * Function pointers for the "enable" debugfs file operations + */ +static const struct file_operations enable_fops = { + .open = debug_enable_fopen, + .read = debug_enable_fread, + .write = debug_enable_fwrite, + .owner = THIS_MODULE, +}; + +/* + * Function pointers for the "max" debugfs file operations + */ +static const struct file_operations max_fops = { + .open = debug_max_fopen, + .read = debug_max_fread, + .write = debug_max_fwrite, + .owner = THIS_MODULE, +}; + +/* + * Function pointers for the "sample" debugfs file operations + */ +static const struct file_operations sample_fops = { + .open = debug_sample_fopen, + .read = debug_sample_fread, + .release = debug_sample_release, + .owner = THIS_MODULE, +}; + +/* + * Function pointers for the "threshold" debugfs file operations + */ +static const struct file_operations threshold_fops = { + .open = debug_threshold_fopen, + .read = debug_threshold_fread, + .write = debug_threshold_fwrite, + .owner = THIS_MODULE, +}; + +/* + * Function pointers for the "width" debugfs file operations + */ +static const struct file_operations width_fops = { + .open = debug_width_fopen, + .read = debug_width_fread, + .write = debug_width_fwrite, + .owner = THIS_MODULE, +}; + +/* + * Function pointers for the "window" debugfs file operations + */ +static const struct file_operations window_fops = { + .open = debug_window_fopen, + .read = debug_window_fread, + .write = debug_window_fwrite, + .owner = THIS_MODULE, +}; + +/** + * init_debugfs - A function to initialize the debugfs interface files + * + * This function creates entries in debugfs for "hwlat_detector", including + * files to read values from the detector, current samples, and the + * maximum sample that has been captured since the hardware latency + * dectector was started. + */ +static int init_debugfs(void) +{ + int ret = -ENOMEM; + + debug_dir = debugfs_create_dir(DRVNAME, NULL); + if (!debug_dir) + goto err_debug_dir; + + debug_sample = debugfs_create_file("sample", 0444, + debug_dir, NULL, + &sample_fops); + if (!debug_sample) + goto err_sample; + + debug_count = debugfs_create_file("count", 0444, + debug_dir, NULL, + &count_fops); + if (!debug_count) + goto err_count; + + debug_max = debugfs_create_file("max", 0444, + debug_dir, NULL, + &max_fops); + if (!debug_max) + goto err_max; + + debug_sample_window = debugfs_create_file("window", 0644, + debug_dir, NULL, + &window_fops); + if (!debug_sample_window) + goto err_window; + + debug_sample_width = debugfs_create_file("width", 0644, + debug_dir, NULL, + &width_fops); + if (!debug_sample_width) + goto err_width; + + debug_threshold = debugfs_create_file("threshold", 0644, + debug_dir, NULL, + &threshold_fops); + if (!debug_threshold) + goto err_threshold; + + debug_enable = debugfs_create_file("enable", 0644, + debug_dir, &enabled, + &enable_fops); + if (!debug_enable) + goto err_enable; + + else { + ret = 0; + goto out; + } + +err_enable: + debugfs_remove(debug_threshold); +err_threshold: + debugfs_remove(debug_sample_width); +err_width: + debugfs_remove(debug_sample_window); +err_window: + debugfs_remove(debug_max); +err_max: + debugfs_remove(debug_count); +err_count: + debugfs_remove(debug_sample); +err_sample: + debugfs_remove(debug_dir); +err_debug_dir: +out: + return ret; +} + +/** + * free_debugfs - A function to cleanup the debugfs file interface + */ +static void free_debugfs(void) +{ + /* could also use a debugfs_remove_recursive */ + debugfs_remove(debug_enable); + debugfs_remove(debug_threshold); + debugfs_remove(debug_sample_width); + debugfs_remove(debug_sample_window); + debugfs_remove(debug_max); + debugfs_remove(debug_count); + debugfs_remove(debug_sample); + debugfs_remove(debug_dir); +} + +/** + * detector_init - Standard module initialization code + */ +static int detector_init(void) +{ + int ret = -ENOMEM; + + printk(KERN_INFO BANNER "version %s\n", VERSION); + + ret = init_stats(); + if (0 != ret) + goto out; + + ret = init_debugfs(); + if (0 != ret) + goto err_stats; + + if (enabled) + ret = start_kthread(); + + goto out; + +err_stats: + ring_buffer_free(ring_buffer); +out: + return ret; + +} + +/** + * detector_exit - Standard module cleanup code + */ +static void detector_exit(void) +{ + int err; + + if (enabled) { + enabled = 0; + err = stop_kthread(); + if (err) + printk(KERN_ERR BANNER "cannot stop kthread\n"); + } + + free_debugfs(); + ring_buffer_free(ring_buffer); /* free up the ring buffer */ + +} + +module_init(detector_init); +module_exit(detector_exit); diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c index c5a7a85..a525eb8 100644 --- a/drivers/mmc/card/queue.c +++ b/drivers/mmc/card/queue.c @@ -195,7 +195,7 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, spinlock_t *lock sg_init_table(mq->sg, host->max_phys_segs); } - init_MUTEX(&mq->thread_sem); + sema_init(&mq->thread_sem, 1); mq->thread = kthread_run(mmc_queue_thread, mq, "mmcqd"); if (IS_ERR(mq->thread)) { diff --git a/drivers/net/3c527.c b/drivers/net/3c527.c index 36c4191..ca43ce6 100644 --- a/drivers/net/3c527.c +++ b/drivers/net/3c527.c @@ -522,7 +522,7 @@ static int __init mc32_probe1(struct net_device *dev, int slot) lp->tx_len = lp->exec_box->data[9]; /* Transmit list count */ lp->rx_len = lp->exec_box->data[11]; /* Receive list count */ - init_MUTEX_LOCKED(&lp->cmd_mutex); + sema_init(&lp->cmd_mutex, 0); init_completion(&lp->execution_cmd); init_completion(&lp->xceiver_cmd); diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c index 39db0e9..43dcc78 100644 --- a/drivers/net/3c59x.c +++ b/drivers/net/3c59x.c @@ -797,9 +797,9 @@ static void poll_vortex(struct net_device *dev) { struct vortex_private *vp = netdev_priv(dev); unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev); - local_irq_restore(flags); + local_irq_restore_nort(flags); } #endif @@ -1764,6 +1764,7 @@ vortex_timer(unsigned long data) int next_tick = 60*HZ; int ok = 0; int media_status, old_window; + unsigned long flags; if (vortex_debug > 2) { pr_debug("%s: Media selection timer tick happened, %s.\n", @@ -1771,7 +1772,7 @@ vortex_timer(unsigned long data) pr_debug("dev->watchdog_timeo=%d\n", dev->watchdog_timeo); } - disable_irq_lockdep(dev->irq); + spin_lock_irqsave(&vp->lock, flags); old_window = ioread16(ioaddr + EL3_CMD) >> 13; EL3WINDOW(4); media_status = ioread16(ioaddr + Wn4_Media); @@ -1794,10 +1795,7 @@ vortex_timer(unsigned long data) case XCVR_MII: case XCVR_NWAY: { ok = 1; - /* Interrupts are already disabled */ - spin_lock(&vp->lock); vortex_check_media(dev, 0); - spin_unlock(&vp->lock); } break; default: /* Other media types handled by Tx timeouts. */ @@ -1851,7 +1849,7 @@ leave_media_alone: dev->name, media_tbl[dev->if_port].name); EL3WINDOW(old_window); - enable_irq_lockdep(dev->irq); + spin_unlock_irqrestore(&vp->lock, flags); mod_timer(&vp->timer, RUN_AT(next_tick)); if (vp->deferred) iowrite16(FakeIntr, ioaddr + EL3_CMD); @@ -1885,12 +1883,12 @@ static void vortex_tx_timeout(struct net_device *dev) * Block interrupts because vortex_interrupt does a bare spin_lock() */ unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); if (vp->full_bus_master_tx) boomerang_interrupt(dev->irq, dev); else vortex_interrupt(dev->irq, dev); - local_irq_restore(flags); + local_irq_restore_nort(flags); } } diff --git a/drivers/net/8139too.c b/drivers/net/8139too.c index 25f7339..3b8ff0d 100644 --- a/drivers/net/8139too.c +++ b/drivers/net/8139too.c @@ -2195,7 +2195,11 @@ static irqreturn_t rtl8139_interrupt (int irq, void *dev_instance) */ static void rtl8139_poll_controller(struct net_device *dev) { - disable_irq(dev->irq); + /* + * use _nosync() variant - might be used by netconsole + * from atomic contexts: + */ + disable_irq_nosync(dev->irq); rtl8139_interrupt(dev->irq, dev); enable_irq(dev->irq); } diff --git a/drivers/net/arm/at91_ether.c b/drivers/net/arm/at91_ether.c index c8bc60a..42230fc 100644 --- a/drivers/net/arm/at91_ether.c +++ b/drivers/net/arm/at91_ether.c @@ -198,7 +198,9 @@ static irqreturn_t at91ether_phy_interrupt(int irq, void *dev_id) struct net_device *dev = (struct net_device *) dev_id; struct at91_private *lp = netdev_priv(dev); unsigned int phy; + unsigned long flags; + spin_lock_irqsave(&lp->lock, flags); /* * This hander is triggered on both edges, but the PHY chips expect * level-triggering. We therefore have to check if the PHY actually has @@ -240,6 +242,7 @@ static irqreturn_t at91ether_phy_interrupt(int irq, void *dev_id) done: disable_mdi(); + spin_unlock_irqrestore(&lp->lock, flags); return IRQ_HANDLED; } @@ -396,9 +399,11 @@ static void at91ether_check_link(unsigned long dev_id) struct net_device *dev = (struct net_device *) dev_id; struct at91_private *lp = netdev_priv(dev); + spin_lock_irq(&lp->lock); enable_mdi(); update_linkspeed(dev, 1); disable_mdi(); + spin_unlock_irq(&lp->lock); mod_timer(&lp->check_timer, jiffies + LINK_POLL_INTERVAL); } diff --git a/drivers/net/atl1c/atl1c_main.c b/drivers/net/atl1c/atl1c_main.c index 2f4be59..5b44d14 100644 --- a/drivers/net/atl1c/atl1c_main.c +++ b/drivers/net/atl1c/atl1c_main.c @@ -2064,11 +2064,8 @@ static netdev_tx_t atl1c_xmit_frame(struct sk_buff *skb, } tpd_req = atl1c_cal_tpd_req(skb); - if (!spin_trylock_irqsave(&adapter->tx_lock, flags)) { - if (netif_msg_pktdata(adapter)) - dev_info(&adapter->pdev->dev, "tx locked\n"); - return NETDEV_TX_LOCKED; - } + spin_lock_irqsave(&adapter->tx_lock, flags); + if (skb->mark == 0x01) type = atl1c_trans_high; else diff --git a/drivers/net/atl1e/atl1e_main.c b/drivers/net/atl1e/atl1e_main.c index 08f8c09..5df9cfd 100644 --- a/drivers/net/atl1e/atl1e_main.c +++ b/drivers/net/atl1e/atl1e_main.c @@ -1823,8 +1823,7 @@ static netdev_tx_t atl1e_xmit_frame(struct sk_buff *skb, return NETDEV_TX_OK; } tpd_req = atl1e_cal_tdp_req(skb); - if (!spin_trylock_irqsave(&adapter->tx_lock, flags)) - return NETDEV_TX_LOCKED; + spin_lock_irqsave(&adapter->tx_lock, flags); if (atl1e_tpd_avail(adapter) < tpd_req) { /* no enough descriptor, just stop queue */ diff --git a/drivers/net/benet/be_cmds.c b/drivers/net/benet/be_cmds.c index fee6eee..006cb2e 100644 --- a/drivers/net/benet/be_cmds.c +++ b/drivers/net/benet/be_cmds.c @@ -296,6 +296,7 @@ static void be_cmd_hdr_prepare(struct be_cmd_req_hdr *req_hdr, req_hdr->opcode = opcode; req_hdr->subsystem = subsystem; req_hdr->request_length = cpu_to_le32(cmd_len - sizeof(*req_hdr)); + req_hdr->version = 0; } static void be_cmd_page_addrs_prepare(struct phys_addr *pages, u32 max_pages, diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index 65df1de..d0bf40d 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c @@ -2863,7 +2863,7 @@ bnx2_tx_int(struct bnx2 *bp, struct bnx2_napi *bnapi, int budget) if (unlikely(netif_tx_queue_stopped(txq)) && (bnx2_tx_avail(bp, txr) > bp->tx_wake_thresh)) { - __netif_tx_lock(txq, smp_processor_id()); + __netif_tx_lock(txq); if ((netif_tx_queue_stopped(txq)) && (bnx2_tx_avail(bp, txr) > bp->tx_wake_thresh)) netif_tx_wake_queue(txq); diff --git a/drivers/net/chelsio/sge.c b/drivers/net/chelsio/sge.c index 109d278..abd372a 100644 --- a/drivers/net/chelsio/sge.c +++ b/drivers/net/chelsio/sge.c @@ -1671,8 +1671,7 @@ static int t1_sge_tx(struct sk_buff *skb, struct adapter *adapter, struct cmdQ *q = &sge->cmdQ[qid]; unsigned int credits, pidx, genbit, count, use_sched_skb = 0; - if (!spin_trylock(&q->lock)) - return NETDEV_TX_LOCKED; + spin_lock(&q->lock); reclaim_completed_tx(sge, q); diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c index d29bb53..7655436 100644 --- a/drivers/net/e1000/e1000_main.c +++ b/drivers/net/e1000/e1000_main.c @@ -4006,11 +4006,21 @@ check_page: } } - if (!buffer_info->dma) + if (!buffer_info->dma) { buffer_info->dma = pci_map_page(pdev, buffer_info->page, 0, buffer_info->length, PCI_DMA_FROMDEVICE); + if (pci_dma_mapping_error(pdev, buffer_info->dma)) { + put_page(buffer_info->page); + dev_kfree_skb(skb); + buffer_info->page = NULL; + buffer_info->skb = NULL; + buffer_info->dma = 0; + adapter->alloc_rx_buff_failed++; + break; /* while !buffer_info->skb */ + } + } rx_desc = E1000_RX_DESC(*rx_ring, i); rx_desc->buffer_addr = cpu_to_le64(buffer_info->dma); @@ -4101,6 +4111,13 @@ map_skb: skb->data, buffer_info->length, PCI_DMA_FROMDEVICE); + if (pci_dma_mapping_error(pdev, buffer_info->dma)) { + dev_kfree_skb(skb); + buffer_info->skb = NULL; + buffer_info->dma = 0; + adapter->alloc_rx_buff_failed++; + break; /* while !buffer_info->skb */ + } /* * XXX if it was allocated cleanly it will never map to a diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index 689b9bd..04cddd9 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -607,7 +607,7 @@ static int sixpack_open(struct tty_struct *tty) spin_lock_init(&sp->lock); atomic_set(&sp->refcnt, 1); - init_MUTEX_LOCKED(&sp->dead_sem); + sema_init(&sp->dead_sem, 0); /* !!! length of the buffers. MTU is IP MTU, not PACLEN! */ diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c index 7db0a1c..e29e9eb 100644 --- a/drivers/net/hamradio/mkiss.c +++ b/drivers/net/hamradio/mkiss.c @@ -746,7 +746,7 @@ static int mkiss_open(struct tty_struct *tty) spin_lock_init(&ax->buflock); atomic_set(&ax->refcnt, 1); - init_MUTEX_LOCKED(&ax->dead_sem); + sema_init(&ax->dead_sem, 0); ax->tty = tty; tty->disc_data = ax; diff --git a/drivers/net/irda/sir_dev.c b/drivers/net/irda/sir_dev.c index 4b2a1a9..8153eb6 100644 --- a/drivers/net/irda/sir_dev.c +++ b/drivers/net/irda/sir_dev.c @@ -909,7 +909,7 @@ struct sir_dev * sirdev_get_instance(const struct sir_driver *drv, const char *n dev->tx_skb = NULL; spin_lock_init(&dev->tx_lock); - init_MUTEX(&dev->fsm.sem); + sema_init(&dev->fsm.sem, 1); dev->drv = drv; dev->netdev = ndev; diff --git a/drivers/net/ixgbe/ixgbe_82598.c b/drivers/net/ixgbe/ixgbe_82598.c index 3103f41..35a06b4 100644 --- a/drivers/net/ixgbe/ixgbe_82598.c +++ b/drivers/net/ixgbe/ixgbe_82598.c @@ -357,12 +357,34 @@ static s32 ixgbe_fc_enable_82598(struct ixgbe_hw *hw, s32 packetbuf_num) u32 fctrl_reg; u32 rmcs_reg; u32 reg; + u32 link_speed = 0; + bool link_up; #ifdef CONFIG_DCB if (hw->fc.requested_mode == ixgbe_fc_pfc) goto out; #endif /* CONFIG_DCB */ + /* + * On 82598 having Rx FC on causes resets while doing 1G + * so if it's on turn it off once we know link_speed. For + * more details see 82598 Specification update. + */ + hw->mac.ops.check_link(hw, &link_speed, &link_up, false); + if (link_up && link_speed == IXGBE_LINK_SPEED_1GB_FULL) { + switch (hw->fc.requested_mode) { + case ixgbe_fc_full: + hw->fc.requested_mode = ixgbe_fc_tx_pause; + break; + case ixgbe_fc_rx_pause: + hw->fc.requested_mode = ixgbe_fc_none; + break; + default: + /* no change */ + break; + } + } + /* Negotiate the fc mode to use */ ret_val = ixgbe_fc_autoneg(hw); if (ret_val) diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c index 7b7c848..951b73c 100644 --- a/drivers/net/ixgbe/ixgbe_main.c +++ b/drivers/net/ixgbe/ixgbe_main.c @@ -5763,6 +5763,10 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev, if (err) goto err_sw_init; + /* Make it possible the adapter to be woken up via WOL */ + if (adapter->hw.mac.type == ixgbe_mac_82599EB) + IXGBE_WRITE_REG(&adapter->hw, IXGBE_WUS, ~0); + /* * If there is a fan on this device and it has failed log the * failure. diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index b9fcc98..d3fee9a 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -65,6 +65,14 @@ struct pcpu_lstats { unsigned long drops; }; +#ifdef CONFIG_PREEMPT_RT +# define xmit_get_cpu() get_cpu() +# define xmit_put_cpu() put_cpu() +#else +# define xmit_get_cpu() smp_processor_id() +# define xmit_put_cpu() do { } while (0) +#endif + /* * The higher levels take care of making this non-reentrant (it's * called with bh's disabled). @@ -73,23 +81,24 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb, struct net_device *dev) { struct pcpu_lstats *pcpu_lstats, *lb_stats; - int len; + int len, res; skb_orphan(skb); skb->protocol = eth_type_trans(skb, dev); + len = skb->len; + res = netif_rx_ni(skb) ; - /* it's OK to use per_cpu_ptr() because BHs are off */ pcpu_lstats = dev->ml_priv; - lb_stats = this_cpu_ptr(pcpu_lstats); + lb_stats = per_cpu_ptr(pcpu_lstats, xmit_get_cpu()); - len = skb->len; - if (likely(netif_rx(skb) == NET_RX_SUCCESS)) { + if (likely(res == NET_RX_SUCCESS)) { lb_stats->bytes += len; lb_stats->packets++; } else lb_stats->drops++; + xmit_put_cpu(); return NETDEV_TX_OK; } diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index bc72d6e..13343e8 100644 --- a/drivers/net/mlx4/mlx4.h +++ b/drivers/net/mlx4/mlx4.h @@ -40,6 +40,7 @@ #include <linux/mutex.h> #include <linux/radix-tree.h> #include <linux/timer.h> +#include <linux/semaphore.h> #include <linux/workqueue.h> #include <linux/mlx4/device.h> diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c index af67af5..3e01ec6 100644 --- a/drivers/net/mv643xx_eth.c +++ b/drivers/net/mv643xx_eth.c @@ -508,7 +508,7 @@ static void txq_maybe_wake(struct tx_queue *txq) struct netdev_queue *nq = netdev_get_tx_queue(mp->dev, txq->index); if (netif_tx_queue_stopped(nq)) { - __netif_tx_lock(nq, smp_processor_id()); + __netif_tx_lock(nq); if (txq->tx_ring_size - txq->tx_desc_count >= MAX_SKB_FRAGS + 1) netif_tx_wake_queue(nq); __netif_tx_unlock(nq); @@ -901,7 +901,7 @@ static void txq_kick(struct tx_queue *txq) u32 hw_desc_ptr; u32 expected_ptr; - __netif_tx_lock(nq, smp_processor_id()); + __netif_tx_lock(nq); if (rdlp(mp, TXQ_COMMAND) & (1 << txq->index)) goto out; @@ -925,7 +925,7 @@ static int txq_reclaim(struct tx_queue *txq, int budget, int force) struct netdev_queue *nq = netdev_get_tx_queue(mp->dev, txq->index); int reclaimed; - __netif_tx_lock(nq, smp_processor_id()); + __netif_tx_lock(nq); reclaimed = 0; while (reclaimed < budget && txq->tx_desc_count > 0) { diff --git a/drivers/net/netxen/netxen_nic_init.c b/drivers/net/netxen/netxen_nic_init.c index 64cff68..c4e38ca 100644 --- a/drivers/net/netxen/netxen_nic_init.c +++ b/drivers/net/netxen/netxen_nic_init.c @@ -1625,7 +1625,7 @@ int netxen_process_cmd_ring(struct netxen_adapter *adapter) smp_mb(); if (netif_queue_stopped(netdev) && netif_carrier_ok(netdev)) { - __netif_tx_lock(tx_ring->txq, smp_processor_id()); + __netif_tx_lock(tx_ring->txq); if (netxen_tx_avail(tx_ring) > TX_STOP_THRESH) { netif_wake_queue(netdev); adapter->tx_timeo_cnt = 0; diff --git a/drivers/net/niu.c b/drivers/net/niu.c index 2aed2b3..1cd2408 100644 --- a/drivers/net/niu.c +++ b/drivers/net/niu.c @@ -3677,7 +3677,7 @@ static void niu_tx_work(struct niu *np, struct tx_ring_info *rp) out: if (unlikely(netif_tx_queue_stopped(txq) && (niu_tx_avail(rp) > NIU_TX_WAKEUP_THRESH(rp)))) { - __netif_tx_lock(txq, smp_processor_id()); + __netif_tx_lock(txq); if (netif_tx_queue_stopped(txq) && (niu_tx_avail(rp) > NIU_TX_WAKEUP_THRESH(rp))) netif_tx_wake_queue(txq); diff --git a/drivers/net/ppp_async.c b/drivers/net/ppp_async.c index 6a375ea..fd2ae3b 100644 --- a/drivers/net/ppp_async.c +++ b/drivers/net/ppp_async.c @@ -183,7 +183,7 @@ ppp_asynctty_open(struct tty_struct *tty) tasklet_init(&ap->tsk, ppp_async_process, (unsigned long) ap); atomic_set(&ap->refcnt, 1); - init_MUTEX_LOCKED(&ap->dead_sem); + sema_init(&ap->dead_sem, 0); ap->chan.private = ap; ap->chan.ops = &async_ops; diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c index ede937e..53345da 100644 --- a/drivers/net/rionet.c +++ b/drivers/net/rionet.c @@ -175,11 +175,7 @@ static int rionet_start_xmit(struct sk_buff *skb, struct net_device *ndev) u16 destid; unsigned long flags; - local_irq_save(flags); - if (!spin_trylock(&rnet->tx_lock)) { - local_irq_restore(flags); - return NETDEV_TX_LOCKED; - } + spin_lock_irqsave(&rnet->tx_lock, flags); if ((rnet->tx_cnt + 1) > RIONET_TX_RING_SIZE) { netif_stop_queue(ndev); diff --git a/drivers/net/s2io.c b/drivers/net/s2io.c index 3c4836d..3b40857 100644 --- a/drivers/net/s2io.c +++ b/drivers/net/s2io.c @@ -4083,7 +4083,6 @@ static netdev_tx_t s2io_xmit(struct sk_buff *skb, struct net_device *dev) unsigned long flags = 0; u16 vlan_tag = 0; struct fifo_info *fifo = NULL; - int do_spin_lock = 1; int offload_type; int enable_per_list_interrupt = 0; struct config_param *config = &sp->config; @@ -4136,7 +4135,6 @@ static netdev_tx_t s2io_xmit(struct sk_buff *skb, struct net_device *dev) queue += sp->udp_fifo_idx; if (skb->len > 1024) enable_per_list_interrupt = 1; - do_spin_lock = 0; } } } @@ -4146,12 +4144,7 @@ static netdev_tx_t s2io_xmit(struct sk_buff *skb, struct net_device *dev) [skb->priority & (MAX_TX_FIFOS - 1)]; fifo = &mac_control->fifos[queue]; - if (do_spin_lock) - spin_lock_irqsave(&fifo->tx_lock, flags); - else { - if (unlikely(!spin_trylock_irqsave(&fifo->tx_lock, flags))) - return NETDEV_TX_LOCKED; - } + spin_lock_irqsave(&fifo->tx_lock, flags); if (sp->config.multiq) { if (__netif_subqueue_stopped(dev, fifo->fifo_no)) { diff --git a/drivers/net/sfc/efx.c b/drivers/net/sfc/efx.c index 103e8b0..46997e1 100644 --- a/drivers/net/sfc/efx.c +++ b/drivers/net/sfc/efx.c @@ -2284,6 +2284,7 @@ static int __devinit efx_pci_probe(struct pci_dev *pci_dev, fail2: efx_fini_struct(efx); fail1: + WARN_ON(rc > 0); EFX_LOG(efx, "initialisation failed. rc=%d\n", rc); free_netdev(net_dev); return rc; diff --git a/drivers/net/sfc/falcon_boards.c b/drivers/net/sfc/falcon_boards.c index bf0b96a..5712fdd 100644 --- a/drivers/net/sfc/falcon_boards.c +++ b/drivers/net/sfc/falcon_boards.c @@ -29,6 +29,15 @@ #define FALCON_BOARD_SFN4111T 0x51 #define FALCON_BOARD_SFN4112F 0x52 +/* Board temperature is about 15°C above ambient when air flow is + * limited. */ +#define FALCON_BOARD_TEMP_BIAS 15 + +/* SFC4000 datasheet says: 'The maximum permitted junction temperature + * is 125°C; the thermal design of the environment for the SFC4000 + * should aim to keep this well below 100°C.' */ +#define FALCON_JUNC_TEMP_MAX 90 + /***************************************************************************** * Support for LM87 sensor chip used on several boards */ @@ -548,16 +557,16 @@ fail_hwmon: static u8 sfe4002_lm87_channel = 0x03; /* use AIN not FAN inputs */ static const u8 sfe4002_lm87_regs[] = { - LM87_IN_LIMITS(0, 0x83, 0x91), /* 2.5V: 1.8V +/- 5% */ - LM87_IN_LIMITS(1, 0x51, 0x5a), /* Vccp1: 1.2V +/- 5% */ - LM87_IN_LIMITS(2, 0xb6, 0xca), /* 3.3V: 3.3V +/- 5% */ - LM87_IN_LIMITS(3, 0xb0, 0xc9), /* 5V: 4.6-5.2V */ - LM87_IN_LIMITS(4, 0xb0, 0xe0), /* 12V: 11-14V */ - LM87_IN_LIMITS(5, 0x44, 0x4b), /* Vccp2: 1.0V +/- 5% */ - LM87_AIN_LIMITS(0, 0xa0, 0xb2), /* AIN1: 1.66V +/- 5% */ - LM87_AIN_LIMITS(1, 0x91, 0xa1), /* AIN2: 1.5V +/- 5% */ - LM87_TEMP_INT_LIMITS(10, 60), /* board */ - LM87_TEMP_EXT1_LIMITS(10, 70), /* Falcon */ + LM87_IN_LIMITS(0, 0x7c, 0x99), /* 2.5V: 1.8V +/- 10% */ + LM87_IN_LIMITS(1, 0x4c, 0x5e), /* Vccp1: 1.2V +/- 10% */ + LM87_IN_LIMITS(2, 0xac, 0xd4), /* 3.3V: 3.3V +/- 10% */ + LM87_IN_LIMITS(3, 0xac, 0xd4), /* 5V: 5.0V +/- 10% */ + LM87_IN_LIMITS(4, 0xac, 0xe0), /* 12V: 10.8-14V */ + LM87_IN_LIMITS(5, 0x3f, 0x4f), /* Vccp2: 1.0V +/- 10% */ + LM87_AIN_LIMITS(0, 0x98, 0xbb), /* AIN1: 1.66V +/- 10% */ + LM87_AIN_LIMITS(1, 0x8a, 0xa9), /* AIN2: 1.5V +/- 10% */ + LM87_TEMP_INT_LIMITS(0, 80 + FALCON_BOARD_TEMP_BIAS), + LM87_TEMP_EXT1_LIMITS(0, FALCON_JUNC_TEMP_MAX), 0 }; @@ -619,14 +628,14 @@ static int sfe4002_init(struct efx_nic *efx) static u8 sfn4112f_lm87_channel = 0x03; /* use AIN not FAN inputs */ static const u8 sfn4112f_lm87_regs[] = { - LM87_IN_LIMITS(0, 0x83, 0x91), /* 2.5V: 1.8V +/- 5% */ - LM87_IN_LIMITS(1, 0x51, 0x5a), /* Vccp1: 1.2V +/- 5% */ - LM87_IN_LIMITS(2, 0xb6, 0xca), /* 3.3V: 3.3V +/- 5% */ - LM87_IN_LIMITS(4, 0xb0, 0xe0), /* 12V: 11-14V */ - LM87_IN_LIMITS(5, 0x44, 0x4b), /* Vccp2: 1.0V +/- 5% */ - LM87_AIN_LIMITS(1, 0x91, 0xa1), /* AIN2: 1.5V +/- 5% */ - LM87_TEMP_INT_LIMITS(10, 60), /* board */ - LM87_TEMP_EXT1_LIMITS(10, 70), /* Falcon */ + LM87_IN_LIMITS(0, 0x7c, 0x99), /* 2.5V: 1.8V +/- 10% */ + LM87_IN_LIMITS(1, 0x4c, 0x5e), /* Vccp1: 1.2V +/- 10% */ + LM87_IN_LIMITS(2, 0xac, 0xd4), /* 3.3V: 3.3V +/- 10% */ + LM87_IN_LIMITS(4, 0xac, 0xe0), /* 12V: 10.8-14V */ + LM87_IN_LIMITS(5, 0x3f, 0x4f), /* Vccp2: 1.0V +/- 10% */ + LM87_AIN_LIMITS(1, 0x8a, 0xa9), /* AIN2: 1.5V +/- 10% */ + LM87_TEMP_INT_LIMITS(0, 60 + FALCON_BOARD_TEMP_BIAS), + LM87_TEMP_EXT1_LIMITS(0, FALCON_JUNC_TEMP_MAX), 0 }; diff --git a/drivers/net/sfc/mcdi.c b/drivers/net/sfc/mcdi.c index 9f035b9..f66b3da 100644 --- a/drivers/net/sfc/mcdi.c +++ b/drivers/net/sfc/mcdi.c @@ -127,7 +127,7 @@ static int efx_mcdi_poll(struct efx_nic *efx) efx_dword_t reg; /* Check for a reboot atomically with respect to efx_mcdi_copyout() */ - rc = efx_mcdi_poll_reboot(efx); + rc = -efx_mcdi_poll_reboot(efx); if (rc) goto out; diff --git a/drivers/net/sfc/qt202x_phy.c b/drivers/net/sfc/qt202x_phy.c index e0d13a4..67eec7a 100644 --- a/drivers/net/sfc/qt202x_phy.c +++ b/drivers/net/sfc/qt202x_phy.c @@ -320,7 +320,7 @@ static int qt202x_reset_phy(struct efx_nic *efx) falcon_board(efx)->type->init_phy(efx); - return rc; + return 0; fail: EFX_ERR(efx, "PHY reset timed out\n"); diff --git a/drivers/net/sungem.c b/drivers/net/sungem.c index b571a1b..c3a1180 100644 --- a/drivers/net/sungem.c +++ b/drivers/net/sungem.c @@ -1034,10 +1034,8 @@ static netdev_tx_t gem_start_xmit(struct sk_buff *skb, (csum_stuff_off << 21)); } - if (!spin_trylock_irqsave(&gp->tx_lock, flags)) { - /* Tell upper layer to requeue */ - return NETDEV_TX_LOCKED; - } + spin_lock_irqsave(&gp->tx_lock, flags); + /* We raced with gem_do_stop() */ if (!gp->running) { spin_unlock_irqrestore(&gp->tx_lock, flags); diff --git a/drivers/net/tc35815.c b/drivers/net/tc35815.c index 75a669d..d71c197 100644 --- a/drivers/net/tc35815.c +++ b/drivers/net/tc35815.c @@ -1437,7 +1437,6 @@ static int tc35815_do_interrupt(struct net_device *dev, u32 status, int limit) /* Transmit complete. */ lp->lstats.tx_ints++; tc35815_txdone(dev); - netif_wake_queue(dev); if (ret < 0) ret = 0; } diff --git a/drivers/net/tehuti.c b/drivers/net/tehuti.c index 80b404f..1013891 100644 --- a/drivers/net/tehuti.c +++ b/drivers/net/tehuti.c @@ -1639,13 +1639,8 @@ static netdev_tx_t bdx_tx_transmit(struct sk_buff *skb, unsigned long flags; ENTER; - local_irq_save(flags); - if (!spin_trylock(&priv->tx_lock)) { - local_irq_restore(flags); - DBG("%s[%s]: TX locked, returning NETDEV_TX_LOCKED\n", - BDX_DRV_NAME, ndev->name); - return NETDEV_TX_LOCKED; - } + + spin_lock_irqsave(&priv->tx_lock, flags); /* build tx descriptor */ BDX_ASSERT(f->m.wptr >= f->m.memsz); /* started with valid wptr */ diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 7f82b02..5cdbe0c 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -4410,7 +4410,7 @@ static void tg3_tx(struct tg3_napi *tnapi) if (unlikely(netif_tx_queue_stopped(txq) && (tg3_tx_avail(tnapi) > TG3_TX_WAKEUP_THRESH(tnapi)))) { - __netif_tx_lock(txq, smp_processor_id()); + __netif_tx_lock(txq); if (netif_tx_queue_stopped(txq) && (tg3_tx_avail(tnapi) > TG3_TX_WAKEUP_THRESH(tnapi))) netif_tx_wake_queue(txq); diff --git a/drivers/net/tulip/tulip_core.c b/drivers/net/tulip/tulip_core.c index 20696b5..541a860 100644 --- a/drivers/net/tulip/tulip_core.c +++ b/drivers/net/tulip/tulip_core.c @@ -1837,6 +1837,7 @@ static void __devexit tulip_remove_one (struct pci_dev *pdev) pci_iounmap(pdev, tp->base_addr); free_netdev (dev); pci_release_regions (pdev); + pci_disable_device (pdev); pci_set_drvdata (pdev, NULL); /* pci_power_off (pdev, -1); */ diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c index 4f27f02..5f3b9ea 100644 --- a/drivers/net/usb/cdc_ether.c +++ b/drivers/net/usb/cdc_ether.c @@ -584,6 +584,11 @@ static const struct usb_device_id products [] = { USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long) &mbm_info, }, { + /* Ericsson C3607w ver 2 */ + USB_DEVICE_AND_INTERFACE_INFO(0x0bdb, 0x190b, USB_CLASS_COMM, + USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE), + .driver_info = (unsigned long) &mbm_info, +}, { /* Toshiba F3507g */ USB_DEVICE_AND_INTERFACE_INFO(0x0930, 0x130b, USB_CLASS_COMM, USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE), diff --git a/drivers/net/via-velocity.c b/drivers/net/via-velocity.c index c93f58f..317aa34 100644 --- a/drivers/net/via-velocity.c +++ b/drivers/net/via-velocity.c @@ -1877,13 +1877,12 @@ static void velocity_error(struct velocity_info *vptr, int status) /** * tx_srv - transmit interrupt service * @vptr; Velocity - * @status: * * Scan the queues looking for transmitted packets that * we can complete and clean up. Update any statistics as * necessary/ */ -static int velocity_tx_srv(struct velocity_info *vptr, u32 status) +static int velocity_tx_srv(struct velocity_info *vptr) { struct tx_desc *td; int qnum; @@ -2090,14 +2089,12 @@ static int velocity_receive_frame(struct velocity_info *vptr, int idx) /** * velocity_rx_srv - service RX interrupt * @vptr: velocity - * @status: adapter status (unused) * * Walk the receive ring of the velocity adapter and remove * any received packets from the receive queue. Hand the ring * slots back to the adapter for reuse. */ -static int velocity_rx_srv(struct velocity_info *vptr, int status, - int budget_left) +static int velocity_rx_srv(struct velocity_info *vptr, int budget_left) { struct net_device_stats *stats = &vptr->dev->stats; int rd_curr = vptr->rx.curr; @@ -2151,32 +2148,24 @@ static int velocity_poll(struct napi_struct *napi, int budget) struct velocity_info *vptr = container_of(napi, struct velocity_info, napi); unsigned int rx_done; - u32 isr_status; - - spin_lock(&vptr->lock); - isr_status = mac_read_isr(vptr->mac_regs); - - /* Ack the interrupt */ - mac_write_isr(vptr->mac_regs, isr_status); - if (isr_status & (~(ISR_PRXI | ISR_PPRXI | ISR_PTXI | ISR_PPTXI))) - velocity_error(vptr, isr_status); + unsigned long flags; + spin_lock_irqsave(&vptr->lock, flags); /* * Do rx and tx twice for performance (taken from the VIA * out-of-tree driver). */ - rx_done = velocity_rx_srv(vptr, isr_status, budget / 2); - velocity_tx_srv(vptr, isr_status); - rx_done += velocity_rx_srv(vptr, isr_status, budget - rx_done); - velocity_tx_srv(vptr, isr_status); - - spin_unlock(&vptr->lock); + rx_done = velocity_rx_srv(vptr, budget / 2); + velocity_tx_srv(vptr); + rx_done += velocity_rx_srv(vptr, budget - rx_done); + velocity_tx_srv(vptr); /* If budget not fully consumed, exit the polling mode */ if (rx_done < budget) { napi_complete(napi); mac_enable_int(vptr->mac_regs); } + spin_unlock_irqrestore(&vptr->lock, flags); return rx_done; } @@ -2206,10 +2195,17 @@ static irqreturn_t velocity_intr(int irq, void *dev_instance) return IRQ_NONE; } + /* Ack the interrupt */ + mac_write_isr(vptr->mac_regs, isr_status); + if (likely(napi_schedule_prep(&vptr->napi))) { mac_disable_int(vptr->mac_regs); __napi_schedule(&vptr->napi); } + + if (isr_status & (~(ISR_PRXI | ISR_PPRXI | ISR_PTXI | ISR_PPTXI))) + velocity_error(vptr, isr_status); + spin_unlock(&vptr->lock); return IRQ_HANDLED; @@ -3100,7 +3096,7 @@ static int velocity_resume(struct pci_dev *pdev) velocity_init_registers(vptr, VELOCITY_INIT_WOL); mac_disable_int(vptr->mac_regs); - velocity_tx_srv(vptr, 0); + velocity_tx_srv(vptr); for (i = 0; i < vptr->tx.numq; i++) { if (vptr->tx.used[i]) @@ -3344,6 +3340,7 @@ static int velocity_set_coalesce(struct net_device *dev, { struct velocity_info *vptr = netdev_priv(dev); int max_us = 0x3f * 64; + unsigned long flags; /* 6 bits of */ if (ecmd->tx_coalesce_usecs > max_us) @@ -3365,6 +3362,7 @@ static int velocity_set_coalesce(struct net_device *dev, ecmd->tx_coalesce_usecs); /* Setup the interrupt suppression and queue timers */ + spin_lock_irqsave(&vptr->lock, flags); mac_disable_int(vptr->mac_regs); setup_adaptive_interrupts(vptr); setup_queue_timers(vptr); @@ -3372,6 +3370,7 @@ static int velocity_set_coalesce(struct net_device *dev, mac_write_int_mask(vptr->int_mask, vptr->mac_regs); mac_clear_isr(vptr->mac_regs); mac_enable_int(vptr->mac_regs); + spin_unlock_irqrestore(&vptr->lock, flags); return 0; } diff --git a/drivers/net/wan/cosa.c b/drivers/net/wan/cosa.c index b36bf96..c56ba75 100644 --- a/drivers/net/wan/cosa.c +++ b/drivers/net/wan/cosa.c @@ -575,7 +575,7 @@ static int cosa_probe(int base, int irq, int dma) /* Initialize the chardev data structures */ mutex_init(&chan->rlock); - init_MUTEX(&chan->wsem); + sema_init(&chan->wsem, 1); /* Register the network interface */ if (!(chan->netdev = alloc_hdlcdev(chan))) { diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c index fa12b90..29bf336 100644 --- a/drivers/net/wireless/ath/ath9k/xmit.c +++ b/drivers/net/wireless/ath/ath9k/xmit.c @@ -1615,7 +1615,7 @@ static int ath_tx_setup_buffer(struct ieee80211_hw *hw, struct ath_buf *bf, bf->bf_frmlen -= padsize; } - if (conf_is_ht(&hw->conf) && !is_pae(skb)) + if (conf_is_ht(&hw->conf)) bf->bf_state.bf_type |= BUF_HT; bf->bf_flags = setup_tx_flags(sc, skb, txctl->txq); @@ -1701,7 +1701,7 @@ static void ath_tx_start_dma(struct ath_softc *sc, struct ath_buf *bf, goto tx_done; } - if (tx_info->flags & IEEE80211_TX_CTL_AMPDU) { + if ((tx_info->flags & IEEE80211_TX_CTL_AMPDU) && !is_pae(skb)) { /* * Try aggregation if it's a unicast data frame * and the destination is HT capable. diff --git a/drivers/net/wireless/b43/b43.h b/drivers/net/wireless/b43/b43.h index fe3bf94..c484cc2 100644 --- a/drivers/net/wireless/b43/b43.h +++ b/drivers/net/wireless/b43/b43.h @@ -115,6 +115,7 @@ #define B43_MMIO_TSF_2 0x636 /* core rev < 3 only */ #define B43_MMIO_TSF_3 0x638 /* core rev < 3 only */ #define B43_MMIO_RNG 0x65A +#define B43_MMIO_IFSSLOT 0x684 /* Interframe slot time */ #define B43_MMIO_IFSCTL 0x688 /* Interframe space control */ #define B43_MMIO_IFSCTL_USE_EDCF 0x0004 #define B43_MMIO_POWERUP_DELAY 0x6A8 diff --git a/drivers/net/wireless/b43/main.c b/drivers/net/wireless/b43/main.c index 4c41cfe..490fb45 100644 --- a/drivers/net/wireless/b43/main.c +++ b/drivers/net/wireless/b43/main.c @@ -628,10 +628,17 @@ static void b43_upload_card_macaddress(struct b43_wldev *dev) static void b43_set_slot_time(struct b43_wldev *dev, u16 slot_time) { /* slot_time is in usec. */ - if (dev->phy.type != B43_PHYTYPE_G) + /* This test used to exit for all but a G PHY. */ + if (b43_current_band(dev->wl) == IEEE80211_BAND_5GHZ) return; - b43_write16(dev, 0x684, 510 + slot_time); - b43_shm_write16(dev, B43_SHM_SHARED, 0x0010, slot_time); + b43_write16(dev, B43_MMIO_IFSSLOT, 510 + slot_time); + /* Shared memory location 0x0010 is the slot time and should be + * set to slot_time; however, this register is initially 0 and changing + * the value adversely affects the transmit rate for BCM4311 + * devices. Until this behavior is unterstood, delete this step + * + * b43_shm_write16(dev, B43_SHM_SHARED, 0x0010, slot_time); + */ } static void b43_short_slot_timing_enable(struct b43_wldev *dev) diff --git a/drivers/net/wireless/iwlwifi/iwl-4965.c b/drivers/net/wireless/iwlwifi/iwl-4965.c index 9b4b8b5..3146281 100644 --- a/drivers/net/wireless/iwlwifi/iwl-4965.c +++ b/drivers/net/wireless/iwlwifi/iwl-4965.c @@ -2008,7 +2008,7 @@ static void iwl4965_rx_reply_tx(struct iwl_priv *priv, IWL_DEBUG_TX_REPLY(priv, "Retry scheduler reclaim scd_ssn " "%d index %d\n", scd_ssn , index); freed = iwl_tx_queue_reclaim(priv, txq_id, index); - priv->stations[sta_id].tid[tid].tfds_in_queue -= freed; + iwl_free_tfds_in_queue(priv, sta_id, tid, freed); if (priv->mac80211_registered && (iwl_queue_space(&txq->q) > txq->q.low_mark) && diff --git a/drivers/net/wireless/iwlwifi/iwl-5000.c b/drivers/net/wireless/iwlwifi/iwl-5000.c index de45f30..cffaae7 100644 --- a/drivers/net/wireless/iwlwifi/iwl-5000.c +++ b/drivers/net/wireless/iwlwifi/iwl-5000.c @@ -1125,7 +1125,7 @@ static void iwl5000_rx_reply_tx(struct iwl_priv *priv, scd_ssn , index, txq_id, txq->swq_id); freed = iwl_tx_queue_reclaim(priv, txq_id, index); - priv->stations[sta_id].tid[tid].tfds_in_queue -= freed; + iwl_free_tfds_in_queue(priv, sta_id, tid, freed); if (priv->mac80211_registered && (iwl_queue_space(&txq->q) > txq->q.low_mark) && @@ -1153,16 +1153,14 @@ static void iwl5000_rx_reply_tx(struct iwl_priv *priv, tx_resp->failure_frame); freed = iwl_tx_queue_reclaim(priv, txq_id, index); - if (ieee80211_is_data_qos(tx_resp->frame_ctrl)) - priv->stations[sta_id].tid[tid].tfds_in_queue -= freed; + iwl_free_tfds_in_queue(priv, sta_id, tid, freed); if (priv->mac80211_registered && (iwl_queue_space(&txq->q) > txq->q.low_mark)) iwl_wake_queue(priv, txq_id); } - if (ieee80211_is_data_qos(tx_resp->frame_ctrl)) - iwl_txq_check_empty(priv, sta_id, tid, txq_id); + iwl_txq_check_empty(priv, sta_id, tid, txq_id); if (iwl_check_bits(status, TX_ABORT_REQUIRED_MSK)) IWL_ERR(priv, "TODO: Implement Tx ABORT REQUIRED!!!\n"); diff --git a/drivers/net/wireless/iwlwifi/iwl-core.c b/drivers/net/wireless/iwlwifi/iwl-core.c index 5461f10..f36f804 100644 --- a/drivers/net/wireless/iwlwifi/iwl-core.c +++ b/drivers/net/wireless/iwlwifi/iwl-core.c @@ -2745,6 +2745,7 @@ int iwl_mac_config(struct ieee80211_hw *hw, u32 changed) priv->staging_rxon.flags = 0; iwl_set_rxon_channel(priv, conf->channel); + iwl_set_rxon_ht(priv, ht_conf); iwl_set_flags_for_band(priv, conf->channel->band); spin_unlock_irqrestore(&priv->lock, flags); diff --git a/drivers/net/wireless/iwlwifi/iwl-core.h b/drivers/net/wireless/iwlwifi/iwl-core.h index 27ca859..b69e972 100644 --- a/drivers/net/wireless/iwlwifi/iwl-core.h +++ b/drivers/net/wireless/iwlwifi/iwl-core.h @@ -446,6 +446,8 @@ void iwl_hw_txq_ctx_free(struct iwl_priv *priv); int iwl_hw_tx_queue_init(struct iwl_priv *priv, struct iwl_tx_queue *txq); int iwl_txq_update_write_ptr(struct iwl_priv *priv, struct iwl_tx_queue *txq); +void iwl_free_tfds_in_queue(struct iwl_priv *priv, + int sta_id, int tid, int freed); int iwl_tx_queue_init(struct iwl_priv *priv, struct iwl_tx_queue *txq, int slots_num, u32 txq_id); void iwl_tx_queue_free(struct iwl_priv *priv, int txq_id); diff --git a/drivers/net/wireless/iwlwifi/iwl-rx.c b/drivers/net/wireless/iwlwifi/iwl-rx.c index 6f36b6e..2dbce85 100644 --- a/drivers/net/wireless/iwlwifi/iwl-rx.c +++ b/drivers/net/wireless/iwlwifi/iwl-rx.c @@ -928,7 +928,10 @@ static void iwl_pass_packet_to_mac80211(struct iwl_priv *priv, if (ieee80211_is_mgmt(fc) || ieee80211_has_protected(fc) || ieee80211_has_morefrags(fc) || - le16_to_cpu(hdr->seq_ctrl) & IEEE80211_SCTL_FRAG) + le16_to_cpu(hdr->seq_ctrl) & IEEE80211_SCTL_FRAG || + (ieee80211_is_data_qos(fc) && + *ieee80211_get_qos_ctl(hdr) & + IEEE80211_QOS_CONTROL_A_MSDU_PRESENT)) ret = skb_linearize(skb); else ret = __pskb_pull_tail(skb, min_t(u16, IWL_LINK_HDR_MAX, len)) ? diff --git a/drivers/net/wireless/iwlwifi/iwl-tx.c b/drivers/net/wireless/iwlwifi/iwl-tx.c index 87ce2bd..8f40715 100644 --- a/drivers/net/wireless/iwlwifi/iwl-tx.c +++ b/drivers/net/wireless/iwlwifi/iwl-tx.c @@ -120,6 +120,20 @@ int iwl_txq_update_write_ptr(struct iwl_priv *priv, struct iwl_tx_queue *txq) EXPORT_SYMBOL(iwl_txq_update_write_ptr); +void iwl_free_tfds_in_queue(struct iwl_priv *priv, + int sta_id, int tid, int freed) +{ + if (priv->stations[sta_id].tid[tid].tfds_in_queue >= freed) + priv->stations[sta_id].tid[tid].tfds_in_queue -= freed; + else { + IWL_ERR(priv, "free more than tfds_in_queue (%u:%d)\n", + priv->stations[sta_id].tid[tid].tfds_in_queue, + freed); + priv->stations[sta_id].tid[tid].tfds_in_queue = 0; + } +} +EXPORT_SYMBOL(iwl_free_tfds_in_queue); + /** * iwl_tx_queue_free - Deallocate DMA queue. * @txq: Transmit queue to deallocate. @@ -1131,6 +1145,7 @@ int iwl_tx_queue_reclaim(struct iwl_priv *priv, int txq_id, int index) struct iwl_queue *q = &txq->q; struct iwl_tx_info *tx_info; int nfreed = 0; + struct ieee80211_hdr *hdr; if ((index >= q->n_bd) || (iwl_queue_used(q, index) == 0)) { IWL_ERR(priv, "Read index for DMA queue txq id (%d), index %d, " @@ -1145,13 +1160,16 @@ int iwl_tx_queue_reclaim(struct iwl_priv *priv, int txq_id, int index) tx_info = &txq->txb[txq->q.read_ptr]; iwl_tx_status(priv, tx_info->skb[0]); + + hdr = (struct ieee80211_hdr *)tx_info->skb[0]->data; + if (hdr && ieee80211_is_data_qos(hdr->frame_control)) + nfreed++; tx_info->skb[0] = NULL; if (priv->cfg->ops->lib->txq_inval_byte_cnt_tbl) priv->cfg->ops->lib->txq_inval_byte_cnt_tbl(priv, txq); priv->cfg->ops->lib->txq_free_tfd(priv, txq); - nfreed++; } return nfreed; } @@ -1559,7 +1577,7 @@ void iwl_rx_reply_compressed_ba(struct iwl_priv *priv, if (txq->q.read_ptr != (ba_resp_scd_ssn & 0xff)) { /* calculate mac80211 ampdu sw queue to wake */ int freed = iwl_tx_queue_reclaim(priv, scd_flow, index); - priv->stations[sta_id].tid[tid].tfds_in_queue -= freed; + iwl_free_tfds_in_queue(priv, sta_id, tid, freed); if ((iwl_queue_space(&txq->q) > txq->q.low_mark) && priv->mac80211_registered && diff --git a/drivers/net/wireless/iwmc3200wifi/rx.c b/drivers/net/wireless/iwmc3200wifi/rx.c index 6d6ed74..f727b4a 100644 --- a/drivers/net/wireless/iwmc3200wifi/rx.c +++ b/drivers/net/wireless/iwmc3200wifi/rx.c @@ -794,7 +794,7 @@ static int iwm_mlme_update_bss_table(struct iwm_priv *iwm, u8 *buf, } bss->bss = kzalloc(bss_len, GFP_KERNEL); - if (!bss) { + if (!bss->bss) { kfree(bss); IWM_ERR(iwm, "Couldn't allocate bss\n"); return -ENOMEM; diff --git a/drivers/net/wireless/rtl818x/rtl8187_dev.c b/drivers/net/wireless/rtl818x/rtl8187_dev.c index bc5726d..7ba3052 100644 --- a/drivers/net/wireless/rtl818x/rtl8187_dev.c +++ b/drivers/net/wireless/rtl818x/rtl8187_dev.c @@ -65,6 +65,7 @@ static struct usb_device_id rtl8187_table[] __devinitdata = { /* Sitecom */ {USB_DEVICE(0x0df6, 0x000d), .driver_info = DEVICE_RTL8187}, {USB_DEVICE(0x0df6, 0x0028), .driver_info = DEVICE_RTL8187B}, + {USB_DEVICE(0x0df6, 0x0029), .driver_info = DEVICE_RTL8187B}, /* Sphairon Access Systems GmbH */ {USB_DEVICE(0x114B, 0x0150), .driver_info = DEVICE_RTL8187}, /* Dick Smith Electronics */ diff --git a/drivers/of/base.c b/drivers/of/base.c index e6627b2..840a1e4 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -26,7 +26,7 @@ struct device_node *allnodes; /* use when traversing tree through the allnext, child, sibling, * or parent members of struct device_node. */ -DEFINE_RWLOCK(devtree_lock); +DEFINE_RAW_SPINLOCK(devtree_lock); int of_n_addr_cells(struct device_node *np) { @@ -60,7 +60,7 @@ int of_n_size_cells(struct device_node *np) } EXPORT_SYMBOL(of_n_size_cells); -struct property *of_find_property(const struct device_node *np, +static struct property *__of_find_property(const struct device_node *np, const char *name, int *lenp) { @@ -69,7 +69,6 @@ struct property *of_find_property(const struct device_node *np, if (!np) return NULL; - read_lock(&devtree_lock); for (pp = np->properties; pp != 0; pp = pp->next) { if (of_prop_cmp(pp->name, name) == 0) { if (lenp != 0) @@ -77,7 +76,20 @@ struct property *of_find_property(const struct device_node *np, break; } } - read_unlock(&devtree_lock); + + return pp; +} + +struct property *of_find_property(const struct device_node *np, + const char *name, + int *lenp) +{ + struct property *pp; + unsigned long flags; + + raw_spin_lock_irqsave(&devtree_lock, flags); + pp = __of_find_property(np, name, lenp); + raw_spin_unlock_irqrestore(&devtree_lock, flags); return pp; } @@ -95,13 +107,13 @@ struct device_node *of_find_all_nodes(struct device_node *prev) { struct device_node *np; - read_lock(&devtree_lock); + raw_spin_lock(&devtree_lock); np = prev ? prev->allnext : allnodes; for (; np != NULL; np = np->allnext) if (of_node_get(np)) break; of_node_put(prev); - read_unlock(&devtree_lock); + raw_spin_unlock(&devtree_lock); return np; } EXPORT_SYMBOL(of_find_all_nodes); @@ -110,8 +122,20 @@ EXPORT_SYMBOL(of_find_all_nodes); * Find a property with a given name for a given node * and return the value. */ +static const void *__of_get_property(const struct device_node *np, + const char *name, int *lenp) +{ + struct property *pp = __of_find_property(np, name, lenp); + + return pp ? pp->value : NULL; +} + +/* + * Find a property with a given name for a given node + * and return the value. + */ const void *of_get_property(const struct device_node *np, const char *name, - int *lenp) + int *lenp) { struct property *pp = of_find_property(np, name, lenp); @@ -122,13 +146,13 @@ EXPORT_SYMBOL(of_get_property); /** Checks if the given "compat" string matches one of the strings in * the device's "compatible" property */ -int of_device_is_compatible(const struct device_node *device, - const char *compat) +static int __of_device_is_compatible(const struct device_node *device, + const char *compat) { const char* cp; - int cplen, l; + int uninitialized_var(cplen), l; - cp = of_get_property(device, "compatible", &cplen); + cp = __of_get_property(device, "compatible", &cplen); if (cp == NULL) return 0; while (cplen > 0) { @@ -141,6 +165,21 @@ int of_device_is_compatible(const struct device_node *device, return 0; } + +/** Checks if the given "compat" string matches one of the strings in + * the device's "compatible" property + */ +int of_device_is_compatible(const struct device_node *device, + const char *compat) +{ + unsigned long flags; + int res; + + raw_spin_lock_irqsave(&devtree_lock, flags); + res = __of_device_is_compatible(device, compat); + raw_spin_unlock_irqrestore(&devtree_lock, flags); + return res; +} EXPORT_SYMBOL(of_device_is_compatible); /** @@ -179,13 +218,14 @@ EXPORT_SYMBOL(of_device_is_available); struct device_node *of_get_parent(const struct device_node *node) { struct device_node *np; + unsigned long flags; if (!node) return NULL; - read_lock(&devtree_lock); + raw_spin_lock_irqsave(&devtree_lock, flags); np = of_node_get(node->parent); - read_unlock(&devtree_lock); + raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_get_parent); @@ -204,14 +244,15 @@ EXPORT_SYMBOL(of_get_parent); struct device_node *of_get_next_parent(struct device_node *node) { struct device_node *parent; + unsigned long flags; if (!node) return NULL; - read_lock(&devtree_lock); + raw_spin_lock_irqsave(&devtree_lock, flags); parent = of_node_get(node->parent); of_node_put(node); - read_unlock(&devtree_lock); + raw_spin_unlock_irqrestore(&devtree_lock, flags); return parent; } @@ -227,14 +268,15 @@ struct device_node *of_get_next_child(const struct device_node *node, struct device_node *prev) { struct device_node *next; + unsigned long flags; - read_lock(&devtree_lock); + raw_spin_lock_irqsave(&devtree_lock, flags); next = prev ? prev->sibling : node->child; for (; next; next = next->sibling) if (of_node_get(next)) break; of_node_put(prev); - read_unlock(&devtree_lock); + raw_spin_unlock_irqrestore(&devtree_lock, flags); return next; } EXPORT_SYMBOL(of_get_next_child); @@ -249,14 +291,15 @@ EXPORT_SYMBOL(of_get_next_child); struct device_node *of_find_node_by_path(const char *path) { struct device_node *np = allnodes; + unsigned long flags; - read_lock(&devtree_lock); + raw_spin_lock_irqsave(&devtree_lock, flags); for (; np; np = np->allnext) { if (np->full_name && (of_node_cmp(np->full_name, path) == 0) && of_node_get(np)) break; } - read_unlock(&devtree_lock); + raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_node_by_path); @@ -276,15 +319,16 @@ struct device_node *of_find_node_by_name(struct device_node *from, const char *name) { struct device_node *np; + unsigned long flags; - read_lock(&devtree_lock); + raw_spin_lock_irqsave(&devtree_lock, flags); np = from ? from->allnext : allnodes; for (; np; np = np->allnext) if (np->name && (of_node_cmp(np->name, name) == 0) && of_node_get(np)) break; of_node_put(from); - read_unlock(&devtree_lock); + raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_node_by_name); @@ -305,15 +349,16 @@ struct device_node *of_find_node_by_type(struct device_node *from, const char *type) { struct device_node *np; + unsigned long flags; - read_lock(&devtree_lock); + raw_spin_lock_irqsave(&devtree_lock, flags); np = from ? from->allnext : allnodes; for (; np; np = np->allnext) if (np->type && (of_node_cmp(np->type, type) == 0) && of_node_get(np)) break; of_node_put(from); - read_unlock(&devtree_lock); + raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_node_by_type); @@ -336,18 +381,20 @@ struct device_node *of_find_compatible_node(struct device_node *from, const char *type, const char *compatible) { struct device_node *np; + unsigned long flags; - read_lock(&devtree_lock); + raw_spin_lock_irqsave(&devtree_lock, flags); np = from ? from->allnext : allnodes; for (; np; np = np->allnext) { if (type && !(np->type && (of_node_cmp(np->type, type) == 0))) continue; - if (of_device_is_compatible(np, compatible) && of_node_get(np)) + if (__of_device_is_compatible(np, compatible) && + of_node_get(np)) break; } of_node_put(from); - read_unlock(&devtree_lock); + raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_compatible_node); @@ -369,8 +416,9 @@ struct device_node *of_find_node_with_property(struct device_node *from, { struct device_node *np; struct property *pp; + unsigned long flags; - read_lock(&devtree_lock); + raw_spin_lock_irqsave(&devtree_lock, flags); np = from ? from->allnext : allnodes; for (; np; np = np->allnext) { for (pp = np->properties; pp != 0; pp = pp->next) { @@ -382,20 +430,14 @@ struct device_node *of_find_node_with_property(struct device_node *from, } out: of_node_put(from); - read_unlock(&devtree_lock); + raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_node_with_property); -/** - * of_match_node - Tell if an device_node has a matching of_match structure - * @matches: array of of device match structures to search in - * @node: the of device structure to match against - * - * Low level utility function used by device matching. - */ -const struct of_device_id *of_match_node(const struct of_device_id *matches, - const struct device_node *node) +static const struct of_device_id * +__of_match_node(const struct of_device_id *matches, + const struct device_node *node) { while (matches->name[0] || matches->type[0] || matches->compatible[0]) { int match = 1; @@ -406,14 +448,33 @@ const struct of_device_id *of_match_node(const struct of_device_id *matches, match &= node->type && !strcmp(matches->type, node->type); if (matches->compatible[0]) - match &= of_device_is_compatible(node, - matches->compatible); + match &= __of_device_is_compatible(node, + matches->compatible); if (match) return matches; matches++; } return NULL; } + +/** + * of_match_node - Tell if an device_node has a matching of_match structure + * @matches: array of of device match structures to search in + * @node: the of device structure to match against + * + * Low level utility function used by device matching. + */ +const struct of_device_id *of_match_node(const struct of_device_id *matches, + const struct device_node *node) +{ + const struct of_device_id *match; + unsigned long flags; + + raw_spin_lock_irqsave(&devtree_lock, flags); + match = __of_match_node(matches, node); + raw_spin_unlock_irqrestore(&devtree_lock, flags); + return match; +} EXPORT_SYMBOL(of_match_node); /** @@ -432,15 +493,16 @@ struct device_node *of_find_matching_node(struct device_node *from, const struct of_device_id *matches) { struct device_node *np; + unsigned long flags; - read_lock(&devtree_lock); + raw_spin_lock_irqsave(&devtree_lock, flags); np = from ? from->allnext : allnodes; for (; np; np = np->allnext) { - if (of_match_node(matches, np) && of_node_get(np)) + if (__of_match_node(matches, np) && of_node_get(np)) break; } of_node_put(from); - read_unlock(&devtree_lock); + raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_matching_node); diff --git a/drivers/oprofile/event_buffer.c b/drivers/oprofile/event_buffer.c index 5df60a6..9a60ccc 100644 --- a/drivers/oprofile/event_buffer.c +++ b/drivers/oprofile/event_buffer.c @@ -82,10 +82,10 @@ int alloc_event_buffer(void) { unsigned long flags; - spin_lock_irqsave(&oprofilefs_lock, flags); + raw_spin_lock_irqsave(&oprofilefs_lock, flags); buffer_size = oprofile_buffer_size; buffer_watershed = oprofile_buffer_watershed; - spin_unlock_irqrestore(&oprofilefs_lock, flags); + raw_spin_unlock_irqrestore(&oprofilefs_lock, flags); if (buffer_watershed >= buffer_size) return -EINVAL; diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c index 2766a6d..049ab37 100644 --- a/drivers/oprofile/oprofilefs.c +++ b/drivers/oprofile/oprofilefs.c @@ -21,7 +21,7 @@ #define OPROFILEFS_MAGIC 0x6f70726f -DEFINE_SPINLOCK(oprofilefs_lock); +DEFINE_RAW_SPINLOCK(oprofilefs_lock); static struct inode *oprofilefs_get_inode(struct super_block *sb, int mode) { @@ -75,9 +75,9 @@ int oprofilefs_ulong_from_user(unsigned long *val, char const __user *buf, size_ if (copy_from_user(tmpbuf, buf, count)) return -EFAULT; - spin_lock_irqsave(&oprofilefs_lock, flags); + raw_spin_lock_irqsave(&oprofilefs_lock, flags); *val = simple_strtoul(tmpbuf, NULL, 0); - spin_unlock_irqrestore(&oprofilefs_lock, flags); + raw_spin_unlock_irqrestore(&oprofilefs_lock, flags); return 0; } diff --git a/drivers/parport/share.c b/drivers/parport/share.c index dffa5d4..a2d9d1e 100644 --- a/drivers/parport/share.c +++ b/drivers/parport/share.c @@ -306,7 +306,7 @@ struct parport *parport_register_port(unsigned long base, int irq, int dma, spin_lock_init(&tmp->pardevice_lock); tmp->ieee1284.mode = IEEE1284_MODE_COMPAT; tmp->ieee1284.phase = IEEE1284_PH_FWD_IDLE; - init_MUTEX_LOCKED (&tmp->ieee1284.irq); /* actually a semaphore at 0 */ + sema_init(&tmp->ieee1284.irq, 0); tmp->spintime = parport_default_spintime; atomic_set (&tmp->ref_count, 1); INIT_LIST_HEAD(&tmp->full_list); diff --git a/drivers/pci/access.c b/drivers/pci/access.c index db23200..0691dbb 100644 --- a/drivers/pci/access.c +++ b/drivers/pci/access.c @@ -12,7 +12,7 @@ * configuration space. */ -static DEFINE_SPINLOCK(pci_lock); +static DEFINE_RAW_SPINLOCK(pci_lock); /* * Wrappers for all PCI configuration access functions. They just check @@ -32,10 +32,10 @@ int pci_bus_read_config_##size \ unsigned long flags; \ u32 data = 0; \ if (PCI_##size##_BAD) return PCIBIOS_BAD_REGISTER_NUMBER; \ - spin_lock_irqsave(&pci_lock, flags); \ + raw_spin_lock_irqsave(&pci_lock, flags); \ res = bus->ops->read(bus, devfn, pos, len, &data); \ *value = (type)data; \ - spin_unlock_irqrestore(&pci_lock, flags); \ + raw_spin_unlock_irqrestore(&pci_lock, flags); \ return res; \ } @@ -46,9 +46,9 @@ int pci_bus_write_config_##size \ int res; \ unsigned long flags; \ if (PCI_##size##_BAD) return PCIBIOS_BAD_REGISTER_NUMBER; \ - spin_lock_irqsave(&pci_lock, flags); \ + raw_spin_lock_irqsave(&pci_lock, flags); \ res = bus->ops->write(bus, devfn, pos, len, value); \ - spin_unlock_irqrestore(&pci_lock, flags); \ + raw_spin_unlock_irqrestore(&pci_lock, flags); \ return res; \ } @@ -78,10 +78,10 @@ struct pci_ops *pci_bus_set_ops(struct pci_bus *bus, struct pci_ops *ops) struct pci_ops *old_ops; unsigned long flags; - spin_lock_irqsave(&pci_lock, flags); + raw_spin_lock_irqsave(&pci_lock, flags); old_ops = bus->ops; bus->ops = ops; - spin_unlock_irqrestore(&pci_lock, flags); + raw_spin_unlock_irqrestore(&pci_lock, flags); return old_ops; } EXPORT_SYMBOL(pci_bus_set_ops); @@ -135,9 +135,9 @@ static noinline void pci_wait_ucfg(struct pci_dev *dev) __add_wait_queue(&pci_ucfg_wait, &wait); do { set_current_state(TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&pci_lock); + raw_spin_unlock_irq(&pci_lock); schedule(); - spin_lock_irq(&pci_lock); + raw_spin_lock_irq(&pci_lock); } while (dev->block_ucfg_access); __remove_wait_queue(&pci_ucfg_wait, &wait); } @@ -149,11 +149,11 @@ int pci_user_read_config_##size \ int ret = 0; \ u32 data = -1; \ if (PCI_##size##_BAD) return PCIBIOS_BAD_REGISTER_NUMBER; \ - spin_lock_irq(&pci_lock); \ + raw_spin_lock_irq(&pci_lock); \ if (unlikely(dev->block_ucfg_access)) pci_wait_ucfg(dev); \ ret = dev->bus->ops->read(dev->bus, dev->devfn, \ pos, sizeof(type), &data); \ - spin_unlock_irq(&pci_lock); \ + raw_spin_unlock_irq(&pci_lock); \ *val = (type)data; \ return ret; \ } @@ -164,11 +164,11 @@ int pci_user_write_config_##size \ { \ int ret = -EIO; \ if (PCI_##size##_BAD) return PCIBIOS_BAD_REGISTER_NUMBER; \ - spin_lock_irq(&pci_lock); \ + raw_spin_lock_irq(&pci_lock); \ if (unlikely(dev->block_ucfg_access)) pci_wait_ucfg(dev); \ ret = dev->bus->ops->write(dev->bus, dev->devfn, \ pos, sizeof(type), val); \ - spin_unlock_irq(&pci_lock); \ + raw_spin_unlock_irq(&pci_lock); \ return ret; \ } @@ -395,10 +395,10 @@ void pci_block_user_cfg_access(struct pci_dev *dev) unsigned long flags; int was_blocked; - spin_lock_irqsave(&pci_lock, flags); + raw_spin_lock_irqsave(&pci_lock, flags); was_blocked = dev->block_ucfg_access; dev->block_ucfg_access = 1; - spin_unlock_irqrestore(&pci_lock, flags); + raw_spin_unlock_irqrestore(&pci_lock, flags); /* If we BUG() inside the pci_lock, we're guaranteed to hose * the machine */ @@ -416,7 +416,7 @@ void pci_unblock_user_cfg_access(struct pci_dev *dev) { unsigned long flags; - spin_lock_irqsave(&pci_lock, flags); + raw_spin_lock_irqsave(&pci_lock, flags); /* This indicates a problem in the caller, but we don't need * to kill them, unlike a double-block above. */ @@ -424,6 +424,6 @@ void pci_unblock_user_cfg_access(struct pci_dev *dev) dev->block_ucfg_access = 0; wake_up_all(&pci_ucfg_wait); - spin_unlock_irqrestore(&pci_lock, flags); + raw_spin_unlock_irqrestore(&pci_lock, flags); } EXPORT_SYMBOL_GPL(pci_unblock_user_cfg_access); diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index cef28a7..caa6295 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -240,9 +240,9 @@ void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), next = dev->bus_list.next; /* Run device routines with the device locked */ - down(&dev->dev.sem); + mutex_lock(&dev->dev.mutex); retval = cb(dev, userdata); - up(&dev->dev.sem); + mutex_unlock(&dev->dev.mutex); if (retval) break; } diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c index 8e952fd..cb2fd01 100644 --- a/drivers/pci/hotplug/acpiphp_glue.c +++ b/drivers/pci/hotplug/acpiphp_glue.c @@ -720,12 +720,6 @@ static int acpiphp_bus_add(struct acpiphp_func *func) -ret_val); goto acpiphp_bus_add_out; } - /* - * try to start anyway. We could have failed to add - * simply because this bus had previously been added - * on another add. Don't bother with the return value - * we just keep going. - */ ret_val = acpi_bus_start(device); acpiphp_bus_add_out: diff --git a/drivers/pci/hotplug/ibmphp_hpc.c b/drivers/pci/hotplug/ibmphp_hpc.c index c7084f0..d811661 100644 --- a/drivers/pci/hotplug/ibmphp_hpc.c +++ b/drivers/pci/hotplug/ibmphp_hpc.c @@ -132,8 +132,8 @@ void __init ibmphp_hpc_initvars (void) debug ("%s - Entry\n", __func__); mutex_init(&sem_hpcaccess); - init_MUTEX (&semOperations); - init_MUTEX_LOCKED (&sem_exit); + sema_init(&semOperations, 1); + sema_init(&sem_exit, 0); to_debug = 0; debug ("%s - Exit\n", __func__); diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 315fea4..36339e0 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2304,7 +2304,7 @@ static int pci_dev_reset(struct pci_dev *dev, int probe) if (!probe) { pci_block_user_cfg_access(dev); /* block PM suspend, driver probe, etc. */ - down(&dev->dev.sem); + mutex_lock(&dev->dev.mutex); } rc = pci_dev_specific_reset(dev, probe); @@ -2326,7 +2326,7 @@ static int pci_dev_reset(struct pci_dev *dev, int probe) rc = pci_parent_bus_reset(dev, probe); done: if (!probe) { - up(&dev->dev.sem); + mutex_unlock(&dev->dev.mutex); pci_unblock_user_cfg_access(dev); } diff --git a/drivers/pcmcia/ds.c b/drivers/pcmcia/ds.c index 1a4a3c4..263bdc8 100644 --- a/drivers/pcmcia/ds.c +++ b/drivers/pcmcia/ds.c @@ -964,9 +964,9 @@ static int runtime_suspend(struct device *dev) { int rc; - down(&dev->sem); + mutex_lock(&dev->mutex); rc = pcmcia_dev_suspend(dev, PMSG_SUSPEND); - up(&dev->sem); + mutex_unlock(&dev->mutex); return rc; } @@ -974,9 +974,9 @@ static void runtime_resume(struct device *dev) { int rc; - down(&dev->sem); + mutex_lock(&dev->mutex); rc = pcmcia_dev_resume(dev); - up(&dev->sem); + mutex_unlock(&dev->mutex); } /************************ per-device sysfs output ***************************/ diff --git a/drivers/platform/x86/acer-wmi.c b/drivers/platform/x86/acer-wmi.c index 07d14df..226b3e9 100644 --- a/drivers/platform/x86/acer-wmi.c +++ b/drivers/platform/x86/acer-wmi.c @@ -934,7 +934,7 @@ static int __devinit acer_backlight_init(struct device *dev) acer_backlight_device = bd; bd->props.power = FB_BLANK_UNBLANK; - bd->props.brightness = max_brightness; + bd->props.brightness = read_brightness(bd); bd->props.max_brightness = max_brightness; backlight_update_status(bd); return 0; diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index e67e4fe..eb603f1 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -5771,7 +5771,7 @@ static void thermal_exit(void) case TPACPI_THERMAL_ACPI_TMP07: case TPACPI_THERMAL_ACPI_UPDT: sysfs_remove_group(&tpacpi_sensors_pdev->dev.kobj, - &thermal_temp_input16_group); + &thermal_temp_input8_group); break; case TPACPI_THERMAL_NONE: default: diff --git a/drivers/s390/cio/crw.c b/drivers/s390/cio/crw.c index d157665..2a932c5 100644 --- a/drivers/s390/cio/crw.c +++ b/drivers/s390/cio/crw.c @@ -14,7 +14,7 @@ #include <linux/init.h> #include <asm/crw.h> -static struct semaphore crw_semaphore; +static DEFINE_SEMAPHORE(crw_semaphore, 0); static DEFINE_MUTEX(crw_handler_mutex); static crw_handler_t crw_handlers[NR_RSCS]; @@ -132,17 +132,6 @@ void crw_handle_channel_report(void) } /* - * Separate initcall needed for semaphore initialization since - * crw_handle_channel_report might be called before crw_machine_check_init. - */ -static int __init crw_init_semaphore(void) -{ - init_MUTEX_LOCKED(&crw_semaphore); - return 0; -} -pure_initcall(crw_init_semaphore); - -/* * Machine checks for the channel subsystem must be enabled * after the channel subsystem is initialized */ diff --git a/drivers/scsi/aacraid/commctrl.c b/drivers/scsi/aacraid/commctrl.c index 9c0c911..7be331d 100644 --- a/drivers/scsi/aacraid/commctrl.c +++ b/drivers/scsi/aacraid/commctrl.c @@ -190,7 +190,7 @@ static int open_getadapter_fib(struct aac_dev * dev, void __user *arg) /* * Initialize the mutex used to wait for the next AIF. */ - init_MUTEX_LOCKED(&fibctx->wait_sem); + sema_init(&fibctx->wait_sem, 0); fibctx->wait = 0; /* * Initialize the fibs and set the count of fibs on diff --git a/drivers/scsi/aacraid/commsup.c b/drivers/scsi/aacraid/commsup.c index 94d2954..a1de995 100644 --- a/drivers/scsi/aacraid/commsup.c +++ b/drivers/scsi/aacraid/commsup.c @@ -124,7 +124,7 @@ int aac_fib_setup(struct aac_dev * dev) fibptr->hw_fib_va = hw_fib; fibptr->data = (void *) fibptr->hw_fib_va->data; fibptr->next = fibptr+1; /* Forward chain the fibs */ - init_MUTEX_LOCKED(&fibptr->event_wait); + sema_init(&fibptr->event_wait, 0); spin_lock_init(&fibptr->event_lock); hw_fib->header.XferState = cpu_to_le32(0xffffffff); hw_fib->header.SenderSize = cpu_to_le16(dev->max_fib_size); diff --git a/drivers/scsi/arm/fas216.c b/drivers/scsi/arm/fas216.c index 4775426..9e71ac6 100644 --- a/drivers/scsi/arm/fas216.c +++ b/drivers/scsi/arm/fas216.c @@ -2516,7 +2516,7 @@ int fas216_eh_device_reset(struct scsi_cmnd *SCpnt) if (info->scsi.phase == PHASE_IDLE) fas216_kick(info); - mod_timer(&info->eh_timer, 30 * HZ); + mod_timer(&info->eh_timer, jiffies + 30 * HZ); spin_unlock_irqrestore(&info->host_lock, flags); /* diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c index 10be9f3..2f47ae7 100644 --- a/drivers/scsi/fcoe/fcoe.c +++ b/drivers/scsi/fcoe/fcoe.c @@ -2009,6 +2009,8 @@ static int fcoe_destroy(const char *buffer, struct kernel_param *kp) fcoe_interface_cleanup(fcoe); rtnl_unlock(); fcoe_if_destroy(fcoe->ctlr.lp); + module_put(THIS_MODULE); + out_putdev: dev_put(netdev); out_nodev: @@ -2059,6 +2061,11 @@ static int fcoe_create(const char *buffer, struct kernel_param *kp) } #endif + if (!try_module_get(THIS_MODULE)) { + rc = -EINVAL; + goto out_nomod; + } + rtnl_lock(); netdev = fcoe_if_to_netdev(buffer); if (!netdev) { @@ -2099,17 +2106,24 @@ static int fcoe_create(const char *buffer, struct kernel_param *kp) if (!fcoe_link_ok(lport)) fcoe_ctlr_link_up(&fcoe->ctlr); - rc = 0; -out_free: /* * Release from init in fcoe_interface_create(), on success lport * should be holding a reference taken in fcoe_if_create(). */ fcoe_interface_put(fcoe); + dev_put(netdev); + rtnl_unlock(); + mutex_unlock(&fcoe_config_mutex); + + return 0; +out_free: + fcoe_interface_put(fcoe); out_putdev: dev_put(netdev); out_nodev: rtnl_unlock(); + module_put(THIS_MODULE); +out_nomod: mutex_unlock(&fcoe_config_mutex); return rc; } diff --git a/drivers/scsi/fcoe/libfcoe.c b/drivers/scsi/fcoe/libfcoe.c index 9823291..511cb6b 100644 --- a/drivers/scsi/fcoe/libfcoe.c +++ b/drivers/scsi/fcoe/libfcoe.c @@ -1187,7 +1187,7 @@ static void fcoe_ctlr_timeout(unsigned long arg) next_timer = fip->ctlr_ka_time; if (time_after_eq(jiffies, fip->port_ka_time)) { - fip->port_ka_time += jiffies + + fip->port_ka_time = jiffies + msecs_to_jiffies(FIP_VN_KA_PERIOD); fip->send_port_ka = 1; } diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c index 19d711c..7f43647 100644 --- a/drivers/scsi/libfc/fc_exch.c +++ b/drivers/scsi/libfc/fc_exch.c @@ -1890,7 +1890,7 @@ static struct fc_seq *fc_exch_seq_send(struct fc_lport *lport, fc_exch_setup_hdr(ep, fp, ep->f_ctl); sp->cnt++; - if (ep->xid <= lport->lro_xid) + if (ep->xid <= lport->lro_xid && fh->fh_r_ctl == FC_RCTL_DD_UNSOL_CMD) fc_fcp_ddp_setup(fr_fsp(fp), ep->xid); if (unlikely(lport->tt.frame_send(lport, fp))) diff --git a/drivers/scsi/libfc/fc_fcp.c b/drivers/scsi/libfc/fc_fcp.c index 881d5df..6fde2fa 100644 --- a/drivers/scsi/libfc/fc_fcp.c +++ b/drivers/scsi/libfc/fc_fcp.c @@ -298,9 +298,6 @@ void fc_fcp_ddp_setup(struct fc_fcp_pkt *fsp, u16 xid) { struct fc_lport *lport; - if (!fsp) - return; - lport = fsp->lp; if ((fsp->req_flags & FC_SRB_READ) && (lport->lro_enabled) && (lport->tt.ddp_setup)) { diff --git a/drivers/scsi/libfc/fc_lport.c b/drivers/scsi/libfc/fc_lport.c index 0b16502..7ec8ce7 100644 --- a/drivers/scsi/libfc/fc_lport.c +++ b/drivers/scsi/libfc/fc_lport.c @@ -1800,7 +1800,8 @@ int fc_lport_bsg_request(struct fc_bsg_job *job) u32 did; job->reply->reply_payload_rcv_len = 0; - rsp->resid_len = job->reply_payload.payload_len; + if (rsp) + rsp->resid_len = job->reply_payload.payload_len; mutex_lock(&lport->lp_mutex); diff --git a/drivers/scsi/libfc/fc_rport.c b/drivers/scsi/libfc/fc_rport.c index 0230052..97923bb 100644 --- a/drivers/scsi/libfc/fc_rport.c +++ b/drivers/scsi/libfc/fc_rport.c @@ -623,7 +623,7 @@ static void fc_rport_plogi_resp(struct fc_seq *sp, struct fc_frame *fp, tov = ntohl(plp->fl_csp.sp_e_d_tov); if (ntohs(plp->fl_csp.sp_features) & FC_SP_FT_EDTR) - tov /= 1000; + tov /= 1000000; if (tov > rdata->e_d_tov) rdata->e_d_tov = tov; csp_seq = ntohs(plp->fl_csp.sp_tot_seq); diff --git a/drivers/scsi/libiscsi_tcp.c b/drivers/scsi/libiscsi_tcp.c index db6856c..4ad87fd 100644 --- a/drivers/scsi/libiscsi_tcp.c +++ b/drivers/scsi/libiscsi_tcp.c @@ -992,12 +992,10 @@ static struct iscsi_r2t_info *iscsi_tcp_get_curr_r2t(struct iscsi_task *task) if (r2t == NULL) { if (kfifo_out(&tcp_task->r2tqueue, (void *)&tcp_task->r2t, sizeof(void *)) != - sizeof(void *)) { - WARN_ONCE(1, "unexpected fifo state"); + sizeof(void *)) r2t = NULL; - } - - r2t = tcp_task->r2t; + else + r2t = tcp_task->r2t; } spin_unlock_bh(&session->lock); } diff --git a/drivers/scsi/megaraid/megaraid_sas.c b/drivers/scsi/megaraid/megaraid_sas.c index 708ea31..d9b8ca5 100644 --- a/drivers/scsi/megaraid/megaraid_sas.c +++ b/drivers/scsi/megaraid/megaraid_sas.c @@ -3781,6 +3781,7 @@ static int megasas_mgmt_compat_ioctl_fw(struct file *file, unsigned long arg) compat_alloc_user_space(sizeof(struct megasas_iocpacket)); int i; int error = 0; + compat_uptr_t ptr; if (clear_user(ioc, sizeof(*ioc))) return -EFAULT; @@ -3793,9 +3794,22 @@ static int megasas_mgmt_compat_ioctl_fw(struct file *file, unsigned long arg) copy_in_user(&ioc->sge_count, &cioc->sge_count, sizeof(u32))) return -EFAULT; - for (i = 0; i < MAX_IOCTL_SGE; i++) { - compat_uptr_t ptr; + /* + * The sense_ptr is used in megasas_mgmt_fw_ioctl only when + * sense_len is not null, so prepare the 64bit value under + * the same condition. + */ + if (ioc->sense_len) { + void __user **sense_ioc_ptr = + (void __user **)(ioc->frame.raw + ioc->sense_off); + compat_uptr_t *sense_cioc_ptr = + (compat_uptr_t *)(cioc->frame.raw + cioc->sense_off); + if (get_user(ptr, sense_cioc_ptr) || + put_user(compat_ptr(ptr), sense_ioc_ptr)) + return -EFAULT; + } + for (i = 0; i < MAX_IOCTL_SGE; i++) { if (get_user(ptr, &cioc->sgl[i].iov_base) || put_user(compat_ptr(ptr), &ioc->sgl[i].iov_base) || copy_in_user(&ioc->sgl[i].iov_len, diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c index c3e37c8..ea995f4 100644 --- a/drivers/serial/8250.c +++ b/drivers/serial/8250.c @@ -83,6 +83,9 @@ static unsigned int skip_txen_test; /* force skip of txen test at init time */ #define PASS_LIMIT 256 +#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE) + + /* * We default to IRQ0 for the "no irq" hack. Some * machine types want others as well - they're free @@ -1595,7 +1598,12 @@ static irqreturn_t serial8250_interrupt(int irq, void *dev_id) l = l->next; - if (l == i->head && pass_counter++ > PASS_LIMIT) { + /* + * On preempt-rt we can be preempted and run in our + * own thread. + */ + if (!preempt_rt() && l == i->head && + pass_counter++ > PASS_LIMIT) { /* If we hit this, we're dead. */ printk(KERN_ERR "serial8250: too much work for " "irq%d\n", irq); @@ -1792,7 +1800,7 @@ static unsigned int serial8250_tx_empty(struct uart_port *port) up->lsr_saved_flags |= lsr & LSR_SAVE_FLAGS; spin_unlock_irqrestore(&up->port.lock, flags); - return lsr & UART_LSR_TEMT ? TIOCSER_TEMT : 0; + return (lsr & BOTH_EMPTY) == BOTH_EMPTY ? TIOCSER_TEMT : 0; } static unsigned int serial8250_get_mctrl(struct uart_port *port) @@ -1850,8 +1858,6 @@ static void serial8250_break_ctl(struct uart_port *port, int break_state) spin_unlock_irqrestore(&up->port.lock, flags); } -#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE) - /* * Wait for transmitter & holding register to empty */ @@ -2734,14 +2740,10 @@ serial8250_console_write(struct console *co, const char *s, unsigned int count) touch_nmi_watchdog(); - local_irq_save(flags); - if (up->port.sysrq) { - /* serial8250_handle_port() already took the lock */ - locked = 0; - } else if (oops_in_progress) { - locked = spin_trylock(&up->port.lock); - } else - spin_lock(&up->port.lock); + if (up->port.sysrq || oops_in_progress || preempt_rt()) + locked = spin_trylock_irqsave(&up->port.lock, flags); + else + spin_lock_irqsave(&up->port.lock, flags); /* * First save the IER then disable the interrupts @@ -2773,8 +2775,7 @@ serial8250_console_write(struct console *co, const char *s, unsigned int count) check_modem_status(up); if (locked) - spin_unlock(&up->port.lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&up->port.lock, flags); } static int __init serial8250_console_setup(struct console *co, char *options) diff --git a/drivers/serial/sb1250-duart.c b/drivers/serial/sb1250-duart.c index a2f2b32..499962e 100644 --- a/drivers/serial/sb1250-duart.c +++ b/drivers/serial/sb1250-duart.c @@ -829,7 +829,7 @@ static void __init sbd_probe_duarts(void) #ifdef CONFIG_SERIAL_SB1250_DUART_CONSOLE /* * Serial console stuff. Very basic, polling driver for doing serial - * console output. The console_sem is held by the caller, so we + * console output. The console_mutex is held by the caller, so we * shouldn't be interrupted for more console activity. */ static void sbd_console_putchar(struct uart_port *uport, int ch) diff --git a/drivers/ssb/main.c b/drivers/ssb/main.c index 5681ebe..03dfd27 100644 --- a/drivers/ssb/main.c +++ b/drivers/ssb/main.c @@ -494,8 +494,7 @@ static int ssb_devices_register(struct ssb_bus *bus) #endif break; case SSB_BUSTYPE_SDIO: -#ifdef CONFIG_SSB_SDIO - sdev->irq = bus->host_sdio->dev.irq; +#ifdef CONFIG_SSB_SDIOHOST dev->parent = &bus->host_sdio->dev; #endif break; diff --git a/drivers/staging/comedi/drivers/dt9812.c b/drivers/staging/comedi/drivers/dt9812.c index 312f4f2..3950109 100644 --- a/drivers/staging/comedi/drivers/dt9812.c +++ b/drivers/staging/comedi/drivers/dt9812.c @@ -262,7 +262,7 @@ struct dt9812_usb_cmd { #define DT9812_NUM_SLOTS 16 -static DECLARE_MUTEX(dt9812_mutex); +static DEFINE_SEMAPHORE(dt9812_mutex, 1); static struct usb_device_id dt9812_table[] = { {USB_DEVICE(0x0867, 0x9812)}, @@ -1124,7 +1124,7 @@ static int __init usb_dt9812_init(void) /* Initialize all driver slots */ for (i = 0; i < DT9812_NUM_SLOTS; i++) { - init_MUTEX(&dt9812[i].mutex); + sema_init(&dt9812[i].mutex, 1); dt9812[i].serial = 0; dt9812[i].usb = NULL; dt9812[i].comedi = NULL; diff --git a/drivers/staging/comedi/drivers/usbdux.c b/drivers/staging/comedi/drivers/usbdux.c index 9a1b559..d611bd7 100644 --- a/drivers/staging/comedi/drivers/usbdux.c +++ b/drivers/staging/comedi/drivers/usbdux.c @@ -316,7 +316,7 @@ struct usbduxsub { */ static struct usbduxsub usbduxsub[NUMUSBDUX]; -static DECLARE_MUTEX(start_stop_sem); +static DEFINE_SEMAPHORE(start_stop_sem, 1); /* * Stops the data acquision @@ -2370,7 +2370,7 @@ static int usbduxsub_probe(struct usb_interface *uinterf, dev_dbg(dev, "comedi_: usbdux: " "usbduxsub[%d] is ready to connect to comedi.\n", index); - init_MUTEX(&(usbduxsub[index].sem)); + sema_init(&(usbduxsub[index].sem)), 1; /* save a pointer to the usb device */ usbduxsub[index].usbdev = udev; diff --git a/drivers/staging/comedi/drivers/usbduxfast.c b/drivers/staging/comedi/drivers/usbduxfast.c index 2e675cc..95a94c7 100644 --- a/drivers/staging/comedi/drivers/usbduxfast.c +++ b/drivers/staging/comedi/drivers/usbduxfast.c @@ -200,7 +200,7 @@ struct usbduxfastsub_s { */ static struct usbduxfastsub_s usbduxfastsub[NUMUSBDUXFAST]; -static DECLARE_MUTEX(start_stop_sem); +static DEFINE_SEMAPHORE(start_stop_sem, 1); /* * bulk transfers to usbduxfast @@ -1500,7 +1500,7 @@ static int usbduxfastsub_probe(struct usb_interface *uinterf, "connect to comedi.\n", index); #endif - init_MUTEX(&(usbduxfastsub[index].sem)); + sema_init(&(usbduxfastsub[index].sem), 1); /* save a pointer to the usb device */ usbduxfastsub[index].usbdev = udev; diff --git a/drivers/staging/dream/pmem.c b/drivers/staging/dream/pmem.c index def6468..21f87a4 100644 --- a/drivers/staging/dream/pmem.c +++ b/drivers/staging/dream/pmem.c @@ -127,9 +127,9 @@ struct pmem_info { * this flag */ unsigned allocated; /* for debugging, creates a list of pmem file structs, the - * data_list_sem should be taken before pmem_data->sem if both are + * data_list_mutex should be taken before pmem_data->sem if both are * needed */ - struct semaphore data_list_sem; + struct mutex data_list_mutex; struct list_head data_list; /* pmem_sem protects the bitmap array * a write lock should be held when modifying entries in bitmap @@ -273,7 +273,7 @@ static int pmem_release(struct inode *inode, struct file *file) int id = get_id(file), ret = 0; - down(&pmem[id].data_list_sem); + mutex_lock(&pmem[id].data_list_mutex); /* if this file is a master, revoke all the memory in the connected * files */ if (PMEM_FLAGS_MASTERMAP & data->flags) { @@ -290,7 +290,7 @@ static int pmem_release(struct inode *inode, struct file *file) } } list_del(&data->list); - up(&pmem[id].data_list_sem); + mutex_unlock(&pmem[id].data_list_mutex); down_write(&data->sem); @@ -358,9 +358,9 @@ static int pmem_open(struct inode *inode, struct file *file) file->private_data = data; INIT_LIST_HEAD(&data->list); - down(&pmem[id].data_list_sem); + mutex_lock(&pmem[id].data_list_mutex); list_add(&data->list, &pmem[id].data_list); - up(&pmem[id].data_list_sem); + mutex_unlock(&pmem[id].data_list_mutex); return ret; } @@ -1178,7 +1178,7 @@ static ssize_t debug_read(struct file *file, char __user *buf, size_t count, n = scnprintf(buffer, debug_bufmax, "pid #: mapped regions (offset, len) (offset,len)...\n"); - down(&pmem[id].data_list_sem); + mutex_lock(&pmem[id].data_list_mutex); list_for_each(elt, &pmem[id].data_list) { data = list_entry(elt, struct pmem_data, list); down_read(&data->sem); @@ -1195,7 +1195,7 @@ static ssize_t debug_read(struct file *file, char __user *buf, size_t count, n += scnprintf(buffer + n, debug_bufmax - n, "\n"); up_read(&data->sem); } - up(&pmem[id].data_list_sem); + mutex_unlock(&pmem[id].data_list_mutex); n++; buffer[n] = 0; @@ -1232,7 +1232,7 @@ int pmem_setup(struct android_pmem_platform_data *pdata, pmem[id].ioctl = ioctl; pmem[id].release = release; init_rwsem(&pmem[id].bitmap_sem); - init_MUTEX(&pmem[id].data_list_sem); + mutex_init(&pmem[id].data_list_mutex); INIT_LIST_HEAD(&pmem[id].data_list); pmem[id].dev.name = pdata->name; pmem[id].dev.minor = id; diff --git a/drivers/staging/frontier/alphatrack.c b/drivers/staging/frontier/alphatrack.c index 15aed87..641975d 100644 --- a/drivers/staging/frontier/alphatrack.c +++ b/drivers/staging/frontier/alphatrack.c @@ -678,7 +678,7 @@ static int usb_alphatrack_probe(struct usb_interface *intf, dev_err(&intf->dev, "Out of memory\n"); goto exit; } - init_MUTEX(&dev->sem); + sema_init(&dev->sem, 1); dev->intf = intf; init_waitqueue_head(&dev->read_wait); init_waitqueue_head(&dev->write_wait); diff --git a/drivers/staging/frontier/tranzport.c b/drivers/staging/frontier/tranzport.c index ef8fcc8..d51af72 100644 --- a/drivers/staging/frontier/tranzport.c +++ b/drivers/staging/frontier/tranzport.c @@ -800,7 +800,7 @@ static int usb_tranzport_probe(struct usb_interface *intf, dev_err(&intf->dev, "Out of memory\n"); goto exit; } - init_MUTEX(&dev->sem); + sema_init(&dev->sem, 1); dev->intf = intf; init_waitqueue_head(&dev->read_wait); init_waitqueue_head(&dev->write_wait); diff --git a/drivers/staging/mimio/mimio.c b/drivers/staging/mimio/mimio.c index 1ba8103..12a6d0e 100644 --- a/drivers/staging/mimio/mimio.c +++ b/drivers/staging/mimio/mimio.c @@ -160,7 +160,7 @@ static struct usb_driver mimio_driver = { .id_table = mimio_table, }; -static DECLARE_MUTEX(disconnect_sem); +static DEFINE_SEMAPHORE(disconnect_sem, 1); static void mimio_close(struct input_dev *idev) { diff --git a/drivers/staging/otus/wwrap.c b/drivers/staging/otus/wwrap.c index 53d2a45..adff5ea 100644 --- a/drivers/staging/otus/wwrap.c +++ b/drivers/staging/otus/wwrap.c @@ -1058,7 +1058,7 @@ u8_t zfLnxCreateThread(zdev_t *dev) /* Create Mutex and keventd */ INIT_WORK(&macp->kevent, kevent); - init_MUTEX(&macp->ioctl_sem); + sema_init(&macp->ioctl_sem, 1); return 0; } diff --git a/drivers/staging/p9auth/p9auth.c b/drivers/staging/p9auth/p9auth.c index db79626..3595153 100644 --- a/drivers/staging/p9auth/p9auth.c +++ b/drivers/staging/p9auth/p9auth.c @@ -391,7 +391,7 @@ static int __init cap_init_module(void) /* Initialize each device. */ for (i = 0; i < cap_nr_devs; i++) { cap_devices[i].node_size = cap_node_size; - init_MUTEX(&cap_devices[i].sem); + sema_init(&cap_devices[i].sem, 1); cap_setup_cdev(&cap_devices[i], i); } diff --git a/drivers/staging/rt2860/common/rtmp_init.c b/drivers/staging/rt2860/common/rtmp_init.c index 21a95ff..9f4ac4d 100644 --- a/drivers/staging/rt2860/common/rtmp_init.c +++ b/drivers/staging/rt2860/common/rtmp_init.c @@ -3520,7 +3520,7 @@ int RtmpRaDevCtrlInit(struct rt_rtmp_adapter *pAd, IN RTMP_INF_TYPE infType) ("STA Driver version-%s\n", STA_DRIVER_VERSION)); #ifdef RTMP_MAC_USB - init_MUTEX(&(pAd->UsbVendorReq_semaphore)); + sema_init(&(pAd->UsbVendorReq_semaphore), 1); os_alloc_mem(pAd, (u8 **) & pAd->UsbVendorReqBuf, MAX_PARAM_BUFFER_SIZE - 1); if (pAd->UsbVendorReqBuf == NULL) { diff --git a/drivers/staging/sm7xx/smtcfb.c b/drivers/staging/sm7xx/smtcfb.c index 161dbc9..dfb1911 100644 --- a/drivers/staging/sm7xx/smtcfb.c +++ b/drivers/staging/sm7xx/smtcfb.c @@ -1151,9 +1151,9 @@ static int __maybe_unused smtcfb_suspend(struct pci_dev *pdev, pm_message_t msg) /* when doing suspend, call fb apis and pci apis */ if (msg.event == PM_EVENT_SUSPEND) { - acquire_console_sem(); + acquire_console_mutex(); fb_set_suspend(&sfb->fb, 1); - release_console_sem(); + release_console_mutex(); retv = pci_save_state(pdev); pci_disable_device(pdev); retv = pci_choose_state(pdev, msg); @@ -1212,9 +1212,9 @@ static int __maybe_unused smtcfb_resume(struct pci_dev *pdev) smtcfb_setmode(sfb); - acquire_console_sem(); + acquire_console_mutex(); fb_set_suspend(&sfb->fb, 0); - release_console_sem(); + release_console_mutex(); return 0; } diff --git a/drivers/staging/vme/devices/vme_user.c b/drivers/staging/vme/devices/vme_user.c index e228942..5c73d50 100644 --- a/drivers/staging/vme/devices/vme_user.c +++ b/drivers/staging/vme/devices/vme_user.c @@ -639,7 +639,7 @@ static int __init vme_user_probe(struct device *dev, int cur_bus, int cur_slot) for (i = 0; i < VME_DEVS; i++) { image[i].kern_buf = NULL; image[i].pci_buf = 0; - init_MUTEX(&(image[i].sem)); + sema_init(&(image[i].sem), 1); image[i].device = NULL; image[i].resource = NULL; image[i].users = 0; diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index 6e8bcdf..1f07cbf 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -393,8 +393,9 @@ static void async_completed(struct urb *urb) uid_t euid = 0; u32 secid = 0; int signr; + unsigned long flags; - spin_lock(&ps->lock); + spin_lock_irqsave(&ps->lock, flags); list_move_tail(&as->asynclist, &ps->async_completed); as->status = urb->status; signr = as->signr; @@ -414,7 +415,7 @@ static void async_completed(struct urb *urb) if (as->status < 0 && as->bulk_addr && as->status != -ECONNRESET && as->status != -ENOENT) cancel_bulk_urbs(ps, as->bulk_addr); - spin_unlock(&ps->lock); + spin_unlock_irqrestore(&ps->lock, flags); if (signr) kill_pid_info_as_uid(sinfo.si_signo, &sinfo, pid, uid, @@ -1312,9 +1313,9 @@ static int processcompl(struct async *as, void __user * __user *arg) void __user *addr = as->userurb; unsigned int i; - if (as->userbuffer) + if (as->userbuffer && urb->actual_length) if (copy_to_user(as->userbuffer, urb->transfer_buffer, - urb->transfer_buffer_length)) + urb->actual_length)) goto err_out; if (put_user(as->status, &userurb->status)) goto err_out; @@ -1334,14 +1335,11 @@ static int processcompl(struct async *as, void __user * __user *arg) } } - free_async(as); - if (put_user(addr, (void __user * __user *)arg)) return -EFAULT; return 0; err_out: - free_async(as); return -EFAULT; } @@ -1371,8 +1369,11 @@ static struct async *reap_as(struct dev_state *ps) static int proc_reapurb(struct dev_state *ps, void __user *arg) { struct async *as = reap_as(ps); - if (as) - return processcompl(as, (void __user * __user *)arg); + if (as) { + int retval = processcompl(as, (void __user * __user *)arg); + free_async(as); + return retval; + } if (signal_pending(current)) return -EINTR; return -EIO; @@ -1380,11 +1381,16 @@ static int proc_reapurb(struct dev_state *ps, void __user *arg) static int proc_reapurbnonblock(struct dev_state *ps, void __user *arg) { + int retval; struct async *as; - if (!(as = async_getcompleted(ps))) - return -EAGAIN; - return processcompl(as, (void __user * __user *)arg); + as = async_getcompleted(ps); + retval = -EAGAIN; + if (as) { + retval = processcompl(as, (void __user * __user *)arg); + free_async(as); + } + return retval; } #ifdef CONFIG_COMPAT @@ -1475,9 +1481,9 @@ static int processcompl_compat(struct async *as, void __user * __user *arg) void __user *addr = as->userurb; unsigned int i; - if (as->userbuffer) + if (as->userbuffer && urb->actual_length) if (copy_to_user(as->userbuffer, urb->transfer_buffer, - urb->transfer_buffer_length)) + urb->actual_length)) return -EFAULT; if (put_user(as->status, &userurb->status)) return -EFAULT; @@ -1497,7 +1503,6 @@ static int processcompl_compat(struct async *as, void __user * __user *arg) } } - free_async(as); if (put_user(ptr_to_compat(addr), (u32 __user *)arg)) return -EFAULT; return 0; @@ -1506,8 +1511,11 @@ static int processcompl_compat(struct async *as, void __user * __user *arg) static int proc_reapurb_compat(struct dev_state *ps, void __user *arg) { struct async *as = reap_as(ps); - if (as) - return processcompl_compat(as, (void __user * __user *)arg); + if (as) { + int retval = processcompl_compat(as, (void __user * __user *)arg); + free_async(as); + return retval; + } if (signal_pending(current)) return -EINTR; return -EIO; @@ -1515,11 +1523,16 @@ static int proc_reapurb_compat(struct dev_state *ps, void __user *arg) static int proc_reapurbnonblock_compat(struct dev_state *ps, void __user *arg) { + int retval; struct async *as; - if (!(as = async_getcompleted(ps))) - return -EAGAIN; - return processcompl_compat(as, (void __user * __user *)arg); + retval = -EAGAIN; + as = async_getcompleted(ps); + if (as) { + retval = processcompl_compat(as, (void __user * __user *)arg); + free_async(as); + } + return retval; } diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c index 60a45f1..66dc262 100644 --- a/drivers/usb/core/driver.c +++ b/drivers/usb/core/driver.c @@ -470,10 +470,10 @@ void usb_driver_release_interface(struct usb_driver *driver, if (device_is_registered(dev)) { device_release_driver(dev); } else { - down(&dev->sem); + mutex_lock(&dev->mutex); usb_unbind_interface(dev); dev->driver = NULL; - up(&dev->sem); + mutex_unlock(&dev->mutex); } } EXPORT_SYMBOL_GPL(usb_driver_release_interface); diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index 80995ef..94a217a 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -1946,7 +1946,7 @@ irqreturn_t usb_hcd_irq (int irq, void *__hcd) * when the first handler doesn't use it. So let's just * assume it's never used. */ - local_irq_save(flags); + local_irq_save_nort(flags); if (unlikely(hcd->state == HC_STATE_HALT || !test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags))) { @@ -1961,7 +1961,7 @@ irqreturn_t usb_hcd_irq (int irq, void *__hcd) rc = IRQ_HANDLED; } - local_irq_restore(flags); + local_irq_restore_nort(flags); return rc; } diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c index 9bc95fe..405f479 100644 --- a/drivers/usb/core/message.c +++ b/drivers/usb/core/message.c @@ -269,8 +269,9 @@ static void sg_complete(struct urb *urb) { struct usb_sg_request *io = urb->context; int status = urb->status; + unsigned long flags; - spin_lock(&io->lock); + spin_lock_irqsave (&io->lock, flags); /* In 2.5 we require hcds' endpoint queues not to progress after fault * reports, until the completion callback (this!) returns. That lets @@ -304,7 +305,7 @@ static void sg_complete(struct urb *urb) * unlink pending urbs so they won't rx/tx bad data. * careful: unlink can sometimes be synchronous... */ - spin_unlock(&io->lock); + spin_unlock_irqrestore (&io->lock, flags); for (i = 0, found = 0; i < io->entries; i++) { if (!io->urbs [i] || !io->urbs [i]->dev) continue; @@ -319,7 +320,7 @@ static void sg_complete(struct urb *urb) } else if (urb == io->urbs [i]) found = 1; } - spin_lock(&io->lock); + spin_lock_irqsave (&io->lock, flags); } urb->dev = NULL; @@ -329,7 +330,7 @@ static void sg_complete(struct urb *urb) if (!io->count) complete(&io->complete); - spin_unlock(&io->lock); + spin_unlock_irqrestore (&io->lock, flags); } @@ -626,7 +627,7 @@ void usb_sg_cancel(struct usb_sg_request *io) int i; io->status = -ECONNRESET; - spin_unlock(&io->lock); + spin_unlock_irqrestore(&io->lock, flags); for (i = 0; i < io->entries; i++) { int retval; @@ -637,7 +638,7 @@ void usb_sg_cancel(struct usb_sg_request *io) dev_warn(&io->dev->dev, "%s, unlink --> %d\n", __func__, retval); } - spin_lock(&io->lock); + spin_lock_irqsave(&io->lock, flags); } spin_unlock_irqrestore(&io->lock, flags); } diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c index 0daff0d..d9ebe7c 100644 --- a/drivers/usb/core/usb.c +++ b/drivers/usb/core/usb.c @@ -601,7 +601,7 @@ int usb_lock_device_for_reset(struct usb_device *udev, iface->condition == USB_INTERFACE_UNBOUND)) return -EINTR; - while (usb_trylock_device(udev) != 0) { + while (!usb_trylock_device(udev)) { /* If we can't acquire the lock after waiting one second, * we're probably deadlocked */ diff --git a/drivers/usb/gadget/f_eem.c b/drivers/usb/gadget/f_eem.c index 0a577d5..d4f0db5 100644 --- a/drivers/usb/gadget/f_eem.c +++ b/drivers/usb/gadget/f_eem.c @@ -358,7 +358,7 @@ done: * b15: bmType (0 == data) */ len = skb->len; - put_unaligned_le16((len & 0x3FFF) | BIT(14), skb_push(skb, 2)); + put_unaligned_le16(len & 0x3FFF, skb_push(skb, 2)); /* add a zero-length EEM packet, if needed */ if (padlen) @@ -464,7 +464,6 @@ static int eem_unwrap(struct gether *port, } /* validate CRC */ - crc = get_unaligned_le32(skb->data + len - ETH_FCS_LEN); if (header & BIT(14)) { crc = get_unaligned_le32(skb->data + len - ETH_FCS_LEN); diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c index bf0f652..de8a838 100644 --- a/drivers/usb/gadget/inode.c +++ b/drivers/usb/gadget/inode.c @@ -194,7 +194,7 @@ enum ep_state { }; struct ep_data { - struct semaphore lock; + struct mutex lock; enum ep_state state; atomic_t count; struct dev_data *dev; @@ -298,10 +298,10 @@ get_ready_ep (unsigned f_flags, struct ep_data *epdata) int val; if (f_flags & O_NONBLOCK) { - if (down_trylock (&epdata->lock) != 0) + if (!mutex_trylock(&epdata->lock)) goto nonblock; if (epdata->state != STATE_EP_ENABLED) { - up (&epdata->lock); + mutex_unlock(&epdata->lock); nonblock: val = -EAGAIN; } else @@ -309,7 +309,8 @@ nonblock: return val; } - if ((val = down_interruptible (&epdata->lock)) < 0) + val = mutex_lock_interruptible(&epdata->lock); + if (val < 0) return val; switch (epdata->state) { @@ -323,7 +324,7 @@ nonblock: // FALLTHROUGH case STATE_EP_UNBOUND: /* clean disconnect */ val = -ENODEV; - up (&epdata->lock); + mutex_unlock(&epdata->lock); } return val; } @@ -393,7 +394,7 @@ ep_read (struct file *fd, char __user *buf, size_t len, loff_t *ptr) if (likely (data->ep != NULL)) usb_ep_set_halt (data->ep); spin_unlock_irq (&data->dev->lock); - up (&data->lock); + mutex_unlock(&data->lock); return -EBADMSG; } @@ -411,7 +412,7 @@ ep_read (struct file *fd, char __user *buf, size_t len, loff_t *ptr) value = -EFAULT; free1: - up (&data->lock); + mutex_unlock(&data->lock); kfree (kbuf); return value; } @@ -436,7 +437,7 @@ ep_write (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) if (likely (data->ep != NULL)) usb_ep_set_halt (data->ep); spin_unlock_irq (&data->dev->lock); - up (&data->lock); + mutex_unlock(&data->lock); return -EBADMSG; } @@ -455,7 +456,7 @@ ep_write (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) VDEBUG (data->dev, "%s write %zu IN, status %d\n", data->name, len, (int) value); free1: - up (&data->lock); + mutex_unlock(&data->lock); kfree (kbuf); return value; } @@ -466,7 +467,8 @@ ep_release (struct inode *inode, struct file *fd) struct ep_data *data = fd->private_data; int value; - if ((value = down_interruptible(&data->lock)) < 0) + value = mutex_lock_interruptible(&data->lock); + if (value < 0) return value; /* clean up if this can be reopened */ @@ -476,7 +478,7 @@ ep_release (struct inode *inode, struct file *fd) data->hs_desc.bDescriptorType = 0; usb_ep_disable(data->ep); } - up (&data->lock); + mutex_unlock(&data->lock); put_ep (data); return 0; } @@ -507,7 +509,7 @@ static long ep_ioctl(struct file *fd, unsigned code, unsigned long value) } else status = -ENODEV; spin_unlock_irq (&data->dev->lock); - up (&data->lock); + mutex_unlock(&data->lock); return status; } @@ -673,7 +675,7 @@ fail: value = -ENODEV; spin_unlock_irq(&epdata->dev->lock); - up(&epdata->lock); + mutex_unlock(&epdata->lock); if (unlikely(value)) { kfree(priv); @@ -765,7 +767,8 @@ ep_config (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) u32 tag; int value, length = len; - if ((value = down_interruptible (&data->lock)) < 0) + value = mutex_lock_interruptible(&data->lock); + if (value < 0) return value; if (data->state != STATE_EP_READY) { @@ -854,7 +857,7 @@ fail: data->desc.bDescriptorType = 0; data->hs_desc.bDescriptorType = 0; } - up (&data->lock); + mutex_unlock(&data->lock); return value; fail0: value = -EINVAL; @@ -870,7 +873,7 @@ ep_open (struct inode *inode, struct file *fd) struct ep_data *data = inode->i_private; int value = -EBUSY; - if (down_interruptible (&data->lock) != 0) + if (mutex_lock_interruptible(&data->lock) != 0) return -EINTR; spin_lock_irq (&data->dev->lock); if (data->dev->state == STATE_DEV_UNBOUND) @@ -885,7 +888,7 @@ ep_open (struct inode *inode, struct file *fd) DBG (data->dev, "%s state %d\n", data->name, data->state); spin_unlock_irq (&data->dev->lock); - up (&data->lock); + mutex_unlock(&data->lock); return value; } @@ -1631,7 +1634,7 @@ static int activate_ep_files (struct dev_data *dev) if (!data) goto enomem0; data->state = STATE_EP_DISABLED; - init_MUTEX (&data->lock); + mutex_init(&data->lock); init_waitqueue_head (&data->wait); strncpy (data->name, ep->name, sizeof (data->name) - 1); diff --git a/drivers/usb/gadget/multi.c b/drivers/usb/gadget/multi.c index 4295601..76496f5 100644 --- a/drivers/usb/gadget/multi.c +++ b/drivers/usb/gadget/multi.c @@ -29,7 +29,7 @@ #if defined USB_ETH_RNDIS # undef USB_ETH_RNDIS #endif -#ifdef CONFIG_USB_ETH_RNDIS +#ifdef CONFIG_USB_G_MULTI_RNDIS # define USB_ETH_RNDIS y #endif diff --git a/drivers/usb/gadget/r8a66597-udc.c b/drivers/usb/gadget/r8a66597-udc.c index e220fb8..8b45145 100644 --- a/drivers/usb/gadget/r8a66597-udc.c +++ b/drivers/usb/gadget/r8a66597-udc.c @@ -26,6 +26,7 @@ #include <linux/io.h> #include <linux/platform_device.h> #include <linux/clk.h> +#include <linux/err.h> #include <linux/usb/ch9.h> #include <linux/usb/gadget.h> diff --git a/drivers/usb/gadget/s3c-hsotg.c b/drivers/usb/gadget/s3c-hsotg.c index 4b5dbd0..5fc80a1 100644 --- a/drivers/usb/gadget/s3c-hsotg.c +++ b/drivers/usb/gadget/s3c-hsotg.c @@ -2582,6 +2582,7 @@ err: hsotg->gadget.dev.driver = NULL; return ret; } +EXPORT_SYMBOL(usb_gadget_register_driver); int usb_gadget_unregister_driver(struct usb_gadget_driver *driver) { diff --git a/drivers/usb/host/ehci-hub.c b/drivers/usb/host/ehci-hub.c index c75d927..1937267 100644 --- a/drivers/usb/host/ehci-hub.c +++ b/drivers/usb/host/ehci-hub.c @@ -196,7 +196,9 @@ static int ehci_bus_suspend (struct usb_hcd *hcd) if (hostpc_reg) { u32 t3; + spin_unlock_irq(&ehci->lock); msleep(5);/* 5ms for HCD enter low pwr mode */ + spin_lock_irq(&ehci->lock); t3 = ehci_readl(ehci, hostpc_reg); ehci_writel(ehci, t3 | HOSTPC_PHCD, hostpc_reg); t3 = ehci_readl(ehci, hostpc_reg); @@ -904,17 +906,18 @@ static int ehci_hub_control ( if ((temp & PORT_PE) == 0 || (temp & PORT_RESET) != 0) goto error; - ehci_writel(ehci, temp | PORT_SUSPEND, status_reg); + /* After above check the port must be connected. * Set appropriate bit thus could put phy into low power * mode if we have hostpc feature */ + temp &= ~PORT_WKCONN_E; + temp |= PORT_WKDISC_E | PORT_WKOC_E; + ehci_writel(ehci, temp | PORT_SUSPEND, status_reg); if (hostpc_reg) { - temp &= ~PORT_WKCONN_E; - temp |= (PORT_WKDISC_E | PORT_WKOC_E); - ehci_writel(ehci, temp | PORT_SUSPEND, - status_reg); + spin_unlock_irqrestore(&ehci->lock, flags); msleep(5);/* 5ms for HCD enter low pwr mode */ + spin_lock_irqsave(&ehci->lock, flags); temp1 = ehci_readl(ehci, hostpc_reg); ehci_writel(ehci, temp1 | HOSTPC_PHCD, hostpc_reg); diff --git a/drivers/usb/host/fhci-tds.c b/drivers/usb/host/fhci-tds.c index d224ab4..e123289 100644 --- a/drivers/usb/host/fhci-tds.c +++ b/drivers/usb/host/fhci-tds.c @@ -105,7 +105,7 @@ void fhci_ep0_free(struct fhci_usb *usb) if (ep->td_base) cpm_muram_free(cpm_muram_offset(ep->td_base)); - if (ep->conf_frame_Q) { + if (kfifo_initialized(&ep->conf_frame_Q)) { size = cq_howmany(&ep->conf_frame_Q); for (; size; size--) { struct packet *pkt = cq_get(&ep->conf_frame_Q); @@ -115,7 +115,7 @@ void fhci_ep0_free(struct fhci_usb *usb) cq_delete(&ep->conf_frame_Q); } - if (ep->empty_frame_Q) { + if (kfifo_initialized(&ep->empty_frame_Q)) { size = cq_howmany(&ep->empty_frame_Q); for (; size; size--) { struct packet *pkt = cq_get(&ep->empty_frame_Q); @@ -125,7 +125,7 @@ void fhci_ep0_free(struct fhci_usb *usb) cq_delete(&ep->empty_frame_Q); } - if (ep->dummy_packets_Q) { + if (kfifo_initialized(&ep->dummy_packets_Q)) { size = cq_howmany(&ep->dummy_packets_Q); for (; size; size--) { u8 *buff = cq_get(&ep->dummy_packets_Q); diff --git a/drivers/usb/misc/ftdi-elan.c b/drivers/usb/misc/ftdi-elan.c index 9d0675e..d5e0bdb 100644 --- a/drivers/usb/misc/ftdi-elan.c +++ b/drivers/usb/misc/ftdi-elan.c @@ -2766,7 +2766,7 @@ static int ftdi_elan_probe(struct usb_interface *interface, ftdi->sequence_num = ++ftdi_instances; mutex_unlock(&ftdi_module_lock); ftdi_elan_init_kref(ftdi); - init_MUTEX(&ftdi->sw_lock); + sema_init(&ftdi->sw_lock, 1); ftdi->udev = usb_get_dev(interface_to_usbdev(interface)); ftdi->interface = interface; mutex_init(&ftdi->u132_lock); diff --git a/drivers/usb/misc/sisusbvga/sisusb.c b/drivers/usb/misc/sisusbvga/sisusb.c index 0025847..8b37a4b 100644 --- a/drivers/usb/misc/sisusbvga/sisusb.c +++ b/drivers/usb/misc/sisusbvga/sisusb.c @@ -3245,6 +3245,7 @@ static struct usb_device_id sisusb_table [] = { { USB_DEVICE(0x0711, 0x0902) }, { USB_DEVICE(0x0711, 0x0903) }, { USB_DEVICE(0x0711, 0x0918) }, + { USB_DEVICE(0x0711, 0x0920) }, { USB_DEVICE(0x182d, 0x021c) }, { USB_DEVICE(0x182d, 0x0269) }, { } diff --git a/drivers/usb/otg/Kconfig b/drivers/usb/otg/Kconfig index de56b3d..3d2d3e5 100644 --- a/drivers/usb/otg/Kconfig +++ b/drivers/usb/otg/Kconfig @@ -44,6 +44,7 @@ config ISP1301_OMAP config USB_ULPI bool "Generic ULPI Transceiver Driver" depends on ARM + select USB_OTG_UTILS help Enable this to support ULPI connected USB OTG transceivers which are likely found on embedded boards. diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c index 216f187..7638828 100644 --- a/drivers/usb/serial/ftdi_sio.c +++ b/drivers/usb/serial/ftdi_sio.c @@ -50,7 +50,7 @@ * Version Information */ #define DRIVER_VERSION "v1.5.0" -#define DRIVER_AUTHOR "Greg Kroah-Hartman <greg@kroah.com>, Bill Ryder <bryder@sgi.com>, Kuba Ober <kuba@mareimbrium.org>" +#define DRIVER_AUTHOR "Greg Kroah-Hartman <greg@kroah.com>, Bill Ryder <bryder@sgi.com>, Kuba Ober <kuba@mareimbrium.org>, Andreas Mohr" #define DRIVER_DESC "USB FTDI Serial Converters Driver" static int debug; @@ -145,10 +145,15 @@ static struct ftdi_sio_quirk ftdi_HE_TIRA1_quirk = { +/* + * Device ID not listed? Test via module params product/vendor or + * /sys/bus/usb/ftdi_sio/new_id, then send patch/report! + */ static struct usb_device_id id_table_combined [] = { { USB_DEVICE(FTDI_VID, FTDI_AMC232_PID) }, { USB_DEVICE(FTDI_VID, FTDI_CANUSB_PID) }, { USB_DEVICE(FTDI_VID, FTDI_CANDAPTER_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_NXTCAM_PID) }, { USB_DEVICE(FTDI_VID, FTDI_SCS_DEVICE_0_PID) }, { USB_DEVICE(FTDI_VID, FTDI_SCS_DEVICE_1_PID) }, { USB_DEVICE(FTDI_VID, FTDI_SCS_DEVICE_2_PID) }, @@ -552,9 +557,16 @@ static struct usb_device_id id_table_combined [] = { { USB_DEVICE(FTDI_VID, FTDI_IBS_PEDO_PID) }, { USB_DEVICE(FTDI_VID, FTDI_IBS_PROD_PID) }, /* - * Due to many user requests for multiple ELV devices we enable - * them by default. + * ELV devices: */ + { USB_DEVICE(FTDI_VID, FTDI_ELV_USR_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_MSM1_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_KL100_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_WS550_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_EC3000_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_WS888_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_TWS550_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_FEM_PID) }, { USB_DEVICE(FTDI_VID, FTDI_ELV_CLI7000_PID) }, { USB_DEVICE(FTDI_VID, FTDI_ELV_PPS7330_PID) }, { USB_DEVICE(FTDI_VID, FTDI_ELV_TFM100_PID) }, @@ -571,11 +583,17 @@ static struct usb_device_id id_table_combined [] = { { USB_DEVICE(FTDI_VID, FTDI_ELV_PCK100_PID) }, { USB_DEVICE(FTDI_VID, FTDI_ELV_RFP500_PID) }, { USB_DEVICE(FTDI_VID, FTDI_ELV_FS20SIG_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_UTP8_PID) }, { USB_DEVICE(FTDI_VID, FTDI_ELV_WS300PC_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_WS444PC_PID) }, { USB_DEVICE(FTDI_VID, FTDI_ELV_FHZ1300PC_PID) }, { USB_DEVICE(FTDI_VID, FTDI_ELV_EM1010PC_PID) }, { USB_DEVICE(FTDI_VID, FTDI_ELV_WS500_PID) }, { USB_DEVICE(FTDI_VID, FTDI_ELV_HS485_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_UMS100_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_TFD128_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_FM3RX_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_WS777_PID) }, { USB_DEVICE(FTDI_VID, LINX_SDMUSBQSS_PID) }, { USB_DEVICE(FTDI_VID, LINX_MASTERDEVEL2_PID) }, { USB_DEVICE(FTDI_VID, LINX_FUTURE_0_PID) }, @@ -697,6 +715,7 @@ static struct usb_device_id id_table_combined [] = { { USB_DEVICE(RATOC_VENDOR_ID, RATOC_PRODUCT_ID_USB60F) }, { USB_DEVICE(FTDI_VID, FTDI_REU_TINY_PID) }, { USB_DEVICE(PAPOUCH_VID, PAPOUCH_QUIDO4x4_PID) }, + { USB_DEVICE(PAPOUCH_VID, PAPOUCH_AD4USB_PID) }, { USB_DEVICE(FTDI_VID, FTDI_DOMINTELL_DGQG_PID) }, { USB_DEVICE(FTDI_VID, FTDI_DOMINTELL_DUSB_PID) }, { USB_DEVICE(ALTI2_VID, ALTI2_N3_PID) }, diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h index da92b49..c8951ae 100644 --- a/drivers/usb/serial/ftdi_sio_ids.h +++ b/drivers/usb/serial/ftdi_sio_ids.h @@ -38,6 +38,8 @@ /* www.candapter.com Ewert Energy Systems CANdapter device */ #define FTDI_CANDAPTER_PID 0x9F80 /* Product Id */ +#define FTDI_NXTCAM_PID 0xABB8 /* NXTCam for Mindstorms NXT */ + /* OOCDlink by Joern Kaipf <joernk@web.de> * (http://www.joernonline.de/dw/doku.php?id=start&idx=projects:oocdlink) */ #define FTDI_OOCDLINK_PID 0xbaf8 /* Amontec JTAGkey */ @@ -161,22 +163,37 @@ /* * ELV USB devices submitted by Christian Abt of ELV (www.elv.de). * All of these devices use FTDI's vendor ID (0x0403). + * Further IDs taken from ELV Windows .inf file. * * The previously included PID for the UO 100 module was incorrect. * In fact, that PID was for ELV's UR 100 USB-RS232 converter (0xFB58). * * Armin Laeuger originally sent the PID for the UM 100 module. */ +#define FTDI_ELV_USR_PID 0xE000 /* ELV Universal-Sound-Recorder */ +#define FTDI_ELV_MSM1_PID 0xE001 /* ELV Mini-Sound-Modul */ +#define FTDI_ELV_KL100_PID 0xE002 /* ELV Kfz-Leistungsmesser KL 100 */ +#define FTDI_ELV_WS550_PID 0xE004 /* WS 550 */ +#define FTDI_ELV_EC3000_PID 0xE006 /* ENERGY CONTROL 3000 USB */ +#define FTDI_ELV_WS888_PID 0xE008 /* WS 888 */ +#define FTDI_ELV_TWS550_PID 0xE009 /* Technoline WS 550 */ +#define FTDI_ELV_FEM_PID 0xE00A /* Funk Energie Monitor */ #define FTDI_ELV_FHZ1300PC_PID 0xE0E8 /* FHZ 1300 PC */ #define FTDI_ELV_WS500_PID 0xE0E9 /* PC-Wetterstation (WS 500) */ #define FTDI_ELV_HS485_PID 0xE0EA /* USB to RS-485 adapter */ +#define FTDI_ELV_UMS100_PID 0xE0EB /* ELV USB Master-Slave Schaltsteckdose UMS 100 */ +#define FTDI_ELV_TFD128_PID 0xE0EC /* ELV Temperatur-Feuchte-Datenlogger TFD 128 */ +#define FTDI_ELV_FM3RX_PID 0xE0ED /* ELV Messwertuebertragung FM3 RX */ +#define FTDI_ELV_WS777_PID 0xE0EE /* Conrad WS 777 */ #define FTDI_ELV_EM1010PC_PID 0xE0EF /* Engery monitor EM 1010 PC */ #define FTDI_ELV_CSI8_PID 0xE0F0 /* Computer-Schalt-Interface (CSI 8) */ #define FTDI_ELV_EM1000DL_PID 0xE0F1 /* PC-Datenlogger fuer Energiemonitor (EM 1000 DL) */ #define FTDI_ELV_PCK100_PID 0xE0F2 /* PC-Kabeltester (PCK 100) */ #define FTDI_ELV_RFP500_PID 0xE0F3 /* HF-Leistungsmesser (RFP 500) */ #define FTDI_ELV_FS20SIG_PID 0xE0F4 /* Signalgeber (FS 20 SIG) */ +#define FTDI_ELV_UTP8_PID 0xE0F5 /* ELV UTP 8 */ #define FTDI_ELV_WS300PC_PID 0xE0F6 /* PC-Wetterstation (WS 300 PC) */ +#define FTDI_ELV_WS444PC_PID 0xE0F7 /* Conrad WS 444 PC */ #define FTDI_PHI_FISCO_PID 0xE40B /* PHI Fisco USB to Serial cable */ #define FTDI_ELV_UAD8_PID 0xF068 /* USB-AD-Wandler (UAD 8) */ #define FTDI_ELV_UDA7_PID 0xF069 /* USB-DA-Wandler (UDA 7) */ @@ -968,6 +985,7 @@ #define PAPOUCH_VID 0x5050 /* Vendor ID */ #define PAPOUCH_TMU_PID 0x0400 /* TMU USB Thermometer */ #define PAPOUCH_QUIDO4x4_PID 0x0900 /* Quido 4/4 Module */ +#define PAPOUCH_AD4USB_PID 0x8003 /* AD4USB Measurement Module */ /* * Marvell SheevaPlug diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c index ac1b644..3eb6143 100644 --- a/drivers/usb/serial/sierra.c +++ b/drivers/usb/serial/sierra.c @@ -298,6 +298,7 @@ static struct usb_device_id id_table [] = { { USB_DEVICE(0x1199, 0x68A3), /* Sierra Wireless Direct IP modems */ .driver_info = (kernel_ulong_t)&direct_ip_interface_blacklist }, + { USB_DEVICE(0x413C, 0x08133) }, /* Dell Computer Corp. Wireless 5720 VZW Mobile Broadband (EVDO Rev-A) Minicard GPS Port */ { } }; diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h index c932f90..49575fb 100644 --- a/drivers/usb/storage/unusual_devs.h +++ b/drivers/usb/storage/unusual_devs.h @@ -941,7 +941,7 @@ UNUSUAL_DEV( 0x07ab, 0xfccd, 0x0000, 0x9999, UNUSUAL_DEV( 0x07af, 0x0004, 0x0100, 0x0133, "Microtech", "USB-SCSI-DB25", - US_SC_SCSI, US_PR_BULK, usb_stor_euscsi_init, + US_SC_DEVICE, US_PR_DEVICE, usb_stor_euscsi_init, US_FL_SCM_MULT_TARG ), UNUSUAL_DEV( 0x07af, 0x0005, 0x0100, 0x0100, diff --git a/drivers/uwb/umc-bus.c b/drivers/uwb/umc-bus.c index cdd6c8e..f063451 100644 --- a/drivers/uwb/umc-bus.c +++ b/drivers/uwb/umc-bus.c @@ -62,12 +62,12 @@ int umc_controller_reset(struct umc_dev *umc) struct device *parent = umc->dev.parent; int ret = 0; - if(down_trylock(&parent->sem)) + if (!mutex_trylock(&parent->mutex)) return -EAGAIN; ret = device_for_each_child(parent, parent, umc_bus_pre_reset_helper); if (ret >= 0) ret = device_for_each_child(parent, parent, umc_bus_post_reset_helper); - up(&parent->sem); + mutex_unlock(&parent->mutex); return ret; } diff --git a/drivers/uwb/uwb-internal.h b/drivers/uwb/uwb-internal.h index d5bcfc1..17b10b9 100644 --- a/drivers/uwb/uwb-internal.h +++ b/drivers/uwb/uwb-internal.h @@ -366,12 +366,12 @@ struct dentry *uwb_dbg_create_pal_dir(struct uwb_pal *pal); static inline void uwb_dev_lock(struct uwb_dev *uwb_dev) { - down(&uwb_dev->dev.sem); + mutex_lock(&uwb_dev->dev.mutex); } static inline void uwb_dev_unlock(struct uwb_dev *uwb_dev) { - up(&uwb_dev->dev.sem); + mutex_unlock(&uwb_dev->dev.mutex); } #endif /* #ifndef __UWB_INTERNAL_H__ */ diff --git a/drivers/video/arkfb.c b/drivers/video/arkfb.c index d583bea..de7b771 100644 --- a/drivers/video/arkfb.c +++ b/drivers/video/arkfb.c @@ -23,7 +23,7 @@ #include <linux/svga.h> #include <linux/init.h> #include <linux/pci.h> -#include <linux/console.h> /* Why should fb driver call console functions? because acquire_console_sem() */ +#include <linux/console.h> /* Why should fb driver call console functions? because acquire_console_mutex() */ #include <video/vga.h> #ifdef CONFIG_MTRR @@ -1091,12 +1091,12 @@ static int ark_pci_suspend (struct pci_dev* dev, pm_message_t state) dev_info(info->device, "suspend\n"); - acquire_console_sem(); + acquire_console_mutex(); mutex_lock(&(par->open_lock)); if ((state.event == PM_EVENT_FREEZE) || (par->ref_count == 0)) { mutex_unlock(&(par->open_lock)); - release_console_sem(); + release_console_mutex(); return 0; } @@ -1107,7 +1107,7 @@ static int ark_pci_suspend (struct pci_dev* dev, pm_message_t state) pci_set_power_state(dev, pci_choose_state(dev, state)); mutex_unlock(&(par->open_lock)); - release_console_sem(); + release_console_mutex(); return 0; } @@ -1122,7 +1122,7 @@ static int ark_pci_resume (struct pci_dev* dev) dev_info(info->device, "resume\n"); - acquire_console_sem(); + acquire_console_mutex(); mutex_lock(&(par->open_lock)); if (par->ref_count == 0) @@ -1141,7 +1141,7 @@ static int ark_pci_resume (struct pci_dev* dev) fail: mutex_unlock(&(par->open_lock)); - release_console_sem(); + release_console_mutex(); return 0; } #else diff --git a/drivers/video/aty/aty128fb.c b/drivers/video/aty/aty128fb.c index e4e4d43..02d45ba 100644 --- a/drivers/video/aty/aty128fb.c +++ b/drivers/video/aty/aty128fb.c @@ -1858,11 +1858,11 @@ static void aty128_early_resume(void *data) { struct aty128fb_par *par = data; - if (try_acquire_console_sem()) + if (try_acquire_console_mutex()) return; pci_restore_state(par->pdev); aty128_do_resume(par->pdev); - release_console_sem(); + release_console_mutex(); } #endif /* CONFIG_PPC_PMAC */ @@ -2436,7 +2436,7 @@ static int aty128_pci_suspend(struct pci_dev *pdev, pm_message_t state) printk(KERN_DEBUG "aty128fb: suspending...\n"); - acquire_console_sem(); + acquire_console_mutex(); fb_set_suspend(info, 1); @@ -2468,7 +2468,7 @@ static int aty128_pci_suspend(struct pci_dev *pdev, pm_message_t state) if (state.event != PM_EVENT_ON) aty128_set_suspend(par, 1); - release_console_sem(); + release_console_mutex(); pdev->dev.power.power_state = state; @@ -2525,9 +2525,9 @@ static int aty128_pci_resume(struct pci_dev *pdev) { int rc; - acquire_console_sem(); + acquire_console_mutex(); rc = aty128_do_resume(pdev); - release_console_sem(); + release_console_mutex(); return rc; } diff --git a/drivers/video/aty/atyfb_base.c b/drivers/video/aty/atyfb_base.c index 1ddeb4c..a8bbb83 100644 --- a/drivers/video/aty/atyfb_base.c +++ b/drivers/video/aty/atyfb_base.c @@ -2073,7 +2073,7 @@ static int atyfb_pci_suspend(struct pci_dev *pdev, pm_message_t state) if (state.event == pdev->dev.power.power_state.event) return 0; - acquire_console_sem(); + acquire_console_mutex(); fb_set_suspend(info, 1); @@ -2101,14 +2101,14 @@ static int atyfb_pci_suspend(struct pci_dev *pdev, pm_message_t state) par->lock_blank = 0; atyfb_blank(FB_BLANK_UNBLANK, info); fb_set_suspend(info, 0); - release_console_sem(); + release_console_mutex(); return -EIO; } #else pci_set_power_state(pdev, pci_choose_state(pdev, state)); #endif - release_console_sem(); + release_console_mutex(); pdev->dev.power.power_state = state; @@ -2137,7 +2137,7 @@ static int atyfb_pci_resume(struct pci_dev *pdev) if (pdev->dev.power.power_state.event == PM_EVENT_ON) return 0; - acquire_console_sem(); + acquire_console_mutex(); /* * PCI state will have been restored by the core, so @@ -2165,7 +2165,7 @@ static int atyfb_pci_resume(struct pci_dev *pdev) par->lock_blank = 0; atyfb_blank(FB_BLANK_UNBLANK, info); - release_console_sem(); + release_console_mutex(); pdev->dev.power.power_state = PMSG_ON; diff --git a/drivers/video/aty/radeon_pm.c b/drivers/video/aty/radeon_pm.c index 515cf19..5490e21 100644 --- a/drivers/video/aty/radeon_pm.c +++ b/drivers/video/aty/radeon_pm.c @@ -2626,7 +2626,7 @@ int radeonfb_pci_suspend(struct pci_dev *pdev, pm_message_t mesg) goto done; } - acquire_console_sem(); + acquire_console_mutex(); fb_set_suspend(info, 1); @@ -2690,7 +2690,7 @@ int radeonfb_pci_suspend(struct pci_dev *pdev, pm_message_t mesg) if (rinfo->pm_mode & radeon_pm_d2) radeon_set_suspend(rinfo, 1); - release_console_sem(); + release_console_mutex(); done: pdev->dev.power.power_state = mesg; @@ -2715,10 +2715,10 @@ int radeonfb_pci_resume(struct pci_dev *pdev) return 0; if (rinfo->no_schedule) { - if (try_acquire_console_sem()) + if (try_acquire_console_mutex()) return 0; } else - acquire_console_sem(); + acquire_console_mutex(); printk(KERN_DEBUG "radeonfb (%s): resuming from state: %d...\n", pci_name(pdev), pdev->dev.power.power_state.event); @@ -2783,7 +2783,7 @@ int radeonfb_pci_resume(struct pci_dev *pdev) pdev->dev.power.power_state = PMSG_ON; bail: - release_console_sem(); + release_console_mutex(); return rc; } diff --git a/drivers/video/chipsfb.c b/drivers/video/chipsfb.c index 57b9d27..fda91e4 100644 --- a/drivers/video/chipsfb.c +++ b/drivers/video/chipsfb.c @@ -461,10 +461,10 @@ static int chipsfb_pci_suspend(struct pci_dev *pdev, pm_message_t state) if (!(state.event & PM_EVENT_SLEEP)) goto done; - acquire_console_sem(); + acquire_console_mutex(); chipsfb_blank(1, p); fb_set_suspend(p, 1); - release_console_sem(); + release_console_mutex(); done: pdev->dev.power.power_state = state; return 0; @@ -474,10 +474,10 @@ static int chipsfb_pci_resume(struct pci_dev *pdev) { struct fb_info *p = pci_get_drvdata(pdev); - acquire_console_sem(); + acquire_console_mutex(); fb_set_suspend(p, 0); chipsfb_blank(0, p); - release_console_sem(); + release_console_mutex(); pdev->dev.power.power_state = PMSG_ON; return 0; diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c index 3681c6a..19f6224 100644 --- a/drivers/video/console/fbcon.c +++ b/drivers/video/console/fbcon.c @@ -374,14 +374,14 @@ static void fb_flashcursor(struct work_struct *work) int c; int mode; - acquire_console_sem(); + acquire_console_mutex(); if (ops && ops->currcon != -1) vc = vc_cons[ops->currcon].d; if (!vc || !CON_IS_VISIBLE(vc) || registered_fb[con2fb_map[vc->vc_num]] != info || vc->vc_deccm != 1) { - release_console_sem(); + release_console_mutex(); return; } @@ -391,7 +391,7 @@ static void fb_flashcursor(struct work_struct *work) CM_ERASE : CM_DRAW; ops->cursor(vc, info, mode, softback_lines, get_color(vc, info, c, 1), get_color(vc, info, c, 0)); - release_console_sem(); + release_console_mutex(); } static void cursor_timer_handler(unsigned long dev_addr) @@ -835,7 +835,7 @@ static int set_con2fb_map(int unit, int newidx, int user) found = search_fb_in_map(newidx); - acquire_console_sem(); + acquire_console_mutex(); con2fb_map[unit] = newidx; if (!err && !found) err = con2fb_acquire_newinfo(vc, info, unit, oldidx); @@ -862,7 +862,7 @@ static int set_con2fb_map(int unit, int newidx, int user) if (!search_fb_in_map(info_idx)) info_idx = newidx; - release_console_sem(); + release_console_mutex(); return err; } @@ -3258,6 +3258,7 @@ static const struct consw fb_con = { .con_screen_pos = fbcon_screen_pos, .con_getxy = fbcon_getxy, .con_resize = fbcon_resize, + .con_preemptible = 1, }; static struct notifier_block fbcon_event_notifier = { @@ -3275,7 +3276,7 @@ static ssize_t store_rotate(struct device *device, if (fbcon_has_exited) return count; - acquire_console_sem(); + acquire_console_mutex(); idx = con2fb_map[fg_console]; if (idx == -1 || registered_fb[idx] == NULL) @@ -3285,7 +3286,7 @@ static ssize_t store_rotate(struct device *device, rotate = simple_strtoul(buf, last, 0); fbcon_rotate(info, rotate); err: - release_console_sem(); + release_console_mutex(); return count; } @@ -3300,7 +3301,7 @@ static ssize_t store_rotate_all(struct device *device, if (fbcon_has_exited) return count; - acquire_console_sem(); + acquire_console_mutex(); idx = con2fb_map[fg_console]; if (idx == -1 || registered_fb[idx] == NULL) @@ -3310,7 +3311,7 @@ static ssize_t store_rotate_all(struct device *device, rotate = simple_strtoul(buf, last, 0); fbcon_rotate_all(info, rotate); err: - release_console_sem(); + release_console_mutex(); return count; } @@ -3323,7 +3324,7 @@ static ssize_t show_rotate(struct device *device, if (fbcon_has_exited) return 0; - acquire_console_sem(); + acquire_console_mutex(); idx = con2fb_map[fg_console]; if (idx == -1 || registered_fb[idx] == NULL) @@ -3332,7 +3333,7 @@ static ssize_t show_rotate(struct device *device, info = registered_fb[idx]; rotate = fbcon_get_rotate(info); err: - release_console_sem(); + release_console_mutex(); return snprintf(buf, PAGE_SIZE, "%d\n", rotate); } @@ -3346,7 +3347,7 @@ static ssize_t show_cursor_blink(struct device *device, if (fbcon_has_exited) return 0; - acquire_console_sem(); + acquire_console_mutex(); idx = con2fb_map[fg_console]; if (idx == -1 || registered_fb[idx] == NULL) @@ -3360,7 +3361,7 @@ static ssize_t show_cursor_blink(struct device *device, blink = (ops->flags & FBCON_FLAGS_CURSOR_TIMER) ? 1 : 0; err: - release_console_sem(); + release_console_mutex(); return snprintf(buf, PAGE_SIZE, "%d\n", blink); } @@ -3375,7 +3376,7 @@ static ssize_t store_cursor_blink(struct device *device, if (fbcon_has_exited) return count; - acquire_console_sem(); + acquire_console_mutex(); idx = con2fb_map[fg_console]; if (idx == -1 || registered_fb[idx] == NULL) @@ -3397,7 +3398,7 @@ static ssize_t store_cursor_blink(struct device *device, } err: - release_console_sem(); + release_console_mutex(); return count; } @@ -3436,7 +3437,7 @@ static void fbcon_start(void) if (num_registered_fb) { int i; - acquire_console_sem(); + acquire_console_mutex(); for (i = 0; i < FB_MAX; i++) { if (registered_fb[i] != NULL) { @@ -3445,7 +3446,7 @@ static void fbcon_start(void) } } - release_console_sem(); + release_console_mutex(); fbcon_takeover(0); } } @@ -3505,7 +3506,7 @@ static int __init fb_console_init(void) { int i; - acquire_console_sem(); + acquire_console_mutex(); fb_register_client(&fbcon_event_notifier); fbcon_device = device_create(fb_class, NULL, MKDEV(0, 0), NULL, "fbcon"); @@ -3521,7 +3522,7 @@ static int __init fb_console_init(void) for (i = 0; i < MAX_NR_CONSOLES; i++) con2fb_map[i] = -1; - release_console_sem(); + release_console_mutex(); fbcon_start(); return 0; } @@ -3544,12 +3545,12 @@ static void __exit fbcon_deinit_device(void) static void __exit fb_console_exit(void) { - acquire_console_sem(); + acquire_console_mutex(); fb_unregister_client(&fbcon_event_notifier); fbcon_deinit_device(); device_destroy(fb_class, MKDEV(0, 0)); fbcon_exit(); - release_console_sem(); + release_console_mutex(); unregister_con_driver(&fb_con); } diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c index cc4bbbe..7f29a2e 100644 --- a/drivers/video/console/vgacon.c +++ b/drivers/video/console/vgacon.c @@ -51,7 +51,7 @@ #include <video/vga.h> #include <asm/io.h> -static DEFINE_SPINLOCK(vga_lock); +static DEFINE_RAW_SPINLOCK(vga_lock); static int cursor_size_lastfrom; static int cursor_size_lastto; static u32 vgacon_xres; @@ -158,7 +158,7 @@ static inline void write_vga(unsigned char reg, unsigned int val) * ddprintk might set the console position from interrupt * handlers, thus the write has to be IRQ-atomic. */ - spin_lock_irqsave(&vga_lock, flags); + raw_spin_lock_irqsave(&vga_lock, flags); #ifndef SLOW_VGA v1 = reg + (val & 0xff00); @@ -171,7 +171,7 @@ static inline void write_vga(unsigned char reg, unsigned int val) outb_p(reg + 1, vga_video_port_reg); outb_p(val & 0xff, vga_video_port_val); #endif - spin_unlock_irqrestore(&vga_lock, flags); + raw_spin_unlock_irqrestore(&vga_lock, flags); } static inline void vga_set_mem_top(struct vc_data *c) @@ -668,7 +668,7 @@ static void vgacon_set_cursor_size(int xpos, int from, int to) cursor_size_lastfrom = from; cursor_size_lastto = to; - spin_lock_irqsave(&vga_lock, flags); + raw_spin_lock_irqsave(&vga_lock, flags); if (vga_video_type >= VIDEO_TYPE_VGAC) { outb_p(VGA_CRTC_CURSOR_START, vga_video_port_reg); curs = inb_p(vga_video_port_val); @@ -686,7 +686,7 @@ static void vgacon_set_cursor_size(int xpos, int from, int to) outb_p(curs, vga_video_port_val); outb_p(VGA_CRTC_CURSOR_END, vga_video_port_reg); outb_p(cure, vga_video_port_val); - spin_unlock_irqrestore(&vga_lock, flags); + raw_spin_unlock_irqrestore(&vga_lock, flags); } static void vgacon_cursor(struct vc_data *c, int mode) @@ -761,7 +761,7 @@ static int vgacon_doresize(struct vc_data *c, unsigned int scanlines = height * c->vc_font.height; u8 scanlines_lo = 0, r7 = 0, vsync_end = 0, mode, max_scan; - spin_lock_irqsave(&vga_lock, flags); + raw_spin_lock_irqsave(&vga_lock, flags); vgacon_xres = width * VGA_FONTWIDTH; vgacon_yres = height * c->vc_font.height; @@ -812,7 +812,7 @@ static int vgacon_doresize(struct vc_data *c, outb_p(vsync_end, vga_video_port_val); } - spin_unlock_irqrestore(&vga_lock, flags); + raw_spin_unlock_irqrestore(&vga_lock, flags); return 0; } @@ -895,11 +895,11 @@ static void vga_vesa_blank(struct vgastate *state, int mode) { /* save original values of VGA controller registers */ if (!vga_vesa_blanked) { - spin_lock_irq(&vga_lock); + raw_spin_lock_irq(&vga_lock); vga_state.SeqCtrlIndex = vga_r(state->vgabase, VGA_SEQ_I); vga_state.CrtCtrlIndex = inb_p(vga_video_port_reg); vga_state.CrtMiscIO = vga_r(state->vgabase, VGA_MIS_R); - spin_unlock_irq(&vga_lock); + raw_spin_unlock_irq(&vga_lock); outb_p(0x00, vga_video_port_reg); /* HorizontalTotal */ vga_state.HorizontalTotal = inb_p(vga_video_port_val); @@ -922,7 +922,7 @@ static void vga_vesa_blank(struct vgastate *state, int mode) /* assure that video is enabled */ /* "0x20" is VIDEO_ENABLE_bit in register 01 of sequencer */ - spin_lock_irq(&vga_lock); + raw_spin_lock_irq(&vga_lock); vga_wseq(state->vgabase, VGA_SEQ_CLOCK_MODE, vga_state.ClockingMode | 0x20); /* test for vertical retrace in process.... */ @@ -958,13 +958,13 @@ static void vga_vesa_blank(struct vgastate *state, int mode) /* restore both index registers */ vga_w(state->vgabase, VGA_SEQ_I, vga_state.SeqCtrlIndex); outb_p(vga_state.CrtCtrlIndex, vga_video_port_reg); - spin_unlock_irq(&vga_lock); + raw_spin_unlock_irq(&vga_lock); } static void vga_vesa_unblank(struct vgastate *state) { /* restore original values of VGA controller registers */ - spin_lock_irq(&vga_lock); + raw_spin_lock_irq(&vga_lock); vga_w(state->vgabase, VGA_MIS_W, vga_state.CrtMiscIO); outb_p(0x00, vga_video_port_reg); /* HorizontalTotal */ @@ -989,7 +989,7 @@ static void vga_vesa_unblank(struct vgastate *state) /* restore index/control registers */ vga_w(state->vgabase, VGA_SEQ_I, vga_state.SeqCtrlIndex); outb_p(vga_state.CrtCtrlIndex, vga_video_port_reg); - spin_unlock_irq(&vga_lock); + raw_spin_unlock_irq(&vga_lock); } static void vga_pal_blank(struct vgastate *state) @@ -1109,7 +1109,7 @@ static int vgacon_do_font_op(struct vgastate *state,char *arg,int set,int ch512) #endif unlock_kernel(); - spin_lock_irq(&vga_lock); + raw_spin_lock_irq(&vga_lock); /* First, the Sequencer */ vga_wseq(state->vgabase, VGA_SEQ_RESET, 0x1); /* CPU writes only to map 2 */ @@ -1125,7 +1125,7 @@ static int vgacon_do_font_op(struct vgastate *state,char *arg,int set,int ch512) vga_wgfx(state->vgabase, VGA_GFX_MODE, 0x00); /* map start at A000:0000 */ vga_wgfx(state->vgabase, VGA_GFX_MISC, 0x00); - spin_unlock_irq(&vga_lock); + raw_spin_unlock_irq(&vga_lock); if (arg) { if (set) @@ -1152,7 +1152,7 @@ static int vgacon_do_font_op(struct vgastate *state,char *arg,int set,int ch512) } } - spin_lock_irq(&vga_lock); + raw_spin_lock_irq(&vga_lock); /* First, the sequencer, Synchronous reset */ vga_wseq(state->vgabase, VGA_SEQ_RESET, 0x01); /* CPU writes to maps 0 and 1 */ @@ -1191,7 +1191,7 @@ static int vgacon_do_font_op(struct vgastate *state,char *arg,int set,int ch512) inb_p(video_port_status); vga_wattr(state->vgabase, VGA_AR_ENABLE_DISPLAY, 0); } - spin_unlock_irq(&vga_lock); + raw_spin_unlock_irq(&vga_lock); lock_kernel(); return 0; } @@ -1217,26 +1217,26 @@ static int vgacon_adjust_height(struct vc_data *vc, unsigned fontheight) registers; they are write-only on EGA, but it appears that they are all don't care bits on EGA, so I guess it doesn't matter. */ - spin_lock_irq(&vga_lock); + raw_spin_lock_irq(&vga_lock); outb_p(0x07, vga_video_port_reg); /* CRTC overflow register */ ovr = inb_p(vga_video_port_val); outb_p(0x09, vga_video_port_reg); /* Font size register */ fsr = inb_p(vga_video_port_val); - spin_unlock_irq(&vga_lock); + raw_spin_unlock_irq(&vga_lock); vde = maxscan & 0xff; /* Vertical display end reg */ ovr = (ovr & 0xbd) + /* Overflow register */ ((maxscan & 0x100) >> 7) + ((maxscan & 0x200) >> 3); fsr = (fsr & 0xe0) + (fontheight - 1); /* Font size register */ - spin_lock_irq(&vga_lock); + raw_spin_lock_irq(&vga_lock); outb_p(0x07, vga_video_port_reg); /* CRTC overflow register */ outb_p(ovr, vga_video_port_val); outb_p(0x09, vga_video_port_reg); /* Font size */ outb_p(fsr, vga_video_port_val); outb_p(0x12, vga_video_port_reg); /* Vertical display limit */ outb_p(vde, vga_video_port_val); - spin_unlock_irq(&vga_lock); + raw_spin_unlock_irq(&vga_lock); vga_video_font_height = fontheight; for (i = 0; i < MAX_NR_CONSOLES; i++) { diff --git a/drivers/video/da8xx-fb.c b/drivers/video/da8xx-fb.c index 369a5b3..d7a4419 100644 --- a/drivers/video/da8xx-fb.c +++ b/drivers/video/da8xx-fb.c @@ -964,14 +964,14 @@ static int fb_suspend(struct platform_device *dev, pm_message_t state) struct fb_info *info = platform_get_drvdata(dev); struct da8xx_fb_par *par = info->par; - acquire_console_sem(); + acquire_console_mutex(); if (par->panel_power_ctrl) par->panel_power_ctrl(0); fb_set_suspend(info, 1); lcd_disable_raster(); clk_disable(par->lcdc_clk); - release_console_sem(); + release_console_mutex(); return 0; } @@ -980,14 +980,14 @@ static int fb_resume(struct platform_device *dev) struct fb_info *info = platform_get_drvdata(dev); struct da8xx_fb_par *par = info->par; - acquire_console_sem(); + acquire_console_mutex(); if (par->panel_power_ctrl) par->panel_power_ctrl(1); clk_enable(par->lcdc_clk); lcd_enable_raster(); fb_set_suspend(info, 0); - release_console_sem(); + release_console_mutex(); return 0; } diff --git a/drivers/video/efifb.c b/drivers/video/efifb.c index eb12182..d25df51 100644 --- a/drivers/video/efifb.c +++ b/drivers/video/efifb.c @@ -161,8 +161,17 @@ static int efifb_setcolreg(unsigned regno, unsigned red, unsigned green, return 0; } +static void efifb_destroy(struct fb_info *info) +{ + if (info->screen_base) + iounmap(info->screen_base); + release_mem_region(info->aperture_base, info->aperture_size); + framebuffer_release(info); +} + static struct fb_ops efifb_ops = { .owner = THIS_MODULE, + .fb_destroy = efifb_destroy, .fb_setcolreg = efifb_setcolreg, .fb_fillrect = cfb_fillrect, .fb_copyarea = cfb_copyarea, @@ -281,7 +290,7 @@ static int __init efifb_probe(struct platform_device *dev) info->par = NULL; info->aperture_base = efifb_fix.smem_start; - info->aperture_size = size_total; + info->aperture_size = size_remap; info->screen_base = ioremap(efifb_fix.smem_start, efifb_fix.smem_len); if (!info->screen_base) { diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c index 99bbd28..7537ae5 100644 --- a/drivers/video/fbmem.c +++ b/drivers/video/fbmem.c @@ -1054,11 +1054,11 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd, return -EFAULT; if (!lock_fb_info(info)) return -ENODEV; - acquire_console_sem(); + acquire_console_mutex(); info->flags |= FBINFO_MISC_USEREVENT; ret = fb_set_var(info, &var); info->flags &= ~FBINFO_MISC_USEREVENT; - release_console_sem(); + release_console_mutex(); unlock_fb_info(info); if (!ret && copy_to_user(argp, &var, sizeof(var))) ret = -EFAULT; @@ -1090,9 +1090,9 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd, return -EFAULT; if (!lock_fb_info(info)) return -ENODEV; - acquire_console_sem(); + acquire_console_mutex(); ret = fb_pan_display(info, &var); - release_console_sem(); + release_console_mutex(); unlock_fb_info(info); if (ret == 0 && copy_to_user(argp, &var, sizeof(var))) return -EFAULT; @@ -1137,11 +1137,11 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd, case FBIOBLANK: if (!lock_fb_info(info)) return -ENODEV; - acquire_console_sem(); + acquire_console_mutex(); info->flags |= FBINFO_MISC_USEREVENT; ret = fb_blank(info, arg); info->flags &= ~FBINFO_MISC_USEREVENT; - release_console_sem(); + release_console_mutex(); unlock_fb_info(info); break; default: diff --git a/drivers/video/fbsysfs.c b/drivers/video/fbsysfs.c index d4a2c11..daf2d6d 100644 --- a/drivers/video/fbsysfs.c +++ b/drivers/video/fbsysfs.c @@ -88,11 +88,11 @@ static int activate(struct fb_info *fb_info, struct fb_var_screeninfo *var) int err; var->activate |= FB_ACTIVATE_FORCE; - acquire_console_sem(); + acquire_console_mutex(); fb_info->flags |= FBINFO_MISC_USEREVENT; err = fb_set_var(fb_info, var); fb_info->flags &= ~FBINFO_MISC_USEREVENT; - release_console_sem(); + release_console_mutex(); if (err) return err; return 0; @@ -173,7 +173,7 @@ static ssize_t store_modes(struct device *device, if (i * sizeof(struct fb_videomode) != count) return -EINVAL; - acquire_console_sem(); + acquire_console_mutex(); list_splice(&fb_info->modelist, &old_list); fb_videomode_to_modelist((const struct fb_videomode *)buf, i, &fb_info->modelist); @@ -183,7 +183,7 @@ static ssize_t store_modes(struct device *device, } else fb_destroy_modelist(&old_list); - release_console_sem(); + release_console_mutex(); return 0; } @@ -299,11 +299,11 @@ static ssize_t store_blank(struct device *device, char *last = NULL; int err; - acquire_console_sem(); + acquire_console_mutex(); fb_info->flags |= FBINFO_MISC_USEREVENT; err = fb_blank(fb_info, simple_strtoul(buf, &last, 0)); fb_info->flags &= ~FBINFO_MISC_USEREVENT; - release_console_sem(); + release_console_mutex(); if (err < 0) return err; return count; @@ -362,9 +362,9 @@ static ssize_t store_pan(struct device *device, return -EINVAL; var.yoffset = simple_strtoul(last, &last, 0); - acquire_console_sem(); + acquire_console_mutex(); err = fb_pan_display(fb_info, &var); - release_console_sem(); + release_console_mutex(); if (err < 0) return err; @@ -397,9 +397,9 @@ static ssize_t store_fbstate(struct device *device, state = simple_strtoul(buf, &last, 0); - acquire_console_sem(); + acquire_console_mutex(); fb_set_suspend(fb_info, (int)state); - release_console_sem(); + release_console_mutex(); return count; } diff --git a/drivers/video/geode/gxfb_core.c b/drivers/video/geode/gxfb_core.c index b3e639d..8fcad99 100644 --- a/drivers/video/geode/gxfb_core.c +++ b/drivers/video/geode/gxfb_core.c @@ -342,10 +342,10 @@ static int gxfb_suspend(struct pci_dev *pdev, pm_message_t state) struct fb_info *info = pci_get_drvdata(pdev); if (state.event == PM_EVENT_SUSPEND) { - acquire_console_sem(); + acquire_console_mutex(); gx_powerdown(info); fb_set_suspend(info, 1); - release_console_sem(); + release_console_mutex(); } /* there's no point in setting PCI states; we emulate PCI, so @@ -359,7 +359,7 @@ static int gxfb_resume(struct pci_dev *pdev) struct fb_info *info = pci_get_drvdata(pdev); int ret; - acquire_console_sem(); + acquire_console_mutex(); ret = gx_powerup(info); if (ret) { printk(KERN_ERR "gxfb: power up failed!\n"); @@ -367,7 +367,7 @@ static int gxfb_resume(struct pci_dev *pdev) } fb_set_suspend(info, 0); - release_console_sem(); + release_console_mutex(); return 0; } #endif diff --git a/drivers/video/geode/lxfb_core.c b/drivers/video/geode/lxfb_core.c index 889cbe3..63fe664 100644 --- a/drivers/video/geode/lxfb_core.c +++ b/drivers/video/geode/lxfb_core.c @@ -464,10 +464,10 @@ static int lxfb_suspend(struct pci_dev *pdev, pm_message_t state) struct fb_info *info = pci_get_drvdata(pdev); if (state.event == PM_EVENT_SUSPEND) { - acquire_console_sem(); + acquire_console_mutex(); lx_powerdown(info); fb_set_suspend(info, 1); - release_console_sem(); + release_console_mutex(); } /* there's no point in setting PCI states; we emulate PCI, so @@ -481,7 +481,7 @@ static int lxfb_resume(struct pci_dev *pdev) struct fb_info *info = pci_get_drvdata(pdev); int ret; - acquire_console_sem(); + acquire_console_mutex(); ret = lx_powerup(info); if (ret) { printk(KERN_ERR "lxfb: power up failed!\n"); @@ -489,7 +489,7 @@ static int lxfb_resume(struct pci_dev *pdev) } fb_set_suspend(info, 0); - release_console_sem(); + release_console_mutex(); return 0; } #else diff --git a/drivers/video/i810/i810_main.c b/drivers/video/i810/i810_main.c index 5743ea2..7105d0a 100644 --- a/drivers/video/i810/i810_main.c +++ b/drivers/video/i810/i810_main.c @@ -1574,7 +1574,7 @@ static int i810fb_suspend(struct pci_dev *dev, pm_message_t mesg) return 0; } - acquire_console_sem(); + acquire_console_mutex(); fb_set_suspend(info, 1); if (info->fbops->fb_sync) @@ -1587,7 +1587,7 @@ static int i810fb_suspend(struct pci_dev *dev, pm_message_t mesg) pci_save_state(dev); pci_disable_device(dev); pci_set_power_state(dev, pci_choose_state(dev, mesg)); - release_console_sem(); + release_console_mutex(); return 0; } @@ -1605,7 +1605,7 @@ static int i810fb_resume(struct pci_dev *dev) return 0; } - acquire_console_sem(); + acquire_console_mutex(); pci_set_power_state(dev, PCI_D0); pci_restore_state(dev); @@ -1621,7 +1621,7 @@ static int i810fb_resume(struct pci_dev *dev) fb_set_suspend (info, 0); info->fbops->fb_blank(VESA_NO_BLANKING, info); fail: - release_console_sem(); + release_console_mutex(); return 0; } /*********************************************************************** diff --git a/drivers/video/mx3fb.c b/drivers/video/mx3fb.c index 772ba3f..9271c37 100644 --- a/drivers/video/mx3fb.c +++ b/drivers/video/mx3fb.c @@ -1175,9 +1175,9 @@ static int mx3fb_suspend(struct platform_device *pdev, pm_message_t state) struct mx3fb_data *mx3fb = platform_get_drvdata(pdev); struct mx3fb_info *mx3_fbi = mx3fb->fbi->par; - acquire_console_sem(); + acquire_console_mutex(); fb_set_suspend(mx3fb->fbi, 1); - release_console_sem(); + release_console_mutex(); if (mx3_fbi->blank == FB_BLANK_UNBLANK) { sdc_disable_channel(mx3_fbi); @@ -1200,9 +1200,9 @@ static int mx3fb_resume(struct platform_device *pdev) sdc_set_brightness(mx3fb, mx3fb->backlight_level); } - acquire_console_sem(); + acquire_console_mutex(); fb_set_suspend(mx3fb->fbi, 0); - release_console_sem(); + release_console_mutex(); return 0; } diff --git a/drivers/video/nvidia/nvidia.c b/drivers/video/nvidia/nvidia.c index efe10ff..faaaaf2 100644 --- a/drivers/video/nvidia/nvidia.c +++ b/drivers/video/nvidia/nvidia.c @@ -1057,7 +1057,7 @@ static int nvidiafb_suspend(struct pci_dev *dev, pm_message_t mesg) if (mesg.event == PM_EVENT_PRETHAW) mesg.event = PM_EVENT_FREEZE; - acquire_console_sem(); + acquire_console_mutex(); par->pm_state = mesg.event; if (mesg.event & PM_EVENT_SLEEP) { @@ -1070,7 +1070,7 @@ static int nvidiafb_suspend(struct pci_dev *dev, pm_message_t mesg) } dev->dev.power.power_state = mesg; - release_console_sem(); + release_console_mutex(); return 0; } @@ -1079,7 +1079,7 @@ static int nvidiafb_resume(struct pci_dev *dev) struct fb_info *info = pci_get_drvdata(dev); struct nvidia_par *par = info->par; - acquire_console_sem(); + acquire_console_mutex(); pci_set_power_state(dev, PCI_D0); if (par->pm_state != PM_EVENT_FREEZE) { @@ -1097,7 +1097,7 @@ static int nvidiafb_resume(struct pci_dev *dev) nvidiafb_blank(FB_BLANK_UNBLANK, info); fail: - release_console_sem(); + release_console_mutex(); return 0; } #else diff --git a/drivers/video/ps3fb.c b/drivers/video/ps3fb.c index 9c0144e..3023ebd 100644 --- a/drivers/video/ps3fb.c +++ b/drivers/video/ps3fb.c @@ -513,9 +513,9 @@ static int ps3fb_release(struct fb_info *info, int user) if (atomic_dec_and_test(&ps3fb.f_count)) { if (atomic_read(&ps3fb.ext_flip)) { atomic_set(&ps3fb.ext_flip, 0); - if (!try_acquire_console_sem()) { + if (!try_acquire_console_mutex()) { ps3fb_sync(info, 0); /* single buffer */ - release_console_sem(); + release_console_mutex(); } } } @@ -830,14 +830,14 @@ static int ps3fb_ioctl(struct fb_info *info, unsigned int cmd, if (vmode) { var = info->var; fb_videomode_to_var(&var, vmode); - acquire_console_sem(); + acquire_console_mutex(); info->flags |= FBINFO_MISC_USEREVENT; /* Force, in case only special bits changed */ var.activate |= FB_ACTIVATE_FORCE; par->new_mode_id = val; retval = fb_set_var(info, &var); info->flags &= ~FBINFO_MISC_USEREVENT; - release_console_sem(); + release_console_mutex(); } break; } @@ -881,9 +881,9 @@ static int ps3fb_ioctl(struct fb_info *info, unsigned int cmd, break; dev_dbg(info->device, "PS3FB_IOCTL_FSEL:%d\n", val); - acquire_console_sem(); + acquire_console_mutex(); retval = ps3fb_sync(info, val); - release_console_sem(); + release_console_mutex(); break; default: @@ -903,9 +903,9 @@ static int ps3fbd(void *arg) set_current_state(TASK_INTERRUPTIBLE); if (ps3fb.is_kicked) { ps3fb.is_kicked = 0; - acquire_console_sem(); + acquire_console_mutex(); ps3fb_sync(info, 0); /* single buffer */ - release_console_sem(); + release_console_mutex(); } schedule(); } diff --git a/drivers/video/s3fb.c b/drivers/video/s3fb.c index c3fad34..fcd948a 100644 --- a/drivers/video/s3fb.c +++ b/drivers/video/s3fb.c @@ -23,7 +23,7 @@ #include <linux/svga.h> #include <linux/init.h> #include <linux/pci.h> -#include <linux/console.h> /* Why should fb driver call console functions? because acquire_console_sem() */ +#include <linux/console.h> /* Why should fb driver call console functions? because acquire_console_mutex() */ #include <video/vga.h> #ifdef CONFIG_MTRR @@ -1054,12 +1054,12 @@ static int s3_pci_suspend(struct pci_dev* dev, pm_message_t state) dev_info(info->device, "suspend\n"); - acquire_console_sem(); + acquire_console_mutex(); mutex_lock(&(par->open_lock)); if ((state.event == PM_EVENT_FREEZE) || (par->ref_count == 0)) { mutex_unlock(&(par->open_lock)); - release_console_sem(); + release_console_mutex(); return 0; } @@ -1070,7 +1070,7 @@ static int s3_pci_suspend(struct pci_dev* dev, pm_message_t state) pci_set_power_state(dev, pci_choose_state(dev, state)); mutex_unlock(&(par->open_lock)); - release_console_sem(); + release_console_mutex(); return 0; } @@ -1086,12 +1086,12 @@ static int s3_pci_resume(struct pci_dev* dev) dev_info(info->device, "resume\n"); - acquire_console_sem(); + acquire_console_mutex(); mutex_lock(&(par->open_lock)); if (par->ref_count == 0) { mutex_unlock(&(par->open_lock)); - release_console_sem(); + release_console_mutex(); return 0; } @@ -1100,7 +1100,7 @@ static int s3_pci_resume(struct pci_dev* dev) err = pci_enable_device(dev); if (err) { mutex_unlock(&(par->open_lock)); - release_console_sem(); + release_console_mutex(); dev_err(info->device, "error %d enabling device for resume\n", err); return err; } @@ -1110,7 +1110,7 @@ static int s3_pci_resume(struct pci_dev* dev) fb_set_suspend(info, 0); mutex_unlock(&(par->open_lock)); - release_console_sem(); + release_console_mutex(); return 0; } diff --git a/drivers/video/savage/savagefb_driver.c b/drivers/video/savage/savagefb_driver.c index 842d157..d4dc4eb 100644 --- a/drivers/video/savage/savagefb_driver.c +++ b/drivers/video/savage/savagefb_driver.c @@ -2373,7 +2373,7 @@ static int savagefb_suspend(struct pci_dev *dev, pm_message_t mesg) if (mesg.event == PM_EVENT_FREEZE) return 0; - acquire_console_sem(); + acquire_console_mutex(); fb_set_suspend(info, 1); if (info->fbops->fb_sync) @@ -2385,7 +2385,7 @@ static int savagefb_suspend(struct pci_dev *dev, pm_message_t mesg) pci_save_state(dev); pci_disable_device(dev); pci_set_power_state(dev, pci_choose_state(dev, mesg)); - release_console_sem(); + release_console_mutex(); return 0; } @@ -2409,7 +2409,7 @@ static int savagefb_resume(struct pci_dev* dev) return 0; } - acquire_console_sem(); + acquire_console_mutex(); pci_set_power_state(dev, PCI_D0); pci_restore_state(dev); @@ -2423,7 +2423,7 @@ static int savagefb_resume(struct pci_dev* dev) savagefb_set_par(info); fb_set_suspend(info, 0); savagefb_blank(FB_BLANK_UNBLANK, info); - release_console_sem(); + release_console_mutex(); return 0; } diff --git a/drivers/video/sm501fb.c b/drivers/video/sm501fb.c index 35370d0..be001e3 100644 --- a/drivers/video/sm501fb.c +++ b/drivers/video/sm501fb.c @@ -2010,9 +2010,9 @@ static int sm501fb_suspend_fb(struct sm501fb_info *info, /* tell console/fb driver we are suspending */ - acquire_console_sem(); + acquire_console_mutex(); fb_set_suspend(fbi, 1); - release_console_sem(); + release_console_mutex(); /* backup copies in case chip is powered down over suspend */ @@ -2069,9 +2069,9 @@ static void sm501fb_resume_fb(struct sm501fb_info *info, memcpy_toio(par->cursor.k_addr, par->store_cursor, par->cursor.size); - acquire_console_sem(); + acquire_console_mutex(); fb_set_suspend(fbi, 0); - release_console_sem(); + release_console_mutex(); vfree(par->store_fb); vfree(par->store_cursor); diff --git a/drivers/video/tmiofb.c b/drivers/video/tmiofb.c index 6913fe1..77843ed 100644 --- a/drivers/video/tmiofb.c +++ b/drivers/video/tmiofb.c @@ -25,7 +25,7 @@ #include <linux/fb.h> #include <linux/interrupt.h> #include <linux/delay.h> -/* Why should fb driver call console functions? because acquire_console_sem() */ +/* Why should fb driver call console functions? because acquire_console_mutex() */ #include <linux/console.h> #include <linux/mfd/core.h> #include <linux/mfd/tmio.h> @@ -944,7 +944,7 @@ static int tmiofb_suspend(struct platform_device *dev, pm_message_t state) struct mfd_cell *cell = dev->dev.platform_data; int retval = 0; - acquire_console_sem(); + acquire_console_mutex(); fb_set_suspend(info, 1); @@ -965,7 +965,7 @@ static int tmiofb_suspend(struct platform_device *dev, pm_message_t state) if (cell->suspend) retval = cell->suspend(dev); - release_console_sem(); + release_console_mutex(); return retval; } @@ -976,7 +976,7 @@ static int tmiofb_resume(struct platform_device *dev) struct mfd_cell *cell = dev->dev.platform_data; int retval = 0; - acquire_console_sem(); + acquire_console_mutex(); if (cell->resume) { retval = cell->resume(dev); @@ -992,7 +992,7 @@ static int tmiofb_resume(struct platform_device *dev) fb_set_suspend(info, 0); out: - release_console_sem(); + release_console_mutex(); return retval; } #else diff --git a/drivers/video/vt8623fb.c b/drivers/video/vt8623fb.c index 65ccd21..ba4ae70 100644 --- a/drivers/video/vt8623fb.c +++ b/drivers/video/vt8623fb.c @@ -24,7 +24,7 @@ #include <linux/svga.h> #include <linux/init.h> #include <linux/pci.h> -#include <linux/console.h> /* Why should fb driver call console functions? because acquire_console_sem() */ +#include <linux/console.h> /* Why should fb driver call console functions? because acquire_console_mutex() */ #include <video/vga.h> #ifdef CONFIG_MTRR @@ -818,12 +818,12 @@ static int vt8623_pci_suspend(struct pci_dev* dev, pm_message_t state) dev_info(info->device, "suspend\n"); - acquire_console_sem(); + acquire_console_mutex(); mutex_lock(&(par->open_lock)); if ((state.event == PM_EVENT_FREEZE) || (par->ref_count == 0)) { mutex_unlock(&(par->open_lock)); - release_console_sem(); + release_console_mutex(); return 0; } @@ -834,7 +834,7 @@ static int vt8623_pci_suspend(struct pci_dev* dev, pm_message_t state) pci_set_power_state(dev, pci_choose_state(dev, state)); mutex_unlock(&(par->open_lock)); - release_console_sem(); + release_console_mutex(); return 0; } @@ -849,7 +849,7 @@ static int vt8623_pci_resume(struct pci_dev* dev) dev_info(info->device, "resume\n"); - acquire_console_sem(); + acquire_console_mutex(); mutex_lock(&(par->open_lock)); if (par->ref_count == 0) @@ -868,7 +868,7 @@ static int vt8623_pci_resume(struct pci_dev* dev) fail: mutex_unlock(&(par->open_lock)); - release_console_sem(); + release_console_mutex(); return 0; } diff --git a/drivers/video/xen-fbfront.c b/drivers/video/xen-fbfront.c index 603598f..dd22e0c 100644 --- a/drivers/video/xen-fbfront.c +++ b/drivers/video/xen-fbfront.c @@ -490,12 +490,12 @@ xenfb_make_preferred_console(void) if (console_set_on_cmdline) return; - acquire_console_sem(); + acquire_console_mutex(); for (c = console_drivers; c; c = c->next) { if (!strcmp(c->name, "tty") && c->index == 0) break; } - release_console_sem(); + release_console_mutex(); if (c) { unregister_console(c); c->flags |= CON_CONSDEV; diff --git a/drivers/watchdog/bfin_wdt.c b/drivers/watchdog/bfin_wdt.c index c7b3f9d..2159e66 100644 --- a/drivers/watchdog/bfin_wdt.c +++ b/drivers/watchdog/bfin_wdt.c @@ -1,9 +1,8 @@ /* * Blackfin On-Chip Watchdog Driver - * Supports BF53[123]/BF53[467]/BF54[2489]/BF561 * * Originally based on softdog.c - * Copyright 2006-2007 Analog Devices Inc. + * Copyright 2006-2010 Analog Devices Inc. * Copyright 2006-2007 Michele d'Amico * Copyright 1996 Alan Cox <alan@lxorguk.ukuu.org.uk> * @@ -137,13 +136,15 @@ static int bfin_wdt_running(void) */ static int bfin_wdt_set_timeout(unsigned long t) { - u32 cnt; + u32 cnt, max_t, sclk; unsigned long flags; - stampit(); + sclk = get_sclk(); + max_t = -1 / sclk; + cnt = t * sclk; + stamp("maxtimeout=%us newtimeout=%lus (cnt=%#x)", max_t, t, cnt); - cnt = t * get_sclk(); - if (cnt < get_sclk()) { + if (t > max_t) { printk(KERN_WARNING PFX "timeout value is too large\n"); return -EINVAL; } diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 0e40caa..e76128a 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -50,9 +50,8 @@ struct affs_ext_key { */ struct affs_inode_info { atomic_t i_opencnt; - struct semaphore i_link_lock; /* Protects internal inode access. */ - struct semaphore i_ext_lock; /* Protects internal inode access. */ -#define i_hash_lock i_ext_lock + struct mutex i_link_lock; /* Protects internal inode access. */ + struct mutex i_ext_lock; /* Protects internal inode access. */ u32 i_blkcnt; /* block count */ u32 i_extcnt; /* extended block count */ u32 *i_lc; /* linear cache of extended blocks */ @@ -275,30 +274,23 @@ affs_adjust_bitmapchecksum(struct buffer_head *bh, u32 val) static inline void affs_lock_link(struct inode *inode) { - down(&AFFS_I(inode)->i_link_lock); + mutex_lock(&AFFS_I(inode)->i_link_lock); } static inline void affs_unlock_link(struct inode *inode) { - up(&AFFS_I(inode)->i_link_lock); -} -static inline void -affs_lock_dir(struct inode *inode) -{ - down(&AFFS_I(inode)->i_hash_lock); -} -static inline void -affs_unlock_dir(struct inode *inode) -{ - up(&AFFS_I(inode)->i_hash_lock); + mutex_unlock(&AFFS_I(inode)->i_link_lock); } static inline void affs_lock_ext(struct inode *inode) { - down(&AFFS_I(inode)->i_ext_lock); + mutex_lock(&AFFS_I(inode)->i_ext_lock); } static inline void affs_unlock_ext(struct inode *inode) { - up(&AFFS_I(inode)->i_ext_lock); + mutex_unlock(&AFFS_I(inode)->i_ext_lock); } + +#define affs_lock_dir(i) affs_lock_ext(i) +#define affs_unlock_dir(i) affs_unlock_ext(i) diff --git a/fs/affs/super.c b/fs/affs/super.c index d41e967..f010759 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -113,8 +113,8 @@ static void init_once(void *foo) { struct affs_inode_info *ei = (struct affs_inode_info *) foo; - init_MUTEX(&ei->i_link_lock); - init_MUTEX(&ei->i_ext_lock); + mutex_init(&ei->i_link_lock); + mutex_init(&ei->i_ext_lock); inode_init_once(&ei->vfs_inode); } diff --git a/fs/attr.c b/fs/attr.c index 96d394b..b3f0527 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -206,7 +206,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr) return error; if (ia_valid & ATTR_SIZE) - down_write(&dentry->d_inode->i_alloc_sem); + anon_down_write(&dentry->d_inode->i_alloc_sem); if (inode->i_op && inode->i_op->setattr) { error = inode->i_op->setattr(dentry, attr); @@ -223,7 +223,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr) } if (ia_valid & ATTR_SIZE) - up_write(&dentry->d_inode->i_alloc_sem); + anon_up_write(&dentry->d_inode->i_alloc_sem); if (!error) fsnotify_change(dentry, ia_valid); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9d08096..6ed434a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -720,13 +720,15 @@ again: inode->i_ino, orig_offset); BUG_ON(ret); } - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); if (del_nr == 0) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); btrfs_mark_buffer_dirty(leaf); } else { + fi = btrfs_item_ptr(leaf, del_slot - 1, + struct btrfs_file_extent_item); btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); btrfs_set_file_extent_num_bytes(leaf, fi, diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 1c36e5c..4b3d18a 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -94,6 +94,7 @@ static int btrfs_spin_on_block(struct extent_buffer *eb) */ int btrfs_try_spin_lock(struct extent_buffer *eb) { +#ifndef CONFIG_PREEMPT_RT int i; if (btrfs_spin_on_block(eb)) { @@ -113,6 +114,7 @@ int btrfs_try_spin_lock(struct extent_buffer *eb) return 1; spin_unlock(&eb->lock); } +#endif return 0; } diff --git a/fs/buffer.c b/fs/buffer.c index 6fa5302..b34323c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -40,7 +40,6 @@ #include <linux/cpu.h> #include <linux/bitops.h> #include <linux/mpage.h> -#include <linux/bit_spinlock.h> static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -330,8 +329,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) * decide that the page is now completely done. */ first = page_buffers(page); - local_irq_save(flags); - bit_spin_lock(BH_Uptodate_Lock, &first->b_state); + spin_lock_irqsave(&first->b_uptodate_lock, flags); clear_buffer_async_read(bh); unlock_buffer(bh); tmp = bh; @@ -344,8 +342,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) } tmp = tmp->b_this_page; } while (tmp != bh); - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); /* * If none of the buffers had errors and they are all @@ -357,8 +354,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) return; still_busy: - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); return; } @@ -393,8 +389,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) } first = page_buffers(page); - local_irq_save(flags); - bit_spin_lock(BH_Uptodate_Lock, &first->b_state); + spin_lock_irqsave(&first->b_uptodate_lock, flags); clear_buffer_async_write(bh); unlock_buffer(bh); @@ -406,14 +401,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) } tmp = tmp->b_this_page; } - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); end_page_writeback(page); return; still_busy: - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); return; } EXPORT_SYMBOL(end_buffer_async_write); @@ -3268,6 +3261,8 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags); if (ret) { INIT_LIST_HEAD(&ret->b_assoc_buffers); + spin_lock_init(&ret->b_uptodate_lock); + spin_lock_init(&ret->b_state_lock); get_cpu_var(bh_accounting).nr++; recalc_bh_state(); put_cpu_var(bh_accounting); @@ -3279,6 +3274,8 @@ EXPORT_SYMBOL(alloc_buffer_head); void free_buffer_head(struct buffer_head *bh) { BUG_ON(!list_empty(&bh->b_assoc_buffers)); + BUG_ON(spin_is_locked(&bh->b_uptodate_lock)); + BUG_ON(spin_is_locked(&bh->b_state_lock)); kmem_cache_free(bh_cachep, bh); get_cpu_var(bh_accounting).nr--; recalc_bh_state(); diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 14ac480..eeb4986 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -348,7 +348,17 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, dir = dget_parent(object->dentry); mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); - ret = cachefiles_bury_object(cache, dir, object->dentry); + + /* we need to check that our parent is _still_ our parent - it may have + * been renamed */ + if (dir == object->dentry->d_parent) { + ret = cachefiles_bury_object(cache, dir, object->dentry); + } else { + /* it got moved, presumably by cachefilesd culling it, so it's + * no longer in the key path and we can ignore it */ + mutex_unlock(&dir->d_inode->i_mutex); + ret = 0; + } dput(dir); _leave(" = %d", ret); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index ed751bb..a1c817e 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -205,7 +205,7 @@ struct cifsUidInfo { struct cifsSesInfo { struct list_head smb_ses_list; struct list_head tcon_list; - struct semaphore sesSem; + struct mutex session_mutex; #if 0 struct cifsUidInfo *uidInfo; /* pointer to user info */ #endif diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 941441d..63de4d6 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -170,19 +170,19 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command) * need to prevent multiple threads trying to simultaneously * reconnect the same SMB session */ - down(&ses->sesSem); + mutex_lock(&ses->session_mutex); if (ses->need_reconnect) rc = cifs_setup_session(0, ses, nls_codepage); /* do we need to reconnect tcon? */ if (rc || !tcon->need_reconnect) { - up(&ses->sesSem); + mutex_unlock(&ses->session_mutex); goto out; } mark_open_files_invalid(tcon); rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage); - up(&ses->sesSem); + mutex_unlock(&ses->session_mutex); cFYI(1, ("reconnect tcon rc = %d", rc)); if (rc) @@ -700,13 +700,13 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses) if (!ses || !ses->server) return -EIO; - down(&ses->sesSem); + mutex_lock(&ses->session_mutex); if (ses->need_reconnect) goto session_already_dead; /* no need to send SMBlogoff if uid already closed due to reconnect */ rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB); if (rc) { - up(&ses->sesSem); + mutex_unlock(&ses->session_mutex); return rc; } @@ -721,7 +721,7 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses) pSMB->AndXCommand = 0xFF; rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0); session_already_dead: - up(&ses->sesSem); + mutex_unlock(&ses->session_mutex); /* if session dead then we do not need to do ulogoff, since server closed smb session, no sense reporting diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 2e9e09c..45eb6cb 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2388,13 +2388,13 @@ try_mount_again: */ cifs_put_tcp_session(srvTcp); - down(&pSesInfo->sesSem); + mutex_lock(&pSesInfo->session_mutex); if (pSesInfo->need_reconnect) { cFYI(1, ("Session needs reconnect")); rc = cifs_setup_session(xid, pSesInfo, cifs_sb->local_nls); } - up(&pSesInfo->sesSem); + mutex_unlock(&pSesInfo->session_mutex); } else if (!rc) { cFYI(1, ("Existing smb sess not found")); pSesInfo = sesInfoAlloc(); @@ -2437,12 +2437,12 @@ try_mount_again: } pSesInfo->linux_uid = volume_info->linux_uid; pSesInfo->overrideSecFlg = volume_info->secFlg; - down(&pSesInfo->sesSem); + mutex_lock(&pSesInfo->session_mutex); /* BB FIXME need to pass vol->secFlgs BB */ rc = cifs_setup_session(xid, pSesInfo, cifs_sb->local_nls); - up(&pSesInfo->sesSem); + mutex_unlock(&pSesInfo->session_mutex); } /* search for existing tcon to this server share */ diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index d27d4ec..d147499 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -79,7 +79,7 @@ sesInfoAlloc(void) ++ret_buf->ses_count; INIT_LIST_HEAD(&ret_buf->smb_ses_list); INIT_LIST_HEAD(&ret_buf->tcon_list); - init_MUTEX(&ret_buf->sesSem); + mutex_init(&ret_buf->session_mutex); } return ret_buf; } diff --git a/fs/dcache.c b/fs/dcache.c index 953173a..116fd33 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -727,8 +727,9 @@ void shrink_dcache_for_umount(struct super_block *sb) { struct dentry *dentry; - if (down_read_trylock(&sb->s_umount)) - BUG(); +// -rt: this might succeed there ... +// if (down_read_trylock(&sb->s_umount)) +// BUG(); dentry = sb->s_root; sb->s_root = NULL; diff --git a/fs/direct-io.c b/fs/direct-io.c index e82adc2..45ee3f6 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -242,7 +242,7 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret) if (dio->flags & DIO_LOCKING) /* lockdep: non-owner release */ - up_read_non_owner(&dio->inode->i_alloc_sem); + anon_up_read_non_owner(&dio->inode->i_alloc_sem); if (ret == 0) ret = dio->page_errors; @@ -1184,7 +1184,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, * Will be released at I/O completion, possibly in a * different thread. */ - down_read_non_owner(&inode->i_alloc_sem); + anon_down_read_non_owner(&inode->i_alloc_sem); } /* diff --git a/fs/exec.c b/fs/exec.c index e95c692..2bcb327 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -55,6 +55,7 @@ #include <linux/fsnotify.h> #include <linux/fs_struct.h> #include <linux/pipe_fs_i.h> +#include <linux/delay.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -502,7 +503,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) unsigned long length = old_end - old_start; unsigned long new_start = old_start - shift; unsigned long new_end = old_end - shift; - struct mmu_gather *tlb; + struct mmu_gather tlb; BUG_ON(new_start > new_end); @@ -527,12 +528,12 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) return -ENOMEM; lru_add_drain(); - tlb = tlb_gather_mmu(mm, 0); + tlb_gather_mmu(&tlb, mm, 0); if (new_end > old_start) { /* * when the old and new regions overlap clear from new_end. */ - free_pgd_range(tlb, new_end, old_end, new_end, + free_pgd_range(&tlb, new_end, old_end, new_end, vma->vm_next ? vma->vm_next->vm_start : 0); } else { /* @@ -541,10 +542,10 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * have constraints on va-space that make this illegal (IA64) - * for the others its just a little faster. */ - free_pgd_range(tlb, old_start, old_end, new_end, + free_pgd_range(&tlb, old_start, old_end, new_end, vma->vm_next ? vma->vm_next->vm_start : 0); } - tlb_finish_mmu(tlb, new_end, old_end); + tlb_finish_mmu(&tlb, new_end, old_end); /* * shrink the vma to just the new range. @@ -637,7 +638,6 @@ int setup_arg_pages(struct linux_binprm *bprm, * will align it up. */ rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK; - rlim_stack = min(rlim_stack, stack_size); #ifdef CONFIG_STACK_GROWSUP if (stack_size + stack_expand > rlim_stack) stack_base = vma->vm_start + rlim_stack; @@ -735,10 +735,12 @@ static int exec_mmap(struct mm_struct *mm) } } task_lock(tsk); + local_irq_disable(); active_mm = tsk->active_mm; + activate_mm(active_mm, mm); tsk->mm = mm; tsk->active_mm = mm; - activate_mm(active_mm, mm); + local_irq_enable(); task_unlock(tsk); arch_pick_mmap_layout(mm); if (old_mm) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e119524..89dbb38 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5768,7 +5768,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) * Get i_alloc_sem to stop truncates messing with the inode. We cannot * get i_mutex because we are already holding mmap_sem. */ - down_read(&inode->i_alloc_sem); + anon_down_read(&inode->i_alloc_sem); size = i_size_read(inode); if (page->mapping != mapping || size <= page_offset(page) || !PageUptodate(page)) { @@ -5818,6 +5818,6 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) out_unlock: if (ret) ret = VM_FAULT_SIGBUS; - up_read(&inode->i_alloc_sem); + anon_up_read(&inode->i_alloc_sem); return ret; } diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 14da530..5c5e1cd 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -202,9 +202,9 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block) sector_t blocknr; /* fat_get_cluster() assumes the requested blocknr isn't truncated. */ - down_read(&mapping->host->i_alloc_sem); + anon_down_read(&mapping->host->i_alloc_sem); blocknr = generic_block_bmap(mapping, block, fat_get_block); - up_read(&mapping->host->i_alloc_sem); + anon_up_read(&mapping->host->i_alloc_sem); return blocknr; } diff --git a/fs/file.c b/fs/file.c index 87e1290..b08281f 100644 --- a/fs/file.c +++ b/fs/file.c @@ -103,14 +103,15 @@ void free_fdtable_rcu(struct rcu_head *rcu) kfree(fdt->open_fds); kfree(fdt); } else { - fddef = &get_cpu_var(fdtable_defer_list); + + fddef = &per_cpu(fdtable_defer_list, raw_smp_processor_id()); + spin_lock(&fddef->lock); fdt->next = fddef->next; fddef->next = fdt; /* vmallocs are handled from the workqueue context */ schedule_work(&fddef->wq); spin_unlock(&fddef->lock); - put_cpu_var(fdtable_defer_list); } } diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c index 4129cdb..571abe9 100644 --- a/fs/hfs/bfind.c +++ b/fs/hfs/bfind.c @@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); - down(&tree->tree_lock); + mutex_lock(&tree->tree_lock); return 0; } @@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd) hfs_bnode_put(fd->bnode); kfree(fd->search_key); dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); - up(&fd->tree->tree_lock); + mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; } diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 052f214..edf44d7 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -26,7 +26,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke if (!tree) return NULL; - init_MUTEX(&tree->tree_lock); + mutex_init(&tree->tree_lock); spin_lock_init(&tree->hash_lock); /* Set the correct compare function */ tree->sb = sb; diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h index cc51905..2a1d712 100644 --- a/fs/hfs/btree.h +++ b/fs/hfs/btree.h @@ -33,7 +33,7 @@ struct hfs_btree { unsigned int depth; //unsigned int map1_size, map_size; - struct semaphore tree_lock; + struct mutex tree_lock; unsigned int pages_per_bnode; spinlock_t hash_lock; diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c index 5007a41..68c7983 100644 --- a/fs/hfsplus/bfind.c +++ b/fs/hfsplus/bfind.c @@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); - down(&tree->tree_lock); + mutex_lock(&tree->tree_lock); return 0; } @@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd) hfs_bnode_put(fd->bnode); kfree(fd->search_key); dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); - up(&fd->tree->tree_lock); + mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; } diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index e49fcee..aa5fcb3 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -30,7 +30,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) if (!tree) return NULL; - init_MUTEX(&tree->tree_lock); + mutex_init(&tree->tree_lock); spin_lock_init(&tree->hash_lock); tree->sb = sb; tree->cnid = id; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 5c10d80..d15f35e 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -62,7 +62,7 @@ struct hfs_btree { unsigned int depth; //unsigned int map1_size, map_size; - struct semaphore tree_lock; + struct mutex tree_lock; unsigned int pages_per_bnode; spinlock_t hash_lock; diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c index b6fca54..8f4141c 100644 --- a/fs/hpfs/buffer.c +++ b/fs/hpfs/buffer.c @@ -13,7 +13,7 @@ void hpfs_lock_creation(struct super_block *s) #ifdef DEBUG_LOCKS printk("lock creation\n"); #endif - down(&hpfs_sb(s)->hpfs_creation_de); + mutex_lock(&hpfs_sb(s)->hpfs_creation_de); } void hpfs_unlock_creation(struct super_block *s) @@ -21,7 +21,7 @@ void hpfs_unlock_creation(struct super_block *s) #ifdef DEBUG_LOCKS printk("unlock creation\n"); #endif - up(&hpfs_sb(s)->hpfs_creation_de); + mutex_unlock(&hpfs_sb(s)->hpfs_creation_de); } /* Map a sector into a buffer and return pointers to it and to the buffer. */ diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 701ca54..ed222f5 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -87,7 +87,7 @@ struct hpfs_sb_info { unsigned *sb_bmp_dir; /* main bitmap directory */ unsigned sb_c_bitmap; /* current bitmap */ unsigned sb_max_fwd_alloc; /* max forwad allocation */ - struct semaphore hpfs_creation_de; /* when creating dirents, nobody else + struct mutex hpfs_creation_de; /* when creating dirents, nobody else can alloc blocks */ /*unsigned sb_mounting : 1;*/ int sb_timeshift; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index cadc4ce..e13f103 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -486,7 +486,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) sbi->sb_bmp_dir = NULL; sbi->sb_cp_table = NULL; - init_MUTEX(&sbi->hpfs_creation_de); + mutex_init(&sbi->hpfs_creation_de); uid = current_uid(); gid = current_gid(); diff --git a/fs/inode.c b/fs/inode.c index 03dfeb2..ce10948 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -162,7 +162,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mutex_init(&inode->i_mutex); lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); - init_rwsem(&inode->i_alloc_sem); + init_anon_rwsem(&inode->i_alloc_sem); lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key); mapping->a_ops = &empty_aops; diff --git a/fs/ioprio.c b/fs/ioprio.c index c7c0b28..82c40a2 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -230,6 +230,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) if (!user) break; + rcu_read_lock(); do_each_thread(g, p) { if (__task_cred(p)->uid != user->uid) continue; @@ -241,6 +242,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) else ret = ioprio_best(ret, tmpio); } while_each_thread(g, p); + rcu_read_unlock(); if (who) free_uid(user); diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 006f9ad..873f8e5 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1577,7 +1577,7 @@ static void __journal_temp_unlink_buffer(struct journal_head *jh) transaction_t *transaction; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh)); transaction = jh->b_transaction; if (transaction) assert_spin_locked(&transaction->t_journal->j_list_lock); @@ -2029,7 +2029,7 @@ void __journal_file_buffer(struct journal_head *jh, int was_dirty = 0; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh)); assert_spin_locked(&transaction->t_journal->j_list_lock); J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); @@ -2123,7 +2123,7 @@ void __journal_refile_buffer(struct journal_head *jh) int was_dirty; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh)); if (jh->b_transaction) assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index a051270..1de640d 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1452,7 +1452,7 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) transaction_t *transaction; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh)); transaction = jh->b_transaction; if (transaction) assert_spin_locked(&transaction->t_journal->j_list_lock); @@ -1884,7 +1884,7 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, int was_dirty = 0; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh)); assert_spin_locked(&transaction->t_journal->j_list_lock); J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); @@ -1972,7 +1972,7 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh) int was_dirty; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh)); if (jh->b_transaction) assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); diff --git a/fs/namei.c b/fs/namei.c index d62fdc8..a4855af 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -823,6 +823,17 @@ fail: } /* + * This is a temporary kludge to deal with "automount" symlinks; proper + * solution is to trigger them on follow_mount(), so that do_lookup() + * would DTRT. To be killed before 2.6.34-final. + */ +static inline int follow_on_final(struct inode *inode, unsigned lookup_flags) +{ + return inode && unlikely(inode->i_op->follow_link) && + ((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode)); +} + +/* * Name resolution. * This is the basic name resolution function, turning a pathname into * the final dentry. We expect 'base' to be positive and a directory. @@ -942,8 +953,7 @@ last_component: if (err) break; inode = next.dentry->d_inode; - if ((lookup_flags & LOOKUP_FOLLOW) - && inode && inode->i_op->follow_link) { + if (follow_on_final(inode, lookup_flags)) { err = do_follow_link(&next, nd); if (err) goto return_err; diff --git a/fs/namespace.c b/fs/namespace.c index c768f73..962fd96 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -264,8 +264,16 @@ int mnt_want_write(struct vfsmount *mnt) * incremented count after it has set MNT_WRITE_HOLD. */ smp_mb(); - while (mnt->mnt_flags & MNT_WRITE_HOLD) - cpu_relax(); + preempt_enable(); + /* + * HACK ALERT. on RT we can not spin here with cpu_relax() and + * preemption disabled so we block on the vfsmount lock which is + * held by mnt_make_readonly(). Works on !RT as well. + */ + while (mnt->mnt_flags & MNT_WRITE_HOLD) { + spin_lock(&vfsmount_lock); + spin_unlock(&vfsmount_lock); + } /* * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will * be set to match its requirements. So we must not load that until @@ -273,12 +281,11 @@ int mnt_want_write(struct vfsmount *mnt) */ smp_rmb(); if (__mnt_is_readonly(mnt)) { + preempt_disable(); dec_mnt_writers(mnt); + preempt_enable(); ret = -EROFS; - goto out; } -out: - preempt_enable(); return ret; } EXPORT_SYMBOL_GPL(mnt_want_write); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index e1d415e..0d28982 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -342,6 +342,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, data->res.fattr = &data->fattr; data->res.eof = 0; data->res.count = bytes; + nfs_fattr_init(&data->fattr); msg.rpc_argp = &data->args; msg.rpc_resp = &data->res; @@ -575,6 +576,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) data->res.count = 0; data->res.fattr = &data->fattr; data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); NFS_PROTO(data->inode)->commit_setup(data, &msg); @@ -766,6 +768,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, data->res.fattr = &data->fattr; data->res.count = bytes; data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); task_setup_data.task = &data->task; task_setup_data.callback_data = data; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 97d79ef..8715d19 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -752,7 +752,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, flags, current_cred()); if (IS_ERR(*filp)) host_err = PTR_ERR(*filp); - host_err = ima_file_check(*filp, access); + else + host_err = ima_file_check(*filp, access); out_nfserr: err = nfserrno(host_err); out: diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 06713ff..e7ed4cd 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -509,7 +509,7 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb, spin_lock_init(&inode->i_lock); mutex_init(&inode->i_mutex); - init_rwsem(&inode->i_alloc_sem); + init_anon_rwsem(&inode->i_alloc_sem); mapping->host = NULL; /* instead of inode */ mapping->flags = 0; diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index cfce53c..2d38d1d 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -29,6 +29,7 @@ #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/bit_spinlock.h> +#include <linux/interrupt.h> #include "aops.h" #include "attrib.h" @@ -107,8 +108,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) "0x%llx.", (unsigned long long)bh->b_blocknr); } first = page_buffers(page); - local_irq_save(flags); - bit_spin_lock(BH_Uptodate_Lock, &first->b_state); + spin_lock_irqsave(&first->b_uptodate_lock, flags); clear_buffer_async_read(bh); unlock_buffer(bh); tmp = bh; @@ -123,8 +123,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) } tmp = tmp->b_this_page; } while (tmp != bh); - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); /* * If none of the buffers had errors then we can set the page uptodate, * but we first have to perform the post read mst fixups, if the @@ -145,13 +144,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) recs = PAGE_CACHE_SIZE / rec_size; /* Should have been verified before we got here... */ BUG_ON(!recs); - local_irq_save(flags); + local_irq_save_nort(flags); kaddr = kmap_atomic(page, KM_BIO_SRC_IRQ); for (i = 0; i < recs; i++) post_read_mst_fixup((NTFS_RECORD*)(kaddr + i * rec_size), rec_size); kunmap_atomic(kaddr, KM_BIO_SRC_IRQ); - local_irq_restore(flags); + local_irq_restore_nort(flags); flush_dcache_page(page); if (likely(page_uptodate && !PageError(page))) SetPageUptodate(page); @@ -159,8 +158,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) unlock_page(page); return; still_busy: - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); return; } diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 43179dd..b3d9d74 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1845,9 +1845,9 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, * fails again. */ if (unlikely(NInoTruncateFailed(ni))) { - down_write(&vi->i_alloc_sem); + anon_down_write(&vi->i_alloc_sem); err = ntfs_truncate(vi); - up_write(&vi->i_alloc_sem); + anon_up_write(&vi->i_alloc_sem); if (err || NInoTruncateFailed(ni)) { if (!err) err = -EIO; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 7e9df11..5d55344 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -620,7 +620,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, level = ocfs2_iocb_rw_locked_level(iocb); if (!level) - up_read(&inode->i_alloc_sem); + anon_up_read(&inode->i_alloc_sem); ocfs2_rw_unlock(inode, level); } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 558ce03..4394eec 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1934,7 +1934,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, relock: /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ if (direct_io) { - down_read(&inode->i_alloc_sem); + anon_down_read(&inode->i_alloc_sem); have_alloc_sem = 1; } @@ -1961,7 +1961,7 @@ relock: */ if (direct_io && !can_do_direct) { ocfs2_rw_unlock(inode, rw_level); - up_read(&inode->i_alloc_sem); + anon_up_read(&inode->i_alloc_sem); have_alloc_sem = 0; rw_level = -1; @@ -2054,7 +2054,7 @@ out: out_sems: if (have_alloc_sem) - up_read(&inode->i_alloc_sem); + anon_up_read(&inode->i_alloc_sem); mutex_unlock(&inode->i_mutex); @@ -2203,7 +2203,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, * need locks to protect pending reads from racing with truncate. */ if (filp->f_flags & O_DIRECT) { - down_read(&inode->i_alloc_sem); + anon_down_read(&inode->i_alloc_sem); have_alloc_sem = 1; ret = ocfs2_rw_lock(inode, 0); @@ -2247,7 +2247,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, bail: if (have_alloc_sem) - up_read(&inode->i_alloc_sem); + anon_up_read(&inode->i_alloc_sem); if (rw_level != -1) ocfs2_rw_unlock(inode, rw_level); mlog_exit(ret); diff --git a/fs/pipe.c b/fs/pipe.c index 37ba29f..a9dcf21 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -434,8 +434,14 @@ redo: wake_up_interruptible_sync(&pipe->wait); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } + /* + * Hack: we turn off atime updates for -RT kernels. + * Who uses them on pipes anyway? + */ +#ifndef CONFIG_PREEMPT_RT if (ret > 0) file_accessed(filp); +#endif return ret; } @@ -607,8 +613,14 @@ out: wake_up_interruptible_sync(&pipe->wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } + /* + * Hack: we turn off atime updates for -RT kernels. + * Who uses them on pipes anyway? + */ +#ifndef CONFIG_PREEMPT_RT if (ret > 0) file_update_time(filp); +#endif return ret; } diff --git a/fs/proc/array.c b/fs/proc/array.c index 13b5d07..768d3e2 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -129,21 +129,23 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) /* * The task state array is a strange "bitmap" of - * reasons to sleep. Thus "running" is zero, and - * you can test for combinations of others with + * reasons to sleep. Thus, the first element is zero, + * and you can test for combinations of others with * simple bit tests. */ +#define TASK_STATE_X(num) TASK_STATE_##num " (" DESCR_TASK_STATE_##num ")" static const char *task_state_array[] = { - "R (running)", /* 0 */ - "S (sleeping)", /* 1 */ - "D (disk sleep)", /* 2 */ - "T (stopped)", /* 4 */ - "t (tracing stop)", /* 8 */ - "Z (zombie)", /* 16 */ - "X (dead)", /* 32 */ - "x (dead)", /* 64 */ - "K (wakekill)", /* 128 */ - "W (waking)", /* 256 */ + TASK_STATE_X(0), + TASK_STATE_X(1), + TASK_STATE_X(2), + TASK_STATE_X(4), + TASK_STATE_X(8), + TASK_STATE_X(16), + TASK_STATE_X(32), + TASK_STATE_X(64), + TASK_STATE_X(128), + TASK_STATE_X(256), + TASK_STATE_X(512) }; static inline const char *get_task_state(struct task_struct *tsk) @@ -270,7 +272,9 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) blocked = p->blocked; collect_sigign_sigcatch(p, &ignored, &caught); num_threads = atomic_read(&p->signal->count); + rcu_read_lock(); qsize = atomic_read(&__task_cred(p)->user->sigpending); + rcu_read_unlock(); qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur; unlock_task_sighand(p, &flags); } @@ -337,6 +341,18 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) seq_printf(m, "\n"); } +#define get_blocked_on(t) (-1) + +static inline void show_blocked_on(struct seq_file *m, struct task_struct *p) +{ + pid_t pid = get_blocked_on(p); + + if (pid < 0) + return; + + seq_printf(m, "BlckOn: %d\n", pid); +} + int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -357,6 +373,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_show_regs(m, task); #endif task_context_switch_counts(m, task); + show_blocked_on(m, task); return 0; } diff --git a/fs/proc/base.c b/fs/proc/base.c index e42bbd8..58324c2 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2369,16 +2369,30 @@ static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) { struct pid_namespace *ns = dentry->d_sb->s_fs_info; pid_t tgid = task_tgid_nr_ns(current, ns); - char tmp[PROC_NUMBUF]; - if (!tgid) - return ERR_PTR(-ENOENT); - sprintf(tmp, "%d", task_tgid_nr_ns(current, ns)); - return ERR_PTR(vfs_follow_link(nd,tmp)); + char *name = ERR_PTR(-ENOENT); + if (tgid) { + name = __getname(); + if (!name) + name = ERR_PTR(-ENOMEM); + else + sprintf(name, "%d", tgid); + } + nd_set_link(nd, name); + return NULL; +} + +static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + char *s = nd_get_link(nd); + if (!IS_ERR(s)) + __putname(s); } static const struct inode_operations proc_self_inode_operations = { .readlink = proc_self_readlink, .follow_link = proc_self_follow_link, + .put_link = proc_self_put_link, }; /* diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f277c4a..da4409e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -138,8 +138,10 @@ static void *m_start(struct seq_file *m, loff_t *pos) vma = NULL; if ((unsigned long)l < mm->map_count) { vma = mm->mmap; - while (l-- && vma) + while (l-- && vma) { vma = vma->vm_next; + cond_resched(); + } goto out; } diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 9087b10..2df0f5c 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1497,9 +1497,11 @@ struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key) args.objectid = key->on_disk_key.k_objectid; args.dirid = key->on_disk_key.k_dir_id; + reiserfs_write_unlock(s); inode = iget5_locked(s, key->on_disk_key.k_objectid, reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args)); + reiserfs_write_lock(s); if (!inode) return ERR_PTR(-ENOMEM); diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 81f09fa..75f4c0b 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -563,11 +563,11 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, reiserfs_write_unlock(inode->i_sb); mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR); - down_write(&dentry->d_inode->i_alloc_sem); + anon_down_write(&dentry->d_inode->i_alloc_sem); reiserfs_write_lock(inode->i_sb); err = reiserfs_setattr(dentry, &newattrs); - up_write(&dentry->d_inode->i_alloc_sem); + anon_up_write(&dentry->d_inode->i_alloc_sem); mutex_unlock(&dentry->d_inode->i_mutex); } else update_ctime(inode); diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c index 1c4c8f0..6d68729 100644 --- a/fs/smbfs/inode.c +++ b/fs/smbfs/inode.c @@ -530,7 +530,7 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent) server->mnt = NULL; server->sock_file = NULL; init_waitqueue_head(&server->conn_wq); - init_MUTEX(&server->sem); + mutex_init(&server->mutex); INIT_LIST_HEAD(&server->entry); INIT_LIST_HEAD(&server->xmitq); INIT_LIST_HEAD(&server->recvq); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 220b758..6a06a1d 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -81,24 +81,23 @@ int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr * iattr) if (!sd_attrs) return -ENOMEM; sd->s_iattr = sd_attrs; - } else { - /* attributes were changed at least once in past */ - iattrs = &sd_attrs->ia_iattr; - - if (ia_valid & ATTR_UID) - iattrs->ia_uid = iattr->ia_uid; - if (ia_valid & ATTR_GID) - iattrs->ia_gid = iattr->ia_gid; - if (ia_valid & ATTR_ATIME) - iattrs->ia_atime = iattr->ia_atime; - if (ia_valid & ATTR_MTIME) - iattrs->ia_mtime = iattr->ia_mtime; - if (ia_valid & ATTR_CTIME) - iattrs->ia_ctime = iattr->ia_ctime; - if (ia_valid & ATTR_MODE) { - umode_t mode = iattr->ia_mode; - iattrs->ia_mode = sd->s_mode = mode; - } + } + /* attributes were changed at least once in past */ + iattrs = &sd_attrs->ia_iattr; + + if (ia_valid & ATTR_UID) + iattrs->ia_uid = iattr->ia_uid; + if (ia_valid & ATTR_GID) + iattrs->ia_gid = iattr->ia_gid; + if (ia_valid & ATTR_ATIME) + iattrs->ia_atime = iattr->ia_atime; + if (ia_valid & ATTR_MTIME) + iattrs->ia_mtime = iattr->ia_mtime; + if (ia_valid & ATTR_CTIME) + iattrs->ia_ctime = iattr->ia_ctime; + if (ia_valid & ATTR_MODE) { + umode_t mode = iattr->ia_mode; + iattrs->ia_mode = sd->s_mode = mode; } return 0; } diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/linux-2.6/mrlock.h index ff6a198..77de2c2 100644 --- a/fs/xfs/linux-2.6/mrlock.h +++ b/fs/xfs/linux-2.6/mrlock.h @@ -21,7 +21,7 @@ #include <linux/rwsem.h> typedef struct { - struct rw_semaphore mr_lock; + struct rw_anon_semaphore mr_lock; #ifdef DEBUG int mr_writer; #endif @@ -29,10 +29,10 @@ typedef struct { #ifdef DEBUG #define mrinit(mrp, name) \ - do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0) + do { (mrp)->mr_writer = 0; init_anon_rwsem(&(mrp)->mr_lock); } while (0) #else #define mrinit(mrp, name) \ - do { init_rwsem(&(mrp)->mr_lock); } while (0) + do { init_anon_rwsem(&(mrp)->mr_lock); } while (0) #endif #define mrlock_init(mrp, t,n,s) mrinit(mrp, n) @@ -40,12 +40,12 @@ typedef struct { static inline void mraccess_nested(mrlock_t *mrp, int subclass) { - down_read_nested(&mrp->mr_lock, subclass); + anon_down_read_nested(&mrp->mr_lock, subclass); } static inline void mrupdate_nested(mrlock_t *mrp, int subclass) { - down_write_nested(&mrp->mr_lock, subclass); + anon_down_write_nested(&mrp->mr_lock, subclass); #ifdef DEBUG mrp->mr_writer = 1; #endif @@ -53,12 +53,12 @@ static inline void mrupdate_nested(mrlock_t *mrp, int subclass) static inline int mrtryaccess(mrlock_t *mrp) { - return down_read_trylock(&mrp->mr_lock); + return anon_down_read_trylock(&mrp->mr_lock); } static inline int mrtryupdate(mrlock_t *mrp) { - if (!down_write_trylock(&mrp->mr_lock)) + if (!anon_down_write_trylock(&mrp->mr_lock)) return 0; #ifdef DEBUG mrp->mr_writer = 1; @@ -71,12 +71,12 @@ static inline void mrunlock_excl(mrlock_t *mrp) #ifdef DEBUG mrp->mr_writer = 0; #endif - up_write(&mrp->mr_lock); + anon_up_write(&mrp->mr_lock); } static inline void mrunlock_shared(mrlock_t *mrp) { - up_read(&mrp->mr_lock); + anon_up_read(&mrp->mr_lock); } static inline void mrdemote(mrlock_t *mrp) @@ -84,7 +84,7 @@ static inline void mrdemote(mrlock_t *mrp) #ifdef DEBUG mrp->mr_writer = 0; #endif - downgrade_write(&mrp->mr_lock); + anon_downgrade_write(&mrp->mr_lock); } #endif /* __XFS_SUPPORT_MRLOCK_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 77b8be8..ddd2fc2 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -236,7 +236,7 @@ _xfs_buf_initialize( init_completion(&bp->b_iowait); INIT_LIST_HEAD(&bp->b_list); INIT_LIST_HEAD(&bp->b_hash_list); - init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */ + sema_init(&bp->b_sema, 0); /* held, no waiters */ XB_SET_OWNER(bp); bp->b_target = target; bp->b_file_offset = range_base; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 77414db..5eddd6f 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1110,7 +1110,7 @@ xfs_fs_clear_inode( * (and basically indicate what we are doing), we explicitly * re-init the iolock here. */ - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + ASSERT(!anon_rwsem_is_locked(&ip->i_iolock.mr_lock)); mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); xfs_inactive(ip); diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 275b1f4..8c5422a 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2270,7 +2270,7 @@ xfs_alloc_vextent( * These three force us into a single a.g. */ args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); args->pag = &mp->m_perag[args->agno]; args->minleft = 0; error = xfs_alloc_fix_freelist(args, 0); @@ -2280,14 +2280,14 @@ xfs_alloc_vextent( goto error0; } if (!args->agbp) { - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); trace_xfs_alloc_vextent_noagbp(args); break; } args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); if ((error = xfs_alloc_ag_vextent(args))) goto error0; - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); break; case XFS_ALLOCTYPE_START_BNO: /* @@ -2339,7 +2339,7 @@ xfs_alloc_vextent( * Loop over allocation groups twice; first time with * trylock set, second time without. */ - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); for (;;) { args->pag = &mp->m_perag[args->agno]; if (no_min) args->minleft = 0; @@ -2401,7 +2401,7 @@ xfs_alloc_vextent( } } } - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) { if (args->agno == sagno) mp->m_agfrotor = (mp->m_agfrotor + 1) % @@ -2429,7 +2429,7 @@ xfs_alloc_vextent( } return 0; error0: - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); return error; } @@ -2454,7 +2454,7 @@ xfs_free_extent( args.agno = XFS_FSB_TO_AGNO(args.mp, bno); ASSERT(args.agno < args.mp->m_sb.sb_agcount); args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); - down_read(&args.mp->m_peraglock); + anon_down_read(&args.mp->m_peraglock); args.pag = &args.mp->m_perag[args.agno]; if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING))) goto error0; @@ -2465,7 +2465,7 @@ xfs_free_extent( #endif error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); error0: - up_read(&args.mp->m_peraglock); + anon_up_read(&args.mp->m_peraglock); return error; } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 98251cd..3326ac8 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2629,13 +2629,13 @@ xfs_bmap_btalloc( if (startag == NULLAGNUMBER) startag = ag = 0; notinit = 0; - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); while (blen < ap->alen) { pag = &mp->m_perag[ag]; if (!pag->pagf_init && (error = xfs_alloc_pagf_init(mp, args.tp, ag, XFS_ALLOC_FLAG_TRYLOCK))) { - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); return error; } /* @@ -2668,7 +2668,7 @@ xfs_bmap_btalloc( error = xfs_filestream_new_ag(ap, &ag); if (error) { - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); return error; } @@ -2682,7 +2682,7 @@ xfs_bmap_btalloc( if (ag == startag) break; } - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); /* * Since the above loop did a BUF_TRYLOCK, it is * possible that there is space for this request. diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index a631e14..1dbe404 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -526,7 +526,7 @@ xfs_filestream_associate( mp = pip->i_mount; cache = mp->m_filestream; - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); /* * We have a problem, Houston. @@ -544,7 +544,7 @@ xfs_filestream_associate( * So, if we can't get the iolock without sleeping then just give up */ if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) { - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); return 1; } @@ -601,7 +601,7 @@ exit_did_pick: exit: xfs_iunlock(pip, XFS_IOLOCK_EXCL); - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); return -err; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index a13919a..ce3baff 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -177,14 +177,14 @@ xfs_growfs_data_private( if (!new_perag) return XFS_ERROR(ENOMEM); - down_write(&mp->m_peraglock); + anon_down_write(&mp->m_peraglock); memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount); old_perag = mp->m_perag; mp->m_perag = new_perag; mp->m_flags |= XFS_MOUNT_32BITINODES; nagimax = xfs_initialize_perag(mp, nagcount); - up_write(&mp->m_peraglock); + anon_up_write(&mp->m_peraglock); kmem_free(old_perag); } diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index cb907ba..303a3fe 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -382,9 +382,9 @@ xfs_ialloc_ag_alloc( newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); be32_add_cpu(&agi->agi_count, newlen); be32_add_cpu(&agi->agi_freecount, newlen); - down_read(&args.mp->m_peraglock); + anon_down_read(&args.mp->m_peraglock); args.mp->m_perag[agno].pagi_freecount += newlen; - up_read(&args.mp->m_peraglock); + anon_up_read(&args.mp->m_peraglock); agi->agi_newino = cpu_to_be32(newino); /* @@ -486,7 +486,7 @@ xfs_ialloc_ag_select( */ agno = pagno; flags = XFS_ALLOC_FLAG_TRYLOCK; - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); for (;;) { pag = &mp->m_perag[agno]; if (!pag->pagi_init) { @@ -527,7 +527,7 @@ xfs_ialloc_ag_select( agbp = NULL; goto nextag; } - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); return agbp; } } @@ -540,7 +540,7 @@ nextag: * down. */ if (XFS_FORCED_SHUTDOWN(mp)) { - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); return NULL; } agno++; @@ -548,7 +548,7 @@ nextag: agno = 0; if (agno == pagno) { if (flags == 0) { - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); return NULL; } flags = 0; @@ -771,13 +771,13 @@ nextag: *inop = NULLFSINO; return noroom ? ENOSPC : 0; } - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); if (mp->m_perag[tagno].pagi_inodeok == 0) { - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); goto nextag; } error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp); - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); if (error) goto nextag; agi = XFS_BUF_TO_AGI(agbp); @@ -1006,9 +1006,9 @@ alloc_inode: goto error0; be32_add_cpu(&agi->agi_freecount, -1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); mp->m_perag[tagno].pagi_freecount--; - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); error = xfs_check_agi_freecount(cur, agi); if (error) @@ -1088,9 +1088,9 @@ xfs_difree( /* * Get the allocation group header. */ - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); if (error) { cmn_err(CE_WARN, "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s. Returning error.", @@ -1157,9 +1157,9 @@ xfs_difree( be32_add_cpu(&agi->agi_count, -ilen); be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); mp->m_perag[agno].pagi_freecount -= ilen - 1; - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); @@ -1188,9 +1188,9 @@ xfs_difree( */ be32_add_cpu(&agi->agi_freecount, 1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); mp->m_perag[agno].pagi_freecount++; - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); } @@ -1312,9 +1312,9 @@ xfs_imap( xfs_buf_t *agbp; /* agi buffer */ int i; /* temp state */ - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); if (error) { xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " "xfs_ialloc_read_agi() returned " diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 62efab2..521bc20 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -420,9 +420,9 @@ xfs_bulkstat( while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) { cond_resched(); bp = NULL; - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); if (error) { /* * Skip this allocation group and go to the next one. @@ -849,9 +849,9 @@ xfs_inumbers( agbp = NULL; while (left > 0 && agno < mp->m_sb.sb_agcount) { if (agbp == NULL) { - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); if (error) { /* * If we can't read the AGI of this ag, diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index eb403b4..95997a6 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1152,7 +1152,7 @@ xfs_mountfs( /* * Allocate and initialize the per-ag data. */ - init_rwsem(&mp->m_peraglock); + init_anon_rwsem(&mp->m_peraglock); mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), KM_MAYFAIL); if (!mp->m_perag) diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 1df7e45..0088311 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -208,7 +208,7 @@ typedef struct xfs_mount { uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ uint m_in_maxlevels; /* max inobt btree levels. */ struct xfs_perag *m_perag; /* per-ag accounting info */ - struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ + struct rw_anon_semaphore m_peraglock; /* lock for m_perag (pointer) */ struct mutex m_growlock; /* growfs mutex */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_dmevmask; /* DMI events for this FS */ diff --git a/include/acpi/acpiosxf.h b/include/acpi/acpiosxf.h index eb0e718..2b83d67 100644 --- a/include/acpi/acpiosxf.h +++ b/include/acpi/acpiosxf.h @@ -61,7 +61,7 @@ typedef enum { OSL_EC_BURST_HANDLER } acpi_execute_type; -#define ACPI_NO_UNIT_LIMIT ((u32) -1) +#define ACPI_NO_UNIT_LIMIT (INT_MAX/2) #define ACPI_MUTEX_SEM 1 /* Functions for acpi_os_signal */ diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 18c435d..a4141a5 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -3,6 +3,10 @@ #include <linux/compiler.h> +#ifndef __ASSEMBLY__ +extern void __WARN_ON(const char *func, const char *file, const int line); +#endif /* __ASSEMBLY__ */ + #ifdef CONFIG_BUG #ifdef CONFIG_GENERIC_BUG @@ -141,4 +145,18 @@ extern void warn_slowpath_null(const char *file, const int line); # define WARN_ON_SMP(x) do { } while (0) #endif +#ifdef CONFIG_PREEMPT_RT +# define BUG_ON_RT(c) BUG_ON(c) +# define BUG_ON_NONRT(c) do { } while (0) +# define WARN_ON_RT(condition) WARN_ON(condition) +# define WARN_ON_NONRT(condition) do { } while (0) +# define WARN_ON_ONCE_NONRT(condition) do { } while (0) +#else +# define BUG_ON_RT(c) do { } while (0) +# define BUG_ON_NONRT(c) BUG_ON(c) +# define WARN_ON_RT(condition) do { } while (0) +# define WARN_ON_NONRT(condition) WARN_ON(condition) +# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition) +#endif + #endif diff --git a/include/asm-generic/cmpxchg-local.h b/include/asm-generic/cmpxchg-local.h index b2ba2fc..9793123 100644 --- a/include/asm-generic/cmpxchg-local.h +++ b/include/asm-generic/cmpxchg-local.h @@ -20,7 +20,7 @@ static inline unsigned long __cmpxchg_local_generic(volatile void *ptr, if (size == 8 && sizeof(unsigned long) != 8) wrong_size_cmpxchg(ptr); - local_irq_save(flags); + raw_local_irq_save(flags); switch (size) { case 1: prev = *(u8 *)ptr; if (prev == old) @@ -41,7 +41,7 @@ static inline unsigned long __cmpxchg_local_generic(volatile void *ptr, default: wrong_size_cmpxchg(ptr); } - local_irq_restore(flags); + raw_local_irq_restore(flags); return prev; } @@ -54,11 +54,11 @@ static inline u64 __cmpxchg64_local_generic(volatile void *ptr, u64 prev; unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); prev = *(u64 *)ptr; if (prev == old) *(u64 *)ptr = new; - local_irq_restore(flags); + raw_local_irq_restore(flags); return prev; } diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index 8087b90..12589bb 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -5,6 +5,9 @@ #include <linux/threads.h> #include <linux/percpu-defs.h> +#define __per_cpu_var_lock(var) per_cpu__lock_##var##_locked +#define __per_cpu_var_lock_var(var) per_cpu__##var##_locked + #ifdef CONFIG_SMP /* @@ -59,6 +62,14 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; #define this_cpu_ptr(ptr) SHIFT_PERCPU_PTR(ptr, my_cpu_offset) #define __this_cpu_ptr(ptr) SHIFT_PERCPU_PTR(ptr, __my_cpu_offset) +#define per_cpu_lock(var, cpu) \ + (*SHIFT_PERCPU_PTR(&__per_cpu_var_lock(var), per_cpu_offset(cpu))) +#define per_cpu_var_locked(var, cpu) \ + (*SHIFT_PERCPU_PTR(&__per_cpu_var_lock_var(var), per_cpu_offset(cpu))) +#define __get_cpu_lock(var, cpu) \ + per_cpu_lock(var, cpu) +#define __get_cpu_var_locked(var, cpu) \ + per_cpu_var_locked(var, cpu) #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA extern void setup_per_cpu_areas(void); @@ -67,6 +78,12 @@ extern void setup_per_cpu_areas(void); #else /* ! SMP */ #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu_var(var))) +#define per_cpu_var_locked(var, cpu) \ + (*((void)(cpu), &__per_cpu_var_lock_var(var))) +#define __get_cpu_var(var) per_cpu_var(var) +#define __raw_get_cpu_var(var) per_cpu_var(var) +#define __get_cpu_lock(var, cpu) __per_cpu_var_lock(var) +#define __get_cpu_var_locked(var, cpu) __per_cpu_var_lock_var(var) #define __get_cpu_var(var) per_cpu_var(var) #define __raw_get_cpu_var(var) per_cpu_var(var) #define this_cpu_ptr(ptr) per_cpu_ptr(ptr, 0) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index e43f976..30f998d 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -22,14 +22,8 @@ * and page free order so much.. */ #ifdef CONFIG_SMP - #ifdef ARCH_FREE_PTR_NR - #define FREE_PTR_NR ARCH_FREE_PTR_NR - #else - #define FREE_PTE_NR 506 - #endif #define tlb_fast_mode(tlb) ((tlb)->nr == ~0U) #else - #define FREE_PTE_NR 1 #define tlb_fast_mode(tlb) 1 #endif @@ -39,30 +33,48 @@ struct mmu_gather { struct mm_struct *mm; unsigned int nr; /* set to ~0U means fast mode */ + unsigned int max; /* nr < max */ unsigned int need_flush;/* Really unmapped some ptes? */ unsigned int fullmm; /* non-zero means full mm flush */ - struct page * pages[FREE_PTE_NR]; +#ifdef HAVE_ARCH_MMU_GATHER + struct arch_mmu_gather arch; +#endif + struct page ** pages; + struct page * local[8]; }; -/* Users of the generic TLB shootdown code must declare this storage space. */ -DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); +static inline void __tlb_alloc_pages(struct mmu_gather *tlb) +{ + unsigned long addr = __get_free_pages(GFP_ATOMIC, 0); + + if (addr) { + tlb->pages = (void *)addr; + tlb->max = PAGE_SIZE / sizeof(struct page *); + } +} /* tlb_gather_mmu * Return a pointer to an initialized struct mmu_gather. */ -static inline struct mmu_gather * -tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) +static inline void +tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int full_mm_flush) { - struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); - tlb->mm = mm; - /* Use fast mode if only one CPU is online */ - tlb->nr = num_online_cpus() > 1 ? 0U : ~0U; + tlb->max = ARRAY_SIZE(tlb->local); + tlb->pages = tlb->local; + + if (num_online_cpus() > 1) { + tlb->nr = 0; + __tlb_alloc_pages(tlb); + } else /* Use fast mode if only one CPU is online */ + tlb->nr = ~0U; tlb->fullmm = full_mm_flush; - return tlb; +#ifdef HAVE_ARCH_MMU_GATHER + tlb->arch = ARCH_MMU_GATHER_INIT; +#endif } static inline void @@ -75,6 +87,8 @@ tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) if (!tlb_fast_mode(tlb)) { free_pages_and_swap_cache(tlb->pages, tlb->nr); tlb->nr = 0; + if (tlb->pages == tlb->local) + __tlb_alloc_pages(tlb); } } @@ -90,7 +104,8 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) /* keep the page table cache within bounds */ check_pgt_cache(); - put_cpu_var(mmu_gathers); + if (tlb->pages != tlb->local) + free_pages((unsigned long)tlb->pages, 0); } /* tlb_remove_page @@ -106,7 +121,7 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) return; } tlb->pages[tlb->nr++] = page; - if (tlb->nr >= FREE_PTE_NR) + if (tlb->nr >= tlb->max) tlb_flush_mmu(tlb, 0, 0); } diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h index ab94335..6816be6 100644 --- a/include/linux/amba/bus.h +++ b/include/linux/amba/bus.h @@ -1,5 +1,9 @@ /* - * linux/include/asm-arm/hardware/amba.h + * linux/include/amba/bus.h + * + * This device type deals with ARM PrimeCells and anything else that + * presents a proper CID (0xB105F00D) at the end of the I/O register + * region or that is derived from a PrimeCell. * * Copyright (C) 2003 Deep Blue Solutions Ltd, All Rights Reserved. * diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h index 27b1bcf..37dc356 100644 --- a/include/linux/bottom_half.h +++ b/include/linux/bottom_half.h @@ -1,9 +1,17 @@ #ifndef _LINUX_BH_H #define _LINUX_BH_H +#ifdef CONFIG_PREEMPT_HARDIRQS +# define local_bh_disable() do { } while (0) +# define __local_bh_disable(ip) do { } while (0) +# define _local_bh_enable() do { } while (0) +# define local_bh_enable() do { } while (0) +# define local_bh_enable_ip(ip) do { } while (0) +#else extern void local_bh_disable(void); extern void _local_bh_enable(void); extern void local_bh_enable(void); extern void local_bh_enable_ip(unsigned long ip); +#endif #endif /* _LINUX_BH_H */ diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 16ed028..a7a7491 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -21,10 +21,6 @@ enum bh_state_bits { BH_Dirty, /* Is dirty */ BH_Lock, /* Is locked */ BH_Req, /* Has been submitted for I/O */ - BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise - * IO completion of other buffers in the page - */ - BH_Mapped, /* Has a disk mapping */ BH_New, /* Disk mapping was newly created by get_block */ BH_Async_Read, /* Is under end_buffer_async_read I/O */ @@ -74,6 +70,8 @@ struct buffer_head { struct address_space *b_assoc_map; /* mapping this buffer is associated with */ atomic_t b_count; /* users using this buffer_head */ + spinlock_t b_uptodate_lock; + spinlock_t b_state_lock; }; /* diff --git a/include/linux/console.h b/include/linux/console.h index dcca533..cc87d8f 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -55,6 +55,7 @@ struct consw { void (*con_invert_region)(struct vc_data *, u16 *, int); u16 *(*con_screen_pos)(struct vc_data *, int); unsigned long (*con_getxy)(struct vc_data *, unsigned long, int *, int *); + int con_preemptible; // can it reschedule from within printk? }; extern const struct consw *conswitchp; @@ -92,6 +93,17 @@ void give_up_console(const struct consw *sw); #define CON_BOOT (8) #define CON_ANYTIME (16) /* Safe to call when cpu is offline */ #define CON_BRL (32) /* Used for a braille device */ +#define CON_ATOMIC (64) /* Safe to call in PREEMPT_RT atomic */ + +#ifdef CONFIG_PREEMPT_RT +# define console_atomic_safe(con) \ + (((con)->flags & CON_ATOMIC) || \ + (!in_atomic() && !irqs_disabled()) || \ + (system_state != SYSTEM_RUNNING) || \ + oops_in_progress) +#else +# define console_atomic_safe(con) (1) +#endif struct console { char name[16]; @@ -115,9 +127,9 @@ extern int update_console_cmdline(char *name, int idx, char *name_new, int idx_n extern void register_console(struct console *); extern int unregister_console(struct console *); extern struct console *console_drivers; -extern void acquire_console_sem(void); -extern int try_acquire_console_sem(void); -extern void release_console_sem(void); +extern void acquire_console_mutex(void); +extern int try_acquire_console_mutex(void); +extern void release_console_mutex(void); extern void console_conditional_schedule(void); extern void console_unblank(void); extern struct tty_driver *console_device(int *); diff --git a/include/linux/device.h b/include/linux/device.h index a62799f..37095cc 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -22,7 +22,6 @@ #include <linux/types.h> #include <linux/module.h> #include <linux/pm.h> -#include <linux/semaphore.h> #include <asm/atomic.h> #include <asm/device.h> @@ -106,7 +105,7 @@ extern int bus_unregister_notifier(struct bus_type *bus, /* All 4 notifers below get called with the target struct device * * as an argument. Note that those functions are likely to be called - * with the device semaphore held in the core, so be careful. + * with the device mutex held in the core, so be careful. */ #define BUS_NOTIFY_ADD_DEVICE 0x00000001 /* device added */ #define BUS_NOTIFY_DEL_DEVICE 0x00000002 /* device removed */ @@ -385,7 +384,7 @@ struct device { const char *init_name; /* initial name of the device */ struct device_type *type; - struct semaphore sem; /* semaphore to synchronize calls to + struct mutex mutex; /* mutex to synchronize calls to * its driver. */ diff --git a/include/linux/fs.h b/include/linux/fs.h index b1bcb27..590c104 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -729,6 +729,7 @@ struct inode { uid_t i_uid; gid_t i_gid; dev_t i_rdev; + unsigned int i_blkbits; u64 i_version; loff_t i_size; #ifdef __NEED_I_SIZE_ORDERED @@ -738,12 +739,11 @@ struct inode { struct timespec i_mtime; struct timespec i_ctime; blkcnt_t i_blocks; - unsigned int i_blkbits; unsigned short i_bytes; umode_t i_mode; spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ struct mutex i_mutex; - struct rw_semaphore i_alloc_sem; + struct rw_anon_semaphore i_alloc_sem; const struct inode_operations *i_op; const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ struct super_block *i_sb; diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index d5b3876..b5dec50 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -83,22 +83,16 @@ * Are we doing bottom half or hardware interrupt processing? * Are we in a softirq context? Interrupt context? */ -#define in_irq() (hardirq_count()) -#define in_softirq() (softirq_count()) -#define in_interrupt() (irq_count()) +#define in_irq() (hardirq_count()) +#define in_softirq() (softirq_count()) +#define in_interrupt() (irq_count()) /* * Are we in NMI context? */ #define in_nmi() (preempt_count() & NMI_MASK) -#if defined(CONFIG_PREEMPT) -# define PREEMPT_INATOMIC_BASE kernel_locked() -# define PREEMPT_CHECK_OFFSET 1 -#else -# define PREEMPT_INATOMIC_BASE 0 -# define PREEMPT_CHECK_OFFSET 0 -#endif +#define PREEMPT_INATOMIC_BASE 0 /* * Are we running in atomic context? WARNING: this macro cannot @@ -107,14 +101,21 @@ * used in the general case to determine whether sleeping is possible. * Do not use in_atomic() in driver code. */ -#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_INATOMIC_BASE) +#define in_atomic() \ + ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_INATOMIC_BASE) + +#ifdef CONFIG_PREEMPT +# define PREEMPT_CHECK_OFFSET 1 +#else +# define PREEMPT_CHECK_OFFSET 0 +#endif /* * Check whether we were atomic before we did preempt_disable(): - * (used by the scheduler, *after* releasing the kernel lock) + * (used by the scheduler) */ #define in_atomic_preempt_off() \ - ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET) + ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET) #ifdef CONFIG_PREEMPT # define preemptible() (preempt_count() == 0 && !irqs_disabled()) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 5d86fb2..1cc4577 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -107,6 +107,8 @@ struct hrtimer { enum hrtimer_restart (*function)(struct hrtimer *); struct hrtimer_clock_base *base; unsigned long state; + struct list_head cb_entry; + int irqsafe; #ifdef CONFIG_TIMER_STATS int start_pid; void *start_site; @@ -142,6 +144,7 @@ struct hrtimer_clock_base { struct hrtimer_cpu_base *cpu_base; clockid_t index; struct rb_root active; + struct list_head expired; struct rb_node *first; ktime_t resolution; ktime_t (*get_time)(void); @@ -180,6 +183,9 @@ struct hrtimer_cpu_base { unsigned long nr_hangs; ktime_t max_hang_time; #endif +#ifdef CONFIG_PREEMPT_SOFTIRQS + wait_queue_head_t wait; +#endif }; static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) @@ -367,6 +373,13 @@ static inline int hrtimer_restart(struct hrtimer *timer) return hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } +/* Softirq preemption could deadlock timer removal */ +#ifdef CONFIG_PREEMPT_SOFTIRQS + extern void hrtimer_wait_for_timer(const struct hrtimer *timer); +#else +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0) +#endif + /* Query timers: */ extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp); diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index 070ba06..5977b72 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -44,7 +44,7 @@ static inline int hw_breakpoint_type(struct perf_event *bp) return bp->attr.bp_type; } -static inline int hw_breakpoint_len(struct perf_event *bp) +static inline unsigned long hw_breakpoint_len(struct perf_event *bp) { return bp->attr.bp_len; } diff --git a/include/linux/init_task.h b/include/linux/init_task.h index abec69b..cae32ed 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -27,7 +27,7 @@ extern struct fs_struct init_fs; .cputimer = { \ .cputime = INIT_CPUTIME, \ .running = 0, \ - .lock = __SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \ + .lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \ }, \ } @@ -167,6 +167,7 @@ extern struct cred init_cred; .fs_excl = ATOMIC_INIT(0), \ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ .timer_slack_ns = 50000, /* 50 usec default slack */ \ + .posix_timer_list = NULL, \ .pids = { \ [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \ [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \ diff --git a/include/linux/input.h b/include/linux/input.h index 735ceaf..663208a 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -376,6 +376,7 @@ struct input_absinfo { #define KEY_DISPLAY_OFF 245 /* display device to off state */ #define KEY_WIMAX 246 +#define KEY_RFKILL 247 /* Key that controls all radios */ /* Range 248 - 255 is reserved for special needs of AT keyboard driver */ diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 75f3f00..9f6580a 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -52,16 +52,20 @@ * IRQF_ONESHOT - Interrupt is not reenabled after the hardirq handler finished. * Used by threaded interrupts which need to keep the * irq line disabled until the threaded handler has been run. + * IRQF_NODELAY - Interrupt is not force threaded on -rt */ #define IRQF_DISABLED 0x00000020 #define IRQF_SAMPLE_RANDOM 0x00000040 #define IRQF_SHARED 0x00000080 #define IRQF_PROBE_SHARED 0x00000100 -#define IRQF_TIMER 0x00000200 +#define __IRQF_TIMER 0x00000200 #define IRQF_PERCPU 0x00000400 #define IRQF_NOBALANCING 0x00000800 #define IRQF_IRQPOLL 0x00001000 #define IRQF_ONESHOT 0x00002000 +#define IRQF_NODELAY 0x00004000 + +#define IRQF_TIMER (__IRQF_TIMER | IRQF_NODELAY) /* * Bits used by threaded handlers: @@ -91,6 +95,7 @@ typedef irqreturn_t (*irq_handler_t)(int, void *); * @thread_fn: interupt handler function for threaded interrupts * @thread: thread pointer for threaded interrupts * @thread_flags: flags related to @thread + * @thread_mask: bit mask to account for forced threads */ struct irqaction { irq_handler_t handler; @@ -103,6 +108,7 @@ struct irqaction { irq_handler_t thread_fn; struct task_struct *thread; unsigned long thread_flags; + unsigned long thread_mask; }; extern irqreturn_t no_action(int cpl, void *dev_id); @@ -179,7 +185,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id); #ifdef CONFIG_LOCKDEP # define local_irq_enable_in_hardirq() do { } while (0) #else -# define local_irq_enable_in_hardirq() local_irq_enable() +# define local_irq_enable_in_hardirq() local_irq_enable_nort() #endif extern void disable_irq_nosync(unsigned int irq); @@ -319,6 +325,7 @@ static inline int disable_irq_wake(unsigned int irq) #ifndef __ARCH_SET_SOFTIRQ_PENDING #define set_softirq_pending(x) (local_softirq_pending() = (x)) +// FIXME: PREEMPT_RT: set_bit()? #define or_softirq_pending(x) (local_softirq_pending() |= (x)) #endif @@ -350,7 +357,6 @@ enum SCHED_SOFTIRQ, HRTIMER_SOFTIRQ, RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ - NR_SOFTIRQS }; @@ -368,14 +374,23 @@ struct softirq_action void (*action)(struct softirq_action *); }; +#ifdef CONFIG_PREEMPT_HARDIRQS +# define __raise_softirq_irqoff(nr) raise_softirq_irqoff(nr) +# define __do_raise_softirq_irqoff(nr) \ + do { or_softirq_pending(1UL << (nr)); } while (0) +#else +# define __raise_softirq_irqoff(nr) \ + do { or_softirq_pending(1UL << (nr)); } while (0) +# define __do_raise_softirq_irqoff(nr) __raise_softirq_irqoff(nr) +#endif + asmlinkage void do_softirq(void); asmlinkage void __do_softirq(void); extern void open_softirq(int nr, void (*action)(struct softirq_action *)); extern void softirq_init(void); -#define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0) extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); -extern void wakeup_softirqd(void); +extern void softirq_check_pending_idle(void); /* This is the worklist that queues up per-cpu softirq work. * @@ -410,8 +425,9 @@ extern void __send_remote_softirq(struct call_single_data *cp, int cpu, to be executed on some cpu at least once after this. * If the tasklet is already scheduled, but its excecution is still not started, it will be executed only once. - * If this tasklet is already running on another CPU (or schedule is called - from tasklet itself), it is rescheduled for later. + * If this tasklet is already running on another CPU, it is rescheduled + for later. + * Schedule must not be called from the tasklet itself (a lockup occurs) * Tasklet is strictly serialized wrt itself, but not wrt another tasklets. If client needs some intertask synchronization, he makes it with spinlocks. @@ -436,27 +452,36 @@ struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data } enum { TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */ - TASKLET_STATE_RUN /* Tasklet is running (SMP only) */ + TASKLET_STATE_RUN, /* Tasklet is running (SMP only) */ + TASKLET_STATE_PENDING /* Tasklet is pending */ }; -#ifdef CONFIG_SMP +#define TASKLET_STATEF_SCHED (1 << TASKLET_STATE_SCHED) +#define TASKLET_STATEF_RUN (1 << TASKLET_STATE_RUN) +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING) + +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) static inline int tasklet_trylock(struct tasklet_struct *t) { return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state); } +static inline int tasklet_tryunlock(struct tasklet_struct *t) +{ + return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN; +} + static inline void tasklet_unlock(struct tasklet_struct *t) { smp_mb__before_clear_bit(); clear_bit(TASKLET_STATE_RUN, &(t)->state); } -static inline void tasklet_unlock_wait(struct tasklet_struct *t) -{ - while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); } -} +extern void tasklet_unlock_wait(struct tasklet_struct *t); + #else #define tasklet_trylock(t) 1 +#define tasklet_tryunlock(t) 1 #define tasklet_unlock_wait(t) do { } while (0) #define tasklet_unlock(t) do { } while (0) #endif @@ -505,22 +530,14 @@ static inline void tasklet_disable(struct tasklet_struct *t) smp_mb(); } -static inline void tasklet_enable(struct tasklet_struct *t) -{ - smp_mb__before_atomic_dec(); - atomic_dec(&t->count); -} - -static inline void tasklet_hi_enable(struct tasklet_struct *t) -{ - smp_mb__before_atomic_dec(); - atomic_dec(&t->count); -} +extern void tasklet_enable(struct tasklet_struct *t); +extern void tasklet_hi_enable(struct tasklet_struct *t); extern void tasklet_kill(struct tasklet_struct *t); extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu); extern void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data); +extern void takeover_tasklets(unsigned int cpu); struct tasklet_hrtimer { struct hrtimer timer; @@ -613,4 +630,19 @@ extern int arch_probe_nr_irqs(void); extern int arch_early_irq_init(void); extern int arch_init_chip_data(struct irq_desc *desc, int node); +/* + * local_irq* variants depending on RT/!RT + */ +#ifdef CONFIG_PREEMPT_RT +# define local_irq_disable_nort() do { } while (0) +# define local_irq_enable_nort() do { } while (0) +# define local_irq_save_nort(flags) do { local_save_flags(flags); } while (0) +# define local_irq_restore_nort(flags) do { (void)(flags); } while (0) +#else +# define local_irq_disable_nort() local_irq_disable() +# define local_irq_enable_nort() local_irq_enable() +# define local_irq_save_nort(flags) local_irq_save(flags) +# define local_irq_restore_nort(flags) local_irq_restore(flags) +#endif + #endif diff --git a/include/linux/irq.h b/include/linux/irq.h index 451481c..239c7b3 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -201,6 +201,7 @@ struct irq_desc { #endif #endif atomic_t threads_active; + unsigned long forced_threads_active; wait_queue_head_t wait_for_threads; #ifdef CONFIG_PROC_FS struct proc_dir_entry *dir; diff --git a/include/linux/jbd.h b/include/linux/jbd.h index 331530c..19153c0 100644 --- a/include/linux/jbd.h +++ b/include/linux/jbd.h @@ -260,6 +260,15 @@ void buffer_assertion_failure(struct buffer_head *bh); #define J_ASSERT_JH(jh, expr) J_ASSERT(expr) #endif +/* + * For assertions that are only valid on SMP (e.g. spin_is_locked()): + */ +#ifdef CONFIG_SMP +# define J_ASSERT_JH_SMP(jh, expr) J_ASSERT_JH(jh, expr) +#else +# define J_ASSERT_JH_SMP(jh, assert) do { } while (0) +#endif + #if defined(JBD_PARANOID_IOFAIL) #define J_EXPECT(expr, why...) J_ASSERT(expr) #define J_EXPECT_BH(bh, expr, why...) J_ASSERT_BH(bh, expr) @@ -315,32 +324,32 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh) static inline void jbd_lock_bh_state(struct buffer_head *bh) { - bit_spin_lock(BH_State, &bh->b_state); + spin_lock(&bh->b_state_lock); } static inline int jbd_trylock_bh_state(struct buffer_head *bh) { - return bit_spin_trylock(BH_State, &bh->b_state); + return spin_trylock(&bh->b_state_lock); } static inline int jbd_is_locked_bh_state(struct buffer_head *bh) { - return bit_spin_is_locked(BH_State, &bh->b_state); + return spin_is_locked(&bh->b_state_lock); } static inline void jbd_unlock_bh_state(struct buffer_head *bh) { - bit_spin_unlock(BH_State, &bh->b_state); + spin_unlock(&bh->b_state_lock); } static inline void jbd_lock_bh_journal_head(struct buffer_head *bh) { - bit_spin_lock(BH_JournalHead, &bh->b_state); + spin_lock_irq(&bh->b_uptodate_lock); } static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) { - bit_spin_unlock(BH_JournalHead, &bh->b_state); + spin_unlock_irq(&bh->b_uptodate_lock); } struct jbd_revoke_table_s; diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 638ce45..9a46328 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -319,6 +319,15 @@ void buffer_assertion_failure(struct buffer_head *bh); #define J_EXPECT_JH(jh, expr, why...) __journal_expect(expr, ## why) #endif +/* + * For assertions that are only valid on SMP (e.g. spin_is_locked()): + */ +#ifdef CONFIG_SMP +# define J_ASSERT_JH_SMP(jh, expr) J_ASSERT_JH(jh, expr) +#else +# define J_ASSERT_JH_SMP(jh, assert) do { } while (0) +#endif + enum jbd_state_bits { BH_JBD /* Has an attached ext3 journal_head */ = BH_PrivateStart, @@ -355,32 +364,32 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh) static inline void jbd_lock_bh_state(struct buffer_head *bh) { - bit_spin_lock(BH_State, &bh->b_state); + spin_lock(&bh->b_state_lock); } static inline int jbd_trylock_bh_state(struct buffer_head *bh) { - return bit_spin_trylock(BH_State, &bh->b_state); + return spin_trylock(&bh->b_state_lock); } static inline int jbd_is_locked_bh_state(struct buffer_head *bh) { - return bit_spin_is_locked(BH_State, &bh->b_state); + return spin_is_locked(&bh->b_state_lock); } static inline void jbd_unlock_bh_state(struct buffer_head *bh) { - bit_spin_unlock(BH_State, &bh->b_state); + spin_unlock(&bh->b_state_lock); } static inline void jbd_lock_bh_journal_head(struct buffer_head *bh) { - bit_spin_lock(BH_JournalHead, &bh->b_state); + spin_lock(&bh->b_uptodate_lock); } static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) { - bit_spin_unlock(BH_JournalHead, &bh->b_state); + spin_unlock(&bh->b_uptodate_lock); } /* Flags in jbd_inode->i_flags */ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 328bca6..45894a4 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -123,7 +123,7 @@ extern int _cond_resched(void); # define might_resched() do { } while (0) #endif -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT) void __might_sleep(char *file, int line, int preempt_offset); /** * might_sleep - annotation for functions that can sleep @@ -286,6 +286,12 @@ extern void printk_tick(void); extern void asmlinkage __attribute__((format(printf, 1, 2))) early_printk(const char *fmt, ...); +#ifdef CONFIG_PREEMPT_RT +extern void zap_rt_locks(void); +#else +# define zap_rt_locks() do { } while (0) +#endif + unsigned long int_sqrt(unsigned long); static inline void console_silent(void) @@ -315,6 +321,7 @@ extern int root_mountflags; /* Values used for system_state */ extern enum system_states { SYSTEM_BOOTING, + SYSTEM_BOOTING_SCHEDULER_OK, SYSTEM_RUNNING, SYSTEM_HALT, SYSTEM_POWER_OFF, diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index 6f6c5f3..bc0fc79 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -124,7 +124,7 @@ extern __must_check unsigned int kfifo_out_peek(struct kfifo *fifo, */ static inline bool kfifo_initialized(struct kfifo *fifo) { - return fifo->buffer != 0; + return fifo->buffer != NULL; } /** diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 1b672f7..620df87 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -170,7 +170,7 @@ struct kretprobe { int nmissed; size_t data_size; struct hlist_head free_instances; - spinlock_t lock; + raw_spinlock_t lock; }; struct kretprobe_instance { diff --git a/include/linux/list.h b/include/linux/list.h index 969f6e9..d62a35b 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -345,6 +345,9 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_first_entry(ptr, type, member) \ list_entry((ptr)->next, type, member) +#define list_last_entry(ptr, type, member) \ + list_entry((ptr)->prev, type, member) + /** * list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 9ccf0e2..528620e 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -40,6 +40,8 @@ struct lock_class_key { struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES]; }; +extern struct lock_class_key __lockdep_no_validate__; + #define LOCKSTAT_POINTS 4 /* @@ -266,6 +268,9 @@ extern void lockdep_init_map(struct lockdep_map *lock, const char *name, #define lockdep_set_subclass(lock, sub) \ lockdep_init_map(&(lock)->dep_map, #lock, \ (lock)->dep_map.key, sub) + +#define lockdep_set_novalidate_class(lock) \ + lockdep_set_class(lock, &__lockdep_no_validate__) /* * Compare locking classes */ @@ -350,6 +355,9 @@ static inline void lockdep_on(void) #define lockdep_set_class_and_subclass(lock, key, sub) \ do { (void)(key); } while (0) #define lockdep_set_subclass(lock, sub) do { } while (0) + +#define lockdep_set_novalidate_class(lock) do { } while (0) + /* * We don't define lockdep_match_class() and lockdep_match_key() for !LOCKDEP * case since the result is not well defined and the caller should rather diff --git a/include/linux/mm.h b/include/linux/mm.h index 60c467b..9fb57fa 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -597,23 +597,39 @@ static __always_inline void *lowmem_page_address(struct page *page) #endif #if defined(WANT_PAGE_VIRTUAL) -#define page_address(page) ((page)->virtual) -#define set_page_address(page, address) \ - do { \ - (page)->virtual = (address); \ - } while(0) -#define page_address_init() do { } while(0) +/* + * wrap page->virtual so it is safe to set/read locklessly + */ +#define page_address(page) \ + ({ typeof((page)->virtual) v = (page)->virtual; \ + smp_read_barrier_depends(); \ + v; }) + +static inline int set_page_address(struct page *page, void *address) +{ + if (address) + return cmpxchg(&page->virtual, NULL, address) == NULL; + else { + /* + * cmpxchg is a bit abused because it is not guaranteed + * safe wrt direct assignment on all platforms. + */ + void *virt = page->virtual; + return cmpxchg(&page->vitrual, virt, NULL) == virt; + } +} +void page_address_init(void); #endif #if defined(HASHED_PAGE_VIRTUAL) void *page_address(struct page *page); -void set_page_address(struct page *page, void *virtual); +int set_page_address(struct page *page, void *virtual); void page_address_init(void); #endif #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) #define page_address(page) lowmem_page_address(page) -#define set_page_address(page, address) do { } while(0) +#define set_page_address(page, address) (0) #define page_address_init() do { } while(0) #endif @@ -758,7 +774,7 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *); -unsigned long unmap_vmas(struct mmu_gather **tlb, +unsigned long unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *); @@ -947,27 +963,85 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a * overflow into the next struct page (as it might with DEBUG_SPINLOCK). * When freeing, reset page->mapping so free_pages_check won't complain. */ +#ifndef CONFIG_PREEMPT_RT + #define __pte_lockptr(page) &((page)->ptl) -#define pte_lock_init(_page) do { \ - spin_lock_init(__pte_lockptr(_page)); \ -} while (0) + +static inline struct page *pte_lock_init(struct page *page) +{ + spin_lock_init(__pte_lockptr(page)); + return page; +} + #define pte_lock_deinit(page) ((page)->mapping = NULL) + +#else /* PREEMPT_RT */ + +/* + * On PREEMPT_RT the spinlock_t's are too large to embed in the + * page frame, hence it only has a pointer and we need to dynamically + * allocate the lock when we allocate PTE-pages. + * + * This is an overall win, since only a small fraction of the pages + * will be PTE pages under normal circumstances. + */ + +#define __pte_lockptr(page) ((page)->ptl) + +/* + * Heinous hack, relies on the caller doing something like: + * + * pte = alloc_pages(PGALLOC_GFP, 0); + * if (pte) + * pgtable_page_ctor(pte); + * return pte; + * + * This ensures we release the page and return NULL when the + * lock allocation fails. + */ +static inline struct page *pte_lock_init(struct page *page) +{ + page->ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL); + if (page->ptl) { + spin_lock_init(__pte_lockptr(page)); + } else { + __free_page(page); + page = NULL; + } + return page; +} + +static inline void pte_lock_deinit(struct page *page) +{ + kfree(page->ptl); + page->mapping = NULL; +} + +#endif /* PREEMPT_RT */ + #define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));}) #else /* !USE_SPLIT_PTLOCKS */ /* * We use mm->page_table_lock to guard all pagetable pages of the mm. */ -#define pte_lock_init(page) do {} while (0) +static inline struct page *pte_lock_init(struct page *page) { return page; } #define pte_lock_deinit(page) do {} while (0) #define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;}) #endif /* USE_SPLIT_PTLOCKS */ -static inline void pgtable_page_ctor(struct page *page) +static inline struct page *__pgtable_page_ctor(struct page *page) { - pte_lock_init(page); - inc_zone_page_state(page, NR_PAGETABLE); + page = pte_lock_init(page); + if (page) + inc_zone_page_state(page, NR_PAGETABLE); + return page; } +#define pgtable_page_ctor(page) \ +do { \ + page = __pgtable_page_ctor(page); \ +} while (0) + static inline void pgtable_page_dtor(struct page *page) { pte_lock_deinit(page); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 36f9627..bd17761 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -69,7 +69,11 @@ struct page { */ }; #if USE_SPLIT_PTLOCKS +#ifndef CONFIG_PREEMPT_RT spinlock_t ptl; +#else + spinlock_t *ptl; +#endif #endif struct kmem_cache *slab; /* SLUB: Pointer to slab */ struct page *first_page; /* Compound tail pages */ @@ -251,6 +255,9 @@ struct mm_struct { /* Architecture-specific MM context */ mm_context_t context; + /* realtime bits */ + struct list_head delayed_drop; + /* Swap token stuff */ /* * Last value of global fault stamp as seen by this process. diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 878cab4..f98509b 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -12,11 +12,85 @@ #include <linux/list.h> #include <linux/spinlock_types.h> +#include <linux/rt_lock.h> #include <linux/linkage.h> #include <linux/lockdep.h> #include <asm/atomic.h> +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ + , .dep_map = { .name = #lockname } +#else +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) +#endif + +#ifdef CONFIG_PREEMPT_RT + +#include <linux/rtmutex.h> + +struct mutex { + struct rt_mutex lock; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + + +#define __MUTEX_INITIALIZER(mutexname) \ + { \ + .lock = __RT_MUTEX_INITIALIZER(mutexname.lock) \ + __DEP_MAP_MUTEX_INITIALIZER(mutexname) \ + } + +#define DEFINE_MUTEX(mutexname) \ + struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) + +extern void +__mutex_init(struct mutex *lock, char *name, struct lock_class_key *key); + +extern void __lockfunc _mutex_lock(struct mutex *lock); +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock); +extern int __lockfunc _mutex_lock_killable(struct mutex *lock); +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass); +extern int __lockfunc +_mutex_lock_interruptible_nested(struct mutex *lock, int subclass); +extern int __lockfunc +_mutex_lock_killable_nested(struct mutex *lock, int subclass); +extern int __lockfunc _mutex_trylock(struct mutex *lock); +extern void __lockfunc _mutex_unlock(struct mutex *lock); + +#define mutex_is_locked(l) rt_mutex_is_locked(&(l)->lock) +#define mutex_lock(l) _mutex_lock(l) +#define mutex_lock_interruptible(l) _mutex_lock_interruptible(l) +#define mutex_lock_killable(l) _mutex_lock_killable(l) +#define mutex_trylock(l) _mutex_trylock(l) +#define mutex_unlock(l) _mutex_unlock(l) +#define mutex_destroy(l) rt_mutex_destroy(&(l)->lock) + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define mutex_lock_nested(l, s) _mutex_lock_nested(l, s) +# define mutex_lock_interruptible_nested(l, s) \ + _mutex_lock_interruptible_nested(l, s) +# define mutex_lock_killable_nested(l, s) \ + _mutex_lock_killable_nested(l, s) +#else +# define mutex_lock_nested(l, s) _mutex_lock(l) +# define mutex_lock_interruptible_nested(l, s) \ + _mutex_lock_interruptible(l) +# define mutex_lock_killable_nested(l, s) \ + _mutex_lock_killable(l) +#endif + +# define mutex_init(mutex) \ +do { \ + static struct lock_class_key __key; \ + \ + __mutex_init((mutex), #mutex, &__key); \ +} while (0) + +#else /* PREEMPT_RT */ + /* * Simple, straightforward mutexes with strict semantics: * @@ -87,13 +161,6 @@ do { \ # define mutex_destroy(mutex) do { } while (0) #endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ - , .dep_map = { .name = #lockname } -#else -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) -#endif - #define __MUTEX_INITIALIZER(lockname) \ { .count = ATOMIC_INIT(1) \ , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ @@ -150,6 +217,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock); */ extern int mutex_trylock(struct mutex *lock); extern void mutex_unlock(struct mutex *lock); +#endif /* !PREEMPT_RT */ + extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); #endif diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a3fccc8..a3cbe54 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -508,7 +508,7 @@ struct netdev_queue { * write mostly part */ spinlock_t _xmit_lock ____cacheline_aligned_in_smp; - int xmit_lock_owner; + void *xmit_lock_owner; /* * please use this field instead of dev->trans_start */ @@ -1745,41 +1745,49 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits) return (1 << debug_value) - 1; } -static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) +static inline void __netif_tx_lock(struct netdev_queue *txq) { spin_lock(&txq->_xmit_lock); - txq->xmit_lock_owner = cpu; + txq->xmit_lock_owner = (void *)current; +} + +/* + * Do we hold the xmit_lock already? + */ +static inline int netif_tx_lock_recursion(struct netdev_queue *txq) +{ + return txq->xmit_lock_owner == (void *)current; } static inline void __netif_tx_lock_bh(struct netdev_queue *txq) { spin_lock_bh(&txq->_xmit_lock); - txq->xmit_lock_owner = smp_processor_id(); + txq->xmit_lock_owner = (void *)current; } static inline int __netif_tx_trylock(struct netdev_queue *txq) { int ok = spin_trylock(&txq->_xmit_lock); if (likely(ok)) - txq->xmit_lock_owner = smp_processor_id(); + txq->xmit_lock_owner = (void *)current; return ok; } static inline void __netif_tx_unlock(struct netdev_queue *txq) { - txq->xmit_lock_owner = -1; + txq->xmit_lock_owner = (void *)-1; spin_unlock(&txq->_xmit_lock); } static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) { - txq->xmit_lock_owner = -1; + txq->xmit_lock_owner = (void *)-1; spin_unlock_bh(&txq->_xmit_lock); } static inline void txq_trans_update(struct netdev_queue *txq) { - if (txq->xmit_lock_owner != -1) + if (txq->xmit_lock_owner != (void *)-1) txq->trans_start = jiffies; } @@ -1792,10 +1800,8 @@ static inline void txq_trans_update(struct netdev_queue *txq) static inline void netif_tx_lock(struct net_device *dev) { unsigned int i; - int cpu; spin_lock(&dev->tx_global_lock); - cpu = smp_processor_id(); for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); @@ -1805,7 +1811,7 @@ static inline void netif_tx_lock(struct net_device *dev) * the ->hard_start_xmit() handler and already * checked the frozen bit. */ - __netif_tx_lock(txq, cpu); + __netif_tx_lock(txq); set_bit(__QUEUE_STATE_FROZEN, &txq->state); __netif_tx_unlock(txq); } @@ -1840,9 +1846,9 @@ static inline void netif_tx_unlock_bh(struct net_device *dev) local_bh_enable(); } -#define HARD_TX_LOCK(dev, txq, cpu) { \ +#define HARD_TX_LOCK(dev, txq) { \ if ((dev->features & NETIF_F_LLTX) == 0) { \ - __netif_tx_lock(txq, cpu); \ + __netif_tx_lock(txq); \ } \ } @@ -1855,14 +1861,12 @@ static inline void netif_tx_unlock_bh(struct net_device *dev) static inline void netif_tx_disable(struct net_device *dev) { unsigned int i; - int cpu; local_bh_disable(); - cpu = smp_processor_id(); for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - __netif_tx_lock(txq, cpu); + __netif_tx_lock(txq); netif_tx_stop_queue(txq); __netif_tx_unlock(txq); } diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 378f27a..e173ef5 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -457,22 +457,35 @@ DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks); * _Only_ that special combination of being per-cpu and never getting * re-entered asynchronously means that the count is safe. */ -static inline void xt_info_rdlock_bh(void) +static inline int xt_info_rdlock_bh(void) { struct xt_info_lock *lock; + int cpu; local_bh_disable(); - lock = &__get_cpu_var(xt_info_locks); - if (likely(!lock->readers++)) + preempt_disable_rt(); + cpu = smp_processor_id(); + lock = &per_cpu(xt_info_locks, cpu); + if (likely(!lock->readers++)) { + preempt_enable_rt(); spin_lock(&lock->lock); + } else + preempt_enable_rt(); + return cpu; } -static inline void xt_info_rdunlock_bh(void) +static inline void xt_info_rdunlock_bh(int cpu) { - struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks); + struct xt_info_lock *lock = &per_cpu(xt_info_locks, cpu); - if (likely(!--lock->readers)) + preempt_disable_rt(); + + if (likely(!--lock->readers)) { + preempt_enable_rt(); spin_unlock(&lock->lock); + } else + preempt_enable_rt(); + local_bh_enable(); } diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 2524267..838405c 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -84,7 +84,7 @@ static inline void *netpoll_poll_lock(struct napi_struct *napi) rcu_read_lock(); /* deal with race on ->npinfo */ if (dev && dev->npinfo) { spin_lock(&napi->poll_lock); - napi->poll_owner = smp_processor_id(); + napi->poll_owner = raw_smp_processor_id(); return napi; } return NULL; diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h index 5171639..7d4ed4d 100644 --- a/include/linux/oprofile.h +++ b/include/linux/oprofile.h @@ -156,7 +156,7 @@ ssize_t oprofilefs_ulong_to_user(unsigned long val, char __user * buf, size_t co int oprofilefs_ulong_from_user(unsigned long * val, char const __user * buf, size_t count); /** lock for read/write safety */ -extern spinlock_t oprofilefs_lock; +extern raw_spinlock_t oprofilefs_lock; /** * Add the contents of a circular buffer to the event buffer. diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index b0e4eb1..491e776 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -12,6 +12,7 @@ */ struct page_cgroup { unsigned long flags; + spinlock_t lock; struct mem_cgroup *mem_cgroup; struct page *page; struct list_head lru; /* per cgroup LRU list */ @@ -85,12 +86,12 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc) static inline void lock_page_cgroup(struct page_cgroup *pc) { - bit_spin_lock(PCG_LOCK, &pc->flags); + spin_lock(&pc->lock); } static inline void unlock_page_cgroup(struct page_cgroup *pc) { - bit_spin_unlock(PCG_LOCK, &pc->flags); + spin_unlock(&pc->lock); } #else /* CONFIG_CGROUP_MEM_RES_CTLR */ diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index bab82f4..0af5218 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -9,7 +9,7 @@ #define _LINUX_PAGEVEC_H /* 14 pointers + two long's align the pagevec structure to a power of two */ -#define PAGEVEC_SIZE 14 +#define PAGEVEC_SIZE 8 struct page; struct address_space; diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 5a5d6ce..3278432 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -87,6 +87,22 @@ DEFINE_PER_CPU_SECTION(type, name, "") /* + * next two added for RT patch + * (wonder if we need corresponding DECLARE_*'s?) (clrkwllms) + */ +#define DEFINE_PER_CPU_SPINLOCK(name, sec) \ + __PCPU_ATTRS(sec) PER_CPU_DEF_ATTRIBUTES \ + __DEFINE_SPINLOCK(per_cpu__lock_##name##_locked) + +#define DECLARE_PER_CPU_LOCKED(type, name) \ + extern PER_CPU_ATTRIBUTES spinlock_t __per_cpu_var_lock(name); \ + extern PER_CPU_ATTRIBUTES __typeof__(type) __per_cpu_var_lock_var(name) + +#define DEFINE_PER_CPU_LOCKED(type, name) \ + DEFINE_PER_CPU_SPINLOCK(name, ""); \ + DEFINE_PER_CPU_SECTION(type, name##_locked, "") + +/* * Declaration/definition used for per-CPU variables that must come first in * the set of variables. */ @@ -138,7 +154,9 @@ * Intermodule exports for per-CPU variables. */ #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var) +#define EXPORT_PER_CPU_LOCKED_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var##_locked) #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var) +#define EXPORT_PER_CPU_LOCKED_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var##_locked) #endif /* _LINUX_PERCPU_DEFS_H */ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index cf5efbc..ea3dfff 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -32,6 +32,51 @@ &__get_cpu_var(var); })) #define put_cpu_var(var) preempt_enable() +/* + * Per-CPU data structures with an additional lock - useful for + * PREEMPT_RT code that wants to reschedule but also wants per-CPU + * data structures. + * + * 'cpu' gets updated with the CPU the task is currently executing on. + * + * NOTE: on normal !PREEMPT_RT kernels these per-CPU variables are the + * same as the normal per-CPU variables, so there is no runtime + * overhead. + */ +#ifdef CONFIG_PREEMPT_RT +#define get_cpu_var_locked(var, cpuptr) \ +(*({ \ + spinlock_t *__lock; \ + int __cpu; \ + \ +again: \ + __cpu = raw_smp_processor_id(); \ + __lock = &__get_cpu_lock(var, __cpu); \ + spin_lock(__lock); \ + if (!cpu_online(__cpu)) { \ + spin_unlock(__lock); \ + goto again; \ + } \ + *(cpuptr) = __cpu; \ + &__get_cpu_var_locked(var, __cpu); \ +})) +#else +#define get_cpu_var_locked(var, cpuptr) \ +(*({ \ + int __cpu; \ + \ + preempt_disable(); \ + __cpu = smp_processor_id(); \ + spin_lock(&__get_cpu_lock(var, __cpu)); \ + preempt_enable(); \ + *(cpuptr) = __cpu; \ + &__get_cpu_var_locked(var, __cpu); \ +})) +#endif + +#define put_cpu_var_locked(var, cpu) \ + do { (void)cpu; spin_unlock(&__get_cpu_lock(var, cpu)); } while (0) + #ifdef CONFIG_SMP /* minimum unit size, also is the maximum supported allocation size */ diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index a7684a5..7823c33 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -16,7 +16,7 @@ #ifdef CONFIG_SMP struct percpu_counter { - spinlock_t lock; + raw_spinlock_t lock; s64 count; #ifdef CONFIG_HOTPLUG_CPU struct list_head list; /* All percpu_counters are on a list */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8fa7187..f57b3ab 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -211,11 +211,9 @@ struct perf_event_attr { __u32 wakeup_watermark; /* bytes before wakeup */ }; - __u32 __reserved_2; - - __u64 bp_addr; __u32 bp_type; - __u32 bp_len; + __u64 bp_addr; + __u64 bp_len; }; /* @@ -647,6 +645,9 @@ struct perf_event { int pending_kill; int pending_disable; struct perf_pending_entry pending; +#ifdef CONFIG_PREEMPT_RT + struct perf_pending_entry pending_softirq; +#endif atomic_t event_limit; @@ -755,6 +756,7 @@ extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); extern void set_perf_event_pending(void); extern void perf_event_do_pending(void); +extern void perf_event_do_pending_softirq(void); extern void perf_event_print_debug(void); extern void __perf_disable(void); extern bool __perf_enable(void); @@ -885,6 +887,7 @@ static inline int perf_event_init_task(struct task_struct *child) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } static inline void perf_event_do_pending(void) { } +static inline void perf_event_do_pending_softirq(void) { } static inline void perf_event_print_debug(void) { } static inline void perf_disable(void) { } static inline void perf_enable(void) { } diff --git a/include/linux/plist.h b/include/linux/plist.h index 8227f71..1559d4e 100644 --- a/include/linux/plist.h +++ b/include/linux/plist.h @@ -75,14 +75,16 @@ #include <linux/kernel.h> #include <linux/list.h> -#include <linux/spinlock_types.h> + +struct spinlock; +struct raw_spinlock; struct plist_head { struct list_head prio_list; struct list_head node_list; #ifdef CONFIG_DEBUG_PI_LIST - raw_spinlock_t *rawlock; - spinlock_t *spinlock; + struct raw_spinlock *rawlock; + struct spinlock *spinlock; #endif }; @@ -142,7 +144,7 @@ struct plist_node { * @lock: spinlock protecting the list (debugging) */ static inline void -plist_head_init(struct plist_head *head, spinlock_t *lock) +plist_head_init(struct plist_head *head, struct spinlock *lock) { INIT_LIST_HEAD(&head->prio_list); INIT_LIST_HEAD(&head->node_list); @@ -158,7 +160,7 @@ plist_head_init(struct plist_head *head, spinlock_t *lock) * @lock: raw_spinlock protecting the list (debugging) */ static inline void -plist_head_init_raw(struct plist_head *head, raw_spinlock_t *lock) +plist_head_init_raw(struct plist_head *head, struct raw_spinlock *lock) { INIT_LIST_HEAD(&head->prio_list); INIT_LIST_HEAD(&head->node_list); diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 2e681d9..9dfe0eb 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -33,12 +33,24 @@ do { \ barrier(); \ } while (0) -#define preempt_enable_no_resched() \ +#define __preempt_enable_no_resched() \ do { \ barrier(); \ dec_preempt_count(); \ } while (0) +#ifdef CONFIG_DEBUG_PREEMPT +extern void notrace preempt_enable_no_resched(void); +#else +# define preempt_enable_no_resched() __preempt_enable_no_resched() +#endif + +#define preempt_enable_and_schedule() \ +do { \ + __preempt_enable_no_resched(); \ + schedule(); \ +} while (0) + #define preempt_check_resched() \ do { \ if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ @@ -47,7 +59,7 @@ do { \ #define preempt_enable() \ do { \ - preempt_enable_no_resched(); \ + __preempt_enable_no_resched(); \ barrier(); \ preempt_check_resched(); \ } while (0) @@ -84,6 +96,8 @@ do { \ #define preempt_disable() do { } while (0) #define preempt_enable_no_resched() do { } while (0) +#define __preempt_enable_no_resched() do { } while (0) +#define preempt_enable_and_schedule() schedule() #define preempt_enable() do { } while (0) #define preempt_check_resched() do { } while (0) @@ -93,6 +107,18 @@ do { \ #endif +#ifdef CONFIG_PREEMPT_RT +# define preempt_disable_rt() preempt_disable() +# define preempt_enable_rt() preempt_enable() +# define preempt_disable_nort() do { } while (0) +# define preempt_enable_nort() do { } while (0) +#else +# define preempt_disable_rt() do { } while (0) +# define preempt_enable_rt() do { } while (0) +# define preempt_disable_nort() preempt_disable() +# define preempt_enable_nort() preempt_enable() +#endif + #ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier; diff --git a/include/linux/profile.h b/include/linux/profile.h index a0fc322..5b72082 100644 --- a/include/linux/profile.h +++ b/include/linux/profile.h @@ -8,10 +8,11 @@ #include <asm/errno.h> -#define CPU_PROFILING 1 -#define SCHED_PROFILING 2 -#define SLEEP_PROFILING 3 -#define KVM_PROFILING 4 +#define CPU_PROFILING 1 +#define SCHED_PROFILING 2 +#define SLEEP_PROFILING 3 +#define KVM_PROFILING 4 +#define PREEMPT_PROFILING 5 struct proc_dir_entry; struct pt_regs; @@ -36,6 +37,8 @@ enum profile_type { PROFILE_MUNMAP }; +extern int prof_pid; + #ifdef CONFIG_PROFILING extern int prof_on __read_mostly; diff --git a/include/linux/proportions.h b/include/linux/proportions.h index cf793bb..ef35bb7 100644 --- a/include/linux/proportions.h +++ b/include/linux/proportions.h @@ -58,7 +58,7 @@ struct prop_local_percpu { */ int shift; unsigned long period; - spinlock_t lock; /* protect the snapshot state */ + raw_spinlock_t lock; /* protect the snapshot state */ }; int prop_local_init_percpu(struct prop_local_percpu *pl); @@ -106,11 +106,11 @@ struct prop_local_single { */ unsigned long period; int shift; - spinlock_t lock; /* protect the snapshot state */ + raw_spinlock_t lock; /* protect the snapshot state */ }; #define INIT_PROP_LOCAL_SINGLE(name) \ -{ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ +{ .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ } int prop_local_init_single(struct prop_local_single *pl); diff --git a/include/linux/quicklist.h b/include/linux/quicklist.h index bd46643..1bc3d46 100644 --- a/include/linux/quicklist.h +++ b/include/linux/quicklist.h @@ -18,7 +18,7 @@ struct quicklist { int nr_pages; }; -DECLARE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; +DECLARE_PER_CPU_LOCKED(struct quicklist, quicklist)[CONFIG_NR_QUICK]; /* * The two key functions quicklist_alloc and quicklist_free are inline so @@ -30,19 +30,27 @@ DECLARE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; * The fast patch in quicklist_alloc touched only a per cpu cacheline and * the first cacheline of the page itself. There is minmal overhead involved. */ -static inline void *quicklist_alloc(int nr, gfp_t flags, void (*ctor)(void *)) +static inline void *__quicklist_alloc(struct quicklist *q) { - struct quicklist *q; - void **p = NULL; + void **p = q->page; - q =&get_cpu_var(quicklist)[nr]; - p = q->page; if (likely(p)) { q->page = p[0]; p[0] = NULL; q->nr_pages--; } - put_cpu_var(quicklist); + return p; +} + +static inline void *quicklist_alloc(int nr, gfp_t flags, void (*ctor)(void *)) +{ + struct quicklist *q; + void **p; + int cpu; + + q = &get_cpu_var_locked(quicklist, &cpu)[nr]; + p = __quicklist_alloc(q); + put_cpu_var_locked(quicklist, cpu); if (likely(p)) return p; @@ -56,12 +64,13 @@ static inline void __quicklist_free(int nr, void (*dtor)(void *), void *p, struct page *page) { struct quicklist *q; + int cpu; - q = &get_cpu_var(quicklist)[nr]; + q = &get_cpu_var_locked(quicklist, &cpu)[nr]; *(void **)p = q->page; q->page = p; q->nr_pages++; - put_cpu_var(quicklist); + put_cpu_var_locked(quicklist, cpu); } static inline void quicklist_free(int nr, void (*dtor)(void *), void *pp) diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index c5da749..9eb17f9 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -169,7 +169,18 @@ unsigned long radix_tree_next_hole(struct radix_tree_root *root, unsigned long index, unsigned long max_scan); unsigned long radix_tree_prev_hole(struct radix_tree_root *root, unsigned long index, unsigned long max_scan); +/* + * On a mutex based kernel we can freely schedule within the radix code: + */ +#ifdef CONFIG_PREEMPT_RT +static inline int radix_tree_preload(gfp_t gfp_mask) +{ + return 0; +} +#else int radix_tree_preload(gfp_t gfp_mask); +#endif + void radix_tree_init(void); void *radix_tree_tag_set(struct radix_tree_root *root, unsigned long index, unsigned int tag); @@ -189,7 +200,9 @@ int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag); static inline void radix_tree_preload_end(void) { +#ifndef CONFIG_PREEMPT_RT preempt_enable(); +#endif } #endif /* _LINUX_RADIX_TREE_H */ diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h index 668cf1b..7596e38 100644 --- a/include/linux/ratelimit.h +++ b/include/linux/ratelimit.h @@ -8,7 +8,7 @@ #define DEFAULT_RATELIMIT_BURST 10 struct ratelimit_state { - spinlock_t lock; /* protect the state */ + raw_spinlock_t lock; /* protect the state */ int interval; int burst; @@ -20,7 +20,7 @@ struct ratelimit_state { #define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init) \ \ struct ratelimit_state name = { \ - .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ + .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ .interval = interval_init, \ .burst = burst_init, \ } diff --git a/include/linux/rt_lock.h b/include/linux/rt_lock.h new file mode 100644 index 0000000..7dff59e --- /dev/null +++ b/include/linux/rt_lock.h @@ -0,0 +1,184 @@ +#ifndef __LINUX_RT_LOCK_H +#define __LINUX_RT_LOCK_H + +/* + * Real-Time Preemption Support + * + * started by Ingo Molnar: + * + * Copyright (C) 2004, 2005 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * + * This file contains the main data structure definitions. + */ +#include <linux/rtmutex.h> +#include <asm/atomic.h> +#include <linux/spinlock_types.h> + +#ifdef CONFIG_PREEMPT_RT + +static inline int preempt_rt(void) { return 1; } + +/* + * spinlocks - an RT mutex plus lock-break field: + */ +typedef struct spinlock { + struct rt_mutex lock; + unsigned int break_lock; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +} spinlock_t; + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# define __RT_SPIN_INITIALIZER(name) \ + { \ + .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name), \ + .save_state = 1, \ + .file = __FILE__, \ + .line = __LINE__ , \ + } +#else +# define __RT_SPIN_INITIALIZER(name) \ + { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name) } +#endif + +#define __SPIN_LOCK_UNLOCKED(name) \ + { .lock = __RT_SPIN_INITIALIZER(name), \ + SPIN_DEP_MAP_INIT(name) } + +#define SPIN_LOCK_UNLOCKED __SPIN_LOCK_UNLOCKED(spin_old_style) + +#define __DEFINE_SPINLOCK(name) \ + spinlock_t name = __SPIN_LOCK_UNLOCKED(name) + +#define DEFINE_SPINLOCK(name) \ + spinlock_t name __cacheline_aligned_in_smp = __SPIN_LOCK_UNLOCKED(name) + +extern void +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key); + +#define spin_lock_init(lock) \ +do { \ + static struct lock_class_key __key; \ + \ + __rt_spin_lock_init(lock, #lock, &__key); \ +} while (0) + +extern void __lockfunc rt_spin_lock(spinlock_t *lock); +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass); +extern void __lockfunc rt_spin_unlock(spinlock_t *lock); +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock); +extern int __lockfunc +rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags); +extern int __lockfunc rt_spin_trylock(spinlock_t *lock); +extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock); + +/* + * lockdep-less calls, for derived types like rwlock: + * (for trylock they can use rt_mutex_trylock() directly. + */ +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock); +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock); + +/* + * rwlocks - an RW semaphore plus lock-break field: + */ +typedef struct { + struct rt_mutex lock; + int read_depth; + unsigned int break_lock; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +} rwlock_t; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define RW_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname } +#else +# define RW_DEP_MAP_INIT(lockname) +#endif + +#define __RW_LOCK_UNLOCKED(name) \ + { .lock = __RT_SPIN_INITIALIZER(name), \ + RW_DEP_MAP_INIT(name) } + +#define RW_LOCK_UNLOCKED __RW_LOCK_UNLOCKED(rw_old_style) + +#define DEFINE_RWLOCK(name) \ + rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name) + +extern void __lockfunc rt_write_lock(rwlock_t *rwlock); +extern void __lockfunc rt_read_lock(rwlock_t *rwlock); +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock); +extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, + unsigned long *flags); +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock); +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock); +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock); +extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock); +extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock); +extern void +__rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key); + +#define rwlock_init(rwl) \ +do { \ + static struct lock_class_key __key; \ + \ + __rt_rwlock_init(rwl, #rwl, &__key); \ +} while (0) + +/* + * RW-semaphores are a spinlock plus a reader-depth count. + * + * Note that the semantics are different from the usual + * Linux rw-sems, in PREEMPT_RT mode we do not allow + * multiple readers to hold the lock at once, we only allow + * a read-lock owner to read-lock recursively. This is + * better for latency, makes the implementation inherently + * fair and makes it simpler as well: + */ +struct rw_semaphore { + struct rt_mutex lock; + int read_depth; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#define __RWSEM_INITIALIZER(name) \ + { .lock = __RT_MUTEX_INITIALIZER(name.lock), \ + RW_DEP_MAP_INIT(name) } + +#define DECLARE_RWSEM(lockname) \ + struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname) + +extern void __rt_rwsem_init(struct rw_semaphore *rwsem, char *name, + struct lock_class_key *key); + +# define rt_init_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __rt_rwsem_init((sem), #sem, &__key); \ +} while (0) + +extern void rt_down_write(struct rw_semaphore *rwsem); +extern void +rt_down_read_nested(struct rw_semaphore *rwsem, int subclass); +extern void +rt_down_write_nested(struct rw_semaphore *rwsem, int subclass); +extern void rt_down_read(struct rw_semaphore *rwsem); +extern int rt_down_write_trylock(struct rw_semaphore *rwsem); +extern int rt_down_read_trylock(struct rw_semaphore *rwsem); +extern void rt_up_read(struct rw_semaphore *rwsem); +extern void rt_up_write(struct rw_semaphore *rwsem); +extern void rt_downgrade_write(struct rw_semaphore *rwsem); + +#else + +static inline int preempt_rt(void) { return 0; } + +#endif /* CONFIG_PREEMPT_RT */ + +#endif + diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index 281d8fd..076359d 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -88,6 +88,8 @@ extern void rt_mutex_destroy(struct rt_mutex *lock); extern void rt_mutex_lock(struct rt_mutex *lock); extern int rt_mutex_lock_interruptible(struct rt_mutex *lock, int detect_deadlock); +extern int rt_mutex_lock_killable(struct rt_mutex *lock, + int detect_deadlock); extern int rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, int detect_deadlock); diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index 71e0b00..688b04f 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -5,6 +5,60 @@ # error "please don't include this file directly" #endif +#ifdef CONFIG_PREEMPT_RT + +#define read_trylock(lock) __cond_lock(lock, rt_read_trylock(lock)) +#define write_trylock(lock) __cond_lock(lock, rt_write_trylock(lock)) + +#define write_trylock_irqsave(lock, flags) \ + __cond_lock(lock, rt_write_trylock_irqsave(lock, &flags)) + +#define write_lock(lock) rt_write_lock(lock) +#define read_lock(lock) rt_read_lock(lock) + +#define read_lock_irqsave(lock, flags) \ + do { \ + typecheck(unsigned long, flags); \ + flags = rt_read_lock_irqsave(lock); \ + } while (0) + +#define write_lock_irqsave(lock, flags) \ + do { \ + typecheck(unsigned long, flags); \ + flags = rt_write_lock_irqsave(lock); \ + } while (0) + +#define read_lock_irq(lock) rt_read_lock(lock) +#define read_lock_bh(lock) rt_read_lock(lock) + +#define write_lock_irq(lock) rt_write_lock(lock) +#define write_lock_bh(lock) rt_write_lock(lock) + +#define read_unlock(lock) rt_read_unlock(lock) +#define write_unlock(lock) rt_write_unlock(lock) +#define read_unlock_irq(lock) rt_read_unlock(lock) +#define write_unlock_irq(lock) rt_write_unlock(lock) + +#define read_unlock_irqrestore(lock, flags) \ + do { \ + typecheck(unsigned long, flags); \ + (void) flags; \ + rt_read_unlock(lock); \ + } while (0) + +#define read_unlock_bh(lock) rt_read_unlock(lock) + +#define write_unlock_irqrestore(lock, flags) \ + do { \ + typecheck(unsigned long, flags); \ + (void) flags; \ + rt_write_unlock(lock); \ + } while (0) + +#define write_unlock_bh(lock) rt_write_unlock(lock) + +#else + /* * rwlock related methods * @@ -121,5 +175,6 @@ do { \ write_trylock(lock) ? \ 1 : ({ local_irq_restore(flags); 0; }); \ }) +#endif #endif /* __LINUX_RWLOCK_H */ diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h index 9c9f049..99d0eed 100644 --- a/include/linux/rwlock_api_smp.h +++ b/include/linux/rwlock_api_smp.h @@ -250,7 +250,7 @@ static inline void __raw_read_unlock_bh(rwlock_t *lock) { rwlock_release(&lock->dep_map, 1, _RET_IP_); do_raw_read_unlock(lock); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); local_bh_enable_ip((unsigned long)__builtin_return_address(0)); } @@ -275,7 +275,7 @@ static inline void __raw_write_unlock_bh(rwlock_t *lock) { rwlock_release(&lock->dep_map, 1, _RET_IP_); do_raw_write_unlock(lock); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); local_bh_enable_ip((unsigned long)__builtin_return_address(0)); } diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h index bdfcc25..6608521 100644 --- a/include/linux/rwsem-spinlock.h +++ b/include/linux/rwsem-spinlock.h @@ -22,6 +22,65 @@ struct rwsem_waiter; /* + * the rw-anon-semaphore definition + * - if activity is 0 then there are no active readers or writers + * - if activity is +ve then that is the number of active readers + * - if activity is -1 then there is one active writer + * - if wait_list is not empty, then there are processes waiting for the semaphore + * + * the anon in the name documents that the semaphore has no full + * restrictions versus owner ship. + */ +struct rw_anon_semaphore { + __s32 activity; + raw_spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define __RWSEM_ANON_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +#else +# define __RWSEM_ANON_DEP_MAP_INIT(lockname) +#endif + +#define __RWSEM_ANON_INITIALIZER(name) \ +{ 0, __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), LIST_HEAD_INIT((name).wait_list) \ + __RWSEM_ANON_DEP_MAP_INIT(name) } + +#define DECLARE_ANON_RWSEM(name) \ + struct rw_anon_semaphore name = __RWSEM_ANON_INITIALIZER(name) + +extern void __init_anon_rwsem(struct rw_anon_semaphore *sem, const char *name, + struct lock_class_key *key); + +#define init_anon_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __init_anon_rwsem((sem), #sem, &__key); \ +} while (0) + +extern void __down_read(struct rw_anon_semaphore *sem); +extern int __down_read_trylock(struct rw_anon_semaphore *sem); +extern void __down_write(struct rw_anon_semaphore *sem); +extern void __down_write_nested(struct rw_anon_semaphore *sem, int subclass); +extern int __down_write_trylock(struct rw_anon_semaphore *sem); +extern void __up_read(struct rw_anon_semaphore *sem); +extern void __up_write(struct rw_anon_semaphore *sem); +extern void __downgrade_write(struct rw_anon_semaphore *sem); + extern int anon_rwsem_is_locked(struct rw_anon_semaphore *sem); + +#ifndef CONFIG_PREEMPT_RT +/* + * Non preempt-rt implementation of rw_semaphore. Same as above, but + * restricted vs. ownership. i.e. ownerless locked state and non owner + * release not allowed. + */ + +/* * the rw-semaphore definition * - if activity is 0 then there are no active readers or writers * - if activity is +ve then that is the number of active readers @@ -30,7 +89,7 @@ struct rwsem_waiter; */ struct rw_semaphore { __s32 activity; - spinlock_t wait_lock; + raw_spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; @@ -44,14 +103,17 @@ struct rw_semaphore { #endif #define __RWSEM_INITIALIZER(name) \ -{ 0, __SPIN_LOCK_UNLOCKED(name.wait_lock), LIST_HEAD_INIT((name).wait_list) \ +{ 0, __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), LIST_HEAD_INIT((name).wait_list) \ __RWSEM_DEP_MAP_INIT(name) } #define DECLARE_RWSEM(name) \ struct rw_semaphore name = __RWSEM_INITIALIZER(name) -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); +static inline void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ + __init_anon_rwsem((struct rw_anon_semaphore *)sem, name, key); +} #define init_rwsem(sem) \ do { \ @@ -60,15 +122,11 @@ do { \ __init_rwsem((sem), #sem, &__key); \ } while (0) -extern void __down_read(struct rw_semaphore *sem); -extern int __down_read_trylock(struct rw_semaphore *sem); -extern void __down_write(struct rw_semaphore *sem); -extern void __down_write_nested(struct rw_semaphore *sem, int subclass); -extern int __down_write_trylock(struct rw_semaphore *sem); -extern void __up_read(struct rw_semaphore *sem); -extern void __up_write(struct rw_semaphore *sem); -extern void __downgrade_write(struct rw_semaphore *sem); -extern int rwsem_is_locked(struct rw_semaphore *sem); +static inline int rwsem_is_locked(struct rw_semaphore *sem) +{ + return anon_rwsem_is_locked((struct rw_anon_semaphore *)sem); +} +#endif /* !PREEMPT_RT */ #endif /* __KERNEL__ */ #endif /* _LINUX_RWSEM_SPINLOCK_H */ diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index efd348f..e516c81 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -11,9 +11,11 @@ #include <linux/types.h> #include <linux/kernel.h> +#include <linux/rt_lock.h> #include <asm/system.h> #include <asm/atomic.h> +struct rw_anon_semaphore; struct rw_semaphore; #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK @@ -25,37 +27,37 @@ struct rw_semaphore; /* * lock for reading */ -extern void down_read(struct rw_semaphore *sem); +extern void anon_down_read(struct rw_anon_semaphore *sem); /* * trylock for reading -- returns 1 if successful, 0 if contention */ -extern int down_read_trylock(struct rw_semaphore *sem); +extern int anon_down_read_trylock(struct rw_anon_semaphore *sem); /* * lock for writing */ -extern void down_write(struct rw_semaphore *sem); +extern void anon_down_write(struct rw_anon_semaphore *sem); /* * trylock for writing -- returns 1 if successful, 0 if contention */ -extern int down_write_trylock(struct rw_semaphore *sem); +extern int anon_down_write_trylock(struct rw_anon_semaphore *sem); /* * release a read lock */ -extern void up_read(struct rw_semaphore *sem); +extern void anon_up_read(struct rw_anon_semaphore *sem); /* * release a write lock */ -extern void up_write(struct rw_semaphore *sem); +extern void anon_up_write(struct rw_anon_semaphore *sem); /* * downgrade write lock to read lock */ -extern void downgrade_write(struct rw_semaphore *sem); +extern void anon_downgrade_write(struct rw_anon_semaphore *sem); #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -71,21 +73,123 @@ extern void downgrade_write(struct rw_semaphore *sem); * lockdep_set_class() at lock initialization time. * See Documentation/lockdep-design.txt for more details.) */ -extern void down_read_nested(struct rw_semaphore *sem, int subclass); -extern void down_write_nested(struct rw_semaphore *sem, int subclass); +extern void anon_down_read_nested(struct rw_anon_semaphore *sem, int subclass); +extern void anon_down_write_nested(struct rw_anon_semaphore *sem, int subclass); /* * Take/release a lock when not the owner will release it. * * [ This API should be avoided as much as possible - the * proper abstraction for this case is completions. ] */ -extern void down_read_non_owner(struct rw_semaphore *sem); -extern void up_read_non_owner(struct rw_semaphore *sem); +extern void anon_down_read_non_owner(struct rw_anon_semaphore *sem); +extern void anon_up_read_non_owner(struct rw_anon_semaphore *sem); #else -# define down_read_nested(sem, subclass) down_read(sem) -# define down_write_nested(sem, subclass) down_write(sem) -# define down_read_non_owner(sem) down_read(sem) -# define up_read_non_owner(sem) up_read(sem) +# define anon_down_read_nested(sem, subclass) anon_down_read(sem) +# define anon_down_write_nested(sem, subclass) anon_down_write(sem) +# define anon_down_read_non_owner(sem) anon_down_read(sem) +# define anon_up_read_non_owner(sem) anon_up_read(sem) +#endif + +#ifdef CONFIG_PREEMPT_RT + +#include <linux/rt_lock.h> + +#define init_rwsem(sem) rt_init_rwsem(sem) +#define rwsem_is_locked(s) rt_mutex_is_locked(&(s)->lock) + +static inline void down_read(struct rw_semaphore *sem) +{ + rt_down_read(sem); +} + +static inline int down_read_trylock(struct rw_semaphore *sem) +{ + return rt_down_read_trylock(sem); +} + +static inline void down_write(struct rw_semaphore *sem) +{ + rt_down_write(sem); +} + +static inline int down_write_trylock(struct rw_semaphore *sem) +{ + return rt_down_write_trylock(sem); +} + +static inline void up_read(struct rw_semaphore *sem) +{ + rt_up_read(sem); +} + +static inline void up_write(struct rw_semaphore *sem) +{ + rt_up_write(sem); +} + +static inline void downgrade_write(struct rw_semaphore *sem) +{ + rt_downgrade_write(sem); +} + +static inline void down_read_nested(struct rw_semaphore *sem, int subclass) +{ + return rt_down_read_nested(sem, subclass); +} + +static inline void down_write_nested(struct rw_semaphore *sem, int subclass) +{ + rt_down_write_nested(sem, subclass); +} + +#else +/* + * Non preempt-rt implementations + */ +static inline void down_read(struct rw_semaphore *sem) +{ + anon_down_read((struct rw_anon_semaphore *)sem); +} + +static inline int down_read_trylock(struct rw_semaphore *sem) +{ + return anon_down_read_trylock((struct rw_anon_semaphore *)sem); +} + +static inline void down_write(struct rw_semaphore *sem) +{ + anon_down_write((struct rw_anon_semaphore *)sem); +} + +static inline int down_write_trylock(struct rw_semaphore *sem) +{ + return anon_down_write_trylock((struct rw_anon_semaphore *)sem); +} + +static inline void up_read(struct rw_semaphore *sem) +{ + anon_up_read((struct rw_anon_semaphore *)sem); +} + +static inline void up_write(struct rw_semaphore *sem) +{ + anon_up_write((struct rw_anon_semaphore *)sem); +} + +static inline void downgrade_write(struct rw_semaphore *sem) +{ + anon_downgrade_write((struct rw_anon_semaphore *)sem); +} + +static inline void down_read_nested(struct rw_semaphore *sem, int subclass) +{ + return anon_down_read_nested((struct rw_anon_semaphore *)sem, subclass); +} + +static inline void down_write_nested(struct rw_semaphore *sem, int subclass) +{ + anon_down_write_nested((struct rw_anon_semaphore *)sem, subclass); +} #endif #endif /* _LINUX_RWSEM_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 78efe7c..a88462a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -102,6 +102,23 @@ struct fs_struct; struct bts_context; struct perf_event_context; +#ifdef CONFIG_PREEMPT +extern int kernel_preemption; +#else +# define kernel_preemption 0 +#endif +#ifdef CONFIG_PREEMPT_VOLUNTARY +extern int voluntary_preemption; +#else +# define voluntary_preemption 0 +#endif + +#ifdef CONFIG_PREEMPT_SOFTIRQS +extern int softirq_preemption; +#else +# define softirq_preemption 0 +#endif + /* * List of flags we want to share for kernel threads, * if only because they are not used by them anyway. @@ -170,9 +187,13 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) } #endif +extern struct mutex kernel_sem; + /* * Task state bitmask. NOTE! These bits are also - * encoded in fs/proc/array.c: get_task_state(). + * used in fs/proc/array.c: get_task_state() and + * in include/trace/events/sched.h in the + * sched_switch trace event. * * We have two separate sets of flags: task->state * is about runnability, while task->exit_state are @@ -181,20 +202,59 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) * mistake. */ #define TASK_RUNNING 0 -#define TASK_INTERRUPTIBLE 1 -#define TASK_UNINTERRUPTIBLE 2 -#define __TASK_STOPPED 4 -#define __TASK_TRACED 8 +#define TASK_STATE_0 "R" +#define DESCR_TASK_STATE_0 "running" + +#define TASK_RUNNING_MUTEX 1 +#define TASK_STATE_1 "M" +#define DESCR_TASK_STATE_1 "running-mutex" + +#define TASK_INTERRUPTIBLE 2 +#define TASK_STATE_2 "S" +#define DESCR_TASK_STATE_2 "sleeping" + +#define TASK_UNINTERRUPTIBLE 4 +#define TASK_STATE_4 "D" +#define DESCR_TASK_STATE_4 "disk sleep" + +#define __TASK_STOPPED 8 +#define TASK_STATE_8 "T" +#define DESCR_TASK_STATE_8 "stopped" + +#define __TASK_TRACED 16 +#define TASK_STATE_16 "t" +#define DESCR_TASK_STATE_16 "tracing stop" + /* in tsk->exit_state */ -#define EXIT_ZOMBIE 16 -#define EXIT_DEAD 32 +#define EXIT_ZOMBIE 32 +#define TASK_STATE_32 "Z" +#define DESCR_TASK_STATE_32 "zombie" + +#define EXIT_DEAD 64 +#define TASK_STATE_64 "X" +#define DESCR_TASK_STATE_64 "dead" + /* in tsk->state again */ -#define TASK_DEAD 64 -#define TASK_WAKEKILL 128 -#define TASK_WAKING 256 -#define TASK_STATE_MAX 512 +#define TASK_DEAD 128 +#define TASK_STATE_128 "x" +#define DESCR_TASK_STATE_128 "dead" + +#define TASK_WAKEKILL 256 +#define TASK_STATE_256 "K" +#define DESCR_TASK_STATE_256 "wakekill" + +#define TASK_WAKING 512 +#define TASK_STATE_512 "W" +#define DESCR_TASK_STATE_512 "waking" -#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW" +#define TASK_STATE_MAX 1024 + +#define TASK_STATE_TO_CHAR_STR \ + TASK_STATE_0 TASK_STATE_1 TASK_STATE_2 TASK_STATE_4 TASK_STATE_8 \ + TASK_STATE_16 TASK_STATE_32 TASK_STATE_64 TASK_STATE_128 TASK_STATE_256 \ + TASK_STATE_512 + +#define TASK_STATE_MAX 1024 extern char ___assert_task_state[1 - 2*!!( sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; @@ -209,7 +269,8 @@ extern char ___assert_task_state[1 - 2*!!( #define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED) /* get_task_state() */ -#define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ +#define TASK_REPORT (TASK_RUNNING | TASK_RUNNING_MUTEX | \ + TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ __TASK_TRACED) @@ -226,6 +287,12 @@ extern char ___assert_task_state[1 - 2*!!( #define set_task_state(tsk, state_value) \ set_mb((tsk)->state, (state_value)) +#ifdef CONFIG_X86_LOCAL_APIC +extern void nmi_show_all_regs(void); +#else +# define nmi_show_all_regs() do { } while (0) +#endif + /* * set_current_state() includes a barrier so that the write of current->state * is correctly serialised wrt the caller's subsequent test of whether to @@ -358,6 +425,11 @@ extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner); +/* + * This one can be called with interrupts disabled, only + * to be used by lowlevel arch code! + */ +asmlinkage void __sched __schedule(void); struct nsproxy; struct user_namespace; @@ -561,7 +633,7 @@ struct task_cputime { struct thread_group_cputimer { struct task_cputime cputime; int running; - spinlock_t lock; + raw_spinlock_t lock; }; /* @@ -878,7 +950,10 @@ static inline int sd_balance_for_mc_power(void) if (sched_smt_power_savings) return SD_POWERSAVINGS_BALANCE; - return SD_PREFER_SIBLING; + if (!sched_mc_power_savings) + return SD_PREFER_SIBLING; + + return 0; } static inline int sd_balance_for_package_power(void) @@ -1084,7 +1159,8 @@ struct sched_domain; struct sched_class { const struct sched_class *next; - void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup, + bool head); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); void (*yield_task) (struct rq *rq); @@ -1234,15 +1310,14 @@ struct task_struct { void *stack; atomic_t usage; unsigned int flags; /* per process flags, defined below */ + unsigned int extra_flags; unsigned int ptrace; int lock_depth; /* BKL lock depth */ #ifdef CONFIG_SMP -#ifdef __ARCH_WANT_UNLOCKED_CTXSW int oncpu; #endif -#endif int prio, static_prio, normal_prio; unsigned int rt_priority; @@ -1286,6 +1361,7 @@ struct task_struct { struct plist_node pushable_tasks; struct mm_struct *mm, *active_mm; + int pagefault_disabled; /* task state */ int exit_state; @@ -1360,6 +1436,8 @@ struct task_struct { struct task_cputime cputime_expires; struct list_head cpu_timers[3]; + struct task_struct* posix_timer_list; + /* process credentials */ const struct cred *real_cred; /* objective and real subjective task * credentials (COW) */ @@ -1395,6 +1473,7 @@ struct task_struct { /* signal handlers */ struct signal_struct *signal; struct sighand_struct *sighand; + struct sigqueue *sigqueue_cache; sigset_t blocked, real_blocked; sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */ @@ -1462,6 +1541,26 @@ struct task_struct { gfp_t lockdep_reclaim_gfp; #endif +/* realtime bits */ + +#define MAX_PREEMPT_TRACE 25 +#define MAX_LOCK_STACK MAX_PREEMPT_TRACE +#ifdef CONFIG_DEBUG_PREEMPT + atomic_t lock_count; +# ifdef CONFIG_PREEMPT_RT + struct rt_mutex *owned_lock[MAX_LOCK_STACK]; +# endif +#endif +#ifdef CONFIG_DETECT_SOFTLOCKUP + unsigned long softlockup_count; /* Count to keep track how long the + * thread is in the kernel without + * sleeping. + */ +#endif +#ifdef CONFIG_DEBUG_RT_MUTEXES + void *last_kernel_lock; +#endif + /* journalling filesystem info */ void *journal_info; @@ -1556,6 +1655,9 @@ struct task_struct { unsigned long trace; /* bitmask of trace recursion */ unsigned long trace_recursion; +#ifdef CONFIG_WAKEUP_LATENCY_HIST + u64 preempt_timestamp_hist; +#endif #endif /* CONFIG_TRACING */ unsigned long stack_start; #ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */ @@ -1566,11 +1668,24 @@ struct task_struct { unsigned long memsw_bytes; /* uncharged mem+swap usage */ } memcg_batch; #endif +#ifdef CONFIG_PREEMPT_RT + /* + * Temporary hack, until we find a solution to + * handle printk in atomic operations. + */ + int in_printk; +#endif }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) +#ifdef CONFIG_PREEMPT_RT +# define set_printk_might_sleep(x) do { current->in_printk = x; } while(0) +#else +# define set_printk_might_sleep(x) do { } while(0) +#endif + /* * Priority of a process goes from 0..MAX_PRIO-1, valid RT * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH @@ -1739,6 +1854,15 @@ extern struct pid *cad_pid; extern void free_task(struct task_struct *tsk); #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) +#ifdef CONFIG_PREEMPT_RT +extern void __put_task_struct_cb(struct rcu_head *rhp); + +static inline void put_task_struct(struct task_struct *t) +{ + if (atomic_dec_and_test(&t->usage)) + call_rcu(&t->rcu, __put_task_struct_cb); +} +#else extern void __put_task_struct(struct task_struct *t); static inline void put_task_struct(struct task_struct *t) @@ -1746,6 +1870,7 @@ static inline void put_task_struct(struct task_struct *t) if (atomic_dec_and_test(&t->usage)) __put_task_struct(t); } +#endif extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st); extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st); @@ -1759,6 +1884,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ +#define PF_KMAP 0x00000020 /* this context has a kmap */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ @@ -1786,6 +1912,10 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ #define PF_FREEZER_NOSIG 0x80000000 /* Freezer won't send signals to it */ +/* Flags in the extra_flags field */ +#define PFE_SOFTIRQ 0x00000001 /* softirq context */ +#define PFE_HARDIRQ 0x00000002 /* hardirq thread */ + /* * Only the _current_ task can read/write to tsk->flags, but other * tasks can access tsk->flags in readonly mode for example @@ -1965,9 +2095,14 @@ int sched_rt_handler(struct ctl_table *table, int write, extern unsigned int sysctl_sched_compat_yield; +extern void task_setprio(struct task_struct *p, int prio); + #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); -extern void rt_mutex_setprio(struct task_struct *p, int prio); +static inline void rt_mutex_setprio(struct task_struct *p, int prio) +{ + task_setprio(p, prio); +} extern void rt_mutex_adjust_pi(struct task_struct *p); #else static inline int rt_mutex_getprio(struct task_struct *p) @@ -1989,8 +2124,17 @@ extern int sched_setscheduler_nocheck(struct task_struct *, int, extern struct task_struct *idle_task(int cpu); extern struct task_struct *curr_task(int cpu); extern void set_curr_task(int cpu, struct task_struct *p); +extern struct task_struct *rq_curr(struct rq *rq); void yield(void); +void __yield(void); + +#ifdef CONFIG_SMP +static inline int task_is_current(struct task_struct *task) +{ + return task->oncpu; +} +#endif /* * The default (Linux) execution domain. @@ -2052,6 +2196,9 @@ extern void do_timer(unsigned long ticks); extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); +extern int wake_up_process_mutex(struct task_struct * tsk); +extern int wake_up_process_sync(struct task_struct * tsk); +extern int wake_up_process_mutex_sync(struct task_struct * tsk); extern void wake_up_new_task(struct task_struct *tsk, unsigned long clone_flags); #ifdef CONFIG_SMP @@ -2142,12 +2289,20 @@ extern struct mm_struct * mm_alloc(void); /* mmdrop drops the mm and the page tables */ extern void __mmdrop(struct mm_struct *); +extern void __mmdrop_delayed(struct mm_struct *); + static inline void mmdrop(struct mm_struct * mm) { if (unlikely(atomic_dec_and_test(&mm->mm_count))) __mmdrop(mm); } +static inline void mmdrop_delayed(struct mm_struct * mm) +{ + if (atomic_dec_and_test(&mm->mm_count)) + __mmdrop_delayed(mm); +} + /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); /* Grab a reference to a task's mm, if it is not already going away */ @@ -2415,7 +2570,7 @@ extern int _cond_resched(void); extern int __cond_resched_lock(spinlock_t *lock); -#ifdef CONFIG_PREEMPT +#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_RT) #define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET #else #define PREEMPT_LOCK_OFFSET 0 @@ -2428,10 +2583,20 @@ extern int __cond_resched_lock(spinlock_t *lock); extern int __cond_resched_softirq(void); + +#ifndef CONFIG_PREEMPT_RT #define cond_resched_softirq() ({ \ __might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET); \ __cond_resched_softirq(); \ }) +#else +#define cond_resched_softirq() ({ \ + __might_sleep(__FILE__, __LINE__, 0); \ + __cond_resched_softirq(); \ +}) +#endif + +extern int cond_resched_softirq_context(void); /* * Does a critical section need to be broken due to another @@ -2456,7 +2621,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times); static inline void thread_group_cputime_init(struct signal_struct *sig) { sig->cputimer.cputime = INIT_CPUTIME; - spin_lock_init(&sig->cputimer.lock); + raw_spin_lock_init(&sig->cputimer.lock); sig->cputimer.running = 0; } @@ -2464,6 +2629,13 @@ static inline void thread_group_cputime_free(struct signal_struct *sig) { } +static inline int softirq_need_resched(void) +{ + if (softirq_preemption && (current->extra_flags & PFE_SOFTIRQ)) + return need_resched(); + return 0; +} + /* * Reevaluate whether the task has signals pending delivery. * Wake the task if so. diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h index 7415839..66d2591 100644 --- a/include/linux/semaphore.h +++ b/include/linux/semaphore.h @@ -14,20 +14,20 @@ /* Please don't access any members of this structure directly */ struct semaphore { - spinlock_t lock; + raw_spinlock_t lock; unsigned int count; struct list_head wait_list; }; #define __SEMAPHORE_INITIALIZER(name, n) \ { \ - .lock = __SPIN_LOCK_UNLOCKED((name).lock), \ + .lock = __RAW_SPIN_LOCK_UNLOCKED((name).lock), \ .count = n, \ .wait_list = LIST_HEAD_INIT((name).wait_list), \ } -#define DECLARE_MUTEX(name) \ - struct semaphore name = __SEMAPHORE_INITIALIZER(name, 1) +#define DEFINE_SEMAPHORE(name, val) \ + struct semaphore name = __SEMAPHORE_INITIALIZER(name, val) static inline void sema_init(struct semaphore *sem, int val) { @@ -36,9 +36,6 @@ static inline void sema_init(struct semaphore *sem, int val) lockdep_init_map(&sem->lock.dep_map, "semaphore->lock", &__key, 0); } -#define init_MUTEX(sem) sema_init(sem, 1) -#define init_MUTEX_LOCKED(sem) sema_init(sem, 0) - extern void down(struct semaphore *sem); extern int __must_check down_interruptible(struct semaphore *sem); extern int __must_check down_killable(struct semaphore *sem); diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 632205c..a6de405 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -3,9 +3,11 @@ /* * Reader/writer consistent mechanism without starving writers. This type of * lock for data where the reader wants a consistent set of information - * and is willing to retry if the information changes. Readers never - * block but they may have to retry if a writer is in - * progress. Writers do not wait for readers. + * and is willing to retry if the information changes. Readers block + * on write contention (and where applicable, pi-boost the writer). + * Readers without contention on entry acquire the critical section + * without any atomic operations, but they may have to retry if a writer + * enters before the critical section ends. Writers do not wait for readers. * * This is not as cache friendly as brlock. Also, this will not work * for data that contains pointers, because any writer could @@ -24,6 +26,8 @@ * * Based on x86_64 vsyscall gettimeofday * by Keith Owens and Andrea Arcangeli + * + * Priority inheritance and live-lock avoidance by Gregory Haskins */ #include <linux/spinlock.h> @@ -31,49 +35,80 @@ typedef struct { unsigned sequence; - spinlock_t lock; + raw_spinlock_t lock; +} raw_seqlock_t; + +typedef struct { + unsigned sequence; + rwlock_t lock; } seqlock_t; /* * These macros triggered gcc-3.x compile-time problems. We think these are * OK now. Be cautious. */ +#define __RAW_SEQLOCK_UNLOCKED(lockname) \ + { 0, __RAW_SPIN_LOCK_UNLOCKED(lockname) } + +#define raw_seqlock_init(x) \ + do { \ + (x)->sequence = 0; \ + raw_spin_lock_init(&(x)->lock); \ + } while (0) + +#define DEFINE_RAW_SEQLOCK(x) \ + raw_seqlock_t x = __RAW_SEQLOCK_UNLOCKED(x) + #define __SEQLOCK_UNLOCKED(lockname) \ - { 0, __SPIN_LOCK_UNLOCKED(lockname) } + { 0, __RW_LOCK_UNLOCKED(lockname) } #define SEQLOCK_UNLOCKED \ - __SEQLOCK_UNLOCKED(old_style_seqlock_init) + __SEQLOCK_UNLOCKED(old_style_seqlock_init) #define seqlock_init(x) \ do { \ (x)->sequence = 0; \ - spin_lock_init(&(x)->lock); \ + rwlock_init(&(x)->lock); \ } while (0) #define DEFINE_SEQLOCK(x) \ - seqlock_t x = __SEQLOCK_UNLOCKED(x) + seqlock_t x = __SEQLOCK_UNLOCKED(x) /* Lock out other writers and update the count. * Acts like a normal spin_lock/unlock. * Don't need preempt_disable() because that is in the spin_lock already. */ +static inline void write_raw_seqlock(raw_seqlock_t *sl) +{ + raw_spin_lock(&sl->lock); + ++sl->sequence; + smp_wmb(); +} + static inline void write_seqlock(seqlock_t *sl) { - spin_lock(&sl->lock); + write_lock(&sl->lock); ++sl->sequence; smp_wmb(); } +static inline void write_raw_sequnlock(raw_seqlock_t *sl) +{ + smp_wmb(); + sl->sequence++; + raw_spin_unlock(&sl->lock); +} + static inline void write_sequnlock(seqlock_t *sl) { smp_wmb(); sl->sequence++; - spin_unlock(&sl->lock); + write_unlock(&sl->lock); } static inline int write_tryseqlock(seqlock_t *sl) { - int ret = spin_trylock(&sl->lock); + int ret = write_trylock(&sl->lock); if (ret) { ++sl->sequence; @@ -83,7 +118,7 @@ static inline int write_tryseqlock(seqlock_t *sl) } /* Start of read calculation -- fetch last complete writer token */ -static __always_inline unsigned read_seqbegin(const seqlock_t *sl) +static __always_inline unsigned read_raw_seqbegin(const raw_seqlock_t *sl) { unsigned ret; @@ -98,11 +133,42 @@ repeat: return ret; } +static __always_inline unsigned read_seqbegin(seqlock_t *sl) +{ + unsigned ret; + + ret = sl->sequence; + smp_rmb(); + if (unlikely(ret & 1)) { + cpu_relax(); + /* + * Serialze with the writer which will ensure they are + * pi-boosted if necessary and prevent us from starving + * them. + */ + read_lock(&sl->lock); + ret = sl->sequence; + read_unlock(&sl->lock); + } + + BUG_ON(ret & 1); + + return ret; +} + /* * Test if reader processed invalid data. * * If sequence value changed then writer changed data while in section. */ +static __always_inline int +read_raw_seqretry(const raw_seqlock_t *sl, unsigned start) +{ + smp_rmb(); + + return (sl->sequence != start); +} + static __always_inline int read_seqretry(const seqlock_t *sl, unsigned start) { smp_rmb(); @@ -170,12 +236,36 @@ static inline void write_seqcount_end(seqcount_t *s) /* * Possible sw/hw IRQ protected versions of the interfaces. */ +#define write_raw_seqlock_irqsave(lock, flags) \ + do { local_irq_save(flags); write_raw_seqlock(lock); } while (0) +#define write_raw_seqlock_irq(lock) \ + do { local_irq_disable(); write_raw_seqlock(lock); } while (0) +#define write_raw_seqlock_bh(lock) \ + do { local_bh_disable(); write_raw_seqlock(lock); } while (0) + +#define write_raw_sequnlock_irqrestore(lock, flags) \ + do { write_raw_sequnlock(lock); local_irq_restore(flags); } while(0) +#define write_raw_sequnlock_irq(lock) \ + do { write_raw_sequnlock(lock); local_irq_enable(); } while(0) +#define write_raw_sequnlock_bh(lock) \ + do { write_raw_sequnlock(lock); local_bh_enable(); } while(0) + +#define read_raw_seqbegin_irqsave(lock, flags) \ + ({ local_irq_save(flags); read_raw_seqbegin(lock); }) + +#define read_raw_seqretry_irqrestore(lock, iv, flags) \ + ({ \ + int ret = read_raw_seqretry(lock, iv); \ + local_irq_restore(flags); \ + ret; \ + }) + #define write_seqlock_irqsave(lock, flags) \ do { local_irq_save(flags); write_seqlock(lock); } while (0) #define write_seqlock_irq(lock) \ do { local_irq_disable(); write_seqlock(lock); } while (0) #define write_seqlock_bh(lock) \ - do { local_bh_disable(); write_seqlock(lock); } while (0) + do { local_bh_disable(); write_seqlock(lock); } while (0) #define write_sequnlock_irqrestore(lock, flags) \ do { write_sequnlock(lock); local_irq_restore(flags); } while(0) diff --git a/include/linux/signal.h b/include/linux/signal.h index ab9272c..0518172 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -225,6 +225,7 @@ static inline void init_sigpending(struct sigpending *sig) } extern void flush_sigqueue(struct sigpending *queue); +extern void flush_task_sigqueue(struct task_struct *tsk); /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */ static inline int valid_signal(unsigned long sig) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ae836fd..45552a4 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -98,6 +98,9 @@ struct pipe_inode_info; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack { atomic_t use; +#ifdef CONFIG_PREEMPT_RT + struct rcu_head rcu; +#endif }; #endif diff --git a/include/linux/smb_fs_sb.h b/include/linux/smb_fs_sb.h index 8a060a7..41c69a4 100644 --- a/include/linux/smb_fs_sb.h +++ b/include/linux/smb_fs_sb.h @@ -57,7 +57,7 @@ struct smb_sb_info { struct smb_conn_opt opt; wait_queue_head_t conn_wq; int conn_complete; - struct semaphore sem; + struct mutex mutex; unsigned char header[SMB_HEADER_LEN + 20*2 + 2]; u32 header_len; @@ -79,19 +79,19 @@ struct smb_sb_info { static inline int smb_lock_server_interruptible(struct smb_sb_info *server) { - return down_interruptible(&(server->sem)); + return mutex_lock_interruptible(&server->mutex); } static inline void smb_lock_server(struct smb_sb_info *server) { - down(&(server->sem)); + mutex_lock(&server->mutex); } static inline void smb_unlock_server(struct smb_sb_info *server) { - up(&(server->sem)); + mutex_unlock(&server->mutex); } #endif diff --git a/include/linux/smp.h b/include/linux/smp.h index 7a0570e..c55f2ca 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -50,6 +50,16 @@ extern void smp_send_stop(void); */ extern void smp_send_reschedule(int cpu); +/* + * trigger a reschedule on all other CPUs: + */ +extern void smp_send_reschedule_allbutself(void); + +/* + * trigger a reschedule on all other CPUs: + */ +extern void smp_send_reschedule_allbutself(void); + /* * Prepare machine for booting other CPUs. @@ -136,6 +146,7 @@ static inline int up_smp_call_function(void (*func)(void *), void *info) 0; \ }) static inline void smp_send_reschedule(int cpu) { } +static inline void smp_send_reschedule_allbutself(void) { } #define num_booting_cpus() 1 #define smp_prepare_boot_cpu() do {} while (0) #define smp_call_function_many(mask, func, info, wait) \ diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 8608821..4186cb7 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -58,23 +58,6 @@ #include <asm/system.h> /* - * Must define these before including other files, inline functions need them - */ -#define LOCK_SECTION_NAME ".text.lock."KBUILD_BASENAME - -#define LOCK_SECTION_START(extra) \ - ".subsection 1\n\t" \ - extra \ - ".ifndef " LOCK_SECTION_NAME "\n\t" \ - LOCK_SECTION_NAME ":\n\t" \ - ".endif\n" - -#define LOCK_SECTION_END \ - ".previous\n\t" - -#define __lockfunc __attribute__((section(".spinlock.text"))) - -/* * Pull the arch_spinlock_t and arch_rwlock_t definitions: */ #include <linux/spinlock_types.h> @@ -261,6 +244,98 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) # include <linux/spinlock_api_up.h> #endif +#ifdef CONFIG_PREEMPT_RT + +#include <linux/rt_lock.h> + +#define spin_lock(lock) rt_spin_lock(lock) +#define spin_lock_bh(lock) rt_spin_lock(lock) + +#define spin_trylock(lock) __cond_lock(lock, rt_spin_trylock(lock)) + +#ifdef CONFIG_LOCKDEP +# define spin_lock_nested(lock, subclass) \ + rt_spin_lock_nested(lock, subclass) + +# define spin_lock_irqsave_nested(lock, flags, subclass) \ +do { \ + typecheck(unsigned long, flags); \ + flags = 0; \ + rt_spin_lock_nested(lock, subclass); \ +} while (0) +#else +# define spin_lock_nested(lock, subclass) \ + rt_spin_lock(lock) + +# define spin_lock_irqsave_nested(lock, flags, subclass) \ +do { \ + typecheck(unsigned long, flags); \ + flags = 0; \ + rt_spin_lock(lock); \ +} while (0) +#endif + +#define spin_lock_irq(lock) rt_spin_lock(lock) + +#define spin_lock_irqsave(lock, flags) \ +do { \ + typecheck(unsigned long, flags); \ + flags = 0; \ + rt_spin_lock(lock); \ +} while (0) + +/* FIXME: we need rt_spin_lock_nested */ +#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0) + +#define spin_unlock(lock) rt_spin_unlock(lock) +#define spin_unlock_bh(lock) rt_spin_unlock(lock) +#define spin_unlock_irq(lock) rt_spin_unlock(lock) + +#define spin_unlock_irqrestore(lock, flags) \ +do { \ + typecheck(unsigned long, flags); \ + (void) flags; \ + rt_spin_unlock(lock); \ +} while (0) + +#define spin_trylock_bh(lock) __cond_lock(lock, rt_spin_trylock(lock)) +#define spin_trylock_irq(lock) __cond_lock(lock, rt_spin_trylock(lock)) + +#define spin_trylock_irqsave(lock, flags) \ +({ \ + typecheck(unsigned long, flags); \ + flags = 0; \ + __cond_lock(lock, rt_spin_trylock(lock)); \ +}) + +#define spin_unlock_wait(lock) rt_spin_unlock_wait(lock) + +#ifdef CONFIG_GENERIC_LOCKBREAK +# define spin_is_contended(lock) ((lock)->break_lock) +#else +# define spin_is_contended(lock) (((void)(lock), 0)) +#endif + +static inline int spin_can_locked(spinlock_t *lock) +{ + return !rt_mutex_is_locked(&lock->lock); +} + +static inline int spin_is_locked(spinlock_t *lock) +{ + return rt_mutex_is_locked(&lock->lock); +} + +static inline void assert_spin_locked(spinlock_t *lock) +{ + BUG_ON(!spin_is_locked(lock)); +} + +#define atomic_dec_and_lock(atomic, lock) \ + atomic_dec_and_spin_lock(atomic, lock) + +#else + /* * Map the spin_lock functions to the raw variants for PREEMPT_RT=n */ @@ -393,4 +468,6 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock); #define atomic_dec_and_lock(atomic, lock) \ __cond_lock(lock, _atomic_dec_and_lock(atomic, lock)) +#endif /* !PREEMPT_RT */ + #endif /* __LINUX_SPINLOCK_H */ diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index e253ccd..4586f95 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -174,7 +174,7 @@ static inline void __raw_spin_unlock_bh(raw_spinlock_t *lock) { spin_release(&lock->dep_map, 1, _RET_IP_); do_raw_spin_unlock(lock); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); local_bh_enable_ip((unsigned long)__builtin_return_address(0)); } @@ -186,11 +186,13 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock) spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); return 1; } - preempt_enable_no_resched(); + __preempt_enable_no_resched(); local_bh_enable_ip((unsigned long)__builtin_return_address(0)); return 0; } +#ifndef CONFIG_PREEMPT_RT #include <linux/rwlock_api_smp.h> +#endif #endif /* __LINUX_SPINLOCK_API_SMP_H */ diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h index af1f472..d05112d 100644 --- a/include/linux/spinlock_api_up.h +++ b/include/linux/spinlock_api_up.h @@ -40,7 +40,7 @@ do { preempt_enable(); __release(lock); (void)(lock); } while (0) #define __UNLOCK_BH(lock) \ - do { preempt_enable_no_resched(); local_bh_enable(); \ + do { __preempt_enable_no_resched(); local_bh_enable(); \ __release(lock); (void)(lock); } while (0) #define __UNLOCK_IRQ(lock) \ diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h index 851b778..b875516 100644 --- a/include/linux/spinlock_types.h +++ b/include/linux/spinlock_types.h @@ -9,6 +9,23 @@ * Released under the General Public License (GPL). */ +/* + * Must define these before including other files, inline functions need them + */ +#define LOCK_SECTION_NAME ".text.lock."KBUILD_BASENAME + +#define LOCK_SECTION_START(extra) \ + ".subsection 1\n\t" \ + extra \ + ".ifndef " LOCK_SECTION_NAME "\n\t" \ + LOCK_SECTION_NAME ":\n\t" \ + ".endif\n" + +#define LOCK_SECTION_END \ + ".previous\n\t" + +#define __lockfunc __attribute__((section(".spinlock.text"))) + #if defined(CONFIG_SMP) # include <asm/spinlock_types.h> #else @@ -61,6 +78,8 @@ typedef struct raw_spinlock { #define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x) +#ifndef CONFIG_PREEMPT_RT + typedef struct spinlock { union { struct raw_spinlock rlock; @@ -90,7 +109,10 @@ typedef struct spinlock { #define SPIN_LOCK_UNLOCKED __SPIN_LOCK_UNLOCKED(old_style_spin_init) #define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) +#define __DEFINE_SPINLOCK(x) DEFINE_SPINLOCK(x) #include <linux/rwlock_types.h> +#endif /* !PREEMPT_RT */ + #endif /* __LINUX_SPINLOCK_TYPES_H */ diff --git a/include/linux/time.h b/include/linux/time.h index 6e026e4..49278a9 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -99,7 +99,7 @@ static inline struct timespec timespec_sub(struct timespec lhs, extern struct timespec xtime; extern struct timespec wall_to_monotonic; -extern seqlock_t xtime_lock; +extern raw_seqlock_t xtime_lock; extern void read_persistent_clock(struct timespec *ts); extern void read_boot_clock(struct timespec *ts); diff --git a/include/linux/timer.h b/include/linux/timer.h index a2d1eb6..af92a91 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -225,10 +225,12 @@ static inline void timer_stats_timer_clear_start_info(struct timer_list *timer) extern void add_timer(struct timer_list *timer); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) + extern int timer_pending_sync(struct timer_list *timer); extern int try_to_del_timer_sync(struct timer_list *timer); extern int del_timer_sync(struct timer_list *timer); #else +# define timer_pending_sync(t) timer_pending(t) # define try_to_del_timer_sync(t) del_timer(t) # define del_timer_sync(t) del_timer(t) #endif diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index d512d98..9c7e38a 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -6,37 +6,10 @@ /* * These routines enable/disable the pagefault handler in that - * it will not take any locks and go straight to the fixup table. - * - * They have great resemblance to the preempt_disable/enable calls - * and in fact they are identical; this is because currently there is - * no other way to make the pagefault handlers do this. So we do - * disable preemption but we don't necessarily care about that. + * it will not take any MM locks and go straight to the fixup table. */ -static inline void pagefault_disable(void) -{ - inc_preempt_count(); - /* - * make sure to have issued the store before a pagefault - * can hit. - */ - barrier(); -} - -static inline void pagefault_enable(void) -{ - /* - * make sure to issue those last loads/stores before enabling - * the pagefault handler again. - */ - barrier(); - dec_preempt_count(); - /* - * make sure we do.. - */ - barrier(); - preempt_check_resched(); -} +extern void pagefault_disable(void); +extern void pagefault_enable(void); #ifndef ARCH_HAS_NOCACHE_UACCESS diff --git a/include/linux/usb.h b/include/linux/usb.h index d7ace1b..44410e0 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -527,9 +527,9 @@ extern struct usb_device *usb_get_dev(struct usb_device *dev); extern void usb_put_dev(struct usb_device *dev); /* USB device locking */ -#define usb_lock_device(udev) down(&(udev)->dev.sem) -#define usb_unlock_device(udev) up(&(udev)->dev.sem) -#define usb_trylock_device(udev) down_trylock(&(udev)->dev.sem) +#define usb_lock_device(udev) mutex_lock(&(udev)->dev.mutex) +#define usb_unlock_device(udev) mutex_unlock(&(udev)->dev.mutex) +#define usb_trylock_device(udev) mutex_trylock(&(udev)->dev.mutex) extern int usb_lock_device_for_reset(struct usb_device *udev, const struct usb_interface *iface); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index ee03bba..60543ed 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -78,7 +78,12 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states); static inline void __count_vm_event(enum vm_event_item item) { +#ifdef CONFIG_PREEMPT_RT + get_cpu_var(vm_event_states).event[item]++; + put_cpu(); +#else __this_cpu_inc(per_cpu_var(vm_event_states).event[item]); +#endif } static inline void count_vm_event(enum vm_event_item item) @@ -88,7 +93,12 @@ static inline void count_vm_event(enum vm_event_item item) static inline void __count_vm_events(enum vm_event_item item, long delta) { +#ifdef CONFIG_PREEMPT_RT + get_cpu_var(vm_event_states).event[item] += delta; + put_cpu(); +#else __this_cpu_add(per_cpu_var(vm_event_states).event[item], delta); +#endif } static inline void count_vm_events(enum vm_event_item item, long delta) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 9466e86..ec218ce 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -211,6 +211,9 @@ __create_workqueue_key(const char *name, int singlethread, #define create_freezeable_workqueue(name) __create_workqueue((name), 1, 1, 0) #define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0, 0) +extern void set_workqueue_prio(struct workqueue_struct *wq, int policy, + int rt_priority, int nice); + extern void destroy_workqueue(struct workqueue_struct *wq); extern int queue_work(struct workqueue_struct *wq, struct work_struct *work); diff --git a/include/trace/events/hist.h b/include/trace/events/hist.h new file mode 100644 index 0000000..28646db --- /dev/null +++ b/include/trace/events/hist.h @@ -0,0 +1,69 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hist + +#if !defined(_TRACE_HIST_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HIST_H + +#include "latency_hist.h" +#include <linux/tracepoint.h> + +#if !defined(CONFIG_PREEMPT_OFF_HIST) && !defined(CONFIG_INTERRUPT_OFF_HIST) +#define trace_preemptirqsoff_hist(a,b) +#else +TRACE_EVENT(preemptirqsoff_hist, + + TP_PROTO(int reason, int starthist), + + TP_ARGS(reason, starthist), + + TP_STRUCT__entry( + __field(int, reason ) + __field(int, starthist ) + ), + + TP_fast_assign( + __entry->reason = reason; + __entry->starthist = starthist; + ), + + TP_printk("reason=%s starthist=%s", getaction(__entry->reason), + __entry->starthist ? "start" : "stop") +); +#endif + +#ifndef CONFIG_MISSED_TIMER_OFFSETS_HIST +#define trace_hrtimer_interrupt(a,b,c,d) +#else +TRACE_EVENT(hrtimer_interrupt, + + TP_PROTO(int cpu, long long offset, struct task_struct *curr, struct task_struct *task), + + TP_ARGS(cpu, offset, curr, task), + + TP_STRUCT__entry( + __field(int, cpu ) + __field(long long, offset ) + __array(char, ccomm, TASK_COMM_LEN) + __field(int, cprio ) + __array(char, tcomm, TASK_COMM_LEN) + __field(int, tprio ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->offset = offset; + memcpy(__entry->ccomm, curr->comm, TASK_COMM_LEN); + __entry->cprio = curr->prio; + memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>", task != NULL ? TASK_COMM_LEN : 7); + __entry->tprio = task != NULL ? task->prio : -1; + ), + + TP_printk("cpu=%d offset=%lld curr=%s[%d] thread=%s[%d]", + __entry->cpu, __entry->offset, __entry->ccomm, __entry->cprio, __entry->tcomm, __entry->tprio) +); +#endif + +#endif /* _TRACE_HIST_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/include/trace/events/latency_hist.h b/include/trace/events/latency_hist.h new file mode 100644 index 0000000..d6b5d77 --- /dev/null +++ b/include/trace/events/latency_hist.h @@ -0,0 +1,30 @@ +#ifndef _LATENCY_HIST_H +#define _LATENCY_HIST_H + +enum hist_action { + IRQS_ON, + PREEMPT_ON, + TRACE_STOP, + IRQS_OFF, + PREEMPT_OFF, + TRACE_START, +}; + +static char *actions[] = { + "IRQS_ON", + "PREEMPT_ON", + "TRACE_STOP", + "IRQS_OFF", + "PREEMPT_OFF", + "TRACE_START", +}; + +static inline char *getaction(int action) +{ + if (action >= 0 && action <= sizeof(actions)/sizeof(actions[0])) + return(actions[action]); + return("unknown"); +} + +#endif /* _LATENCY_HIST_H */ + diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index cfceb0b..ad2ae91 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -161,9 +161,12 @@ TRACE_EVENT(sched_switch, __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, __entry->prev_state ? __print_flags(__entry->prev_state, "|", - { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, - { 16, "Z" }, { 32, "X" }, { 64, "x" }, - { 128, "W" }) : "R", + { 1, TASK_STATE_1} , { 2, TASK_STATE_2 }, + { 4, TASK_STATE_4 }, { 8, TASK_STATE_8 }, + { 16, TASK_STATE_16 }, { 32, TASK_STATE_32 }, + { 64, TASK_STATE_64 }, { 128, TASK_STATE_128 }, + { 256, TASK_STATE_256 }, { 512, TASK_STATE_512 } + ) : TASK_STATE_0, __entry->next_comm, __entry->next_pid, __entry->next_prio) ); @@ -235,6 +238,37 @@ DEFINE_EVENT(sched_process_template, sched_process_exit, TP_ARGS(p)); /* + * Tracepoint for priority boosting/deboosting of a task: + * + * (NOTE: the 'rq' argument is not used by generic trace events, + * but used by the latency tracer plugin. ) + */ +TRACE_EVENT(sched_task_setprio, + + TP_PROTO(struct rq *rq, struct task_struct *p, int oldprio), + + TP_ARGS(rq, p, oldprio), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + __field( int, oldprio ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + __entry->oldprio = oldprio; + ), + + TP_printk("task %s:%d [%d] oldprio=%d", + __entry->comm, __entry->pid, __entry->prio, + __entry->oldprio) +); + +/* * Tracepoint for a waiting task: */ TRACE_EVENT(sched_process_wait, diff --git a/init/Kconfig b/init/Kconfig index d95ca7c..5cc0442 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -328,6 +328,7 @@ choice config TREE_RCU bool "Tree-based hierarchical RCU" + depends on !PREEMPT_RT help This option selects the RCU implementation that is designed for very large SMP system with hundreds or @@ -346,7 +347,7 @@ config TREE_PREEMPT_RCU config TINY_RCU bool "UP-only small-memory-footprint RCU" - depends on !SMP + depends on !SMP && !PREEMPT_RT help This option selects the RCU implementation that is designed for UP systems from which real-time response @@ -1071,6 +1072,7 @@ config SLAB config SLUB bool "SLUB (Unqueued Allocator)" + depends on !PREEMPT_RT help SLUB is a slab allocator that minimizes cache line usage instead of managing queues of cached objects (SLAB approach). diff --git a/init/Makefile b/init/Makefile index 0bf677a..dc276e4 100644 --- a/init/Makefile +++ b/init/Makefile @@ -29,4 +29,5 @@ silent_chk_compile.h = : include/generated/compile.h: FORCE @$($(quiet)chk_compile.h) $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \ - "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)" + "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT)" \ + "$(CC) $(KBUILD_CFLAGS)" diff --git a/init/main.c b/init/main.c index 4cb47a1..aa48b7b 100644 --- a/init/main.c +++ b/init/main.c @@ -36,6 +36,7 @@ #include <linux/workqueue.h> #include <linux/profile.h> #include <linux/rcupdate.h> +#include <linux/posix-timers.h> #include <linux/moduleparam.h> #include <linux/kallsyms.h> #include <linux/writeback.h> @@ -412,6 +413,8 @@ static noinline void __init_refok rest_init(void) { int pid; + system_state = SYSTEM_BOOTING_SCHEDULER_OK; + rcu_scheduler_starting(); kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND); numa_default_policy(); @@ -424,8 +427,7 @@ static noinline void __init_refok rest_init(void) * at least once to get things moving: */ init_idle_bootup_task(current); - preempt_enable_no_resched(); - schedule(); + preempt_enable_and_schedule(); preempt_disable(); /* Call into cpu_idle with preempt disabled */ @@ -621,7 +623,16 @@ asmlinkage void __init start_kernel(void) * to self-test [hard/soft]-irqs on/off lock inversion bugs * too: */ - locking_selftest(); + if (1) { + /* + * Hack around the fact that locking_selftest() destroys + * the lockdep state, so release the one known lock and + * acquire it again after the self-test is done. + */ + mutex_release(&kernel_sem.dep_map, 1, _THIS_IP_); + locking_selftest(); + mutex_acquire(&kernel_sem.dep_map, 0, 0, _THIS_IP_); + } #ifdef CONFIG_BLK_DEV_INITRD if (initrd_start && !initrd_below_start_ok && @@ -678,6 +689,9 @@ asmlinkage void __init start_kernel(void) ftrace_init(); +#ifdef CONFIG_PREEMPT_RT + WARN_ON(irqs_disabled()); +#endif /* Do the rest non-__init'ed, we're now alive */ rest_init(); } @@ -781,9 +795,11 @@ static void __init do_basic_setup(void) static void __init do_pre_smp_initcalls(void) { initcall_t *fn; + extern int spawn_desched_task(void); for (fn = __initcall_start; fn < __early_initcall_end; fn++) do_one_initcall(*fn); + spawn_desched_task(); } static void run_init_process(char *init_filename) @@ -819,6 +835,9 @@ static noinline int init_post(void) printk(KERN_WARNING "Failed to execute %s\n", ramdisk_execute_command); } +#ifdef CONFIG_PREEMPT_RT + WARN_ON(irqs_disabled()); +#endif /* * We try each of these until one succeeds. @@ -885,7 +904,60 @@ static int __init kernel_init(void * unused) ramdisk_execute_command = NULL; prepare_namespace(); } +#ifdef CONFIG_PREEMPT_RT + WARN_ON(irqs_disabled()); +#endif + +#define DEBUG_COUNT (defined(CONFIG_DEBUG_RT_MUTEXES) + defined(CONFIG_IRQSOFF_TRACER) + defined(CONFIG_PREEMPT_TRACER) + defined(CONFIG_STACK_TRACER) + defined(CONFIG_INTERRUPT_OFF_HIST) + defined(CONFIG_PREEMPT_OFF_HIST) + defined(CONFIG_WAKEUP_LATENCY_HIST) + defined(CONFIG_DEBUG_SLAB) + defined(CONFIG_DEBUG_PAGEALLOC) + defined(CONFIG_LOCKDEP) + (defined(CONFIG_FTRACE) - defined(CONFIG_FTRACE_MCOUNT_RECORD))) +#if DEBUG_COUNT > 0 + printk(KERN_ERR "*****************************************************************************\n"); + printk(KERN_ERR "* *\n"); +#if DEBUG_COUNT == 1 + printk(KERN_ERR "* REMINDER, the following debugging option is turned on in your .config: *\n"); +#else + printk(KERN_ERR "* REMINDER, the following debugging options are turned on in your .config: *\n"); +#endif + printk(KERN_ERR "* *\n"); +#ifdef CONFIG_DEBUG_RT_MUTEXES + printk(KERN_ERR "* CONFIG_DEBUG_RT_MUTEXES *\n"); +#endif +#ifdef CONFIG_IRQSOFF_TRACER + printk(KERN_ERR "* CONFIG_IRQSOFF_TRACER *\n"); +#endif +#ifdef CONFIG_PREEMPT_TRACER + printk(KERN_ERR "* CONFIG_PREEMPT_TRACER *\n"); +#endif +#if defined(CONFIG_FTRACE) && !defined(CONFIG_FTRACE_MCOUNT_RECORD) + printk(KERN_ERR "* CONFIG_FTRACE *\n"); +#endif +#ifdef CONFIG_INTERRUPT_OFF_HIST + printk(KERN_ERR "* CONFIG_INTERRUPT_OFF_HIST *\n"); +#endif +#ifdef CONFIG_PREEMPT_OFF_HIST + printk(KERN_ERR "* CONFIG_PREEMPT_OFF_HIST *\n"); +#endif +#ifdef CONFIG_WAKEUP_LATENCY_HIST + printk(KERN_ERR "* CONFIG_WAKEUP_LATENCY_HIST *\n"); +#endif +#ifdef CONFIG_DEBUG_SLAB + printk(KERN_ERR "* CONFIG_DEBUG_SLAB *\n"); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk(KERN_ERR "* CONFIG_DEBUG_PAGEALLOC *\n"); +#endif +#ifdef CONFIG_LOCKDEP + printk(KERN_ERR "* CONFIG_LOCKDEP *\n"); +#endif + printk(KERN_ERR "* *\n"); +#if DEBUG_COUNT == 1 + printk(KERN_ERR "* it may increase runtime overhead and latencies. *\n"); +#else + printk(KERN_ERR "* they may increase runtime overhead and latencies. *\n"); +#endif + printk(KERN_ERR "* *\n"); + printk(KERN_ERR "*****************************************************************************\n"); +#endif /* * Ok, we have completed the initial bootup, and * we're essentially up and running. Get rid of the diff --git a/ipc/mqueue.c b/ipc/mqueue.c index c79bd57..5bcb571 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -818,12 +818,17 @@ static inline void pipelined_send(struct mqueue_inode_info *info, struct msg_msg *message, struct ext_wait_queue *receiver) { + /* + * Keep them in one critical section for PREEMPT_RT: + */ + preempt_disable_rt(); receiver->msg = message; list_del(&receiver->list); receiver->state = STATE_PENDING; wake_up_process(receiver->task); smp_wmb(); receiver->state = STATE_READY; + preempt_enable_rt(); } /* pipelined_receive() - if there is task waiting in sys_mq_timedsend() diff --git a/ipc/msg.c b/ipc/msg.c index af42ef8..b1b796d 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -260,12 +260,20 @@ static void expunge_all(struct msg_queue *msq, int res) while (tmp != &msq->q_receivers) { struct msg_receiver *msr; + /* + * Make sure that the wakeup doesnt preempt + * this CPU prematurely. (on PREEMPT_RT) + */ + preempt_disable_rt(); + msr = list_entry(tmp, struct msg_receiver, r_list); tmp = tmp->next; msr->r_msg = NULL; wake_up_process(msr->r_tsk); smp_mb(); msr->r_msg = ERR_PTR(res); + + preempt_enable_rt(); } } @@ -612,6 +620,12 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg) !security_msg_queue_msgrcv(msq, msg, msr->r_tsk, msr->r_msgtype, msr->r_mode)) { + /* + * Make sure that the wakeup doesnt preempt + * this CPU prematurely. (on PREEMPT_RT) + */ + preempt_disable_rt(); + list_del(&msr->r_list); if (msr->r_maxsize < msg->m_ts) { msr->r_msg = NULL; @@ -625,9 +639,11 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg) wake_up_process(msr->r_tsk); smp_mb(); msr->r_msg = msg; + preempt_enable_rt(); return 1; } + preempt_enable_rt(); } } return 0; diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index bf987b9..f4602f8 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -1,14 +1,13 @@ - choice - prompt "Preemption Model" - default PREEMPT_NONE + prompt "Preemption Mode" + default PREEMPT_RT config PREEMPT_NONE bool "No Forced Preemption (Server)" help - This is the traditional Linux preemption model, geared towards + This is the traditional Linux preemption model geared towards throughput. It will still provide good latencies most of the - time, but there are no guarantees and occasional longer delays + time but there are no guarantees and occasional long delays are possible. Select this option if you are building a kernel for a server or @@ -21,7 +20,7 @@ config PREEMPT_VOLUNTARY help This option reduces the latency of the kernel by adding more "explicit preemption points" to the kernel code. These new - preemption points have been selected to reduce the maximum + preemption points have been selected to minimize the maximum latency of rescheduling, providing faster application reactions, at the cost of slightly lower throughput. @@ -33,22 +32,91 @@ config PREEMPT_VOLUNTARY Select this if you are building a kernel for a desktop system. -config PREEMPT +config PREEMPT_DESKTOP bool "Preemptible Kernel (Low-Latency Desktop)" help This option reduces the latency of the kernel by making - all kernel code (that is not executing in a critical section) + all kernel code that is not executing in a critical section preemptible. This allows reaction to interactive events by permitting a low priority process to be preempted involuntarily even if it is in kernel mode executing a system call and would - otherwise not be about to reach a natural preemption point. - This allows applications to run more 'smoothly' even when the - system is under load, at the cost of slightly lower throughput - and a slight runtime overhead to kernel code. + otherwise not about to reach a preemption point. This allows + applications to run more 'smoothly' even when the system is + under load, at the cost of slighly lower throughput and a + slight runtime overhead to kernel code. + + (According to profiles, when this mode is selected then even + during kernel-intense workloads the system is in an immediately + preemptible state more than 50% of the time.) Select this if you are building a kernel for a desktop or embedded system with latency requirements in the milliseconds range. +config PREEMPT_RT + bool "Complete Preemption (Real-Time)" + select PREEMPT_SOFTIRQS + select PREEMPT_HARDIRQS + select PREEMPT_RCU + select RT_MUTEXES + help + This option further reduces the scheduling latency of the + kernel by replacing almost every spinlock used by the kernel + with preemptible mutexes and thus making all but the most + critical kernel code involuntarily preemptible. The remaining + handful of lowlevel non-preemptible codepaths are short and + have a deterministic latency of a couple of tens of + microseconds (depending on the hardware). This also allows + applications to run more 'smoothly' even when the system is + under load, at the cost of lower throughput and runtime + overhead to kernel code. + + (According to profiles, when this mode is selected then even + during kernel-intense workloads the system is in an immediately + preemptible state more than 95% of the time.) + + Select this if you are building a kernel for a desktop, + embedded or real-time system with guaranteed latency + requirements of 100 usecs or lower. + endchoice +config PREEMPT + bool + default y + depends on PREEMPT_DESKTOP || PREEMPT_RT + +config PREEMPT_SOFTIRQS + bool "Thread Softirqs" + default n +# depends on PREEMPT + help + This option reduces the latency of the kernel by 'threading' + soft interrupts. This means that all softirqs will execute + in softirqd's context. While this helps latency, it can also + reduce performance. + + The threading of softirqs can also be controlled via + /proc/sys/kernel/softirq_preemption runtime flag and the + sofirq-preempt=0/1 boot-time option. + + Say N if you are unsure. + +config PREEMPT_HARDIRQS + bool "Thread Hardirqs" + default n + depends on GENERIC_HARDIRQS_NO__DO_IRQ + select PREEMPT_SOFTIRQS + help + This option reduces the latency of the kernel by 'threading' + hardirqs. This means that all (or selected) hardirqs will run + in their own kernel thread context. While this helps latency, + this feature can also reduce performance. + + The threading of hardirqs can also be controlled via the + /proc/sys/kernel/hardirq_preemption runtime flag and the + hardirq-preempt=0/1 boot-time option. Per-irq threading can + be enabled/disable via the /proc/irq/<IRQ>/<handler>/threaded + runtime flags. + + Say N if you are unsure. diff --git a/kernel/Makefile b/kernel/Makefile index 864ff75..2a60a05 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -7,7 +7,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ - kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ + kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ async.o @@ -29,7 +29,10 @@ obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ +ifneq ($(CONFIG_PREEMPT_RT),y) +obj-y += mutex.o obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o +endif obj-$(CONFIG_LOCKDEP) += lockdep.o ifeq ($(CONFIG_PROC_FS),y) obj-$(CONFIG_LOCKDEP) += lockdep_proc.o @@ -41,6 +44,7 @@ endif obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o +obj-$(CONFIG_PREEMPT_RT) += rt.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o ifneq ($(CONFIG_SMP),y) diff --git a/kernel/audit.c b/kernel/audit.c index 5feed23..8ccf8d5 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -880,40 +880,40 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_TTY_GET: { struct audit_tty_status s; struct task_struct *tsk; + unsigned long flags; - read_lock(&tasklist_lock); + rcu_read_lock(); tsk = find_task_by_vpid(pid); - if (!tsk) - err = -ESRCH; - else { - spin_lock_irq(&tsk->sighand->siglock); + if (tsk && lock_task_sighand(tsk, &flags)) { s.enabled = tsk->signal->audit_tty != 0; - spin_unlock_irq(&tsk->sighand->siglock); - } - read_unlock(&tasklist_lock); - audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0, - &s, sizeof(s)); + unlock_task_sighand(tsk, &flags); + } else + err = -ESRCH; + rcu_read_unlock(); + + if (!err) + audit_send_reply(NETLINK_CB(skb).pid, seq, + AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { struct audit_tty_status *s; struct task_struct *tsk; + unsigned long flags; if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) return -EINVAL; s = data; if (s->enabled != 0 && s->enabled != 1) return -EINVAL; - read_lock(&tasklist_lock); + rcu_read_lock(); tsk = find_task_by_vpid(pid); - if (!tsk) - err = -ESRCH; - else { - spin_lock_irq(&tsk->sighand->siglock); + if (tsk && lock_task_sighand(tsk, &flags)) { tsk->signal->audit_tty = s->enabled != 0; - spin_unlock_irq(&tsk->sighand->siglock); - } - read_unlock(&tasklist_lock); + unlock_task_sighand(tsk, &flags); + } else + err = -ESRCH; + rcu_read_unlock(); break; } default: diff --git a/kernel/capability.c b/kernel/capability.c index 7f876e6..9e4697e 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -135,7 +135,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, if (pid && (pid != task_pid_vnr(current))) { struct task_struct *target; - read_lock(&tasklist_lock); + rcu_read_lock(); target = find_task_by_vpid(pid); if (!target) @@ -143,7 +143,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, else ret = security_capget(target, pEp, pIp, pPp); - read_unlock(&tasklist_lock); + rcu_read_unlock(); } else ret = security_capget(current, pEp, pIp, pPp); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index aa3bee5..11a8b34 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -204,7 +204,7 @@ list_for_each_entry(_root, &roots, root_list) /* the list of cgroups eligible for automatic release. Protected by * release_list_lock */ static LIST_HEAD(release_list); -static DEFINE_SPINLOCK(release_list_lock); +static DEFINE_RAW_SPINLOCK(release_list_lock); static void cgroup_release_agent(struct work_struct *work); static DECLARE_WORK(release_agent_work, cgroup_release_agent); static void check_for_release(struct cgroup *cgrp); @@ -3153,11 +3153,11 @@ again: finish_wait(&cgroup_rmdir_waitq, &wait); clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) list_del(&cgrp->release_list); - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); cgroup_lock_hierarchy(cgrp->root); /* delete this cgroup from parent->children */ @@ -3693,13 +3693,13 @@ static void check_for_release(struct cgroup *cgrp) * already queued for a userspace notification, queue * it now */ int need_schedule_work = 0; - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); if (!cgroup_is_removed(cgrp) && list_empty(&cgrp->release_list)) { list_add(&cgrp->release_list, &release_list); need_schedule_work = 1; } - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); if (need_schedule_work) schedule_work(&release_agent_work); } @@ -3749,7 +3749,7 @@ static void cgroup_release_agent(struct work_struct *work) { BUG_ON(work != &release_agent_work); mutex_lock(&cgroup_mutex); - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); while (!list_empty(&release_list)) { char *argv[3], *envp[3]; int i; @@ -3758,7 +3758,7 @@ static void cgroup_release_agent(struct work_struct *work) struct cgroup, release_list); list_del_init(&cgrp->release_list); - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!pathbuf) goto continue_free; @@ -3788,9 +3788,9 @@ static void cgroup_release_agent(struct work_struct *work) continue_free: kfree(pathbuf); kfree(agentbuf); - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); } - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); mutex_unlock(&cgroup_mutex); } diff --git a/kernel/exit.c b/kernel/exit.c index 546774a..1295e09 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -69,7 +69,9 @@ static void __unhash_process(struct task_struct *p) list_del_rcu(&p->tasks); list_del_init(&p->sibling); + preempt_disable(); __get_cpu_var(process_counts)--; + preempt_enable(); } list_del_rcu(&p->thread_group); } @@ -131,7 +133,7 @@ static void __exit_signal(struct task_struct *tsk) * Do this under ->siglock, we can race with another thread * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. */ - flush_sigqueue(&tsk->pending); + flush_task_sigqueue(tsk); tsk->signal = NULL; tsk->sighand = NULL; @@ -686,9 +688,11 @@ static void exit_mm(struct task_struct * tsk) task_lock(tsk); tsk->mm = NULL; up_read(&mm->mmap_sem); + preempt_disable(); // FIXME enter_lazy_tlb(mm, current); /* We don't want this task to be frozen prematurely */ clear_freeze_flag(tsk); + preempt_enable(); task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); @@ -1493,6 +1497,9 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) { int ret = eligible_child(wo, p); + + BUG_ON(!atomic_read(&p->usage)); + if (!ret) return ret; diff --git a/kernel/fork.c b/kernel/fork.c index f88bd98..340cbc8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -38,6 +38,7 @@ #include <linux/syscalls.h> #include <linux/jiffies.h> #include <linux/tracehook.h> +#include <linux/interrupt.h> #include <linux/futex.h> #include <linux/compat.h> #include <linux/task_io_accounting_ops.h> @@ -65,6 +66,8 @@ #include <linux/perf_event.h> #include <linux/posix-timers.h> #include <linux/user-return-notifier.h> +#include <linux/kthread.h> +#include <linux/notifier.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -85,7 +88,19 @@ int max_threads; /* tunable limit on nr_threads */ DEFINE_PER_CPU(unsigned long, process_counts) = 0; +#ifdef CONFIG_PREEMPT_RT +DEFINE_RWLOCK(tasklist_lock); /* outer */ +#else __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ +#endif + +/* + * Delayed mmdrop. In the PREEMPT_RT case we + * dont want to do this from the scheduling + * context. + */ +static DEFINE_PER_CPU(struct task_struct *, desched_task); +static DEFINE_PER_CPU(struct list_head, delayed_drop_list); int nr_processes(void) { @@ -170,6 +185,16 @@ void __put_task_struct(struct task_struct *tsk) free_task(tsk); } +#ifdef CONFIG_PREEMPT_RT +void __put_task_struct_cb(struct rcu_head *rhp) +{ + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + + __put_task_struct(tsk); + +} +#endif + /* * macro override instead of weak attribute alias, to workaround * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. @@ -180,6 +205,8 @@ void __put_task_struct(struct task_struct *tsk) void __init fork_init(unsigned long mempages) { + int i; + #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES @@ -210,6 +237,9 @@ void __init fork_init(unsigned long mempages) init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; + + for (i = 0; i < NR_CPUS; i++) + INIT_LIST_HEAD(&per_cpu(delayed_drop_list, i)); } int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, @@ -295,6 +325,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; + INIT_LIST_HEAD(&mm->delayed_drop); mm->free_area_cache = oldmm->mmap_base; mm->cached_hole_size = ~0UL; mm->map_count = 0; @@ -943,6 +974,9 @@ static void rt_mutex_init_task(struct task_struct *p) #ifdef CONFIG_RT_MUTEXES plist_head_init_raw(&p->pi_waiters, &p->pi_lock); p->pi_blocked_on = NULL; +# ifdef CONFIG_DEBUG_RT_MUTEXES + p->last_kernel_lock = NULL; +# endif #endif } @@ -1065,6 +1099,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_lock_init(&p->alloc_lock); init_sigpending(&p->pending); + p->sigqueue_cache = NULL; p->utime = cputime_zero; p->stime = cputime_zero; @@ -1082,7 +1117,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, acct_clear_integrals(p); posix_cpu_timers_init(p); - + p->posix_timer_list = NULL; p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); p->real_start_time = p->start_time; @@ -1118,6 +1153,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->hardirq_context = 0; p->softirq_context = 0; #endif + p->pagefault_disabled = 0; #ifdef CONFIG_LOCKDEP p->lockdep_depth = 0; /* no locks held yet */ p->curr_chain_key = 0; @@ -1165,6 +1201,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); if (retval) goto bad_fork_cleanup_io; +#ifdef CONFIG_DEBUG_PREEMPT + atomic_set(&p->lock_count, 0); +#endif if (pid != &init_struct_pid) { retval = -ENOMEM; @@ -1289,7 +1328,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); + preempt_disable(); __get_cpu_var(process_counts)++; + preempt_enable(); } attach_pid(p, PIDTYPE_PID, pid); nr_threads++; @@ -1763,3 +1804,141 @@ int unshare_files(struct files_struct **displaced) task_unlock(task); return 0; } + +static int mmdrop_complete(void) +{ + struct list_head *head; + int ret = 0; + + head = &get_cpu_var(delayed_drop_list); + while (!list_empty(head)) { + struct mm_struct *mm = list_entry(head->next, + struct mm_struct, delayed_drop); + list_del(&mm->delayed_drop); + put_cpu_var(delayed_drop_list); + + __mmdrop(mm); + ret = 1; + + head = &get_cpu_var(delayed_drop_list); + } + put_cpu_var(delayed_drop_list); + + return ret; +} + +/* + * We dont want to do complex work from the scheduler, thus + * we delay the work to a per-CPU worker thread: + */ +void __mmdrop_delayed(struct mm_struct *mm) +{ + struct task_struct *desched_task; + struct list_head *head; + + head = &get_cpu_var(delayed_drop_list); + list_add_tail(&mm->delayed_drop, head); + desched_task = __get_cpu_var(desched_task); + if (desched_task) + wake_up_process(desched_task); + put_cpu_var(delayed_drop_list); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void takeover_delayed_drop(int hotcpu) +{ + struct list_head *head = &per_cpu(delayed_drop_list, hotcpu); + + while (!list_empty(head)) { + struct mm_struct *mm = list_entry(head->next, + struct mm_struct, delayed_drop); + + list_del(&mm->delayed_drop); + __mmdrop_delayed(mm); + } +} +#endif + +static int desched_thread(void * __bind_cpu) +{ + set_user_nice(current, -10); + current->flags |= PF_NOFREEZE; + current->extra_flags |= PFE_SOFTIRQ; + + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { + + if (mmdrop_complete()) + continue; + schedule(); + + /* + * This must be called from time to time on ia64, and is a + * no-op on other archs. Used to be in cpu_idle(), but with + * the new -rt semantics it can't stay there. + */ + check_pgt_cache(); + + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +static int __devinit cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + struct task_struct *p; + + switch (action) { + case CPU_UP_PREPARE: + + BUG_ON(per_cpu(desched_task, hotcpu)); + INIT_LIST_HEAD(&per_cpu(delayed_drop_list, hotcpu)); + p = kthread_create(desched_thread, hcpu, "desched/%d", hotcpu); + if (IS_ERR(p)) { + printk("desched_thread for %i failed\n", hotcpu); + return NOTIFY_BAD; + } + per_cpu(desched_task, hotcpu) = p; + kthread_bind(p, hotcpu); + break; + case CPU_ONLINE: + + wake_up_process(per_cpu(desched_task, hotcpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + + /* Unbind so it can run. Fall thru. */ + kthread_bind(per_cpu(desched_task, hotcpu), smp_processor_id()); + case CPU_DEAD: + + p = per_cpu(desched_task, hotcpu); + per_cpu(desched_task, hotcpu) = NULL; + kthread_stop(p); + takeover_delayed_drop(hotcpu); + takeover_tasklets(hotcpu); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata cpu_nfb = { + .notifier_call = cpu_callback +}; + +__init int spawn_desched_task(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + + cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + register_cpu_notifier(&cpu_nfb); + return 0; +} + diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 2357165..d49afb2 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -146,7 +146,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, struct task_struct *p; ret = -ESRCH; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_task_by_vpid(pid); if (!p) goto err_unlock; @@ -157,7 +157,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, !capable(CAP_SYS_PTRACE)) goto err_unlock; head = p->compat_robust_list; - read_unlock(&tasklist_lock); + rcu_read_unlock(); } if (put_user(sizeof(*head), len_ptr)) @@ -165,7 +165,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, return put_user(ptr_to_compat(head), head_ptr); err_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return ret; } diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 0086628..2bf8f85 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -49,6 +49,7 @@ #include <asm/uaccess.h> #include <trace/events/timer.h> +#include <trace/events/hist.h> /* * The timer bases: @@ -88,10 +89,10 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) unsigned long seq; do { - seq = read_seqbegin(&xtime_lock); + seq = read_raw_seqbegin(&xtime_lock); xts = current_kernel_time(); tom = wall_to_monotonic; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_raw_seqretry(&xtime_lock, seq)); xtim = timespec_to_ktime(xts); tomono = timespec_to_ktime(tom); @@ -499,9 +500,9 @@ static inline int hrtimer_is_hres_enabled(void) /* * Is the high resolution mode active ? */ -static inline int hrtimer_hres_active(void) +static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { - return __get_cpu_var(hrtimer_bases).hres_active; + return cpu_base->hres_active; } /* @@ -567,8 +568,7 @@ static int hrtimer_reprogram(struct hrtimer *timer, * When the callback is running, we do not reprogram the clock event * device. The timer callback is either running on a different CPU or * the callback is executed in the hrtimer_interrupt context. The - * reprogramming is handled either by the softirq, which called the - * callback or at the end of the hrtimer_interrupt. + * reprogramming is handled at the end of the hrtimer_interrupt. */ if (hrtimer_callback_running(timer)) return 0; @@ -611,21 +611,19 @@ static int hrtimer_reprogram(struct hrtimer *timer, */ static void retrigger_next_event(void *arg) { - struct hrtimer_cpu_base *base; + struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); struct timespec realtime_offset; unsigned long seq; - if (!hrtimer_hres_active()) + if (!hrtimer_hres_active(base)) return; do { - seq = read_seqbegin(&xtime_lock); + seq = read_raw_seqbegin(&xtime_lock); set_normalized_timespec(&realtime_offset, -wall_to_monotonic.tv_sec, -wall_to_monotonic.tv_nsec); - } while (read_seqretry(&xtime_lock, seq)); - - base = &__get_cpu_var(hrtimer_bases); + } while (read_raw_seqretry(&xtime_lock, seq)); /* Adjust CLOCK_REALTIME offset */ raw_spin_lock(&base->lock); @@ -681,6 +679,8 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } +static void __run_hrtimer(struct hrtimer *timer, ktime_t *now); +static int hrtimer_rt_defer(struct hrtimer *timer); /* * When High resolution timers are active, try to reprogram. Note, that in case @@ -692,7 +692,29 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base, int wakeup) { +#ifdef CONFIG_PREEMPT_RT +again: +#endif if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { +#ifdef CONFIG_PREEMPT_RT + /* + * Move softirq based timers away from the rbtree in + * case it expired already. Otherwise we would have a + * stale base->first entry until the softirq runs. + */ + if (!hrtimer_rt_defer(timer)) { + ktime_t now = ktime_get(); + + __run_hrtimer(timer, &now); + /* + * __run_hrtimer might have requeued timer and + * it could be base->first again. + */ + if (base->first == &timer->node) + goto again; + return 1; + } +#endif if (wakeup) { raw_spin_unlock(&base->cpu_base->lock); raise_softirq_irqoff(HRTIMER_SOFTIRQ); @@ -709,10 +731,8 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, /* * Switch to high resolution mode */ -static int hrtimer_switch_to_hres(void) +static int hrtimer_switch_to_hres(struct hrtimer_cpu_base *base) { - int cpu = smp_processor_id(); - struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); unsigned long flags; if (base->hres_active) @@ -723,7 +743,7 @@ static int hrtimer_switch_to_hres(void) if (tick_init_highres()) { local_irq_restore(flags); printk(KERN_WARNING "Could not switch to high resolution " - "mode on CPU %d\n", cpu); + "mode on CPU %d\n", raw_smp_processor_id()); return 0; } base->hres_active = 1; @@ -740,9 +760,15 @@ static int hrtimer_switch_to_hres(void) #else -static inline int hrtimer_hres_active(void) { return 0; } +static inline int hrtimer_hres_active(struct hrtimer_cpu_base *base) +{ + return 0; +} static inline int hrtimer_is_hres_enabled(void) { return 0; } -static inline int hrtimer_switch_to_hres(void) { return 0; } +static inline int hrtimer_switch_to_hres(struct hrtimer_cpu_base *base) +{ + return 0; +} static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, @@ -751,6 +777,13 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, { return 0; } + +static inline int hrtimer_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + return 0; +} + static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } @@ -889,6 +922,32 @@ static int enqueue_hrtimer(struct hrtimer *timer, return leftmost; } +#ifdef CONFIG_PREEMPT_SOFTIRQS +# define wake_up_timer_waiters(b) wake_up(&(b)->wait) + +/** + * hrtimer_wait_for_timer - Wait for a running timer + * + * @timer: timer to wait for + * + * The function waits in case the timers callback function is + * currently executed on the waitqueue of the timer base. The + * waitqueue is woken up after the timer callback function has + * finished execution. + */ +void hrtimer_wait_for_timer(const struct hrtimer *timer) +{ + struct hrtimer_clock_base *base = timer->base; + + if (base && base->cpu_base && !timer->irqsafe) + wait_event(base->cpu_base->wait, + !(timer->state & HRTIMER_STATE_CALLBACK)); +} + +#else +# define wake_up_timer_waiters(b) do { } while (0) +#endif + /* * __remove_hrtimer - internal function to remove a timer * @@ -906,6 +965,11 @@ static void __remove_hrtimer(struct hrtimer *timer, if (!(timer->state & HRTIMER_STATE_ENQUEUED)) goto out; + if (unlikely(!list_empty(&timer->cb_entry))) { + list_del_init(&timer->cb_entry); + goto out; + } + /* * Remove the timer from the rbtree and replace the first * entry pointer if necessary. @@ -914,7 +978,7 @@ static void __remove_hrtimer(struct hrtimer *timer, base->first = rb_next(&timer->node); #ifdef CONFIG_HIGH_RES_TIMERS /* Reprogram the clock event device. if enabled */ - if (reprogram && hrtimer_hres_active()) { + if (reprogram && hrtimer_hres_active(base->cpu_base)) { ktime_t expires; expires = ktime_sub(hrtimer_get_expires(timer), @@ -1085,7 +1149,7 @@ int hrtimer_cancel(struct hrtimer *timer) if (ret >= 0) return ret; - cpu_relax(); + hrtimer_wait_for_timer(timer); } } EXPORT_SYMBOL_GPL(hrtimer_cancel); @@ -1125,7 +1189,7 @@ ktime_t hrtimer_get_next_event(void) raw_spin_lock_irqsave(&cpu_base->lock, flags); - if (!hrtimer_hres_active()) { + if (!hrtimer_hres_active(cpu_base)) { for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { struct hrtimer *timer; @@ -1161,6 +1225,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, clock_id = CLOCK_MONOTONIC; timer->base = &cpu_base->clock_base[clock_id]; + INIT_LIST_HEAD(&timer->cb_entry); hrtimer_init_timer_hres(timer); #ifdef CONFIG_TIMER_STATS @@ -1240,8 +1305,120 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) timer->state &= ~HRTIMER_STATE_CALLBACK; } +#ifdef CONFIG_PREEMPT_RT + +static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + /* + * Note, we clear the callback flag before we requeue the + * timer otherwise we trigger the callback_running() check + * in hrtimer_reprogram(). + */ + timer->state &= ~HRTIMER_STATE_CALLBACK; + + if (restart != HRTIMER_NORESTART) { + BUG_ON(hrtimer_active(timer)); + /* + * Enqueue the timer, if it's the leftmost timer then + * we need to reprogram it. + */ + if (!enqueue_hrtimer(timer, base)) + return; + + if (hrtimer_reprogram(timer, base)) + goto requeue; + + } else if (hrtimer_active(timer)) { + /* + * If the timer was rearmed on another CPU, reprogram + * the event device. + */ + if (base->first == &timer->node && + hrtimer_reprogram(timer, base)) + goto requeue; + } + return; + +requeue: + /* + * Timer is expired. Thus move it from tree to pending list + * again. + */ + __remove_hrtimer(timer, base, timer->state, 0); + list_add_tail(&timer->cb_entry, &base->expired); +} + +/* + * The changes in mainline which removed the callback modes from + * hrtimer are not yet working with -rt. The non wakeup_process() + * based callbacks which involve sleeping locks need to be treated + * seperately. + */ +static void hrtimer_rt_run_pending(void) +{ + enum hrtimer_restart (*fn)(struct hrtimer *); + struct hrtimer_cpu_base *cpu_base; + struct hrtimer_clock_base *base; + struct hrtimer *timer; + int index, restart; + + local_irq_disable(); + cpu_base = &per_cpu(hrtimer_bases, smp_processor_id()); + + raw_spin_lock(&cpu_base->lock); + + for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { + base = &cpu_base->clock_base[index]; + + while (!list_empty(&base->expired)) { + timer = list_first_entry(&base->expired, + struct hrtimer, cb_entry); + + /* + * Same as the above __run_hrtimer function + * just we run with interrupts enabled. + */ + debug_hrtimer_deactivate(timer); + __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); + timer_stats_account_hrtimer(timer); + fn = timer->function; + + raw_spin_unlock_irq(&cpu_base->lock); + restart = fn(timer); + raw_spin_lock_irq(&cpu_base->lock); + + hrtimer_rt_reprogram(restart, timer, base); + } + } + + raw_spin_unlock_irq(&cpu_base->lock); + + wake_up_timer_waiters(cpu_base); +} + +static int hrtimer_rt_defer(struct hrtimer *timer) +{ + if (timer->irqsafe) + return 0; + + __remove_hrtimer(timer, timer->base, timer->state, 0); + list_add_tail(&timer->cb_entry, &timer->base->expired); + return 1; +} + +#else + +static inline void hrtimer_rt_run_pending(void) { } +static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; } + +#endif + + #ifdef CONFIG_HIGH_RES_TIMERS +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer); + /* * High resolution timer interrupt * Called with interrupts disabled @@ -1251,7 +1428,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); struct hrtimer_clock_base *base; ktime_t expires_next, now, entry_time, delta; - int i, retries = 0; + int i, retries = 0, raise = 0; BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; @@ -1284,6 +1461,14 @@ retry: timer = rb_entry(node, struct hrtimer, node); + trace_hrtimer_interrupt(raw_smp_processor_id(), + ktime_to_ns(ktime_sub( + hrtimer_get_expires(timer), basenow)), + current, + timer->function == hrtimer_wakeup ? + container_of(timer, struct hrtimer_sleeper, + timer)->task : NULL); + /* * The immediate goal for using the softexpires is * minimizing wakeups, not running timers at the @@ -1307,7 +1492,10 @@ retry: break; } - __run_hrtimer(timer, &basenow); + if (!hrtimer_rt_defer(timer)) + __run_hrtimer(timer, &basenow); + else + raise = 1; } base++; } @@ -1323,6 +1511,10 @@ retry: if (expires_next.tv64 == KTIME_MAX || !tick_program_event(expires_next, 0)) { cpu_base->hang_detected = 0; + + if (raise) + raise_softirq_irqoff(HRTIMER_SOFTIRQ); + return; } @@ -1370,9 +1562,11 @@ retry: */ static void __hrtimer_peek_ahead_timers(void) { + struct hrtimer_cpu_base *cpu_base; struct tick_device *td; - if (!hrtimer_hres_active()) + cpu_base = &__get_cpu_var(hrtimer_bases); + if (!hrtimer_hres_active(cpu_base)) return; td = &__get_cpu_var(tick_cpu_device); @@ -1398,17 +1592,17 @@ void hrtimer_peek_ahead_timers(void) local_irq_restore(flags); } -static void run_hrtimer_softirq(struct softirq_action *h) -{ - hrtimer_peek_ahead_timers(); -} - #else /* CONFIG_HIGH_RES_TIMERS */ static inline void __hrtimer_peek_ahead_timers(void) { } #endif /* !CONFIG_HIGH_RES_TIMERS */ +static void run_hrtimer_softirq(struct softirq_action *h) +{ + hrtimer_rt_run_pending(); +} + /* * Called from timer softirq every jiffy, expire hrtimers: * @@ -1418,7 +1612,9 @@ static inline void __hrtimer_peek_ahead_timers(void) { } */ void hrtimer_run_pending(void) { - if (hrtimer_hres_active()) + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + + if (hrtimer_hres_active(cpu_base)) return; /* @@ -1430,7 +1626,7 @@ void hrtimer_run_pending(void) * deadlock vs. xtime_lock. */ if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) - hrtimer_switch_to_hres(); + hrtimer_switch_to_hres(cpu_base); } /* @@ -1439,11 +1635,12 @@ void hrtimer_run_pending(void) void hrtimer_run_queues(void) { struct rb_node *node; - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base; struct hrtimer_clock_base *base; - int index, gettime = 1; + int index, gettime = 1, raise = 0; - if (hrtimer_hres_active()) + cpu_base = &per_cpu(hrtimer_bases, raw_smp_processor_id()); + if (hrtimer_hres_active(cpu_base)) return; for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { @@ -1467,10 +1664,16 @@ void hrtimer_run_queues(void) hrtimer_get_expires_tv64(timer)) break; - __run_hrtimer(timer, &base->softirq_time); + if (!hrtimer_rt_defer(timer)) + __run_hrtimer(timer, &base->softirq_time); + else + raise = 1; } raw_spin_unlock(&cpu_base->lock); } + + if (raise) + raise_softirq_irqoff(HRTIMER_SOFTIRQ); } /* @@ -1492,6 +1695,7 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) { sl->timer.function = hrtimer_wakeup; + sl->timer.irqsafe = 1; sl->task = task; } EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); @@ -1627,10 +1831,15 @@ static void __cpuinit init_hrtimers_cpu(int cpu) raw_spin_lock_init(&cpu_base->lock); - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { cpu_base->clock_base[i].cpu_base = cpu_base; + INIT_LIST_HEAD(&cpu_base->clock_base[i].expired); + } hrtimer_init_hres(cpu_base); +#ifdef CONFIG_PREEMPT_RT + init_waitqueue_head(&cpu_base->wait); +#endif } #ifdef CONFIG_HOTPLUG_CPU @@ -1743,9 +1952,7 @@ void __init hrtimers_init(void) hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); -#ifdef CONFIG_HIGH_RES_TIMERS open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); -#endif } /** diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 8a5c7d5..967e661 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -360,8 +360,8 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { u64 old_addr = bp->attr.bp_addr; + u64 old_len = bp->attr.bp_len; int old_type = bp->attr.bp_type; - int old_len = bp->attr.bp_len; int err = 0; perf_event_disable(bp); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index ecc3fa2..ad096ed 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -275,6 +275,7 @@ static unsigned int default_startup(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); + desc->status &= ~IRQ_MASKED; desc->chip->enable(irq); return 0; } @@ -495,6 +496,9 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) goto out; } + if ((desc->status & IRQ_ONESHOT) && desc->chip->mask) + desc->chip->mask(irq); + desc->status |= IRQ_INPROGRESS; desc->status &= ~IRQ_PENDING; raw_spin_unlock(&desc->lock); @@ -532,7 +536,12 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) { raw_spin_lock(&desc->lock); - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + /* + * Edge irqs can be requested with IRQF_ONESHOT set. RT + * (ab)uses this for enforced irq threading, but we do not + * want to mask edge type interrupts. Clear the oneshot flag. + */ + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING | IRQ_ONESHOT); /* * If we're currently running this IRQ, or its disabled, diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 814940e..5cfebbe 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -360,6 +360,25 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) "but no thread function available.", irq, action->name); } +/* + * Momentary workaround until I have a brighter idea how to handle the + * accounting of forced threaded (shared) handlers. + */ +irqreturn_t handle_irq_action(unsigned int irq, struct irqaction *action) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc->status & IRQ_ONESHOT) { + unsigned long flags; + + raw_spin_lock_irqsave(&desc->lock, flags); + desc->forced_threads_active |= action->thread_mask; + raw_spin_unlock_irqrestore(&desc->lock, flags); + return IRQ_WAKE_THREAD; + } + return action->handler(irq, action->dev_id); +} + /** * handle_IRQ_event - irq action chain handler * @irq: the interrupt number @@ -377,7 +396,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) do { trace_irq_handler_entry(irq, action); - ret = action->handler(irq, action->dev_id); + ret = handle_irq_action(irq, action); trace_irq_handler_exit(irq, action, ret); switch (ret) { @@ -424,8 +443,11 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) action = action->next; } while (action); +#ifndef CONFIG_PREEMPT_RT + /* FIXME: Can we unbreak that ? */ if (status & IRQF_SAMPLE_RANDOM) add_interrupt_randomness(irq); +#endif local_irq_disable(); return retval; @@ -454,6 +476,11 @@ unsigned int __do_IRQ(unsigned int irq) struct irqaction *action; unsigned int status; +#ifdef CONFIG_PREEMPT_RT + printk(KERN_WARNING "__do_IRQ called for irq %d. " + "PREEMPT_RT will crash your system soon\n", irq); + printk(KERN_WARNING "I hope you have a fire-extinguisher handy!\n"); +#endif kstat_incr_irqs_this_cpu(irq, desc); if (CHECK_IRQ_PER_CPU(desc->status)) { diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index eb6078c..7d108d0 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -280,7 +280,8 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) goto err_out; /* Prevent probing on this irq: */ desc->status = status | IRQ_NOPROBE; - check_irq_resend(desc, irq); + if (!desc->forced_threads_active) + check_irq_resend(desc, irq); /* fall-through */ } default: @@ -461,7 +462,88 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id) return IRQ_NONE; } -static int irq_wait_for_interrupt(struct irqaction *action) +#ifdef CONFIG_PREEMPT_HARDIRQS +/* + * If the caller does not request irq threading then the handler + * becomes the thread function and we use the above handler as the + * primary hardirq context handler. + */ +static void preempt_hardirq_setup(struct irqaction *new) +{ + if (new->thread_fn || (new->flags & (IRQF_NODELAY | IRQF_PERCPU))) + return; + + new->flags |= IRQF_ONESHOT; + new->thread_fn = new->handler; + new->handler = irq_default_primary_handler; +} + +#else +static inline void preempt_hardirq_setup(struct irqaction *new) { } +#endif + +/* + * forced threaded interrupts need to unmask the interrupt line + */ +static int preempt_hardirq_thread_done(struct irq_desc *desc, + struct irqaction *action) +{ + unsigned long masked; + + if (!(desc->status & IRQ_ONESHOT)) + return 0; +again: + raw_spin_lock_irq(&desc->lock); + /* + * Be careful. The hardirq handler might be running on the + * other CPU. + */ + if (desc->status & IRQ_INPROGRESS) { + raw_spin_unlock_irq(&desc->lock); + cpu_relax(); + goto again; + } + + /* + * Now check again, whether the thread should run. Otherwise + * we would clear the forced_threads_active bit which was just + * set. + */ + if (test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) { + raw_spin_unlock_irq(&desc->lock); + return 1; + } + + masked = desc->forced_threads_active; + desc->forced_threads_active &= ~action->thread_mask; + + /* + * Unmask the interrupt line when this is the last active + * thread and the interrupt is not disabled. + */ + if (masked && !desc->forced_threads_active && + !(desc->status & IRQ_DISABLED)) { + if (desc->chip->unmask) + desc->chip->unmask(action->irq); + /* + * Do we need to call check_irq_resend() here ? + * No. check_irq_resend needs only to be checked when + * we go from IRQ_DISABLED to IRQ_ENABLED state. + */ + } + raw_spin_unlock_irq(&desc->lock); + return 0; +} + +static inline void +preempt_hardirq_cleanup(struct irq_desc *desc, struct irqaction *action) +{ + clear_bit(IRQTF_RUNTHREAD, &action->thread_flags); + preempt_hardirq_thread_done(desc, action); +} + +static int +irq_wait_for_interrupt(struct irq_desc *desc, struct irqaction *action) { while (!kthread_should_stop()) { set_current_state(TASK_INTERRUPTIBLE); @@ -481,15 +563,20 @@ static int irq_wait_for_interrupt(struct irqaction *action) * handler finished. unmask if the interrupt has not been disabled and * is marked MASKED. */ -static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) +static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc, + struct irqaction *action) { chip_bus_lock(irq, desc); +#ifndef CONFIG_PREEMPT_RT raw_spin_lock_irq(&desc->lock); if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { desc->status &= ~IRQ_MASKED; desc->chip->unmask(irq); } raw_spin_unlock_irq(&desc->lock); +#else + preempt_hardirq_thread_done(desc, action); +#endif chip_bus_sync_unlock(irq, desc); } @@ -534,12 +621,13 @@ static int irq_thread(void *data) struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); - int wake, oneshot = desc->status & IRQ_ONESHOT; + int wake; sched_setscheduler(current, SCHED_FIFO, ¶m); + current->extra_flags |= PFE_HARDIRQ; current->irqaction = action; - while (!irq_wait_for_interrupt(action)) { + while (!irq_wait_for_interrupt(desc, action)) { irq_thread_check_affinity(desc, action); @@ -561,8 +649,8 @@ static int irq_thread(void *data) action->thread_fn(action->irq, action->dev_id); - if (oneshot) - irq_finalize_oneshot(action->irq, desc); + if (desc->status & IRQ_ONESHOT) + irq_finalize_oneshot(action->irq, desc, action); } wake = atomic_dec_and_test(&desc->threads_active); @@ -571,6 +659,8 @@ static int irq_thread(void *data) wake_up(&desc->wait_for_threads); } + preempt_hardirq_cleanup(desc, action); + /* * Clear irqaction. Otherwise exit_irq_thread() would make * fuzz about an active irq thread going into nirvana. @@ -609,7 +699,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) { struct irqaction *old, **old_ptr; const char *old_name = NULL; - unsigned long flags; + unsigned long flags, thread_mask = 0; int nested, shared = 0; int ret; @@ -635,9 +725,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) rand_initialize_irq(irq); } - /* Oneshot interrupts are not allowed with shared */ - if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED)) - return -EINVAL; + /* Preempt-RT setup for forced threading */ + preempt_hardirq_setup(new); /* * Check whether the interrupt nests into another interrupt @@ -704,12 +793,20 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) /* add new interrupt at end of irq queue */ do { + thread_mask |= old->thread_mask; old_ptr = &old->next; old = *old_ptr; } while (old); shared = 1; } + /* + * Setup the thread mask for this irqaction. No risk that ffz + * will fail. If we have 32 resp. 64 devices sharing one irq + * then ..... + */ + new->thread_mask = 1 << ffz(thread_mask); + if (!shared) { irq_chip_set_defaults(desc->chip); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 2419622..ea0b492 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -54,6 +54,7 @@ void move_masked_irq(int irq) void move_native_irq(int irq) { struct irq_desc *desc = irq_to_desc(irq); + int mask = 1; if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; @@ -61,8 +62,18 @@ void move_native_irq(int irq) if (unlikely(desc->status & IRQ_DISABLED)) return; - desc->chip->mask(irq); + /* + * If the irq is already in progress, it should be masked. + * If we unmask it, we might cause an interrupt storm on RT. + */ + if (unlikely((desc->status & IRQ_INPROGRESS) || + desc->forced_threads_active)) + mask = 0; + + if (mask) + desc->chip->mask(irq); move_masked_irq(irq); - desc->chip->unmask(irq); + if (mask) + desc->chip->unmask(irq); } diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 89fb90a..a9a6628 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -54,9 +54,9 @@ static int try_one_irq(int irq, struct irq_desc *desc) } action = action->next; } - local_irq_disable(); + /* Now clean up the flags */ - raw_spin_lock(&desc->lock); + raw_spin_lock_irq(&desc->lock); action = desc->action; /* @@ -278,6 +278,11 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true"); static int __init irqfixup_setup(char *str) { +#ifdef CONFIG_PREEMPT_RT + printk(KERN_WARNING "irqfixup boot option not supported " + "w/ CONFIG_PREEMPT_RT\n"); + return 1; +#endif irqfixup = 1; printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); printk(KERN_WARNING "This may impact system performance.\n"); @@ -290,6 +295,11 @@ module_param(irqfixup, int, 0644); static int __init irqpoll_setup(char *str) { +#ifdef CONFIG_PREEMPT_RT + printk(KERN_WARNING "irqpoll boot option not supported " + "w/ CONFIG_PREEMPT_RT\n"); + return 1; +#endif irqfixup = 2; printk(KERN_WARNING "Misrouted IRQ fixup and polling support " "enabled\n"); diff --git a/kernel/itimer.c b/kernel/itimer.c index d802883..2c582fc 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -214,6 +214,7 @@ again: /* We are sharing ->siglock with it_real_fn() */ if (hrtimer_try_to_cancel(timer) < 0) { spin_unlock_irq(&tsk->sighand->siglock); + hrtimer_wait_for_timer(&tsk->signal->real_timer); goto again; } expires = timeval_to_ktime(value->it_value); diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 498cabb..35edbe2 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -80,7 +80,7 @@ int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask) buffer = kmalloc(size, gfp_mask); if (!buffer) { - _kfifo_init(fifo, 0, 0); + _kfifo_init(fifo, NULL, 0); return -ENOMEM; } @@ -97,6 +97,7 @@ EXPORT_SYMBOL(kfifo_alloc); void kfifo_free(struct kfifo *fifo) { kfree(fifo->buffer); + _kfifo_init(fifo, NULL, 0); } EXPORT_SYMBOL(kfifo_free); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b7df302..40547e6 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -73,10 +73,10 @@ static bool kprobes_all_disarmed; static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; static struct { - spinlock_t lock ____cacheline_aligned_in_smp; + raw_spinlock_t lock ____cacheline_aligned_in_smp; } kretprobe_table_locks[KPROBE_TABLE_SIZE]; -static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) +static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) { return &(kretprobe_table_locks[hash].lock); } @@ -410,9 +410,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, hlist_del(&ri->hlist); INIT_HLIST_NODE(&ri->hlist); if (likely(rp)) { - spin_lock(&rp->lock); + raw_spin_lock(&rp->lock); hlist_add_head(&ri->hlist, &rp->free_instances); - spin_unlock(&rp->lock); + raw_spin_unlock(&rp->lock); } else /* Unregistering */ hlist_add_head(&ri->hlist, head); @@ -422,34 +422,34 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk, struct hlist_head **head, unsigned long *flags) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); - spinlock_t *hlist_lock; + raw_spinlock_t *hlist_lock; *head = &kretprobe_inst_table[hash]; hlist_lock = kretprobe_table_lock_ptr(hash); - spin_lock_irqsave(hlist_lock, *flags); + raw_spin_lock_irqsave(hlist_lock, *flags); } static void __kprobes kretprobe_table_lock(unsigned long hash, unsigned long *flags) { - spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - spin_lock_irqsave(hlist_lock, *flags); + raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); + raw_spin_lock_irqsave(hlist_lock, *flags); } void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); - spinlock_t *hlist_lock; + raw_spinlock_t *hlist_lock; hlist_lock = kretprobe_table_lock_ptr(hash); - spin_unlock_irqrestore(hlist_lock, *flags); + raw_spin_unlock_irqrestore(hlist_lock, *flags); } void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) { - spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - spin_unlock_irqrestore(hlist_lock, *flags); + raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); + raw_spin_unlock_irqrestore(hlist_lock, *flags); } /* @@ -982,12 +982,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, /*TODO: consider to only swap the RA after the last pre_handler fired */ hash = hash_ptr(current, KPROBE_HASH_BITS); - spin_lock_irqsave(&rp->lock, flags); + raw_spin_lock_irqsave(&rp->lock, flags); if (!hlist_empty(&rp->free_instances)) { ri = hlist_entry(rp->free_instances.first, struct kretprobe_instance, hlist); hlist_del(&ri->hlist); - spin_unlock_irqrestore(&rp->lock, flags); + raw_spin_unlock_irqrestore(&rp->lock, flags); ri->rp = rp; ri->task = current; @@ -1004,7 +1004,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, kretprobe_table_unlock(hash, &flags); } else { rp->nmissed++; - spin_unlock_irqrestore(&rp->lock, flags); + raw_spin_unlock_irqrestore(&rp->lock, flags); } return 0; } @@ -1040,7 +1040,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp) rp->maxactive = num_possible_cpus(); #endif } - spin_lock_init(&rp->lock); + raw_spin_lock_init(&rp->lock); INIT_HLIST_HEAD(&rp->free_instances); for (i = 0; i < rp->maxactive; i++) { inst = kmalloc(sizeof(struct kretprobe_instance) + @@ -1227,7 +1227,7 @@ static int __init init_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { INIT_HLIST_HEAD(&kprobe_table[i]); INIT_HLIST_HEAD(&kretprobe_inst_table[i]); - spin_lock_init(&(kretprobe_table_locks[i].lock)); + raw_spin_lock_init(&(kretprobe_table_locks[i].lock)); } /* diff --git a/kernel/latencytop.c b/kernel/latencytop.c index ca07c5c..2b462ef 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -59,7 +59,7 @@ #include <linux/slab.h> #include <linux/stacktrace.h> -static DEFINE_SPINLOCK(latency_lock); +static DEFINE_RAW_SPINLOCK(latency_lock); #define MAXLR 128 static struct latency_record latency_record[MAXLR]; @@ -73,19 +73,19 @@ void clear_all_latency_tracing(struct task_struct *p) if (!latencytop_enabled) return; - spin_lock_irqsave(&latency_lock, flags); + raw_spin_lock_irqsave(&latency_lock, flags); memset(&p->latency_record, 0, sizeof(p->latency_record)); p->latency_record_count = 0; - spin_unlock_irqrestore(&latency_lock, flags); + raw_spin_unlock_irqrestore(&latency_lock, flags); } static void clear_global_latency_tracing(void) { unsigned long flags; - spin_lock_irqsave(&latency_lock, flags); + raw_spin_lock_irqsave(&latency_lock, flags); memset(&latency_record, 0, sizeof(latency_record)); - spin_unlock_irqrestore(&latency_lock, flags); + raw_spin_unlock_irqrestore(&latency_lock, flags); } static void __sched @@ -191,7 +191,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) lat.max = usecs; store_stacktrace(tsk, &lat); - spin_lock_irqsave(&latency_lock, flags); + raw_spin_lock_irqsave(&latency_lock, flags); account_global_scheduler_latency(tsk, &lat); @@ -233,7 +233,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); out_unlock: - spin_unlock_irqrestore(&latency_lock, flags); + raw_spin_unlock_irqrestore(&latency_lock, flags); } static int lstats_show(struct seq_file *m, void *v) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index c62ec14..1199bda 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -805,6 +805,12 @@ out_unlock_set: return class; } +#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_TRACE_IRQFLAGS) + +/* CHECKME */ + +#endif /* CONFIG_PROVE_LOCKING || CONFIG_TRACE_IRQFLAGS */ + #ifdef CONFIG_PROVE_LOCKING /* * Allocate a lockdep entry. (assumes the graph_lock held, returns @@ -1337,6 +1343,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf, return; } +#ifdef CONFIG_PROVE_LOCKING static int print_bad_irq_dependency(struct task_struct *curr, struct lock_list *prev_root, @@ -1407,6 +1414,7 @@ print_bad_irq_dependency(struct task_struct *curr, return 0; } +#endif /* CONFIG_PROVE_LOCKING */ static int check_usage(struct task_struct *curr, struct held_lock *prev, @@ -2716,6 +2724,8 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, } EXPORT_SYMBOL_GPL(lockdep_init_map); +struct lock_class_key __lockdep_no_validate__; + /* * This gets called for every mutex_lock*()/spin_lock*() operation. * We maintain the dependency maps and validate the locking attempt: @@ -2750,6 +2760,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, return 0; } + if (lock->key == &__lockdep_no_validate__) + check = 1; + if (!subclass) class = lock->class_cache; /* @@ -3595,6 +3608,9 @@ void lockdep_init(void) for (i = 0; i < CHAINHASH_SIZE; i++) INIT_LIST_HEAD(chainhash_table + i); + /* Hack alert ! */ + lockdep_set_novalidate_class(&kernel_sem); + lockdep_initialized = 1; } diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index 57d527a..6dc57dd 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h @@ -41,7 +41,6 @@ static inline void mutex_clear_owner(struct mutex *lock) do { \ struct mutex *l = container_of(lock, struct mutex, wait_lock); \ \ - DEBUG_LOCKS_WARN_ON(in_interrupt()); \ local_irq_save(flags); \ arch_spin_lock(&(lock)->rlock.raw_lock);\ DEBUG_LOCKS_WARN_ON(l->magic != l); \ diff --git a/kernel/mutex.c b/kernel/mutex.c index 632f04c..e23f35e 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -149,6 +149,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, preempt_disable(); mutex_acquire(&lock->dep_map, subclass, 0, ip); + DEBUG_LOCKS_WARN_ON(in_interrupt()); + #ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* * Optimistic spinning. @@ -249,9 +251,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, /* didnt get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); - preempt_enable_no_resched(); - schedule(); + + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); + spin_lock_mutex(&lock->wait_lock, flags); } diff --git a/kernel/notifier.c b/kernel/notifier.c index acd24e7..348a829 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -71,7 +71,7 @@ static int notifier_chain_unregister(struct notifier_block **nl, * @returns: notifier_call_chain returns the value returned by the * last notifier function called. */ -static int __kprobes notifier_call_chain(struct notifier_block **nl, +static int __kprobes notrace notifier_call_chain(struct notifier_block **nl, unsigned long val, void *v, int nr_to_call, int *nr_calls) { @@ -217,7 +217,7 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh, * not yet working and interrupts must remain disabled. At * such times we must not call down_write(). */ - if (unlikely(system_state == SYSTEM_BOOTING)) + if (unlikely(system_state < SYSTEM_RUNNING)) return notifier_chain_register(&nh->head, n); down_write(&nh->rwsem); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index d27746b..21c76ea 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -248,7 +248,7 @@ static void perf_unpin_context(struct perf_event_context *ctx) static inline u64 perf_clock(void) { - return cpu_clock(smp_processor_id()); + return cpu_clock(raw_smp_processor_id()); } /* @@ -2560,45 +2560,69 @@ static void perf_pending_event(struct perf_pending_entry *entry) __perf_event_disable(event); } +#ifndef CONFIG_PREEMPT_RT if (event->pending_wakeup) { event->pending_wakeup = 0; perf_event_wakeup(event); } +#endif } +#ifdef CONFIG_PREEMPT_RT +static void perf_pending_counter_softirq(struct perf_pending_entry *entry) +{ + struct perf_event *counter = container_of(entry, + struct perf_event, pending_softirq); + + if (counter->pending_wakeup) { + counter->pending_wakeup = 0; + perf_event_wakeup(counter); + } +} +#endif + #define PENDING_TAIL ((struct perf_pending_entry *)-1UL) static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { PENDING_TAIL, }; -static void perf_pending_queue(struct perf_pending_entry *entry, - void (*func)(struct perf_pending_entry *)) -{ - struct perf_pending_entry **head; +static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_softirq_head) = { + PENDING_TAIL, +}; +static void __perf_pending_queue(struct perf_pending_entry **head, + struct perf_pending_entry *entry, + void (*func)(struct perf_pending_entry *)) +{ if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) return; entry->func = func; - head = &get_cpu_var(perf_pending_head); - do { entry->next = *head; } while (cmpxchg(head, entry->next, entry) != entry->next); +} - set_perf_event_pending(); +static void perf_pending_queue(struct perf_pending_entry *entry, + void (*func)(struct perf_pending_entry *)) +{ + struct perf_pending_entry **head; + head = &get_cpu_var(perf_pending_head); + __perf_pending_queue(head, entry, func); put_cpu_var(perf_pending_head); + + set_perf_event_pending(); } -static int __perf_pending_run(void) +static int __perf_pending_run(struct perf_pending_entry **head) { struct perf_pending_entry *list; int nr = 0; - list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); + list = xchg(head, PENDING_TAIL); while (list != PENDING_TAIL) { void (*func)(struct perf_pending_entry *); struct perf_pending_entry *entry = list; @@ -2628,7 +2652,8 @@ static inline int perf_not_pending(struct perf_event *event) * need to wait. */ get_cpu(); - __perf_pending_run(); + __perf_pending_run(&__get_cpu_var(perf_pending_head)); + __perf_pending_run(&__get_cpu_var(perf_pending_softirq_head)); put_cpu(); /* @@ -2646,7 +2671,13 @@ static void perf_pending_sync(struct perf_event *event) void perf_event_do_pending(void) { - __perf_pending_run(); + __perf_pending_run(&__get_cpu_var(perf_pending_head)); +} + +void perf_event_do_pending_softirq(void) +{ + __perf_pending_run(&__get_cpu_var(perf_pending_head)); + __perf_pending_run(&__get_cpu_var(perf_pending_softirq_head)); } /* @@ -2684,12 +2715,18 @@ static void perf_output_wakeup(struct perf_output_handle *handle) { atomic_set(&handle->data->poll, POLL_IN); +#ifndef CONFIG_PREEMPT_RT if (handle->nmi) { handle->event->pending_wakeup = 1; perf_pending_queue(&handle->event->pending, perf_pending_event); } else perf_event_wakeup(handle->event); +#else + __perf_pending_queue(&__get_cpu_var(perf_pending_softirq_head), + &handle->event->pending_softirq, + perf_pending_counter_softirq); +#endif } /* @@ -3259,8 +3296,6 @@ static void perf_event_task_output(struct perf_event *event, task_event->event_id.tid = perf_event_tid(event, task); task_event->event_id.ptid = perf_event_tid(event, current); - task_event->event_id.time = perf_clock(); - perf_output_put(&handle, task_event->event_id); perf_output_end(&handle); @@ -3268,7 +3303,7 @@ static void perf_event_task_output(struct perf_event *event, static int perf_event_task_match(struct perf_event *event) { - if (event->state != PERF_EVENT_STATE_ACTIVE) + if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; if (event->cpu != -1 && event->cpu != smp_processor_id()) @@ -3300,7 +3335,7 @@ static void perf_event_task_event(struct perf_task_event *task_event) cpuctx = &get_cpu_var(perf_cpu_context); perf_event_task_ctx(&cpuctx->ctx, task_event); if (!ctx) - ctx = rcu_dereference(task_event->task->perf_event_ctxp); + ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) perf_event_task_ctx(ctx, task_event); put_cpu_var(perf_cpu_context); @@ -3331,6 +3366,7 @@ static void perf_event_task(struct task_struct *task, /* .ppid */ /* .tid */ /* .ptid */ + .time = perf_clock(), }, }; @@ -3380,7 +3416,7 @@ static void perf_event_comm_output(struct perf_event *event, static int perf_event_comm_match(struct perf_event *event) { - if (event->state != PERF_EVENT_STATE_ACTIVE) + if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; if (event->cpu != -1 && event->cpu != smp_processor_id()) @@ -3500,7 +3536,7 @@ static void perf_event_mmap_output(struct perf_event *event, static int perf_event_mmap_match(struct perf_event *event, struct perf_mmap_event *mmap_event) { - if (event->state != PERF_EVENT_STATE_ACTIVE) + if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; if (event->cpu != -1 && event->cpu != smp_processor_id()) @@ -4580,7 +4616,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->type >= PERF_TYPE_MAX) return -EINVAL; - if (attr->__reserved_1 || attr->__reserved_2) + if (attr->__reserved_1) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 438ff45..3dd9218 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -280,7 +280,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) struct task_cputime sum; unsigned long flags; - spin_lock_irqsave(&cputimer->lock, flags); + raw_spin_lock_irqsave(&cputimer->lock, flags); if (!cputimer->running) { cputimer->running = 1; /* @@ -293,7 +293,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) update_gt_cputime(&cputimer->cputime, &sum); } *times = cputimer->cputime; - spin_unlock_irqrestore(&cputimer->lock, flags); + raw_spin_unlock_irqrestore(&cputimer->lock, flags); } /* @@ -570,7 +570,7 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) p->cpu_timers : p->signal->cpu_timers); head += CPUCLOCK_WHICH(timer->it_clock); - BUG_ON(!irqs_disabled()); + BUG_ON_NONRT(!irqs_disabled()); spin_lock(&p->sighand->siglock); listpos = head; @@ -749,7 +749,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, /* * Disarm any old timer after extracting its expiry time. */ - BUG_ON(!irqs_disabled()); + BUG_ON_NONRT(!irqs_disabled()); ret = 0; spin_lock(&p->sighand->siglock); @@ -1068,9 +1068,9 @@ static void stop_process_timers(struct task_struct *tsk) if (!cputimer->running) return; - spin_lock_irqsave(&cputimer->lock, flags); + raw_spin_lock_irqsave(&cputimer->lock, flags); cputimer->running = 0; - spin_unlock_irqrestore(&cputimer->lock, flags); + raw_spin_unlock_irqrestore(&cputimer->lock, flags); } static u32 onecputick; @@ -1390,12 +1390,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk) * already updated our counts. We need to check if any timers fire now. * Interrupts are disabled. */ -void run_posix_cpu_timers(struct task_struct *tsk) +void __run_posix_cpu_timers(struct task_struct *tsk) { LIST_HEAD(firing); struct k_itimer *timer, *next; - BUG_ON(!irqs_disabled()); /* * The fast path checks that there are no expired thread or thread @@ -1447,6 +1446,177 @@ void run_posix_cpu_timers(struct task_struct *tsk) } } +#include <linux/kthread.h> +#include <linux/cpu.h> +DEFINE_PER_CPU(struct task_struct *, posix_timer_task); +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist); + +static int posix_cpu_timers_thread(void *data) +{ + int cpu = (long)data; + + BUG_ON(per_cpu(posix_timer_task,cpu) != current); + + while (!kthread_should_stop()) { + struct task_struct *tsk = NULL; + struct task_struct *next = NULL; + + if (cpu_is_offline(cpu)) + goto wait_to_die; + + /* grab task list */ + raw_local_irq_disable(); + tsk = per_cpu(posix_timer_tasklist, cpu); + per_cpu(posix_timer_tasklist, cpu) = NULL; + raw_local_irq_enable(); + + /* its possible the list is empty, just return */ + if (!tsk) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + continue; + } + + /* Process task list */ + while (1) { + /* save next */ + next = tsk->posix_timer_list; + + /* run the task timers, clear its ptr and + * unreference it + */ + __run_posix_cpu_timers(tsk); + tsk->posix_timer_list = NULL; + put_task_struct(tsk); + + /* check if this is the last on the list */ + if (next == tsk) + break; + tsk = next; + } + } + return 0; + +wait_to_die: + /* Wait for kthread_stop */ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +static inline int __fastpath_timer_check(struct task_struct *tsk) +{ + /* tsk == current, ensure it is safe to use ->signal/sighand */ + if (unlikely(tsk->exit_state)) + return 0; + + if (!task_cputime_zero(&tsk->cputime_expires)) + return 1; + + if (!task_cputime_zero(&tsk->signal->cputime_expires)) + return 1; + + return 0; +} + +void run_posix_cpu_timers(struct task_struct *tsk) +{ + unsigned long cpu = smp_processor_id(); + struct task_struct *tasklist; + + BUG_ON(!irqs_disabled()); + if(!per_cpu(posix_timer_task, cpu)) + return; + /* get per-cpu references */ + tasklist = per_cpu(posix_timer_tasklist, cpu); + + /* check to see if we're already queued */ + if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) { + get_task_struct(tsk); + if (tasklist) { + tsk->posix_timer_list = tasklist; + } else { + /* + * The list is terminated by a self-pointing + * task_struct + */ + tsk->posix_timer_list = tsk; + } + per_cpu(posix_timer_tasklist, cpu) = tsk; + + wake_up_process(per_cpu(posix_timer_task, cpu)); + } +} + +/* + * posix_cpu_thread_call - callback that gets triggered when a CPU is added. + * Here we can start up the necessary migration thread for the new CPU. + */ +static int posix_cpu_thread_call(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int cpu = (long)hcpu; + struct task_struct *p; + struct sched_param param; + + switch (action) { + case CPU_UP_PREPARE: + p = kthread_create(posix_cpu_timers_thread, hcpu, + "posixcputmr/%d",cpu); + if (IS_ERR(p)) + return NOTIFY_BAD; + p->flags |= PF_NOFREEZE; + kthread_bind(p, cpu); + /* Must be high prio to avoid getting starved */ + param.sched_priority = MAX_RT_PRIO-1; + sched_setscheduler(p, SCHED_FIFO, ¶m); + per_cpu(posix_timer_task,cpu) = p; + break; + case CPU_ONLINE: + /* Strictly unneccessary, as first user will wake it. */ + wake_up_process(per_cpu(posix_timer_task,cpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + /* Unbind it from offline cpu so it can run. Fall thru. */ + kthread_bind(per_cpu(posix_timer_task,cpu), + any_online_cpu(cpu_online_map)); + kthread_stop(per_cpu(posix_timer_task,cpu)); + per_cpu(posix_timer_task,cpu) = NULL; + break; + case CPU_DEAD: + kthread_stop(per_cpu(posix_timer_task,cpu)); + per_cpu(posix_timer_task,cpu) = NULL; + break; +#endif + } + return NOTIFY_OK; +} + +/* Register at highest priority so that task migration (migrate_all_tasks) + * happens before everything else. + */ +static struct notifier_block __devinitdata posix_cpu_thread_notifier = { + .notifier_call = posix_cpu_thread_call, + .priority = 10 +}; + +static int __init posix_cpu_thread_init(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + /* Start one for boot CPU. */ + posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, cpu); + posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, cpu); + register_cpu_notifier(&posix_cpu_thread_notifier); + return 0; +} +early_initcall(posix_cpu_thread_init); + /* * Set one of the process-wide special case CPU timers. * The tsk->sighand->siglock must be held by the caller. @@ -1713,6 +1883,12 @@ static __init int init_posix_cpu_timers(void) .nsleep_restart = thread_cpu_nsleep_restart, }; struct timespec ts; + unsigned long cpu; + + /* init the per-cpu posix_timer_tasklets */ + for_each_cpu_mask(cpu, cpu_possible_map) { + per_cpu(posix_timer_tasklist, cpu) = NULL; + } register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 4954407..d2818dd 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -462,6 +462,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) static struct pid *good_sigevent(sigevent_t * event) { struct task_struct *rtn = current->group_leader; + int sig = event->sigev_signo; if ((event->sigev_notify & SIGEV_THREAD_ID ) && (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) || @@ -470,7 +471,8 @@ static struct pid *good_sigevent(sigevent_t * event) return NULL; if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) && - ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) + (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) || + sig_kernel_coredump(sig))) return NULL; return task_pid(rtn); @@ -829,6 +831,7 @@ retry: unlock_timer(timr, flag); if (error == TIMER_RETRY) { + hrtimer_wait_for_timer(&timr->it.real.timer); rtn = NULL; // We already got the old time... goto retry; } @@ -867,6 +870,7 @@ retry_delete: if (timer_delete_hook(timer) == TIMER_RETRY) { unlock_timer(timer, flags); + hrtimer_wait_for_timer(&timer->it.real.timer); goto retry_delete; } @@ -896,6 +900,7 @@ retry_delete: if (timer_delete_hook(timer) == TIMER_RETRY) { unlock_timer(timer, flags); + hrtimer_wait_for_timer(&timer->it.real.timer); goto retry_delete; } list_del(&timer->list); diff --git a/kernel/printk.c b/kernel/printk.c index 1751c45..073d27f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -35,6 +35,7 @@ #include <linux/kexec.h> #include <linux/ratelimit.h> #include <linux/kmsg_dump.h> +#include <linux/semaphore.h> #include <asm/uaccess.h> @@ -79,17 +80,17 @@ int oops_in_progress; EXPORT_SYMBOL(oops_in_progress); /* - * console_sem protects the console_drivers list, and also + * console_mutex protects the console_drivers list, and also * provides serialisation for access to the entire console * driver system. */ -static DECLARE_MUTEX(console_sem); +static DEFINE_MUTEX(console_mutex); struct console *console_drivers; EXPORT_SYMBOL_GPL(console_drivers); /* * This is used for debugging the mess that is the VT code by - * keeping track if we have the console semaphore held. It's + * keeping track if we have the console mutex held. It's * definitely not the perfect debug tool (we don't know if _WE_ * hold it are racing, but it helps tracking those weird code * path in the console code where we end up in places I want @@ -100,9 +101,9 @@ static int console_locked, console_suspended; /* * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars * It is also used in interesting ways to provide interlocking in - * release_console_sem(). + * release_console_mutex(). */ -static DEFINE_SPINLOCK(logbuf_lock); +static DEFINE_RAW_SPINLOCK(logbuf_lock); #define LOG_BUF_MASK (log_buf_len-1) #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) @@ -181,7 +182,7 @@ static int __init log_buf_len_setup(char *str) goto out; } - spin_lock_irqsave(&logbuf_lock, flags); + raw_spin_lock_irqsave(&logbuf_lock, flags); log_buf_len = size; log_buf = new_log_buf; @@ -195,7 +196,7 @@ static int __init log_buf_len_setup(char *str) log_start -= offset; con_start -= offset; log_end -= offset; - spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len); } @@ -305,18 +306,18 @@ int do_syslog(int type, char __user *buf, int len) if (error) goto out; i = 0; - spin_lock_irq(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); while (!error && (log_start != log_end) && i < len) { c = LOG_BUF(log_start); log_start++; - spin_unlock_irq(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); error = __put_user(c,buf); buf++; i++; cond_resched(); - spin_lock_irq(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); } - spin_unlock_irq(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); if (!error) error = i; break; @@ -337,7 +338,7 @@ int do_syslog(int type, char __user *buf, int len) count = len; if (count > log_buf_len) count = log_buf_len; - spin_lock_irq(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); if (count > logged_chars) count = logged_chars; if (do_clear) @@ -354,12 +355,12 @@ int do_syslog(int type, char __user *buf, int len) if (j + log_buf_len < log_end) break; c = LOG_BUF(j); - spin_unlock_irq(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); error = __put_user(c,&buf[count-1-i]); cond_resched(); - spin_lock_irq(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); } - spin_unlock_irq(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); if (error) break; error = i; @@ -429,9 +430,13 @@ static void __call_console_drivers(unsigned start, unsigned end) for_each_console(con) { if ((con->flags & CON_ENABLED) && con->write && - (cpu_online(smp_processor_id()) || - (con->flags & CON_ANYTIME))) + console_atomic_safe(con) && + (cpu_online(raw_smp_processor_id()) || + (con->flags & CON_ANYTIME))) { + set_printk_might_sleep(1); con->write(con, &LOG_BUF(start), end - start); + set_printk_might_sleep(0); + } } } @@ -469,7 +474,7 @@ static void _call_console_drivers(unsigned start, /* * Call the console drivers, asking them to write out * log_buf[start] to log_buf[end - 1]. - * The console_sem must be held. + * The console_mutex must be held. */ static void call_console_drivers(unsigned start, unsigned end) { @@ -542,9 +547,10 @@ static void zap_locks(void) oops_timestamp = jiffies; /* If a crash is occurring, make sure we can't deadlock */ - spin_lock_init(&logbuf_lock); + raw_spin_lock_init(&logbuf_lock); /* And make sure that we print immediately */ - init_MUTEX(&console_sem); + mutex_init(&console_mutex); + zap_rt_locks(); } #if defined(CONFIG_PRINTK_TIME) @@ -572,11 +578,11 @@ static int have_callable_console(void) * * This is printk(). It can be called from any context. We want it to work. * - * We try to grab the console_sem. If we succeed, it's easy - we log the output and - * call the console drivers. If we fail to get the semaphore we place the output - * into the log buffer and return. The current holder of the console_sem will - * notice the new output in release_console_sem() and will send it to the - * consoles before releasing the semaphore. + * We try to grab the console_mutex. If we succeed, it's easy - we log the output and + * call the console drivers. If we fail to get the mutex we place the output + * into the log buffer and return. The current holder of the console_mutex will + * notice the new output in release_console_mutex() and will send it to the + * consoles before releasing the mutex. * * One effect of this deferred printing is that code which calls printk() and * then changes console_loglevel may break. This is because console_loglevel @@ -619,34 +625,34 @@ static inline int can_use_console(unsigned int cpu) /* * Try to get console ownership to actually show the kernel * messages from a 'printk'. Return true (and with the - * console_semaphore held, and 'console_locked' set) if it + * console_mutex held, and 'console_locked' set) if it * is successful, false otherwise. * * This gets called with the 'logbuf_lock' spinlock held and * interrupts disabled. It should return with 'lockbuf_lock' * released but interrupts still disabled. */ -static int acquire_console_semaphore_for_printk(unsigned int cpu) +static int acquire_console_mutex_for_printk(unsigned int cpu) { int retval = 0; - if (!try_acquire_console_sem()) { + if (!try_acquire_console_mutex()) { retval = 1; /* * If we can't use the console, we need to release - * the console semaphore by hand to avoid flushing - * the buffer. We need to hold the console semaphore + * the console mutex by hand to avoid flushing + * the buffer. We need to hold the console mutex * in order to do this test safely. */ if (!can_use_console(cpu)) { console_locked = 0; - up(&console_sem); + mutex_unlock(&console_mutex); retval = 0; } } printk_cpu = UINT_MAX; - spin_unlock(&logbuf_lock); + raw_spin_unlock(&logbuf_lock); return retval; } static const char recursion_bug_msg [] = @@ -681,9 +687,9 @@ asmlinkage int vprintk(const char *fmt, va_list args) printk_delay(); preempt_disable(); - /* This stops the holder of console_sem just where we want him */ + /* This stops the holder of console_mutex just where we want him */ raw_local_irq_save(flags); - this_cpu = smp_processor_id(); + this_cpu = raw_smp_processor_id(); /* * Ouch, printk recursed into itself! @@ -698,14 +704,15 @@ asmlinkage int vprintk(const char *fmt, va_list args) */ if (!oops_in_progress) { recursion_bug = 1; - goto out_restore_irqs; + goto out; } zap_locks(); } lockdep_off(); - spin_lock(&logbuf_lock); + raw_spin_lock(&logbuf_lock); printk_cpu = this_cpu; + preempt_enable(); if (recursion_bug) { recursion_bug = 0; @@ -782,22 +789,20 @@ asmlinkage int vprintk(const char *fmt, va_list args) /* * Try to acquire and then immediately release the - * console semaphore. The release will do all the + * console mutex. The release will do all the * actual magic (print out buffers, wake up klogd, * etc). * - * The acquire_console_semaphore_for_printk() function + * The acquire_console_mutex_for_printk() function * will release 'logbuf_lock' regardless of whether it - * actually gets the semaphore or not. + * actually gets the mutex or not. */ - if (acquire_console_semaphore_for_printk(this_cpu)) - release_console_sem(); + if (acquire_console_mutex_for_printk(this_cpu)) + release_console_mutex(); lockdep_on(); -out_restore_irqs: +out: raw_local_irq_restore(flags); - - preempt_enable(); return printed_len; } EXPORT_SYMBOL(printk); @@ -952,52 +957,52 @@ void suspend_console(void) if (!console_suspend_enabled) return; printk("Suspending console(s) (use no_console_suspend to debug)\n"); - acquire_console_sem(); + acquire_console_mutex(); console_suspended = 1; - up(&console_sem); + mutex_unlock(&console_mutex); } void resume_console(void) { if (!console_suspend_enabled) return; - down(&console_sem); + mutex_lock(&console_mutex); console_suspended = 0; - release_console_sem(); + release_console_mutex(); } /** - * acquire_console_sem - lock the console system for exclusive use. + * acquire_console_mutex - lock the console system for exclusive use. * - * Acquires a semaphore which guarantees that the caller has + * Acquires a mutex which guarantees that the caller has * exclusive access to the console system and the console_drivers list. * * Can sleep, returns nothing. */ -void acquire_console_sem(void) +void acquire_console_mutex(void) { BUG_ON(in_interrupt()); - down(&console_sem); + mutex_lock(&console_mutex); if (console_suspended) return; console_locked = 1; console_may_schedule = 1; } -EXPORT_SYMBOL(acquire_console_sem); +EXPORT_SYMBOL(acquire_console_mutex); -int try_acquire_console_sem(void) +int try_acquire_console_mutex(void) { - if (down_trylock(&console_sem)) + if (!mutex_trylock(&console_mutex)) return -1; if (console_suspended) { - up(&console_sem); + mutex_unlock(&console_mutex); return -1; } console_locked = 1; console_may_schedule = 0; return 0; } -EXPORT_SYMBOL(try_acquire_console_sem); +EXPORT_SYMBOL(try_acquire_console_mutex); int is_console_locked(void) { @@ -1026,53 +1031,75 @@ void wake_up_klogd(void) } /** - * release_console_sem - unlock the console system + * release_console_mutex - unlock the console system * - * Releases the semaphore which the caller holds on the console system + * Releases the mutex which the caller holds on the console system * and the console driver list. * - * While the semaphore was held, console output may have been buffered - * by printk(). If this is the case, release_console_sem() emits - * the output prior to releasing the semaphore. + * While the mutex was held, console output may have been buffered + * by printk(). If this is the case, release_console_mutex() emits + * the output prior to releasing the mutex. * * If there is output waiting for klogd, we wake it up. * - * release_console_sem() may be called from any context. + * release_console_mutex() may be called from any context. */ -void release_console_sem(void) +void release_console_mutex(void) { unsigned long flags; unsigned _con_start, _log_end; unsigned wake_klogd = 0; if (console_suspended) { - up(&console_sem); + mutex_unlock(&console_mutex); return; } console_may_schedule = 0; for ( ; ; ) { - spin_lock_irqsave(&logbuf_lock, flags); + raw_spin_lock_irqsave(&logbuf_lock, flags); wake_klogd |= log_start - log_end; if (con_start == log_end) break; /* Nothing to print */ _con_start = con_start; _log_end = log_end; con_start = log_end; /* Flush */ - spin_unlock(&logbuf_lock); + + /* + * on PREEMPT_RT, call console drivers with + * interrupts enabled (if printk was called + * with interrupts disabled): + */ +#ifdef CONFIG_PREEMPT_RT + raw_spin_unlock_irqrestore(&logbuf_lock, flags); +#else + raw_spin_unlock(&logbuf_lock); stop_critical_timings(); /* don't trace print latency */ +#endif call_console_drivers(_con_start, _log_end); start_critical_timings(); +#ifndef CONFIG_PREEMPT_RT local_irq_restore(flags); +#endif } console_locked = 0; - up(&console_sem); - spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + mutex_unlock(&console_mutex); + + /* + * On PREEMPT_RT kernels __wake_up may sleep, so wake syslogd + * up only if we are in a preemptible section. We normally dont + * printk from non-preemptible sections so this is for the emergency + * case only. + */ +#ifdef CONFIG_PREEMPT_RT + if (!in_atomic() && !irqs_disabled()) +#endif if (wake_klogd) wake_up_klogd(); } -EXPORT_SYMBOL(release_console_sem); +EXPORT_SYMBOL(release_console_mutex); /** * console_conditional_schedule - yield the CPU if required @@ -1081,7 +1108,7 @@ EXPORT_SYMBOL(release_console_sem); * if this CPU should yield the CPU to another task, do * so here. * - * Must be called within acquire_console_sem(). + * Must be called within acquire_console_mutex(). */ void __sched console_conditional_schedule(void) { @@ -1099,17 +1126,17 @@ void console_unblank(void) * oops_in_progress is set to 1.. */ if (oops_in_progress) { - if (down_trylock(&console_sem) != 0) + if (!mutex_trylock(&console_mutex)) return; } else - acquire_console_sem(); + acquire_console_mutex(); console_locked = 1; console_may_schedule = 0; for_each_console(c) if ((c->flags & CON_ENABLED) && c->unblank) c->unblank(); - release_console_sem(); + release_console_mutex(); } /* @@ -1120,7 +1147,7 @@ struct tty_driver *console_device(int *index) struct console *c; struct tty_driver *driver = NULL; - acquire_console_sem(); + acquire_console_mutex(); for_each_console(c) { if (!c->device) continue; @@ -1128,7 +1155,7 @@ struct tty_driver *console_device(int *index) if (driver) break; } - release_console_sem(); + release_console_mutex(); return driver; } @@ -1139,17 +1166,17 @@ struct tty_driver *console_device(int *index) */ void console_stop(struct console *console) { - acquire_console_sem(); + acquire_console_mutex(); console->flags &= ~CON_ENABLED; - release_console_sem(); + release_console_mutex(); } EXPORT_SYMBOL(console_stop); void console_start(struct console *console) { - acquire_console_sem(); + acquire_console_mutex(); console->flags |= CON_ENABLED; - release_console_sem(); + release_console_mutex(); } EXPORT_SYMBOL(console_start); @@ -1271,7 +1298,7 @@ void register_console(struct console *newcon) * Put this console in the list - keep the * preferred driver at the head of the list. */ - acquire_console_sem(); + acquire_console_mutex(); if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) { newcon->next = console_drivers; console_drivers = newcon; @@ -1283,14 +1310,14 @@ void register_console(struct console *newcon) } if (newcon->flags & CON_PRINTBUFFER) { /* - * release_console_sem() will print out the buffered messages + * release_console_mutex() will print out the buffered messages * for us. */ - spin_lock_irqsave(&logbuf_lock, flags); + raw_spin_lock_irqsave(&logbuf_lock, flags); con_start = log_start; - spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); } - release_console_sem(); + release_console_mutex(); /* * By unregistering the bootconsoles after we enable the real console @@ -1326,7 +1353,7 @@ int unregister_console(struct console *console) return braille_unregister_console(console); #endif - acquire_console_sem(); + acquire_console_mutex(); if (console_drivers == console) { console_drivers=console->next; res = 0; @@ -1348,7 +1375,7 @@ int unregister_console(struct console *console) if (console_drivers != NULL && console->flags & CON_CONSDEV) console_drivers->flags |= CON_CONSDEV; - release_console_sem(); + release_console_mutex(); return res; } EXPORT_SYMBOL(unregister_console); @@ -1407,6 +1434,21 @@ bool printk_timed_ratelimit(unsigned long *caller_jiffies, } EXPORT_SYMBOL(printk_timed_ratelimit); +static DEFINE_RAW_SPINLOCK(warn_lock); + +void __WARN_ON(const char *func, const char *file, const int line) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&warn_lock, flags); + printk("%s/%d[CPU#%d]: BUG in %s at %s:%d\n", + current->comm, current->pid, raw_smp_processor_id(), + func, file, line); + dump_stack(); + raw_spin_unlock_irqrestore(&warn_lock, flags); +} +EXPORT_SYMBOL(__WARN_ON); + static DEFINE_SPINLOCK(dump_list_lock); static LIST_HEAD(dump_list); @@ -1497,10 +1539,10 @@ void kmsg_dump(enum kmsg_dump_reason reason) /* Theoretically, the log could move on after we do this, but there's not a lot we can do about that. The new messages will overwrite the start of what we dump. */ - spin_lock_irqsave(&logbuf_lock, flags); + raw_spin_lock_irqsave(&logbuf_lock, flags); end = log_end & LOG_BUF_MASK; chars = logged_chars; - spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); if (logged_chars > end) { s1 = log_buf + log_buf_len - logged_chars + end; diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 9bb5217..81a32e4 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -806,7 +806,7 @@ rcu_torture_reader(void *arg) if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ cur_ops->readunlock(idx); - schedule_timeout_interruptible(HZ); + schedule_timeout_interruptible(round_jiffies_relative(HZ)); continue; } if (p->rtort_mbtest == 0) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 53ae959..0b4905b 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -66,11 +66,11 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; .signaled = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ - .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ + .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \ .orphan_cbs_list = NULL, \ .orphan_cbs_tail = &name.orphan_cbs_list, \ .orphan_qlen = 0, \ - .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ + .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \ .n_force_qs = 0, \ .n_force_qs_ngp = 0, \ } @@ -439,10 +439,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp) /* Only let one CPU complain about others per time interval. */ - spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave(&rnp->lock, flags); delta = jiffies - rsp->jiffies_stall; if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; @@ -452,13 +452,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp) * due to CPU offlining. */ rcu_print_task_stall(rnp); - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); /* OK, time to rat on our buddy... */ printk(KERN_ERR "INFO: RCU detected CPU stalls:"); rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave(&rnp->lock, flags); rcu_print_task_stall(rnp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); if (rnp->qsmask == 0) continue; for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) @@ -481,11 +483,11 @@ static void print_cpu_stall(struct rcu_state *rsp) smp_processor_id(), jiffies - rsp->gp_start); trigger_all_cpu_backtrace(); - spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave(&rnp->lock, flags); if ((long)(jiffies - rsp->jiffies_stall) >= 0) rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); set_need_resched(); /* kick ourselves to get things going. */ } @@ -545,12 +547,12 @@ static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_save(flags); rnp = rdp->mynode; if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ - !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ + !raw_spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ local_irq_restore(flags); return; } __note_new_gpnum(rsp, rnp, rdp); - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -609,12 +611,12 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_save(flags); rnp = rdp->mynode; if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ - !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ + !raw_spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ local_irq_restore(flags); return; } __rcu_process_gp_end(rsp, rnp, rdp); - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -661,10 +663,10 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) if (!cpu_needs_another_gp(rsp, rdp)) { if (rnp->completed == rsp->completed) { - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ /* * Propagate new ->completed value to rcu_node structures @@ -672,9 +674,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) * of the next grace period to process their callbacks. */ rcu_for_each_node_breadth_first(rsp, rnp) { - spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ rnp->completed = rsp->completed; - spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } local_irq_restore(flags); return; @@ -695,15 +697,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rnp->completed = rsp->completed; rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ rcu_start_gp_per_cpu(rsp, rnp, rdp); - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - spin_unlock(&rnp->lock); /* leave irqs disabled. */ + raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ /* Exclude any concurrent CPU-hotplug operations. */ - spin_lock(&rsp->onofflock); /* irqs already disabled. */ + raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ /* * Set the quiescent-state-needed bits in all the rcu_node @@ -723,21 +725,21 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) * irqs disabled. */ rcu_for_each_node_breadth_first(rsp, rnp) { - spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; rnp->gpnum = rsp->gpnum; rnp->completed = rsp->completed; if (rnp == rdp->mynode) rcu_start_gp_per_cpu(rsp, rnp, rdp); - spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } rnp = rcu_get_root(rsp); - spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ - spin_unlock(&rnp->lock); /* irqs remain disabled. */ - spin_unlock_irqrestore(&rsp->onofflock, flags); + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_irqrestore(&rsp->onofflock, flags); } /* @@ -776,14 +778,14 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, if (!(rnp->qsmask & mask)) { /* Our bit has already been cleared, so done. */ - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } rnp->qsmask &= ~mask; if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { /* Other bits still set at this level, so done. */ - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } mask = rnp->grpmask; @@ -793,10 +795,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, break; } - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); rnp_c = rnp; rnp = rnp->parent; - spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave(&rnp->lock, flags); WARN_ON_ONCE(rnp_c->qsmask); } @@ -825,7 +827,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las struct rcu_node *rnp; rnp = rdp->mynode; - spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave(&rnp->lock, flags); if (lastcomp != rnp->completed) { /* @@ -837,12 +839,12 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las * race occurred. */ rdp->passed_quiesc = 0; /* try again later! */ - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } mask = rdp->grpmask; if ((rnp->qsmask & mask) == 0) { - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); } else { rdp->qs_pending = 0; @@ -906,7 +908,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) if (rdp->nxtlist == NULL) return; /* irqs disabled, so comparison is stable. */ - spin_lock(&rsp->onofflock); /* irqs already disabled. */ + raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ *rsp->orphan_cbs_tail = rdp->nxtlist; rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; rdp->nxtlist = NULL; @@ -914,7 +916,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) rdp->nxttail[i] = &rdp->nxtlist; rsp->orphan_qlen += rdp->qlen; rdp->qlen = 0; - spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ + raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ } /* @@ -925,10 +927,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) unsigned long flags; struct rcu_data *rdp; - spin_lock_irqsave(&rsp->onofflock, flags); + raw_spin_lock_irqsave(&rsp->onofflock, flags); rdp = rsp->rda[smp_processor_id()]; if (rsp->orphan_cbs_list == NULL) { - spin_unlock_irqrestore(&rsp->onofflock, flags); + raw_spin_unlock_irqrestore(&rsp->onofflock, flags); return; } *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; @@ -937,7 +939,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) rsp->orphan_cbs_list = NULL; rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; rsp->orphan_qlen = 0; - spin_unlock_irqrestore(&rsp->onofflock, flags); + raw_spin_unlock_irqrestore(&rsp->onofflock, flags); } /* @@ -953,23 +955,23 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) struct rcu_node *rnp; /* Exclude any attempts to start a new grace period. */ - spin_lock_irqsave(&rsp->onofflock, flags); + raw_spin_lock_irqsave(&rsp->onofflock, flags); /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ mask = rdp->grpmask; /* rnp->grplo is constant. */ do { - spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ rnp->qsmaskinit &= ~mask; if (rnp->qsmaskinit != 0) { if (rnp != rdp->mynode) - spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ break; } if (rnp == rdp->mynode) need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); else - spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ mask = rnp->grpmask; rnp = rnp->parent; } while (rnp != NULL); @@ -980,12 +982,12 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) * because invoking rcu_report_unblock_qs_rnp() with ->onofflock * held leads to deadlock. */ - spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ + raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ rnp = rdp->mynode; if (need_report & RCU_OFL_TASKS_NORM_GP) rcu_report_unblock_qs_rnp(rnp, flags); else - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); if (need_report & RCU_OFL_TASKS_EXP_GP) rcu_report_exp_rnp(rsp, rnp); @@ -1158,13 +1160,13 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, rcu_for_each_leaf_node(rsp, rnp) { mask = 0; - spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave(&rnp->lock, flags); if (rnp->completed != lastcomp) { - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); return 1; } if (rnp->qsmask == 0) { - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); continue; } cpu = rnp->grplo; @@ -1179,7 +1181,7 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, rcu_report_qs_rnp(mask, rsp, rnp, flags); continue; } - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); } return 0; } @@ -1198,7 +1200,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) if (!rcu_gp_in_progress(rsp)) return; /* No grace period in progress, nothing to force. */ - if (!spin_trylock_irqsave(&rsp->fqslock, flags)) { + if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ return; /* Someone else is already on the job. */ } @@ -1206,16 +1208,16 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) (long)(rsp->jiffies_force_qs - jiffies) >= 0) goto unlock_ret; /* no emergency and done recently. */ rsp->n_force_qs++; - spin_lock(&rnp->lock); + raw_spin_lock(&rnp->lock); lastcomp = rsp->gpnum - 1; signaled = rsp->signaled; rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; if(!rcu_gp_in_progress(rsp)) { rsp->n_force_qs_ngp++; - spin_unlock(&rnp->lock); + raw_spin_unlock(&rnp->lock); goto unlock_ret; /* no GP in progress, time updated. */ } - spin_unlock(&rnp->lock); + raw_spin_unlock(&rnp->lock); switch (signaled) { case RCU_GP_IDLE: case RCU_GP_INIT: @@ -1237,7 +1239,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) /* Update state, record completion counter. */ forcenow = 0; - spin_lock(&rnp->lock); + raw_spin_lock(&rnp->lock); if (lastcomp + 1 == rsp->gpnum && lastcomp == rsp->completed && rsp->signaled == signaled) { @@ -1245,7 +1247,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) rsp->completed_fqs = lastcomp; forcenow = signaled == RCU_SAVE_COMPLETED; } - spin_unlock(&rnp->lock); + raw_spin_unlock(&rnp->lock); if (!forcenow) break; /* fall into next case. */ @@ -1262,7 +1264,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) break; } unlock_ret: - spin_unlock_irqrestore(&rsp->fqslock, flags); + raw_spin_unlock_irqrestore(&rsp->fqslock, flags); } #else /* #ifdef CONFIG_SMP */ @@ -1304,7 +1306,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) /* Does this CPU require a not-yet-started grace period? */ if (cpu_needs_another_gp(rsp, rdp)) { - spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); + raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); rcu_start_gp(rsp, flags); /* releases above lock */ } @@ -1369,7 +1371,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), unsigned long nestflag; struct rcu_node *rnp_root = rcu_get_root(rsp); - spin_lock_irqsave(&rnp_root->lock, nestflag); + raw_spin_lock_irqsave(&rnp_root->lock, nestflag); rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ } @@ -1659,7 +1661,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ - spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave(&rnp->lock, flags); rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) @@ -1669,7 +1671,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->dynticks = &per_cpu(rcu_dynticks, cpu); #endif /* #ifdef CONFIG_NO_HZ */ rdp->cpu = cpu; - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -1687,7 +1689,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ - spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave(&rnp->lock, flags); rdp->passed_quiesc = 0; /* We could be racing with new GP, */ rdp->qs_pending = 1; /* so set up to respond to current GP. */ rdp->beenonline = 1; /* We have now been online. */ @@ -1695,7 +1697,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; - spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ /* * A new grace period might start here. If so, we won't be part @@ -1703,14 +1705,14 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) */ /* Exclude any attempts to start a new GP on large systems. */ - spin_lock(&rsp->onofflock); /* irqs already disabled. */ + raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ /* Add CPU to rcu_node bitmasks. */ rnp = rdp->mynode; mask = rdp->grpmask; do { /* Exclude any attempts to start a new GP on small systems. */ - spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ rnp->qsmaskinit |= mask; mask = rnp->grpmask; if (rnp == rdp->mynode) { @@ -1718,11 +1720,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) rdp->completed = rnp->completed; rdp->passed_quiesc_completed = rnp->completed - 1; } - spin_unlock(&rnp->lock); /* irqs already disabled. */ + raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ rnp = rnp->parent; } while (rnp != NULL && !(rnp->qsmaskinit & mask)); - spin_unlock_irqrestore(&rsp->onofflock, flags); + raw_spin_unlock_irqrestore(&rsp->onofflock, flags); } static void __cpuinit rcu_online_cpu(int cpu) @@ -1823,7 +1825,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) cpustride *= rsp->levelspread[i]; rnp = rsp->level[i]; for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { - spin_lock_init(&rnp->lock); + raw_spin_lock_init(&rnp->lock); lockdep_set_class(&rnp->lock, &rcu_node_class[i]); rnp->gpnum = 0; rnp->qsmask = 0; diff --git a/kernel/rcutree.h b/kernel/rcutree.h index d2a0046..4613de1 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -90,7 +90,7 @@ struct rcu_dynticks { * Definition for node within the RCU grace-period-detection hierarchy. */ struct rcu_node { - spinlock_t lock; /* Root rcu_node's lock protects some */ + raw_spinlock_t lock; /* Root rcu_node's lock protects some */ /* rcu_state fields as well as following. */ long gpnum; /* Current grace period for this node. */ /* This will either be equal to or one */ @@ -282,7 +282,7 @@ struct rcu_state { /* End of fields guarded by root rcu_node's lock. */ - spinlock_t onofflock; /* exclude on/offline and */ + raw_spinlock_t onofflock; /* exclude on/offline and */ /* starting new GP. Also */ /* protects the following */ /* orphan_cbs fields. */ @@ -292,7 +292,7 @@ struct rcu_state { /* going offline. */ struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ long orphan_qlen; /* Number of orphaned cbs. */ - spinlock_t fqslock; /* Only one task forcing */ + raw_spinlock_t fqslock; /* Only one task forcing */ /* quiescent states. */ long completed_fqs; /* Value of completed @ snap. */ /* Protected by fqslock. */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 37fbccd..518aca9 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -102,7 +102,7 @@ static void rcu_preempt_note_context_switch(int cpu) /* Possibly blocking in an RCU read-side critical section. */ rdp = rcu_preempt_state.rda[cpu]; rnp = rdp->mynode; - spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave(&rnp->lock, flags); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; t->rcu_blocked_node = rnp; @@ -123,7 +123,7 @@ static void rcu_preempt_note_context_switch(int cpu) WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -180,7 +180,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) struct rcu_node *rnp_p; if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); return; /* Still need more quiescent states! */ } @@ -197,8 +197,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) /* Report up the rest of the hierarchy. */ mask = rnp->grpmask; - spin_unlock(&rnp->lock); /* irqs remain disabled. */ - spin_lock(&rnp_p->lock); /* irqs already disabled. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); } @@ -248,10 +248,10 @@ static void rcu_read_unlock_special(struct task_struct *t) */ for (;;) { rnp = t->rcu_blocked_node; - spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ if (rnp == t->rcu_blocked_node) break; - spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } empty = !rcu_preempted_readers(rnp); empty_exp = !rcu_preempted_readers_exp(rnp); @@ -265,7 +265,7 @@ static void rcu_read_unlock_special(struct task_struct *t) * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. */ if (empty) - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); else rcu_report_unblock_qs_rnp(rnp, flags); @@ -306,18 +306,15 @@ EXPORT_SYMBOL_GPL(__rcu_read_unlock); */ static void rcu_print_task_stall(struct rcu_node *rnp) { - unsigned long flags; struct list_head *lp; int phase; struct task_struct *t; if (rcu_preempted_readers(rnp)) { - spin_lock_irqsave(&rnp->lock, flags); phase = rnp->gpnum & 0x1; lp = &rnp->blocked_tasks[phase]; list_for_each_entry(t, lp, rcu_node_entry) printk(" P%d", t->pid); - spin_unlock_irqrestore(&rnp->lock, flags); } } @@ -388,11 +385,11 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, lp_root = &rnp_root->blocked_tasks[i]; while (!list_empty(lp)) { tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); - spin_lock(&rnp_root->lock); /* irqs already disabled */ + raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ list_del(&tp->rcu_node_entry); tp->rcu_blocked_node = rnp_root; list_add(&tp->rcu_node_entry, lp_root); - spin_unlock(&rnp_root->lock); /* irqs remain disabled */ + raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */ } } return retval; @@ -516,7 +513,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) unsigned long flags; unsigned long mask; - spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave(&rnp->lock, flags); for (;;) { if (!sync_rcu_preempt_exp_done(rnp)) break; @@ -525,12 +522,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) break; } mask = rnp->grpmask; - spin_unlock(&rnp->lock); /* irqs remain disabled */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ rnp = rnp->parent; - spin_lock(&rnp->lock); /* irqs already disabled */ + raw_spin_lock(&rnp->lock); /* irqs already disabled */ rnp->expmask &= ~mask; } - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -545,11 +542,11 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) { int must_wait; - spin_lock(&rnp->lock); /* irqs already disabled */ + raw_spin_lock(&rnp->lock); /* irqs already disabled */ list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); must_wait = rcu_preempted_readers_exp(rnp); - spin_unlock(&rnp->lock); /* irqs remain disabled */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ if (!must_wait) rcu_report_exp_rnp(rsp, rnp); } @@ -594,13 +591,13 @@ void synchronize_rcu_expedited(void) /* force all RCU readers onto blocked_tasks[]. */ synchronize_sched_expedited(); - spin_lock_irqsave(&rsp->onofflock, flags); + raw_spin_lock_irqsave(&rsp->onofflock, flags); /* Initialize ->expmask for all non-leaf rcu_node structures. */ rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { - spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ rnp->expmask = rnp->qsmaskinit; - spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } /* Snapshot current state of ->blocked_tasks[] lists. */ @@ -609,7 +606,7 @@ void synchronize_rcu_expedited(void) if (NUM_RCU_NODES > 1) sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); - spin_unlock_irqrestore(&rsp->onofflock, flags); + raw_spin_unlock_irqrestore(&rsp->onofflock, flags); /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ rnp = rcu_get_root(rsp); @@ -734,7 +731,7 @@ static int rcu_preempted_readers(struct rcu_node *rnp) /* Because preemptible RCU does not exist, no quieting of tasks. */ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) { - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); } #endif /* #ifdef CONFIG_HOTPLUG_CPU */ diff --git a/kernel/relay.c b/kernel/relay.c index c705a41..4da323b 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -343,6 +343,10 @@ static void wakeup_readers(unsigned long data) { struct rchan_buf *buf = (struct rchan_buf *)data; wake_up_interruptible(&buf->read_wait); + /* + * Stupid polling for now: + */ + mod_timer(&buf->timer, jiffies + 1); } /** @@ -360,6 +364,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) init_waitqueue_head(&buf->read_wait); kref_init(&buf->kref); setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf); + mod_timer(&buf->timer, jiffies + 1); } else del_timer_sync(&buf->timer); @@ -740,15 +745,6 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) else buf->early_bytes += buf->chan->subbuf_size - buf->padding[old_subbuf]; - smp_mb(); - if (waitqueue_active(&buf->read_wait)) - /* - * Calling wake_up_interruptible() from here - * will deadlock if we happen to be logging - * from the scheduler (trying to re-grab - * rq->lock), so defer it. - */ - mod_timer(&buf->timer, jiffies + 1); } old = buf->data; diff --git a/kernel/res_counter.c b/kernel/res_counter.c index bcdabf3..a340d29 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -14,6 +14,7 @@ #include <linux/res_counter.h> #include <linux/uaccess.h> #include <linux/mm.h> +#include <linux/interrupt.h> void res_counter_init(struct res_counter *counter, struct res_counter *parent) { @@ -44,7 +45,7 @@ int res_counter_charge(struct res_counter *counter, unsigned long val, struct res_counter *c, *u; *limit_fail_at = NULL; - local_irq_save(flags); + local_irq_save_nort(flags); for (c = counter; c != NULL; c = c->parent) { spin_lock(&c->lock); ret = res_counter_charge_locked(c, val); @@ -63,7 +64,7 @@ undo: spin_unlock(&u->lock); } done: - local_irq_restore(flags); + local_irq_restore_nort(flags); return ret; } @@ -80,13 +81,13 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val) unsigned long flags; struct res_counter *c; - local_irq_save(flags); + local_irq_save_nort(flags); for (c = counter; c != NULL; c = c->parent) { spin_lock(&c->lock); res_counter_uncharge_locked(c, val); spin_unlock(&c->lock); } - local_irq_restore(flags); + local_irq_restore_nort(flags); } diff --git a/kernel/rt.c b/kernel/rt.c new file mode 100644 index 0000000..ccbf20f --- /dev/null +++ b/kernel/rt.c @@ -0,0 +1,433 @@ +/* + * kernel/rt.c + * + * Real-Time Preemption Support + * + * started by Ingo Molnar: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * historic credit for proving that Linux spinlocks can be implemented via + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow + * and others) who prototyped it on 2.4 and did lots of comparative + * research and analysis; TimeSys, for proving that you can implement a + * fully preemptible kernel via the use of IRQ threading and mutexes; + * Bill Huey for persuasively arguing on lkml that the mutex model is the + * right one; and to MontaVista, who ported pmutexes to 2.6. + * + * This code is a from-scratch implementation and is not based on pmutexes, + * but the idea of converting spinlocks to mutexes is used here too. + * + * lock debugging, locking tree, deadlock detection: + * + * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey + * Released under the General Public License (GPL). + * + * Includes portions of the generic R/W semaphore implementation from: + * + * Copyright (c) 2001 David Howells (dhowells@redhat.com). + * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de> + * - Derived also from comments by Linus + * + * Pending ownership of locks and ownership stealing: + * + * Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt + * + * (also by Steven Rostedt) + * - Converted single pi_lock to individual task locks. + * + * By Esben Nielsen: + * Doing priority inheritance with help of the scheduler. + * + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * - major rework based on Esben Nielsens initial patch + * - replaced thread_info references by task_struct refs + * - removed task->pending_owner dependency + * - BKL drop/reacquire for semaphore style locks to avoid deadlocks + * in the scheduler return path as discussed with Steven Rostedt + * + * Copyright (C) 2006, Kihon Technologies Inc. + * Steven Rostedt <rostedt@goodmis.org> + * - debugged and patched Thomas Gleixner's rework. + * - added back the cmpxchg to the rework. + * - turned atomic require back on for SMP. + */ + +#include <linux/spinlock.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/kallsyms.h> +#include <linux/syscalls.h> +#include <linux/interrupt.h> +#include <linux/plist.h> +#include <linux/fs.h> +#include <linux/futex.h> +#include <linux/hrtimer.h> + +#include "rtmutex_common.h" + +#ifdef CONFIG_PREEMPT_RT +/* + * Unlock these on crash: + */ +void zap_rt_locks(void) +{ + //trace_lock_init(); +} +#endif + +/* + * struct mutex functions + */ +void __mutex_init(struct mutex *lock, char *name, struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + __rt_mutex_init(&lock->lock, name); +} +EXPORT_SYMBOL(__mutex_init); + +void __lockfunc _mutex_lock(struct mutex *lock) +{ + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); + rt_mutex_lock(&lock->lock); +} +EXPORT_SYMBOL(_mutex_lock); + +int __lockfunc _mutex_lock_interruptible(struct mutex *lock) +{ + int ret; + + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); + ret = rt_mutex_lock_interruptible(&lock->lock, 0); + if (ret) + mutex_release(&lock->dep_map, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(_mutex_lock_interruptible); + +int __lockfunc _mutex_lock_killable(struct mutex *lock) +{ + int ret; + + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); + ret = rt_mutex_lock_killable(&lock->lock, 0); + if (ret) + mutex_release(&lock->dep_map, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(_mutex_lock_killable); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass) +{ + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + rt_mutex_lock(&lock->lock); +} +EXPORT_SYMBOL(_mutex_lock_nested); + +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass) +{ + int ret; + + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + ret = rt_mutex_lock_interruptible(&lock->lock, 0); + if (ret) + mutex_release(&lock->dep_map, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(_mutex_lock_interruptible_nested); + +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass) +{ + int ret; + + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + ret = rt_mutex_lock_killable(&lock->lock, 0); + if (ret) + mutex_release(&lock->dep_map, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(_mutex_lock_killable_nested); +#endif + +int __lockfunc _mutex_trylock(struct mutex *lock) +{ + int ret = rt_mutex_trylock(&lock->lock); + + if (ret) + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(_mutex_trylock); + +void __lockfunc _mutex_unlock(struct mutex *lock) +{ + mutex_release(&lock->dep_map, 1, _RET_IP_); + rt_mutex_unlock(&lock->lock); +} +EXPORT_SYMBOL(_mutex_unlock); + +/* + * rwlock_t functions + */ +int __lockfunc rt_write_trylock(rwlock_t *rwlock) +{ + int ret = rt_mutex_trylock(&rwlock->lock); + + if (ret) + rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(rt_write_trylock); + +int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags) +{ + *flags = 0; + return rt_write_trylock(rwlock); +} +EXPORT_SYMBOL(rt_write_trylock_irqsave); + +int __lockfunc rt_read_trylock(rwlock_t *rwlock) +{ + struct rt_mutex *lock = &rwlock->lock; + int ret = 1; + + /* + * recursive read locks succeed when current owns the lock, + * but not when read_depth == 0 which means that the lock is + * write locked. + */ + if (rt_mutex_real_owner(lock) != current) + ret = rt_mutex_trylock(lock); + else if (!rwlock->read_depth) + ret = 0; + + if (ret) { + rwlock->read_depth++; + rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_); + } + + return ret; +} +EXPORT_SYMBOL(rt_read_trylock); + +void __lockfunc rt_write_lock(rwlock_t *rwlock) +{ + rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); + __rt_spin_lock(&rwlock->lock); +} +EXPORT_SYMBOL(rt_write_lock); + +void __lockfunc rt_read_lock(rwlock_t *rwlock) +{ + struct rt_mutex *lock = &rwlock->lock; + + rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); + + /* + * recursive read locks succeed when current owns the lock + */ + if (rt_mutex_real_owner(lock) != current) + __rt_spin_lock(lock); + rwlock->read_depth++; +} + +EXPORT_SYMBOL(rt_read_lock); + +void __lockfunc rt_write_unlock(rwlock_t *rwlock) +{ + /* NOTE: we always pass in '1' for nested, for simplicity */ + rwlock_release(&rwlock->dep_map, 1, _RET_IP_); + __rt_spin_unlock(&rwlock->lock); +} +EXPORT_SYMBOL(rt_write_unlock); + +void __lockfunc rt_read_unlock(rwlock_t *rwlock) +{ + rwlock_release(&rwlock->dep_map, 1, _RET_IP_); + + /* Release the lock only when read_depth is down to 0 */ + if (--rwlock->read_depth == 0) + __rt_spin_unlock(&rwlock->lock); +} +EXPORT_SYMBOL(rt_read_unlock); + +unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock) +{ + rt_write_lock(rwlock); + + return 0; +} +EXPORT_SYMBOL(rt_write_lock_irqsave); + +unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock) +{ + rt_read_lock(rwlock); + + return 0; +} +EXPORT_SYMBOL(rt_read_lock_irqsave); + +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock)); + lockdep_init_map(&rwlock->dep_map, name, key, 0); +#endif + __rt_mutex_init(&rwlock->lock, name); + rwlock->read_depth = 0; +} +EXPORT_SYMBOL(__rt_rwlock_init); + +/* + * rw_semaphores + */ + +void rt_up_write(struct rw_semaphore *rwsem) +{ + rwsem_release(&rwsem->dep_map, 1, _RET_IP_); + rt_mutex_unlock(&rwsem->lock); +} +EXPORT_SYMBOL(rt_up_write); + +void rt_up_read(struct rw_semaphore *rwsem) +{ + rwsem_release(&rwsem->dep_map, 1, _RET_IP_); + if (--rwsem->read_depth == 0) + rt_mutex_unlock(&rwsem->lock); +} +EXPORT_SYMBOL(rt_up_read); + +/* + * downgrade a write lock into a read lock + * - just wake up any readers at the front of the queue + */ +void rt_downgrade_write(struct rw_semaphore *rwsem) +{ + BUG_ON(rt_mutex_real_owner(&rwsem->lock) != current); + rwsem->read_depth = 1; +} +EXPORT_SYMBOL(rt_downgrade_write); + +int rt_down_write_trylock(struct rw_semaphore *rwsem) +{ + int ret = rt_mutex_trylock(&rwsem->lock); + + if (ret) + rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(rt_down_write_trylock); + +void rt_down_write(struct rw_semaphore *rwsem) +{ + rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_); + rt_mutex_lock(&rwsem->lock); +} +EXPORT_SYMBOL(rt_down_write); + +void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass) +{ + rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_); + rt_mutex_lock(&rwsem->lock); +} +EXPORT_SYMBOL(rt_down_write_nested); + +int rt_down_read_trylock(struct rw_semaphore *rwsem) +{ + struct rt_mutex *lock = &rwsem->lock; + int ret = 1; + + /* + * recursive read locks succeed when current owns the rwsem, + * but not when read_depth == 0 which means that the rwsem is + * write locked. + */ + if (rt_mutex_real_owner(lock) != current) + ret = rt_mutex_trylock(&rwsem->lock); + else if (!rwsem->read_depth) + ret = 0; + + if (ret) { + rwsem->read_depth++; + rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_); + } + return ret; +} +EXPORT_SYMBOL(rt_down_read_trylock); + +static void __rt_down_read(struct rw_semaphore *rwsem, int subclass) +{ + struct rt_mutex *lock = &rwsem->lock; + + rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_); + + if (rt_mutex_real_owner(lock) != current) + rt_mutex_lock(&rwsem->lock); + rwsem->read_depth++; +} + +void rt_down_read(struct rw_semaphore *rwsem) +{ + __rt_down_read(rwsem, 0); +} +EXPORT_SYMBOL(rt_down_read); + +void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass) +{ + __rt_down_read(rwsem, subclass); +} +EXPORT_SYMBOL(rt_down_read_nested); + +void __rt_rwsem_init(struct rw_semaphore *rwsem, char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem)); + lockdep_init_map(&rwsem->dep_map, name, key, 0); +#endif + __rt_mutex_init(&rwsem->lock, name); + rwsem->read_depth = 0; +} +EXPORT_SYMBOL(__rt_rwsem_init); + +/** + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 + * @cnt: the atomic which we are to dec + * @lock: the mutex to return holding if we dec to 0 + * + * return true and hold lock if we dec to 0, return false otherwise + */ +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) +{ + /* dec if we can't possibly hit 0 */ + if (atomic_add_unless(cnt, -1, 1)) + return 0; + /* we might hit 0, so take the lock */ + mutex_lock(lock); + if (!atomic_dec_and_test(cnt)) { + /* when we actually did the dec, we didn't hit 0 */ + mutex_unlock(lock); + return 0; + } + /* we hit 0, and we hold the lock */ + return 1; +} +EXPORT_SYMBOL(atomic_dec_and_mutex_lock); diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index ddabb54..e7e6314 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -29,61 +29,6 @@ #include "rtmutex_common.h" -# define TRACE_WARN_ON(x) WARN_ON(x) -# define TRACE_BUG_ON(x) BUG_ON(x) - -# define TRACE_OFF() \ -do { \ - if (rt_trace_on) { \ - rt_trace_on = 0; \ - console_verbose(); \ - if (raw_spin_is_locked(¤t->pi_lock)) \ - raw_spin_unlock(¤t->pi_lock); \ - } \ -} while (0) - -# define TRACE_OFF_NOLOCK() \ -do { \ - if (rt_trace_on) { \ - rt_trace_on = 0; \ - console_verbose(); \ - } \ -} while (0) - -# define TRACE_BUG_LOCKED() \ -do { \ - TRACE_OFF(); \ - BUG(); \ -} while (0) - -# define TRACE_WARN_ON_LOCKED(c) \ -do { \ - if (unlikely(c)) { \ - TRACE_OFF(); \ - WARN_ON(1); \ - } \ -} while (0) - -# define TRACE_BUG_ON_LOCKED(c) \ -do { \ - if (unlikely(c)) \ - TRACE_BUG_LOCKED(); \ -} while (0) - -#ifdef CONFIG_SMP -# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c) -#else -# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0) -#endif - -/* - * deadlock detection flag. We turn it off when we detect - * the first problem because we dont want to recurse back - * into the tracing code when doing error printk or - * executing a BUG(): - */ -static int rt_trace_on = 1; - static void printk_task(struct task_struct *p) { if (p) @@ -111,8 +56,8 @@ static void printk_lock(struct rt_mutex *lock, int print_owner) void rt_mutex_debug_task_free(struct task_struct *task) { - WARN_ON(!plist_head_empty(&task->pi_waiters)); - WARN_ON(task->pi_blocked_on); + DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); + DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); } /* @@ -125,7 +70,7 @@ void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, { struct task_struct *task; - if (!rt_trace_on || detect || !act_waiter) + if (!debug_locks || detect || !act_waiter) return; task = rt_mutex_owner(act_waiter->lock); @@ -139,7 +84,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) { struct task_struct *task; - if (!waiter->deadlock_lock || !rt_trace_on) + if (!waiter->deadlock_lock || !debug_locks) return; rcu_read_lock(); @@ -149,7 +94,8 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) return; } - TRACE_OFF_NOLOCK(); + if (!debug_locks_off()) + return; printk("\n============================================\n"); printk( "[ BUG: circular locking deadlock detected! ]\n"); @@ -180,7 +126,6 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) printk("[ turning off deadlock detection." "Please report this trace. ]\n\n"); - local_irq_disable(); } void debug_rt_mutex_lock(struct rt_mutex *lock) @@ -189,7 +134,8 @@ void debug_rt_mutex_lock(struct rt_mutex *lock) void debug_rt_mutex_unlock(struct rt_mutex *lock) { - TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); + if (debug_locks) + DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); } void @@ -199,7 +145,7 @@ debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner) void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) { - TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock)); + DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); } void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) @@ -213,9 +159,9 @@ void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) { put_pid(waiter->deadlock_task_pid); - TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); - TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); - TRACE_WARN_ON(waiter->task); + DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); + DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); + DEBUG_LOCKS_WARN_ON(waiter->task); memset(waiter, 0x22, sizeof(*waiter)); } @@ -231,9 +177,36 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task) { +#ifdef CONFIG_DEBUG_PREEMPT + if (atomic_read(&task->lock_count) >= MAX_LOCK_STACK) { + if (!debug_locks_off()) + return; + printk("BUG: %s/%d: lock count overflow!\n", + task->comm, task->pid); + dump_stack(); + return; + } +#ifdef CONFIG_PREEMPT_RT + task->owned_lock[atomic_read(&task->lock_count)] = lock; +#endif + atomic_inc(&task->lock_count); +#endif } void rt_mutex_deadlock_account_unlock(struct task_struct *task) { +#ifdef CONFIG_DEBUG_PREEMPT + if (!atomic_read(&task->lock_count)) { + if (!debug_locks_off()) + return; + printk("BUG: %s/%d: lock count underflow!\n", + task->comm, task->pid); + dump_stack(); + return; + } + atomic_dec(&task->lock_count); +#ifdef CONFIG_PREEMPT_RT + task->owned_lock[atomic_read(&task->lock_count)] = NULL; +#endif +#endif } - diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h index 14193d5..b031c8a 100644 --- a/kernel/rtmutex-debug.h +++ b/kernel/rtmutex-debug.h @@ -17,17 +17,17 @@ extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); extern void debug_rt_mutex_lock(struct rt_mutex *lock); extern void debug_rt_mutex_unlock(struct rt_mutex *lock); -extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, - struct task_struct *powner); +extern void +debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner); extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, struct rt_mutex *lock); extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); -# define debug_rt_mutex_reset_waiter(w) \ +# define debug_rt_mutex_reset_waiter(w) \ do { (w)->deadlock_lock = NULL; } while (0) -static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, - int detect) +static inline int +debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, int detect) { - return (waiter != NULL); + return waiter != NULL; } diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index a960481..16bfa1c 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -8,12 +8,20 @@ * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt * Copyright (C) 2006 Esben Nielsen * + * Adaptive Spinlocks: + * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich, + * and Peter Morreale, + * Adaptive Spinlocks simplification: + * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com> + * * See Documentation/rt-mutex-design.txt for details. */ #include <linux/spinlock.h> #include <linux/module.h> #include <linux/sched.h> #include <linux/timer.h> +#include <linux/hardirq.h> +#include <linux/semaphore.h> #include "rtmutex_common.h" @@ -97,6 +105,22 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) } #endif +int pi_initialized; + +/* + * we initialize the wait_list runtime. (Could be done build-time and/or + * boot-time.) + */ +static inline void init_lists(struct rt_mutex *lock) +{ + if (unlikely(!lock->wait_list.prio_list.prev)) { + plist_head_init_raw(&lock->wait_list, &lock->wait_lock); +#ifdef CONFIG_DEBUG_RT_MUTEXES + pi_initialized++; +#endif + } +} + /* * Calculate task priority from the waiter list priority * @@ -253,13 +277,13 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, plist_add(&waiter->list_entry, &lock->wait_list); /* Release the task */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); put_task_struct(task); /* Grab the next task */ task = rt_mutex_owner(lock); get_task_struct(task); - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); if (waiter == rt_mutex_top_waiter(lock)) { /* Boost the owner */ @@ -277,10 +301,10 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, __rt_mutex_adjust_prio(task); } - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); top_waiter = rt_mutex_top_waiter(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); if (!detect_deadlock && waiter != top_waiter) goto out_put_task; @@ -301,11 +325,10 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * lock yet]: */ static inline int try_to_steal_lock(struct rt_mutex *lock, - struct task_struct *task) + struct task_struct *task, int mode) { struct task_struct *pendowner = rt_mutex_owner(lock); struct rt_mutex_waiter *next; - unsigned long flags; if (!rt_mutex_owner_pending(lock)) return 0; @@ -313,9 +336,9 @@ static inline int try_to_steal_lock(struct rt_mutex *lock, if (pendowner == task) return 1; - raw_spin_lock_irqsave(&pendowner->pi_lock, flags); - if (task->prio >= pendowner->prio) { - raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); + raw_spin_lock(&pendowner->pi_lock); + if (!lock_is_stealable(task, pendowner, mode)) { + raw_spin_unlock(&pendowner->pi_lock); return 0; } @@ -325,7 +348,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock, * priority. */ if (likely(!rt_mutex_has_waiters(lock))) { - raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); + raw_spin_unlock(&pendowner->pi_lock); return 1; } @@ -333,7 +356,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock, next = rt_mutex_top_waiter(lock); plist_del(&next->pi_list_entry, &pendowner->pi_waiters); __rt_mutex_adjust_prio(pendowner); - raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); + raw_spin_unlock(&pendowner->pi_lock); /* * We are going to steal the lock and a waiter was @@ -350,10 +373,10 @@ static inline int try_to_steal_lock(struct rt_mutex *lock, * might be task: */ if (likely(next->task != task)) { - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); plist_add(&next->pi_list_entry, &task->pi_waiters); __rt_mutex_adjust_prio(task); - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); } return 1; } @@ -367,7 +390,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock, * * Must be called with lock->wait_lock held. */ -static int try_to_take_rt_mutex(struct rt_mutex *lock) +static int do_try_to_take_rt_mutex(struct rt_mutex *lock, int mode) { /* * We have to be careful here if the atomic speedups are @@ -390,7 +413,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) */ mark_rt_mutex_waiters(lock); - if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) + if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current, mode)) return 0; /* We got the lock. */ @@ -403,6 +426,11 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) return 1; } +static inline int try_to_take_rt_mutex(struct rt_mutex *lock) +{ + return do_try_to_take_rt_mutex(lock, STEAL_NORMAL); +} + /* * Task blocks on lock. * @@ -413,14 +441,13 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task, - int detect_deadlock) + int detect_deadlock, unsigned long flags) { struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; - unsigned long flags; int chain_walk = 0, res; - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); __rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; @@ -434,17 +461,17 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, task->pi_blocked_on = waiter; - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); if (waiter == rt_mutex_top_waiter(lock)) { - raw_spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock(&owner->pi_lock); plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); plist_add(&waiter->pi_list_entry, &owner->pi_waiters); __rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) chain_walk = 1; - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + raw_spin_unlock(&owner->pi_lock); } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) chain_walk = 1; @@ -459,12 +486,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, */ get_task_struct(owner); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, task); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); return res; } @@ -477,13 +504,13 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, * * Called with lock->wait_lock held. */ -static void wakeup_next_waiter(struct rt_mutex *lock) +static void wakeup_next_waiter(struct rt_mutex *lock, int savestate) { struct rt_mutex_waiter *waiter; struct task_struct *pendowner; - unsigned long flags; + struct rt_mutex_waiter *next; - raw_spin_lock_irqsave(¤t->pi_lock, flags); + raw_spin_lock(¤t->pi_lock); waiter = rt_mutex_top_waiter(lock); plist_del(&waiter->list_entry, &lock->wait_list); @@ -498,9 +525,44 @@ static void wakeup_next_waiter(struct rt_mutex *lock) pendowner = waiter->task; waiter->task = NULL; + /* + * Do the wakeup before the ownership change to give any spinning + * waiter grantees a headstart over the other threads that will + * trigger once owner changes. + */ + if (!savestate) + wake_up_process(pendowner); + else { + /* + * We can skip the actual (expensive) wakeup if the + * waiter is already running, but we have to be careful + * of race conditions because they may be about to sleep. + * + * The waiter-side protocol has the following pattern: + * 1: Set state != RUNNING + * 2: Conditionally sleep if waiter->task != NULL; + * + * And the owner-side has the following: + * A: Set waiter->task = NULL + * B: Conditionally wake if the state != RUNNING + * + * As long as we ensure 1->2 order, and A->B order, we + * will never miss a wakeup. + * + * Therefore, this barrier ensures that waiter->task = NULL + * is visible before we test the pendowner->state. The + * corresponding barrier is in the sleep logic. + */ + smp_mb(); + + /* If !RUNNING && !RUNNING_MUTEX */ + if (pendowner->state & ~TASK_RUNNING_MUTEX) + wake_up_process_mutex(pendowner); + } + rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); - raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + raw_spin_unlock(¤t->pi_lock); /* * Clear the pi_blocked_on variable and enqueue a possible @@ -509,7 +571,13 @@ static void wakeup_next_waiter(struct rt_mutex *lock) * waiter with higher priority than pending-owner->normal_prio * is blocked on the unboosted (pending) owner. */ - raw_spin_lock_irqsave(&pendowner->pi_lock, flags); + + if (rt_mutex_has_waiters(lock)) + next = rt_mutex_top_waiter(lock); + else + next = NULL; + + raw_spin_lock(&pendowner->pi_lock); WARN_ON(!pendowner->pi_blocked_on); WARN_ON(pendowner->pi_blocked_on != waiter); @@ -517,15 +585,10 @@ static void wakeup_next_waiter(struct rt_mutex *lock) pendowner->pi_blocked_on = NULL; - if (rt_mutex_has_waiters(lock)) { - struct rt_mutex_waiter *next; - - next = rt_mutex_top_waiter(lock); + if (next) plist_add(&next->pi_list_entry, &pendowner->pi_waiters); - } - raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); - wake_up_process(pendowner); + raw_spin_unlock(&pendowner->pi_lock); } /* @@ -534,22 +597,22 @@ static void wakeup_next_waiter(struct rt_mutex *lock) * Must be called with lock->wait_lock held */ static void remove_waiter(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter) + struct rt_mutex_waiter *waiter, + unsigned long flags) { int first = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); - unsigned long flags; int chain_walk = 0; - raw_spin_lock_irqsave(¤t->pi_lock, flags); + raw_spin_lock(¤t->pi_lock); plist_del(&waiter->list_entry, &lock->wait_list); waiter->task = NULL; current->pi_blocked_on = NULL; - raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + raw_spin_unlock(¤t->pi_lock); if (first && owner != current) { - raw_spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock(&owner->pi_lock); plist_del(&waiter->pi_list_entry, &owner->pi_waiters); @@ -564,7 +627,7 @@ static void remove_waiter(struct rt_mutex *lock, if (owner->pi_blocked_on) chain_walk = 1; - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + raw_spin_unlock(&owner->pi_lock); } WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); @@ -575,11 +638,11 @@ static void remove_waiter(struct rt_mutex *lock, /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(owner); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); } /* @@ -600,18 +663,391 @@ void rt_mutex_adjust_pi(struct task_struct *task) return; } - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(task); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); } +/* + * preemptible spin_lock functions: + */ + +#ifdef CONFIG_PREEMPT_RT + +static inline void +rt_spin_lock_fastlock(struct rt_mutex *lock, + void (*slowfn)(struct rt_mutex *lock)) +{ + /* Temporary HACK! */ + if (likely(!current->in_printk)) + might_sleep(); + else if (in_atomic() || irqs_disabled()) + /* don't grab locks for printk in atomic */ + return; + + if (likely(rt_mutex_cmpxchg(lock, NULL, current))) + rt_mutex_deadlock_account_lock(lock, current); + else + slowfn(lock); +} + +static inline void +rt_spin_lock_fastunlock(struct rt_mutex *lock, + void (*slowfn)(struct rt_mutex *lock)) +{ + /* Temporary HACK! */ + if (unlikely(rt_mutex_owner(lock) != current) && current->in_printk) + /* don't grab locks for printk in atomic */ + return; + + if (likely(rt_mutex_cmpxchg(lock, current, NULL))) + rt_mutex_deadlock_account_unlock(current); + else + slowfn(lock); +} + + +#ifdef CONFIG_SMP +static int adaptive_wait(struct rt_mutex_waiter *waiter, + struct task_struct *orig_owner) +{ + for (;;) { + + /* we are the owner? */ + if (!waiter->task) + return 0; + + /* Owner changed? Then lets update the original */ + if (orig_owner != rt_mutex_owner(waiter->lock)) + return 0; + + /* Owner went to bed, so should we */ + if (!task_is_current(orig_owner)) + return 1; + + cpu_relax(); + } +} +#else +static int adaptive_wait(struct rt_mutex_waiter *waiter, + struct task_struct *orig_owner) +{ + return 1; +} +#endif + +/* + * The state setting needs to preserve the original state and needs to + * take care of non rtmutex wakeups. + * + * Called with rtmutex->wait_lock held to serialize against rtmutex + * wakeups(). + */ +static inline unsigned long +rt_set_current_blocked_state(unsigned long saved_state) +{ + unsigned long state, block_state; + + /* + * If state is TASK_INTERRUPTIBLE, then we set the state for + * blocking to TASK_INTERRUPTIBLE as well, otherwise we would + * miss real wakeups via wake_up_interruptible(). If such a + * wakeup happens we see the running state and preserve it in + * saved_state. Now we can ignore further wakeups as we will + * return in state running from our "spin" sleep. + */ + if (saved_state == TASK_INTERRUPTIBLE) + block_state = TASK_INTERRUPTIBLE; + else + block_state = TASK_UNINTERRUPTIBLE; + + state = xchg(¤t->state, block_state); + /* + * Take care of non rtmutex wakeups. rtmutex wakeups + * or TASK_RUNNING_MUTEX to (UN)INTERRUPTIBLE. + */ + if (state == TASK_RUNNING) + saved_state = TASK_RUNNING; + + return saved_state; +} + +static inline void rt_restore_current_state(unsigned long saved_state) +{ + unsigned long state = xchg(¤t->state, saved_state); + + if (state == TASK_RUNNING) + current->state = TASK_RUNNING; +} + +/* + * Slow path lock function spin_lock style: this variant is very + * careful not to miss any non-lock wakeups. + * + * The wakeup side uses wake_up_process_mutex, which, combined with + * the xchg code of this function is a transparent sleep/wakeup + * mechanism nested within any existing sleep/wakeup mechanism. This + * enables the seemless use of arbitrary (blocking) spinlocks within + * sleep/wakeup event loops. + */ +static void noinline __sched +rt_spin_lock_slowlock(struct rt_mutex *lock) +{ + struct rt_mutex_waiter waiter; + unsigned long saved_state, flags; + struct task_struct *orig_owner; + + debug_rt_mutex_init_waiter(&waiter); + waiter.task = NULL; + + raw_spin_lock_irqsave(&lock->wait_lock, flags); + init_lists(lock); + + BUG_ON(rt_mutex_owner(lock) == current); + + /* + * Here we save whatever state the task was in originally, + * we'll restore it at the end of the function and we'll take + * any intermediate wakeup into account as well, independently + * of the lock sleep/wakeup mechanism. When we get a real + * wakeup the task->state is TASK_RUNNING and we change + * saved_state accordingly. If we did not get a real wakeup + * then we return with the saved state. We need to be careful + * about original state TASK_INTERRUPTIBLE as well, as we + * could miss a wakeup_interruptible() + */ + saved_state = rt_set_current_blocked_state(current->state); + + for (;;) { + int saved_lock_depth = current->lock_depth; + + /* Try to acquire the lock */ + if (do_try_to_take_rt_mutex(lock, STEAL_LATERAL)) + break; + + /* + * waiter.task is NULL the first time we come here and + * when we have been woken up by the previous owner + * but the lock got stolen by an higher prio task. + */ + if (!waiter.task) { + task_blocks_on_rt_mutex(lock, &waiter, current, 0, + flags); + /* Wakeup during boost ? */ + if (unlikely(!waiter.task)) + continue; + } + + /* + * Prevent schedule() to drop BKL, while waiting for + * the lock ! We restore lock_depth when we come back. + */ + current->lock_depth = -1; + orig_owner = rt_mutex_owner(lock); + get_task_struct(orig_owner); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + + debug_rt_mutex_print_deadlock(&waiter); + + if (adaptive_wait(&waiter, orig_owner)) { + put_task_struct(orig_owner); + + if (waiter.task) + schedule_rt_mutex(lock); + } else + put_task_struct(orig_owner); + + raw_spin_lock_irqsave(&lock->wait_lock, flags); + current->lock_depth = saved_lock_depth; + saved_state = rt_set_current_blocked_state(saved_state); + } + + rt_restore_current_state(saved_state); + + /* + * Extremely rare case, if we got woken up by a non-mutex wakeup, + * and we managed to steal the lock despite us not being the + * highest-prio waiter (due to SCHED_OTHER changing prio), then we + * can end up with a non-NULL waiter.task: + */ + if (unlikely(waiter.task)) + remove_waiter(lock, &waiter, flags); + /* + * try_to_take_rt_mutex() sets the waiter bit + * unconditionally. We might have to fix that up: + */ + fixup_rt_mutex_waiters(lock); + + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + + debug_rt_mutex_free_waiter(&waiter); +} + +/* + * Slow path to release a rt_mutex spin_lock style + */ +static void noinline __sched +rt_spin_lock_slowunlock(struct rt_mutex *lock) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&lock->wait_lock, flags); + + debug_rt_mutex_unlock(lock); + + rt_mutex_deadlock_account_unlock(current); + + if (!rt_mutex_has_waiters(lock)) { + lock->owner = NULL; + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + return; + } + + wakeup_next_waiter(lock, 1); + + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + + /* Undo pi boosting.when necessary */ + rt_mutex_adjust_prio(current); +} + +void __lockfunc rt_spin_lock(spinlock_t *lock) +{ + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); +} +EXPORT_SYMBOL(rt_spin_lock); + +void __lockfunc __rt_spin_lock(struct rt_mutex *lock) +{ + rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock); +} +EXPORT_SYMBOL(__rt_spin_lock); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass) +{ + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); +} +EXPORT_SYMBOL(rt_spin_lock_nested); + +#endif + +void __lockfunc rt_spin_unlock(spinlock_t *lock) +{ + /* NOTE: we always pass in '1' for nested, for simplicity */ + spin_release(&lock->dep_map, 1, _RET_IP_); + rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock); +} +EXPORT_SYMBOL(rt_spin_unlock); + +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock) +{ + rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock); +} +EXPORT_SYMBOL(__rt_spin_unlock); + +/* + * Wait for the lock to get unlocked: instead of polling for an unlock + * (like raw spinlocks do), we lock and unlock, to force the kernel to + * schedule if there's contention: + */ +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock) +{ + spin_lock(lock); + spin_unlock(lock); +} +EXPORT_SYMBOL(rt_spin_unlock_wait); + +int __lockfunc rt_spin_trylock(spinlock_t *lock) +{ + int ret = rt_mutex_trylock(&lock->lock); + + if (ret) + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(rt_spin_trylock); + +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags) +{ + int ret; + + *flags = 0; + ret = rt_mutex_trylock(&lock->lock); + if (ret) + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(rt_spin_trylock_irqsave); + +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock) +{ + /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ + if (atomic_add_unless(atomic, -1, 1)) + return 0; + rt_spin_lock(lock); + if (atomic_dec_and_test(atomic)) + return 1; + rt_spin_unlock(lock); + return 0; +} +EXPORT_SYMBOL(atomic_dec_and_spin_lock); + +void +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + __rt_mutex_init(&lock->lock, name); +} +EXPORT_SYMBOL(__rt_spin_lock_init); + +#endif + +static inline int rt_release_bkl(struct rt_mutex *lock, unsigned long flags) +{ + int saved_lock_depth = current->lock_depth; + +#ifdef CONFIG_LOCK_KERNEL + current->lock_depth = -1; + /* + * try_to_take_lock set the waiters, make sure it's + * still correct. + */ + fixup_rt_mutex_waiters(lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + + mutex_unlock(&kernel_sem); + + raw_spin_lock_irq(&lock->wait_lock); +#endif + return saved_lock_depth; +} + +static inline void rt_reacquire_bkl(int saved_lock_depth) +{ +#ifdef CONFIG_LOCK_KERNEL + mutex_lock(&kernel_sem); + current->lock_depth = saved_lock_depth; +#endif +} + /** * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop * @lock: the rt_mutex to take * @state: the state the task should block in (TASK_INTERRUPTIBLE - * or TASK_UNINTERRUPTIBLE) + * or TASK_UNINTERRUPTIBLE) * @timeout: the pre-initialized and started timer, or NULL for none * @waiter: the pre-initialized rt_mutex_waiter * @detect_deadlock: passed to task_blocks_on_rt_mutex @@ -622,7 +1058,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, struct rt_mutex_waiter *waiter, - int detect_deadlock) + int detect_deadlock, unsigned long flags) { int ret = 0; @@ -652,7 +1088,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, */ if (!waiter->task) { ret = task_blocks_on_rt_mutex(lock, waiter, current, - detect_deadlock); + detect_deadlock, flags); /* * If we got woken up by the owner then start loop * all over without going into schedule to try @@ -672,14 +1108,15 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, break; } - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); if (waiter->task) schedule_rt_mutex(lock); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); + set_current_state(state); } @@ -694,20 +1131,29 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, int detect_deadlock) { + int ret = 0, saved_lock_depth = -1; struct rt_mutex_waiter waiter; - int ret = 0; + unsigned long flags; debug_rt_mutex_init_waiter(&waiter); waiter.task = NULL; - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); + init_lists(lock); /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock)) { - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return 0; } + /* + * We drop the BKL here before we go into the wait loop to avoid a + * possible deadlock in the scheduler. + */ + if (unlikely(current->lock_depth >= 0)) + saved_lock_depth = rt_release_bkl(lock, flags); + set_current_state(state); /* Setup the timer, when timeout != NULL */ @@ -718,12 +1164,12 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, } ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, - detect_deadlock); + detect_deadlock, flags); set_current_state(TASK_RUNNING); if (unlikely(waiter.task)) - remove_waiter(lock, &waiter); + remove_waiter(lock, &waiter, flags); /* * try_to_take_rt_mutex() sets the waiter bit @@ -731,7 +1177,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, */ fixup_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); /* Remove pending timer: */ if (unlikely(timeout)) @@ -745,6 +1191,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, if (unlikely(ret)) rt_mutex_adjust_prio(current); + /* Must we reaquire the BKL? */ + if (unlikely(saved_lock_depth >= 0)) + rt_reacquire_bkl(saved_lock_depth); + debug_rt_mutex_free_waiter(&waiter); return ret; @@ -756,12 +1206,15 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) { + unsigned long flags; int ret = 0; - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); if (likely(rt_mutex_owner(lock) != current)) { + init_lists(lock); + ret = try_to_take_rt_mutex(lock); /* * try_to_take_rt_mutex() sets the lock waiters @@ -770,7 +1223,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock) fixup_rt_mutex_waiters(lock); } - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return ret; } @@ -781,7 +1234,9 @@ rt_mutex_slowtrylock(struct rt_mutex *lock) static void __sched rt_mutex_slowunlock(struct rt_mutex *lock) { - raw_spin_lock(&lock->wait_lock); + unsigned long flags; + + raw_spin_lock_irqsave(&lock->wait_lock, flags); debug_rt_mutex_unlock(lock); @@ -789,13 +1244,13 @@ rt_mutex_slowunlock(struct rt_mutex *lock) if (!rt_mutex_has_waiters(lock)) { lock->owner = NULL; - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return; } - wakeup_next_waiter(lock); + wakeup_next_waiter(lock, 0); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); /* Undo pi boosting if necessary: */ rt_mutex_adjust_prio(current); @@ -857,6 +1312,27 @@ rt_mutex_fastunlock(struct rt_mutex *lock, } /** + * rt_mutex_lock_killable - lock a rt_mutex killable + * + * @lock: the rt_mutex to be locked + * @detect_deadlock: deadlock detection on/off + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + * -EDEADLK when the lock would deadlock (when deadlock detection is on) + */ +int __sched rt_mutex_lock_killable(struct rt_mutex *lock, + int detect_deadlock) +{ + might_sleep(); + + return rt_mutex_fastlock(lock, TASK_KILLABLE, + detect_deadlock, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable); + +/** * rt_mutex_lock - lock a rt_mutex * * @lock: the rt_mutex to be locked @@ -1030,13 +1506,15 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task, int detect_deadlock) { + unsigned long flags; int ret; - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); mark_rt_mutex_waiters(lock); - if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { + if (!rt_mutex_owner(lock) || + try_to_steal_lock(lock, task, STEAL_NORMAL)) { /* We got the lock for task. */ debug_rt_mutex_lock(lock); rt_mutex_set_owner(lock, task, 0); @@ -1045,7 +1523,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, return 1; } - ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); + ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock, + flags); if (ret && !waiter->task) { /* @@ -1056,7 +1535,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, */ ret = 0; } - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); debug_rt_mutex_print_deadlock(waiter); @@ -1104,19 +1583,20 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, int detect_deadlock) { + unsigned long flags; int ret; - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); set_current_state(TASK_INTERRUPTIBLE); ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, - detect_deadlock); + detect_deadlock, flags); set_current_state(TASK_RUNNING); if (unlikely(waiter->task)) - remove_waiter(lock, waiter); + remove_waiter(lock, waiter, flags); /* * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might @@ -1124,7 +1604,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, */ fixup_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); /* * Readjust priority, when we did not get the lock. We might have been diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index 97a2f81..4df690c 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -129,6 +129,26 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, int detect_deadlock); + +#define STEAL_LATERAL 1 +#define STEAL_NORMAL 0 + +/* + * Note that RT tasks are excluded from lateral-steals to prevent the + * introduction of an unbounded latency + */ +static inline int lock_is_stealable(struct task_struct *task, + struct task_struct *pendowner, int mode) +{ + if (mode == STEAL_NORMAL || rt_task(task)) { + if (task->prio >= pendowner->prio) + return 0; + } else if (task->prio > pendowner->prio) + return 0; + + return 1; +} + #ifdef CONFIG_DEBUG_RT_MUTEXES # include "rtmutex-debug.h" #else diff --git a/kernel/rwsem.c b/kernel/rwsem.c index cae050b..6c6e7fa 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c @@ -16,20 +16,19 @@ /* * lock for reading */ -void __sched down_read(struct rw_semaphore *sem) +void __sched anon_down_read(struct rw_anon_semaphore *sem) { might_sleep(); rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_read_trylock, __down_read); } - -EXPORT_SYMBOL(down_read); +EXPORT_SYMBOL(anon_down_read); /* * trylock for reading -- returns 1 if successful, 0 if contention */ -int down_read_trylock(struct rw_semaphore *sem) +int anon_down_read_trylock(struct rw_anon_semaphore *sem) { int ret = __down_read_trylock(sem); @@ -37,26 +36,24 @@ int down_read_trylock(struct rw_semaphore *sem) rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); return ret; } - -EXPORT_SYMBOL(down_read_trylock); +EXPORT_SYMBOL(anon_down_read_trylock); /* * lock for writing */ -void __sched down_write(struct rw_semaphore *sem) +void __sched anon_down_write(struct rw_anon_semaphore *sem) { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); } - -EXPORT_SYMBOL(down_write); +EXPORT_SYMBOL(anon_down_write); /* * trylock for writing -- returns 1 if successful, 0 if contention */ -int down_write_trylock(struct rw_semaphore *sem) +int anon_down_write_trylock(struct rw_anon_semaphore *sem) { int ret = __down_write_trylock(sem); @@ -64,37 +61,34 @@ int down_write_trylock(struct rw_semaphore *sem) rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); return ret; } - -EXPORT_SYMBOL(down_write_trylock); +EXPORT_SYMBOL(anon_down_write_trylock); /* * release a read lock */ -void up_read(struct rw_semaphore *sem) +void anon_up_read(struct rw_anon_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); __up_read(sem); } - -EXPORT_SYMBOL(up_read); +EXPORT_SYMBOL(anon_up_read); /* * release a write lock */ -void up_write(struct rw_semaphore *sem) +void anon_up_write(struct rw_anon_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); __up_write(sem); } - -EXPORT_SYMBOL(up_write); +EXPORT_SYMBOL(anon_up_write); /* * downgrade write lock to read lock */ -void downgrade_write(struct rw_semaphore *sem) +void anon_downgrade_write(struct rw_anon_semaphore *sem) { /* * lockdep: a downgraded write will live on as a write @@ -102,46 +96,41 @@ void downgrade_write(struct rw_semaphore *sem) */ __downgrade_write(sem); } - -EXPORT_SYMBOL(downgrade_write); +EXPORT_SYMBOL(anon_downgrade_write); #ifdef CONFIG_DEBUG_LOCK_ALLOC -void down_read_nested(struct rw_semaphore *sem, int subclass) +void anon_down_read_nested(struct rw_anon_semaphore *sem, int subclass) { might_sleep(); rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_read_trylock, __down_read); } +EXPORT_SYMBOL(anon_down_read_nested); -EXPORT_SYMBOL(down_read_nested); - -void down_read_non_owner(struct rw_semaphore *sem) +void anon_down_read_non_owner(struct rw_anon_semaphore *sem) { might_sleep(); __down_read(sem); } +EXPORT_SYMBOL(anon_down_read_non_owner); -EXPORT_SYMBOL(down_read_non_owner); - -void down_write_nested(struct rw_semaphore *sem, int subclass) +void anon_down_write_nested(struct rw_anon_semaphore *sem, int subclass) { might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); } +EXPORT_SYMBOL(anon_down_write_nested); -EXPORT_SYMBOL(down_write_nested); - -void up_read_non_owner(struct rw_semaphore *sem) +void anon_up_read_non_owner(struct rw_anon_semaphore *sem) { __up_read(sem); } - -EXPORT_SYMBOL(up_read_non_owner); +EXPORT_SYMBOL(anon_up_read_non_owner); #endif diff --git a/kernel/sched.c b/kernel/sched.c index 3a8fb30..aaa9918 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4,6 +4,7 @@ * Kernel scheduler and related syscalls * * Copyright (C) 1991-2002 Linus Torvalds + * Copyright (C) 2004 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and * make semaphores SMP safe @@ -16,6 +17,7 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin + * 2004-10-13 Real-Time Preemption support by Ingo Molnar * 2007-04-15 Work begun on replacing all interactivity tuning with a * fair scheduling design by Con Kolivas. * 2007-05-05 Load balancing (smp-nice) and other improvements @@ -61,6 +63,7 @@ #include <linux/sysctl.h> #include <linux/syscalls.h> #include <linux/times.h> +#include <linux/kallsyms.h> #include <linux/tsacct_kern.h> #include <linux/kprobes.h> #include <linux/delayacct.h> @@ -106,6 +109,20 @@ #define NICE_0_LOAD SCHED_LOAD_SCALE #define NICE_0_SHIFT SCHED_LOAD_SHIFT +#if (BITS_PER_LONG < 64) +#define JIFFIES_TO_NS64(TIME) \ + ((unsigned long long)(TIME) * ((unsigned long) (1000000000 / HZ))) + +#define NS64_TO_JIFFIES(TIME) \ + ((((unsigned long long)((TIME)) >> BITS_PER_LONG) * \ + (1 + NS_TO_JIFFIES(~0UL))) + NS_TO_JIFFIES((unsigned long)(TIME))) +#else /* BITS_PER_LONG < 64 */ + +#define NS64_TO_JIFFIES(TIME) NS_TO_JIFFIES(TIME) +#define JIFFIES_TO_NS64(TIME) JIFFIES_TO_NS(TIME) + +#endif /* BITS_PER_LONG < 64 */ + /* * These are the 'tuning knobs' of the scheduler: * @@ -131,6 +148,9 @@ static inline int task_has_rt_policy(struct task_struct *p) return rt_policy(p->policy); } +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) + /* * This is the priority-queue data structure of the RT scheduling class: */ @@ -182,6 +202,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rt_b->rt_period_timer.irqsafe = 1; rt_b->rt_period_timer.function = sched_rt_period_timer; } @@ -389,6 +410,7 @@ static inline struct task_group *task_group(struct task_struct *p) struct cfs_rq { struct load_weight load; unsigned long nr_running; + unsigned long nr_enqueued; u64 exec_clock; u64 min_vruntime; @@ -466,6 +488,7 @@ struct rt_rq { int overloaded; struct plist_head pushable_tasks; #endif + unsigned long rt_nr_uninterruptible; int rt_throttled; u64 rt_time; u64 rt_runtime; @@ -561,6 +584,8 @@ struct rq { */ unsigned long nr_uninterruptible; + unsigned long switch_timestamp; + unsigned long slice_avg; struct task_struct *curr, *idle; unsigned long next_balance; struct mm_struct *prev_mm; @@ -625,9 +650,21 @@ struct rq { /* BKL stats */ unsigned int bkl_count; + + /* RT-overload stats: */ + unsigned long rto_schedule; + unsigned long rto_schedule_tail; + unsigned long rto_wakeup; + unsigned long rto_pulled; + unsigned long rto_pushed; #endif }; +struct task_struct *rq_curr(struct rq *rq) +{ + return rq->curr; +} + static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); static inline @@ -666,6 +703,13 @@ inline void update_rq_clock(struct rq *rq) rq->clock = sched_clock_cpu(cpu_of(rq)); } +#ifndef CONFIG_SMP +int task_is_current(struct task_struct *task) +{ + return task_rq(task)->curr == task; +} +#endif + /* * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ @@ -858,11 +902,23 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } +/* + * We really dont want to do anything complex within switch_to() + * on PREEMPT_RT - this check enforces this. + */ +#ifdef prepare_arch_switch +# ifdef CONFIG_PREEMPT_RT +# error FIXME +# else +# define _finish_arch_switch finish_arch_switch +# endif +#endif + #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif #ifndef finish_arch_switch -# define finish_arch_switch(prev) do { } while (0) +# define _finish_arch_switch(prev) do { } while (0) #endif static inline int task_current(struct rq *rq, struct task_struct *p) @@ -870,18 +926,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p) return rq->curr == p; } -#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline int task_running(struct rq *rq, struct task_struct *p) { +#ifdef CONFIG_SMP + return p->oncpu; +#else return task_current(rq, p); +#endif } +#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->oncpu = 1; +#endif } static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { +#ifdef CONFIG_SMP + /* + * After ->oncpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->oncpu = 0; +#endif #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ rq->lock.owner = current; @@ -893,18 +970,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) */ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - raw_spin_unlock_irq(&rq->lock); + raw_spin_unlock(&rq->lock); } #else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline int task_running(struct rq *rq, struct task_struct *p) -{ -#ifdef CONFIG_SMP - return p->oncpu; -#else - return task_current(rq, p); -#endif -} static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { @@ -934,23 +1003,40 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) smp_wmb(); prev->oncpu = 0; #endif -#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_enable(); +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_disable(); #endif } #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* + * Check whether the task is waking, we use this to synchronize against + * ttwu() so that task_cpu() reports a stable number. + * + * We need to make an exception for PF_STARTING tasks because the fork + * path might require task_rq_lock() to work, eg. it can call + * set_cpus_allowed_ptr() from the cpuset clone_ns code. + */ +static inline int task_is_waking(struct task_struct *p) +{ + return unlikely((p->state & TASK_WAKING) && !(p->flags & PF_STARTING)); +} + +/* * __task_rq_lock - lock the runqueue a given task resides on. * Must be called interrupts disabled. */ static inline struct rq *__task_rq_lock(struct task_struct *p) __acquires(rq->lock) { + struct rq *rq; + for (;;) { - struct rq *rq = task_rq(p); + while (task_is_waking(p)) + cpu_relax(); + rq = task_rq(p); raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) + if (likely(rq == task_rq(p) && !task_is_waking(p))) return rq; raw_spin_unlock(&rq->lock); } @@ -967,10 +1053,12 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) struct rq *rq; for (;;) { + while (task_is_waking(p)) + cpu_relax(); local_irq_save(*flags); rq = task_rq(p); raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) + if (likely(rq == task_rq(p) && !task_is_waking(p))) return rq; raw_spin_unlock_irqrestore(&rq->lock, *flags); } @@ -1147,6 +1235,7 @@ static void init_rq_hrtick(struct rq *rq) hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rq->hrtick_timer.function = hrtick; + rq->hrtick_timer.irqsafe = 1; } #else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) @@ -1222,7 +1311,7 @@ void wake_up_idle_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); - if (cpu == smp_processor_id()) + if (cpu == raw_smp_processor_id()) return; /* @@ -1390,7 +1479,8 @@ static const u32 prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); +static void activate_task(struct rq *rq, struct task_struct *p, int wakeup, + bool head); /* * runqueue iterator, to support SMP load-balancing between different @@ -1883,13 +1973,14 @@ static void update_avg(u64 *avg, u64 sample) *avg += diff >> 3; } -static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) +static void +enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head) { if (wakeup) p->se.start_runtime = p->se.sum_exec_runtime; sched_info_queued(p); - p->sched_class->enqueue_task(rq, p, wakeup); + p->sched_class->enqueue_task(rq, p, wakeup, head); p->se.on_rq = 1; } @@ -1934,6 +2025,8 @@ static inline int normal_prio(struct task_struct *p) prio = MAX_RT_PRIO-1 - p->rt_priority; else prio = __normal_prio(p); + +// trace_special_pid(p->pid, PRIO(p), __PRIO(prio)); return prio; } @@ -1960,12 +2053,13 @@ static int effective_prio(struct task_struct *p) /* * activate_task - move a task to the runqueue. */ -static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) +static void +activate_task(struct rq *rq, struct task_struct *p, int wakeup, bool head) { if (task_contributes_to_load(p)) rq->nr_uninterruptible--; - enqueue_task(rq, p, wakeup); + enqueue_task(rq, p, wakeup, head); inc_nr_running(rq); } @@ -2034,13 +2128,20 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { -#ifdef CONFIG_SCHED_DEBUG +#if defined(CONFIG_SCHED_DEBUG) /* * We should never call set_task_cpu() on a blocked task, * ttwu() will sort out the placement. */ - WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && - !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); + if (p->state != TASK_RUNNING && + !(p->state & TASK_WAKING) && + !(p->state & TASK_RUNNING_MUTEX) && + !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)) { + printk(KERN_ERR "%d %s %lx %lx\n", p->pid, p->comm, + (unsigned long) p->state, + (unsigned long) preempt_count()); + WARN_ON(1); + } #endif trace_sched_migrate_task(p, new_cpu); @@ -2219,7 +2320,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * yield - it could be a while. */ if (unlikely(on_rq)) { - schedule_timeout_uninterruptible(1); + ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&to, HRTIMER_MODE_REL); continue; } @@ -2365,7 +2469,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) * returns failure only if the task is already active. */ static int try_to_wake_up(struct task_struct *p, unsigned int state, - int wake_flags) + int wake_flags, int mutex) { int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; @@ -2395,12 +2499,8 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, /* * In order to handle concurrent wakeups and release the rq->lock * we put the task in TASK_WAKING state. - * - * First fix up the nr_uninterruptible count: */ - if (task_contributes_to_load(p)) - rq->nr_uninterruptible--; - p->state = TASK_WAKING; + p->state |= TASK_WAKING; if (p->sched_class->task_waking) p->sched_class->task_waking(rq, p); @@ -2408,14 +2508,27 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, __task_rq_unlock(rq); cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); - if (cpu != orig_cpu) + if (cpu != orig_cpu) { + /* + * Since we migrate the task without holding any rq->lock, + * we need to be careful with task_rq_lock(), since that + * might end up locking an invalid rq. + */ set_task_cpu(p, cpu); + } - rq = __task_rq_lock(p); + rq = cpu_rq(cpu); + raw_spin_lock(&rq->lock); update_rq_clock(rq); - WARN_ON(p->state != TASK_WAKING); - cpu = task_cpu(p); + /* + * We migrated the task without holding either rq->lock, however + * since the task is not on the task list itself, nobody else + * will try and migrate the task, hence the rq should match the + * cpu we just moved it to. + */ + WARN_ON(task_cpu(p) != cpu); + WARN_ON(!(p->state & TASK_WAKING)); #ifdef CONFIG_SCHEDSTATS schedstat_inc(rq, ttwu_count); @@ -2443,7 +2556,7 @@ out_activate: schedstat_inc(p, se.nr_wakeups_local); else schedstat_inc(p, se.nr_wakeups_remote); - activate_task(rq, p, 1); + activate_task(rq, p, 1, false); success = 1; /* @@ -2466,7 +2579,20 @@ out_running: trace_sched_wakeup(rq, p, success); check_preempt_curr(rq, p, wake_flags); - p->state = TASK_RUNNING; + /* + * For a mutex wakeup we or TASK_RUNNING_MUTEX to the task + * state to preserve the original state, so a real wakeup + * still can see the (UN)INTERRUPTIBLE bits in the state check + * above. We dont have to worry about the | TASK_RUNNING_MUTEX + * here. The waiter is serialized by the mutex lock and nobody + * else can fiddle with p->state as we hold rq lock. + */ + p->state &= ~TASK_WAKING; + if (mutex) + p->state |= TASK_RUNNING_MUTEX; + else + p->state = TASK_RUNNING; + #ifdef CONFIG_SMP if (p->sched_class->task_woken) p->sched_class->task_woken(rq, p); @@ -2502,13 +2628,31 @@ out: */ int wake_up_process(struct task_struct *p) { - return try_to_wake_up(p, TASK_ALL, 0); + return try_to_wake_up(p, TASK_ALL, 0, 0); } EXPORT_SYMBOL(wake_up_process); +int wake_up_process_sync(struct task_struct * p) +{ + return try_to_wake_up(p, TASK_ALL, 1, 0); +} +EXPORT_SYMBOL(wake_up_process_sync); + +int wake_up_process_mutex(struct task_struct * p) +{ + return try_to_wake_up(p, TASK_ALL, 0, 1); +} +EXPORT_SYMBOL(wake_up_process_mutex); + +int wake_up_process_mutex_sync(struct task_struct * p) +{ + return try_to_wake_up(p, TASK_ALL, 1, 1); +} +EXPORT_SYMBOL(wake_up_process_mutex_sync); + int wake_up_state(struct task_struct *p, unsigned int state) { - return try_to_wake_up(p, state, 0); + return try_to_wake_up(p, state, 0, 0); } /* @@ -2575,7 +2719,7 @@ static void __sched_fork(struct task_struct *p) */ void sched_fork(struct task_struct *p, int clone_flags) { - int cpu = get_cpu(); + int cpu; __sched_fork(p); /* @@ -2615,16 +2759,24 @@ void sched_fork(struct task_struct *p, int clone_flags) if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; + /* + * task_fork() and set_task_cpu() must be called with + * preemption disabled + */ + cpu = get_cpu(); + if (p->sched_class->task_fork) p->sched_class->task_fork(p); set_task_cpu(p, cpu); + put_cpu(); + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) +#if defined(CONFIG_SMP) p->oncpu = 0; #endif #ifdef CONFIG_PREEMPT @@ -2632,8 +2784,6 @@ void sched_fork(struct task_struct *p, int clone_flags) task_thread_info(p)->preempt_count = 1; #endif plist_node_init(&p->pushable_tasks, MAX_PRIO); - - put_cpu(); } /* @@ -2663,11 +2813,17 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) set_task_cpu(p, cpu); #endif - rq = task_rq_lock(p, &flags); + /* + * Since the task is not on the rq and we still have TASK_WAKING set + * nobody else will migrate this task. + */ + rq = cpu_rq(cpu); + raw_spin_lock_irqsave(&rq->lock, flags); + BUG_ON(p->state != TASK_WAKING); p->state = TASK_RUNNING; update_rq_clock(rq); - activate_task(rq, p, 0); + activate_task(rq, p, 0, false); trace_sched_wakeup_new(rq, p, 1); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP @@ -2707,8 +2863,17 @@ static void fire_sched_in_preempt_notifiers(struct task_struct *curr) struct preempt_notifier *notifier; struct hlist_node *node; + if (hlist_empty(&curr->preempt_notifiers)) + return; + + /* + * The KVM sched in notifier expects to be called with + * interrupts enabled. + */ + local_irq_enable(); hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) notifier->ops->sched_in(notifier, raw_smp_processor_id()); + local_irq_disable(); } static void @@ -2793,13 +2958,17 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) * Manfred Spraul <manfred@colorfullife.com> */ prev_state = prev->state; - finish_arch_switch(prev); + _finish_arch_switch(prev); perf_event_task_sched_in(current, cpu_of(rq)); finish_lock_switch(rq, prev); fire_sched_in_preempt_notifiers(current); + /* + * Delay the final freeing of the mm or task, so that we dont have + * to do complex work from within the scheduler: + */ if (mm) - mmdrop(mm); + mmdrop_delayed(mm); if (unlikely(prev_state == TASK_DEAD)) { /* * Remove function-return probe instances associated with this @@ -2853,8 +3022,10 @@ static inline void post_schedule(struct rq *rq) asmlinkage void schedule_tail(struct task_struct *prev) __releases(rq->lock) { - struct rq *rq = this_rq(); + struct rq *rq; + preempt_disable(); + rq = this_rq(); finish_task_switch(rq, prev); /* @@ -2863,9 +3034,14 @@ asmlinkage void schedule_tail(struct task_struct *prev) */ post_schedule(rq); + __preempt_enable_no_resched(); + local_irq_enable(); + #ifdef __ARCH_WANT_UNLOCKED_CTXSW /* In this case, finish_task_switch does not reenable preemption */ preempt_enable(); +#else + preempt_check_resched(); #endif if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); @@ -2913,6 +3089,11 @@ context_switch(struct rq *rq, struct task_struct *prev, spin_release(&rq->lock.dep_map, 1, _THIS_IP_); #endif +#ifdef CURRENT_PTR + barrier(); + *current_ptr = next; + *current_ti_ptr = next->thread_info; +#endif /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); @@ -2959,6 +3140,11 @@ unsigned long nr_uninterruptible(void) return sum; } +unsigned long nr_uninterruptible_cpu(int cpu) +{ + return cpu_rq(cpu)->nr_uninterruptible; +} + unsigned long long nr_context_switches(void) { int i; @@ -2977,6 +3163,13 @@ unsigned long nr_iowait(void) for_each_possible_cpu(i) sum += atomic_read(&cpu_rq(i)->nr_iowait); + /* + * Since we read the counters lockless, it might be slightly + * inaccurate. Do not allow it to go below zero though: + */ + if (unlikely((long)sum < 0)) + sum = 0; + return sum; } @@ -3199,7 +3392,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, { deactivate_task(src_rq, p, 0); set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); + activate_task(this_rq, p, 0, false); check_preempt_curr(this_rq, p, 0); } @@ -4119,12 +4312,23 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, continue; rq = cpu_rq(i); - wl = weighted_cpuload(i) * SCHED_LOAD_SCALE; - wl /= power; + wl = weighted_cpuload(i); + /* + * When comparing with imbalance, use weighted_cpuload() + * which is not scaled with the cpu power. + */ if (capacity && rq->nr_running == 1 && wl > imbalance) continue; + /* + * For the load comparisons with the other cpu's, consider + * the weighted_cpuload() scaled with the cpu power, so that + * the load can be moved away from the cpu that is potentially + * running at a lower capacity. + */ + wl = (wl * SCHED_LOAD_SCALE) / power; + if (wl > max_load) { max_load = wl; busiest = rq; @@ -4846,7 +5050,7 @@ out: */ static void run_rebalance_domains(struct softirq_action *h) { - int this_cpu = smp_processor_id(); + int this_cpu = raw_smp_processor_id(); struct rq *this_rq = cpu_rq(this_cpu); enum cpu_idle_type idle = this_rq->idle_at_tick ? CPU_IDLE : CPU_NOT_IDLE; @@ -5120,9 +5324,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset, /* Add system time to cpustat. */ tmp = cputime_to_cputime64(cputime); - if (hardirq_count() - hardirq_offset) + if ((hardirq_count() - hardirq_offset) || + (p->extra_flags & PFE_HARDIRQ)) cpustat->irq = cputime64_add(cpustat->irq, tmp); - else if (softirq_count()) + else if (softirq_count() || (p->extra_flags & PFE_SOFTIRQ)) cpustat->softirq = cputime64_add(cpustat->softirq, tmp); else cpustat->system = cputime64_add(cpustat->system, tmp); @@ -5303,10 +5508,13 @@ void scheduler_tick(void) sched_clock_tick(); + BUG_ON(!irqs_disabled()); + raw_spin_lock(&rq->lock); update_rq_clock(rq); update_cpu_load(rq); - curr->sched_class->task_tick(rq, curr, 0); + if (curr != rq->idle && curr->se.on_rq) + curr->sched_class->task_tick(rq, curr, 0); raw_spin_unlock(&rq->lock); perf_event_task_tick(curr, cpu); @@ -5327,6 +5535,19 @@ notrace unsigned long get_parent_ip(unsigned long addr) return addr; } +#ifdef CONFIG_DEBUG_PREEMPT +void notrace preempt_enable_no_resched(void) +{ + barrier(); + dec_preempt_count(); + + WARN_ONCE(!preempt_count(), + KERN_ERR "BUG: %s:%d task might have lost a preemption check!\n", + current->comm, current->pid); +} +EXPORT_SYMBOL(preempt_enable_no_resched); +#endif + #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) @@ -5383,8 +5604,8 @@ static noinline void __schedule_bug(struct task_struct *prev) { struct pt_regs *regs = get_irq_regs(); - printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", - prev->comm, prev->pid, preempt_count()); + printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d, CPU#%d\n", + prev->comm, preempt_count(), prev->pid, smp_processor_id()); debug_show_held_locks(prev); print_modules(); @@ -5402,12 +5623,14 @@ static noinline void __schedule_bug(struct task_struct *prev) */ static inline void schedule_debug(struct task_struct *prev) { +// WARN_ON(system_state == SYSTEM_BOOTING); + /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ - if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) + if (unlikely(in_atomic() && !prev->exit_state)) __schedule_bug(prev); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -5478,15 +5701,13 @@ pick_next_task(struct rq *rq) /* * schedule() is the main scheduler function. */ -asmlinkage void __sched schedule(void) +asmlinkage void __sched __schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; int cpu; -need_resched: - preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rcu_sched_qs(cpu); @@ -5494,10 +5715,11 @@ need_resched: switch_count = &prev->nivcsw; release_kernel_lock(prev); -need_resched_nonpreemptible: schedule_debug(prev); + preempt_disable(); + if (sched_feat(HRTICK)) hrtick_clear(rq); @@ -5505,7 +5727,8 @@ need_resched_nonpreemptible: update_rq_clock(rq); clear_tsk_need_resched(prev); - if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + if (!(prev->state & TASK_RUNNING_MUTEX) && prev->state && + !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely(signal_pending_state(prev->state, prev))) prev->state = TASK_RUNNING; else @@ -5536,24 +5759,29 @@ need_resched_nonpreemptible: */ cpu = smp_processor_id(); rq = cpu_rq(cpu); - } else - raw_spin_unlock_irq(&rq->lock); + __preempt_enable_no_resched(); + } else { + __preempt_enable_no_resched(); + raw_spin_unlock(&rq->lock); + } post_schedule(rq); - if (unlikely(reacquire_kernel_lock(current) < 0)) { - prev = rq->curr; - switch_count = &prev->nivcsw; - goto need_resched_nonpreemptible; - } + reacquire_kernel_lock(current); +} - preempt_enable_no_resched(); +asmlinkage void __sched schedule(void) +{ +need_resched: + local_irq_disable(); + __schedule(); + local_irq_enable(); if (need_resched()) goto need_resched; } EXPORT_SYMBOL(schedule); -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER +#if defined(CONFIG_MUTEX_SPIN_ON_OWNER) && !defined(CONFIG_PREEMPT_RT) /* * Look out! "owner" is an entirely speculative pointer * access and not reliable. @@ -5615,6 +5843,35 @@ out: #endif #ifdef CONFIG_PREEMPT + +/* + * Global flag to turn preemption off on a CONFIG_PREEMPT kernel: + */ +int kernel_preemption = 1; + +static int __init preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) { + if (kernel_preemption) { + printk(KERN_INFO "turning off kernel preemption!\n"); + kernel_preemption = 0; + } + return 1; + } + if (!strncmp(str, "on", 2)) { + if (!kernel_preemption) { + printk(KERN_INFO "turning on kernel preemption!\n"); + kernel_preemption = 1; + } + return 1; + } + get_option(&str, &kernel_preemption); + + return 1; +} + +__setup("preempt=", preempt_setup); + /* * this is the entry point to schedule() from in-kernel preemption * off of preempt_enable. Kernel preemptions off return from interrupt @@ -5623,7 +5880,11 @@ out: asmlinkage void __sched preempt_schedule(void) { struct thread_info *ti = current_thread_info(); + struct task_struct *task = current; + int saved_lock_depth; + if (!kernel_preemption) + return; /* * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. @@ -5632,10 +5893,23 @@ asmlinkage void __sched preempt_schedule(void) return; do { + local_irq_disable(); add_preempt_count(PREEMPT_ACTIVE); - schedule(); + + /* + * We keep the big kernel semaphore locked, but we + * clear ->lock_depth so that schedule() doesnt + * auto-release the semaphore: + */ + saved_lock_depth = task->lock_depth; + task->lock_depth = -1; + __schedule(); + task->lock_depth = saved_lock_depth; + sub_preempt_count(PREEMPT_ACTIVE); + local_irq_enable(); + /* * Check again in case we missed a preemption opportunity * between schedule and now. @@ -5646,23 +5920,40 @@ asmlinkage void __sched preempt_schedule(void) EXPORT_SYMBOL(preempt_schedule); /* - * this is the entry point to schedule() from kernel preemption - * off of irq context. - * Note, that this is called and return with irqs disabled. This will - * protect us against recursive calling from irq. + * this is is the entry point for the IRQ return path. Called with + * interrupts disabled. To avoid infinite irq-entry recursion problems + * with fast-paced IRQ sources we do all of this carefully to never + * enable interrupts again. */ asmlinkage void __sched preempt_schedule_irq(void) { struct thread_info *ti = current_thread_info(); + struct task_struct *task = current; + int saved_lock_depth; - /* Catch callers which need to be fixed */ - BUG_ON(ti->preempt_count || !irqs_disabled()); + if (!kernel_preemption) + return; + /* + * If there is a non-zero preempt_count then just return. + * (interrupts are disabled) + */ + if (unlikely(ti->preempt_count)) + return; do { - add_preempt_count(PREEMPT_ACTIVE); - local_irq_enable(); - schedule(); local_irq_disable(); + add_preempt_count(PREEMPT_ACTIVE); + + /* + * We keep the big kernel semaphore locked, but we + * clear ->lock_depth so that schedule() doesnt + * auto-release the semaphore: + */ + saved_lock_depth = task->lock_depth; + task->lock_depth = -1; + __schedule(); + + task->lock_depth = saved_lock_depth; sub_preempt_count(PREEMPT_ACTIVE); /* @@ -5678,7 +5969,7 @@ asmlinkage void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key) { - return try_to_wake_up(curr->private, mode, wake_flags); + return try_to_wake_up(curr->private, mode, wake_flags, 0); } EXPORT_SYMBOL(default_wake_function); @@ -5721,7 +6012,7 @@ void __wake_up(wait_queue_head_t *q, unsigned int mode, unsigned long flags; spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, 0, key); + __wake_up_common(q, mode, nr_exclusive, 1, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(__wake_up); @@ -5801,7 +6092,7 @@ void complete(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); x->done++; - __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); + __wake_up_common(&x->wait, TASK_NORMAL, 1, 1, NULL); spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete); @@ -5821,7 +6112,7 @@ void complete_all(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); x->done += UINT_MAX/2; - __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); + __wake_up_common(&x->wait, TASK_NORMAL, 0, 1, NULL); spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete_all); @@ -6037,31 +6328,51 @@ long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) } EXPORT_SYMBOL(sleep_on_timeout); -#ifdef CONFIG_RT_MUTEXES - /* - * rt_mutex_setprio - set the current priority of a task + * task_setprio - set the current priority of a task * @p: task * @prio: prio value (kernel-internal form) * * This function changes the 'effective' priority of a task. It does * not touch ->normal_prio like __setscheduler(). * - * Used by the rt_mutex code to implement priority inheritance logic. + * Used by the rt_mutex code to implement priority inheritance logic + * and by rcupreempt-boost to boost priorities of tasks sleeping + * with rcu locks. */ -void rt_mutex_setprio(struct task_struct *p, int prio) +void task_setprio(struct task_struct *p, int prio) { unsigned long flags; int oldprio, on_rq, running; struct rq *rq; - const struct sched_class *prev_class = p->sched_class; + const struct sched_class *prev_class; BUG_ON(prio < 0 || prio > MAX_PRIO); rq = task_rq_lock(p, &flags); + + /* + * Idle task boosting is a nono in general. There is one + * exception, when NOHZ is active: + * + * The idle task calls get_next_timer_interrupt() and holds + * the timer wheel base->lock on the CPU and another CPU wants + * to access the timer (probably to cancel it). We can safely + * ignore the boosting request, as the idle CPU runs this code + * with interrupts disabled and will complete the lock + * protected section without being interrupted. So there is no + * real need to boost. + */ + if (unlikely(p == rq->idle)) { + WARN_ON(p != rq->curr); + WARN_ON(p->pi_blocked_on); + goto out_unlock; + } + update_rq_clock(rq); oldprio = p->prio; + prev_class = p->sched_class; on_rq = p->se.on_rq; running = task_current(rq, p); if (on_rq) @@ -6076,18 +6387,20 @@ void rt_mutex_setprio(struct task_struct *p, int prio) p->prio = prio; + trace_sched_task_setprio(rq, p, oldprio); + if (running) p->sched_class->set_curr_task(rq); if (on_rq) { - enqueue_task(rq, p, 0); + enqueue_task(rq, p, 0, oldprio < prio); check_class_changed(rq, p, prev_class, oldprio, running); } + +out_unlock: task_rq_unlock(rq, &flags); } -#endif - void set_user_nice(struct task_struct *p, long nice) { int old_prio, delta, on_rq; @@ -6123,7 +6436,7 @@ void set_user_nice(struct task_struct *p, long nice) delta = p->prio - old_prio; if (on_rq) { - enqueue_task(rq, p, 0); + enqueue_task(rq, p, 0, false); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -6281,7 +6594,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy, { int retval, oldprio, oldpolicy = -1, on_rq, running; unsigned long flags; - const struct sched_class *prev_class = p->sched_class; + const struct sched_class *prev_class; struct rq *rq; int reset_on_fork; @@ -6395,12 +6708,31 @@ recheck: p->sched_reset_on_fork = reset_on_fork; oldprio = p->prio; + prev_class = p->sched_class; __setscheduler(rq, p, policy, param->sched_priority); if (running) p->sched_class->set_curr_task(rq); if (on_rq) { - activate_task(rq, p, 0); + /* + * Workaround to make prio ceiling work as expected: + * + * Queue task to head when task is running and task is + * lowering its priority. This works around the non- + * availability of a sched_setprio syscall which was + * tinkered into the posix spec to make prio ceiling + * work correctly. + * + * This workaround violates the posix scheduling + * semantics of tail queueing in the case that the + * priority was changed by anything else than + * sched_setprio, but there is no other breakage + * lurking than some specification fetishists going + * berserk on me. + * + * Fixing this in mainline needs more thoughts. + */ + activate_task(rq, p, 0, running && oldprio < p->prio); check_class_changed(rq, p, prev_class, oldprio, running); } @@ -6732,9 +7064,9 @@ SYSCALL_DEFINE0(sched_yield) __release(rq->lock); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); do_raw_spin_unlock(&rq->lock); - preempt_enable_no_resched(); + local_irq_enable(); - schedule(); + preempt_enable_and_schedule(); return 0; } @@ -6746,9 +7078,18 @@ static inline int should_resched(void) static void __cond_resched(void) { - add_preempt_count(PREEMPT_ACTIVE); - schedule(); - sub_preempt_count(PREEMPT_ACTIVE); + do { + add_preempt_count(PREEMPT_ACTIVE); + schedule(); + sub_preempt_count(PREEMPT_ACTIVE); + + /* + * Check again in case we missed a preemption opportunity + * between schedule and now. + */ + barrier(); + + } while (need_resched()); } int __sched _cond_resched(void) @@ -6789,10 +7130,16 @@ int __cond_resched_lock(spinlock_t *lock) } EXPORT_SYMBOL(__cond_resched_lock); +/* + * Voluntarily preempt a process context that has softirqs disabled: + */ int __sched __cond_resched_softirq(void) { - BUG_ON(!in_softirq()); - +#ifndef CONFIG_PREEMPT_SOFTIRQS + WARN_ON_ONCE(!in_softirq()); + if (!in_softirq()) + return 0; +#endif if (should_resched()) { local_bh_enable(); __cond_resched(); @@ -6803,17 +7150,75 @@ int __sched __cond_resched_softirq(void) } EXPORT_SYMBOL(__cond_resched_softirq); +/* + * Voluntarily preempt a softirq context (possible with softirq threading): + */ +int __sched cond_resched_softirq_context(void) +{ + WARN_ON_ONCE(!in_softirq() && !(current->extra_flags & PFE_SOFTIRQ)); + + if (softirq_need_resched() && system_state == SYSTEM_RUNNING) { + raw_local_irq_disable(); + _local_bh_enable(); + raw_local_irq_enable(); + __cond_resched(); + local_bh_disable(); + return 1; + } + return 0; +} +EXPORT_SYMBOL(cond_resched_softirq_context); + +#ifdef CONFIG_PREEMPT_VOLUNTARY +int voluntary_preemption = 1; +EXPORT_SYMBOL(voluntary_preemption); + +static int __init voluntary_preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) + voluntary_preemption = 0; + else + get_option(&str, &voluntary_preemption); + if (!voluntary_preemption) + printk("turning off voluntary preemption!\n"); + + return 1; +} + +__setup("voluntary-preempt=", voluntary_preempt_setup); + +#endif + /** * yield - yield the current processor to other threads. * * This is a shortcut for kernel-space yielding - it marks the * thread runnable and calls sys_sched_yield(). */ -void __sched yield(void) +void __sched __yield(void) { set_current_state(TASK_RUNNING); sys_sched_yield(); } + +void __sched yield(void) +{ + static int once = 1; + + /* + * it's a bug to rely on yield() with RT priorities. We print + * the first occurance after bootup ... this will still give + * us an idea about the scope of the problem, without spamming + * the syslog: + */ + if (once && rt_task(current)) { + once = 0; + printk(KERN_ERR "BUG: %s:%d RT task yield()-ing!\n", + current->comm, current->pid); + dump_stack(); + } + __yield(); +} EXPORT_SYMBOL(yield); /* @@ -6977,6 +7382,7 @@ void sched_show_task(struct task_struct *p) void show_state_filter(unsigned long state_filter) { struct task_struct *g, *p; + int do_unlock = 1; #if BITS_PER_LONG == 32 printk(KERN_INFO @@ -6985,7 +7391,16 @@ void show_state_filter(unsigned long state_filter) printk(KERN_INFO " task PC stack pid father\n"); #endif +#ifdef CONFIG_PREEMPT_RT + if (!read_trylock(&tasklist_lock)) { + printk("hm, tasklist_lock write-locked.\n"); + printk("ignoring ...\n"); + do_unlock = 0; + } +#else read_lock(&tasklist_lock); +#endif + do_each_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow @@ -7001,7 +7416,8 @@ void show_state_filter(unsigned long state_filter) #ifdef CONFIG_SCHED_DEBUG sysrq_sched_debug_show(); #endif - read_unlock(&tasklist_lock); + if (do_unlock) + read_unlock(&tasklist_lock); /* * Only show locks if all tasks are dumped: */ @@ -7037,17 +7453,14 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) __set_task_cpu(idle, cpu); rq->curr = rq->idle = idle; -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) +#if defined(CONFIG_SMP) idle->oncpu = 1; #endif raw_spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ -#if defined(CONFIG_PREEMPT) - task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); -#else task_thread_info(idle)->preempt_count = 0; -#endif + /* * The idle tasks have their own, simple scheduling class: */ @@ -7145,27 +7558,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) struct rq *rq; int ret = 0; - /* - * Since we rely on wake-ups to migrate sleeping tasks, don't change - * the ->cpus_allowed mask from under waking tasks, which would be - * possible when we change rq->lock in ttwu(), so synchronize against - * TASK_WAKING to avoid that. - * - * Make an exception for freshly cloned tasks, since cpuset namespaces - * might move the task about, we have to validate the target in - * wake_up_new_task() anyway since the cpu might have gone away. - */ -again: - while (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) - cpu_relax(); - rq = task_rq_lock(p, &flags); - if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) { - task_rq_unlock(rq, &flags); - goto again; - } - if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; goto out; @@ -7221,11 +7615,18 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { struct rq *rq_dest, *rq_src; + unsigned long flags; int ret = 0; if (unlikely(!cpu_active(dest_cpu))) return ret; + /* + * PREEMPT_RT: this relies on write_lock_irq(&tasklist_lock) + * disabling interrupts - which on PREEMPT_RT does not do: + */ + local_irq_save(flags); + rq_src = cpu_rq(src_cpu); rq_dest = cpu_rq(dest_cpu); @@ -7244,13 +7645,15 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (p->se.on_rq) { deactivate_task(rq_src, p, 0); set_task_cpu(p, dest_cpu); - activate_task(rq_dest, p, 0); + activate_task(rq_dest, p, 0, false); check_preempt_curr(rq_dest, p, 0); } done: ret = 1; fail: double_rq_unlock(rq_src, rq_dest); + local_irq_restore(flags); + return ret; } @@ -7410,7 +7813,7 @@ void sched_idle_next(void) __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); update_rq_clock(rq); - activate_task(rq, p, 0); + activate_task(rq, p, 0, false); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -7427,7 +7830,11 @@ void idle_task_exit(void) if (mm != &init_mm) switch_mm(mm, &init_mm, current); +#ifdef CONFIG_PREEMPT_RT + mmdrop_delayed(mm); +#else mmdrop(mm); +#endif } /* called under rq->lock with disabled interrupts */ @@ -9672,6 +10079,9 @@ void __init sched_init(void) atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current); +#ifdef CONFIG_PREEMPT_RT + printk("Real-Time Preemption Support (C) 2004-2007 Ingo Molnar\n"); +#endif /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, @@ -9704,10 +10114,14 @@ void __init sched_init(void) scheduler_running = 1; } -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT) static inline int preempt_count_equals(int preempt_offset) { - int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); + int nested = (preempt_count() & ~PREEMPT_ACTIVE); + +#ifndef CONFIG_PREEMPT_RT + nested += rcu_preempt_depth(); +#endif return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); } @@ -9728,7 +10142,8 @@ void __might_sleep(char *file, int line, int preempt_offset) "BUG: sleeping function called from invalid context at %s:%d\n", file, line); printk(KERN_ERR - "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + "pcnt: %x %d in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + preempt_count(), preempt_offset, in_atomic(), irqs_disabled(), current->pid, current->comm); @@ -9752,7 +10167,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p) deactivate_task(rq, p, 0); __setscheduler(rq, p, SCHED_NORMAL, 0); if (on_rq) { - activate_task(rq, p, 0); + activate_task(rq, p, 0, false); resched_task(rq->curr); } } @@ -10128,7 +10543,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->set_curr_task(rq); if (on_rq) - enqueue_task(rq, tsk, 0); + enqueue_task(rq, tsk, 0, false); task_rq_unlock(rq, &flags); } diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 67f95aa..966e925 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -280,6 +280,19 @@ static void print_cpu(struct seq_file *m, int cpu) P(cpu_load[2]); P(cpu_load[3]); P(cpu_load[4]); +#ifdef CONFIG_PREEMPT_RT + /* Print rt related rq stats */ + P(rt.rt_nr_running); + P(rt.rt_nr_uninterruptible); +# ifdef CONFIG_SCHEDSTATS + P(rto_schedule); + P(rto_schedule_tail); + P(rto_wakeup); + P(rto_pulled); + P(rto_pushed); +# endif +#endif + #undef P #undef PN diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 8fe7ee8..cff45e4 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -360,6 +360,8 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) rb_link_node(&se->run_node, parent, link); rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); + + cfs_rq->nr_enqueued++; } static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -372,6 +374,8 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) } rb_erase(&se->run_node, &cfs_rq->tasks_timeline); + + cfs_rq->nr_enqueued--; } static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) @@ -1053,7 +1057,8 @@ static inline void hrtick_update(struct rq *rq) * increased. Here we update the fair scheduling stats and * then put the task into the rbtree: */ -static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) +static void +enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; @@ -1061,7 +1066,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) if (wakeup) flags |= ENQUEUE_WAKEUP; - if (p->state == TASK_WAKING) + if (p->state & TASK_WAKING) flags |= ENQUEUE_MIGRATE; for_each_sched_entity(se) { diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index f48328a..9de5f18 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -194,7 +194,7 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) return rt_se->my_q; } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se); +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); static void dequeue_rt_entity(struct sched_rt_entity *rt_se); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) @@ -204,7 +204,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) if (rt_rq->rt_nr_running) { if (rt_se && !on_rt_rq(rt_se)) - enqueue_rt_entity(rt_se); + enqueue_rt_entity(rt_se, false); if (rt_rq->highest_prio.curr < curr->prio) resched_task(curr); } @@ -783,7 +783,7 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { int prio = rt_se_prio(rt_se); - WARN_ON(!rt_prio(prio)); + BUG_ON(!rt_prio(prio)); rt_rq->rt_nr_running++; inc_rt_prio(rt_rq, prio); @@ -794,7 +794,7 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static inline void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { - WARN_ON(!rt_prio(rt_se_prio(rt_se))); + BUG_ON(!rt_prio(rt_se_prio(rt_se))); WARN_ON(!rt_rq->rt_nr_running); rt_rq->rt_nr_running--; @@ -803,7 +803,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) dec_rt_group(rt_se, rt_rq); } -static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) +static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; @@ -819,7 +819,10 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) return; - list_add_tail(&rt_se->run_list, queue); + if (head) + list_add(&rt_se->run_list, queue); + else + list_add_tail(&rt_se->run_list, queue); __set_bit(rt_se_prio(rt_se), array->bitmap); inc_rt_tasks(rt_se, rt_rq); @@ -856,11 +859,11 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) } } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se) +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) { dequeue_rt_stack(rt_se); for_each_sched_rt_entity(rt_se) - __enqueue_rt_entity(rt_se); + __enqueue_rt_entity(rt_se, head); } static void dequeue_rt_entity(struct sched_rt_entity *rt_se) @@ -871,21 +874,74 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) struct rt_rq *rt_rq = group_rt_rq(rt_se); if (rt_rq && rt_rq->rt_nr_running) - __enqueue_rt_entity(rt_se); + __enqueue_rt_entity(rt_se, false); } } +static inline void incr_rt_nr_uninterruptible(struct task_struct *p, + struct rq *rq) +{ + rq->rt.rt_nr_uninterruptible++; +} + +static inline void decr_rt_nr_uninterruptible(struct task_struct *p, + struct rq *rq) +{ + rq->rt.rt_nr_uninterruptible--; +} + +unsigned long rt_nr_running(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->rt.rt_nr_running; + + return sum; +} + +unsigned long rt_nr_running_cpu(int cpu) +{ + return cpu_rq(cpu)->rt.rt_nr_running; +} + +unsigned long rt_nr_uninterruptible(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->rt.rt_nr_uninterruptible; + + /* + * Since we read the counters lockless, it might be slightly + * inaccurate. Do not allow it to go below zero though: + */ + if (unlikely((long)sum < 0)) + sum = 0; + + return sum; +} + +unsigned long rt_nr_uninterruptible_cpu(int cpu) +{ + return cpu_rq(cpu)->rt.rt_nr_uninterruptible; +} + /* * Adding/removing a task to/from a priority array: */ -static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) +static void +enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head) { struct sched_rt_entity *rt_se = &p->rt; if (wakeup) rt_se->timeout = 0; - enqueue_rt_entity(rt_se); + enqueue_rt_entity(rt_se, head); + + if (p->state == TASK_UNINTERRUPTIBLE) + decr_rt_nr_uninterruptible(p, rq); if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); @@ -896,6 +952,10 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) struct sched_rt_entity *rt_se = &p->rt; update_curr_rt(rq); + + if (p->state == TASK_UNINTERRUPTIBLE) + incr_rt_nr_uninterruptible(p, rq); + dequeue_rt_entity(rt_se); dequeue_pushable_task(rq, p); @@ -970,6 +1030,17 @@ static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) } /* + * If the new task is an RT task, current is not an RT task + * and the new one may run on the current CPU, run it here. + * This avoids sending reschedule IPIs across CPUs. + */ + if (unlikely(rt_task(p)) && !rt_task(rq->curr)) { + int cpu = smp_processor_id(); + if (cpumask_test_cpu(cpu, &p->cpus_allowed)) + return cpu; + } + + /* * Otherwise, just let it ride on the affined RQ and the * post-schedule router will push the preempted task away */ @@ -1358,7 +1429,7 @@ static int push_rt_task(struct rq *rq) deactivate_task(rq, next_task, 0); set_task_cpu(next_task, lowest_rq->cpu); - activate_task(lowest_rq, next_task, 0); + activate_task(lowest_rq, next_task, 0, false); resched_task(lowest_rq->curr); @@ -1441,7 +1512,7 @@ static int pull_rt_task(struct rq *this_rq) deactivate_task(src_rq, p, 0); set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); + activate_task(this_rq, p, 0, false); /* * We continue with the search, just in * case there's an even higher prio task @@ -1459,8 +1530,10 @@ static int pull_rt_task(struct rq *this_rq) static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) { /* Try to pull RT tasks here if we lower this rq's prio */ - if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio) + if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio) { pull_rt_task(rq); + schedstat_inc(rq, rto_schedule); + } } static void post_schedule_rt(struct rq *rq) @@ -1528,7 +1601,6 @@ static void set_cpus_allowed_rt(struct task_struct *p, */ if (weight > 1) enqueue_pushable_task(rq, p); - } if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 32d2bd4..9ecca2f 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -306,10 +306,10 @@ static inline void account_group_user_time(struct task_struct *tsk, if (!cputimer->running) return; - spin_lock(&cputimer->lock); + raw_spin_lock(&cputimer->lock); cputimer->cputime.utime = cputime_add(cputimer->cputime.utime, cputime); - spin_unlock(&cputimer->lock); + raw_spin_unlock(&cputimer->lock); } /** @@ -336,10 +336,10 @@ static inline void account_group_system_time(struct task_struct *tsk, if (!cputimer->running) return; - spin_lock(&cputimer->lock); + raw_spin_lock(&cputimer->lock); cputimer->cputime.stime = cputime_add(cputimer->cputime.stime, cputime); - spin_unlock(&cputimer->lock); + raw_spin_unlock(&cputimer->lock); } /** @@ -369,7 +369,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, if (!cputimer->running) return; - spin_lock(&cputimer->lock); + raw_spin_lock(&cputimer->lock); cputimer->cputime.sum_exec_runtime += ns; - spin_unlock(&cputimer->lock); + raw_spin_unlock(&cputimer->lock); } diff --git a/kernel/semaphore.c b/kernel/semaphore.c index 94a62c0..d831841 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c @@ -54,12 +54,12 @@ void down(struct semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else __down(sem); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); } EXPORT_SYMBOL(down); @@ -77,12 +77,12 @@ int down_interruptible(struct semaphore *sem) unsigned long flags; int result = 0; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else result = __down_interruptible(sem); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); return result; } @@ -103,12 +103,12 @@ int down_killable(struct semaphore *sem) unsigned long flags; int result = 0; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else result = __down_killable(sem); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); return result; } @@ -132,11 +132,11 @@ int down_trylock(struct semaphore *sem) unsigned long flags; int count; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); count = sem->count - 1; if (likely(count >= 0)) sem->count = count; - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); return (count < 0); } @@ -157,12 +157,12 @@ int down_timeout(struct semaphore *sem, long jiffies) unsigned long flags; int result = 0; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else result = __down_timeout(sem, jiffies); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); return result; } @@ -179,12 +179,12 @@ void up(struct semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(list_empty(&sem->wait_list))) sem->count++; else __up(sem); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); } EXPORT_SYMBOL(up); @@ -217,9 +217,9 @@ static inline int __sched __down_common(struct semaphore *sem, long state, if (timeout <= 0) goto timed_out; __set_task_state(task, state); - spin_unlock_irq(&sem->lock); + raw_spin_unlock_irq(&sem->lock); timeout = schedule_timeout(timeout); - spin_lock_irq(&sem->lock); + raw_spin_lock_irq(&sem->lock); if (waiter.up) return 0; } diff --git a/kernel/signal.c b/kernel/signal.c index 934ae5e..9dda83b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -206,13 +206,47 @@ static inline void print_dropped_signal(int sig) current->comm, current->pid, sig); } +#ifdef __HAVE_ARCH_CMPXCHG +static inline struct sigqueue *get_task_cache(struct task_struct *t) +{ + struct sigqueue *q = t->sigqueue_cache; + + if (cmpxchg(&t->sigqueue_cache, q, NULL) != q) + return NULL; + + return q; +} + +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q) +{ + if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL) + return 0; + + return 1; +} + +#else + +static inline struct sigqueue *get_task_cache(struct task_struct *t) +{ + return NULL; +} + +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q) +{ + return 1; +} + +#endif + /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an * appopriate lock must be held to stop the target task from exiting */ static struct sigqueue * -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags, + int override_rlimit, int fromslab) { struct sigqueue *q = NULL; struct user_struct *user; @@ -229,7 +263,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi if (override_rlimit || atomic_read(&user->sigpending) <= t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { - q = kmem_cache_alloc(sigqueue_cachep, flags); + if (!fromslab) + q = get_task_cache(t); + if (!q) + q = kmem_cache_alloc(sigqueue_cachep, flags); } else { print_dropped_signal(sig); } @@ -246,6 +283,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi return q; } +static struct sigqueue * +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, + int override_rlimit) +{ + return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0); +} + static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) @@ -255,6 +299,21 @@ static void __sigqueue_free(struct sigqueue *q) kmem_cache_free(sigqueue_cachep, q); } +static void sigqueue_free_current(struct sigqueue *q) +{ + struct user_struct *up; + + if (q->flags & SIGQUEUE_PREALLOC) + return; + + up = q->user; + if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) { + atomic_dec(&up->sigpending); + free_uid(up); + } else + __sigqueue_free(q); +} + void flush_sigqueue(struct sigpending *queue) { struct sigqueue *q; @@ -268,6 +327,21 @@ void flush_sigqueue(struct sigpending *queue) } /* + * Called from __exit_signal. Flush tsk->pending and + * tsk->sigqueue_cache + */ +void flush_task_sigqueue(struct task_struct *tsk) +{ + struct sigqueue *q; + + flush_sigqueue(&tsk->pending); + + q = get_task_cache(tsk); + if (q) + kmem_cache_free(sigqueue_cachep, q); +} + +/* * Flush all pending signals for a task. */ void __flush_signals(struct task_struct *t) @@ -415,7 +489,7 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) still_pending: list_del_init(&first->list); copy_siginfo(info, &first->info); - __sigqueue_free(first); + sigqueue_free_current(first); } else { /* Ok, it wasn't in the queue. This must be a fast-pathed signal or we must have been @@ -460,6 +534,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) { int signr; + WARN_ON_ONCE(tsk != current); + /* We only dequeue private signals from ourselves, we don't let * signalfd steal them */ @@ -542,6 +618,9 @@ void signal_wake_up(struct task_struct *t, int resume) set_tsk_thread_flag(t, TIF_SIGPENDING); + if (unlikely(t == current)) + return; + /* * For SIGKILL, we want to wake it up in the stopped/traced/killable * case. We don't check t->state here because there is a race with it @@ -870,7 +949,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, trace_signal_generate(sig, info, t); +#ifdef CONFIG_SMP assert_spin_locked(&t->sighand->siglock); +#endif if (!prepare_signal(sig, t, from_ancestor_ns)) return 0; @@ -1337,7 +1418,8 @@ EXPORT_SYMBOL(kill_pid); */ struct sigqueue *sigqueue_alloc(void) { - struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0); + /* Preallocated sigqueue objects always from the slabcache ! */ + struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1); if (q) q->flags |= SIGQUEUE_PREALLOC; @@ -1630,15 +1712,7 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) read_lock(&tasklist_lock); if (may_ptrace_stop()) { do_notify_parent_cldstop(current, CLD_TRAPPED); - /* - * Don't want to allow preemption here, because - * sys_ptrace() needs this task to be inactive. - * - * XXX: implement read_unlock_no_resched(). - */ - preempt_disable(); read_unlock(&tasklist_lock); - preempt_enable_no_resched(); schedule(); } else { /* diff --git a/kernel/softirq.c b/kernel/softirq.c index a09502e..c35f72b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -8,15 +8,23 @@ * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) * * Remote softirq infrastructure is by Jens Axboe. + * + * Softirq-split implemetation by + * Copyright (C) 2005 Thomas Gleixner, Ingo Molnar */ #include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/syscalls.h> +#include <linux/wait.h> #include <linux/kernel_stat.h> #include <linux/interrupt.h> #include <linux/init.h> +#include <linux/delay.h> #include <linux/mm.h> #include <linux/notifier.h> #include <linux/percpu.h> +#include <linux/delay.h> #include <linux/cpu.h> #include <linux/freezer.h> #include <linux/kthread.h> @@ -54,29 +62,122 @@ EXPORT_SYMBOL(irq_stat); static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; -static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); +struct softirqdata { + int nr; + unsigned long cpu; + struct task_struct *tsk; + int running; +}; + +static DEFINE_PER_CPU(struct softirqdata [NR_SOFTIRQS], ksoftirqd); char *softirq_to_name[NR_SOFTIRQS] = { "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", "TASKLET", "SCHED", "HRTIMER", "RCU" }; +#ifdef CONFIG_PREEMPT_RT +/* + * On preempt-rt a softirq might be blocked on a lock. There might be + * no other runnable task on this CPU because the lock owner runs on + * some other CPU. So we have to go into idle with the pending bit + * set. Therefor we need to check this otherwise we warn about false + * positives which confuses users and defeats the whole purpose of + * this test. + * + * This code is called with interrupts disabled. + */ +void softirq_check_pending_idle(void) +{ + static int rate_limit; + u32 warnpending = 0, pending = local_softirq_pending(); + int curr = 0; + + if (rate_limit >= 10) + return; + + while (pending) { + if (pending & 1) { + struct task_struct *tsk; + + tsk = __get_cpu_var(ksoftirqd)[curr].tsk; + /* + * The wakeup code in rtmutex.c wakes up the + * task _before_ it sets pi_blocked_on to NULL + * under tsk->pi_lock. So we need to check for + * both: state and pi_blocked_on. + */ + raw_spin_lock(&tsk->pi_lock); + + if (!tsk->pi_blocked_on && + !(tsk->state == TASK_RUNNING) && + !(tsk->state & TASK_RUNNING_MUTEX)) + warnpending |= 1 << curr; + + raw_spin_unlock(&tsk->pi_lock); + } + pending >>= 1; + curr++; + } + + if (warnpending) { + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", + warnpending); + rate_limit++; + } +} + +#else +/* + * On !PREEMPT_RT we just printk rate limited: + */ +void softirq_check_pending_idle(void) +{ + static int rate_limit; + + if (rate_limit < 10) { + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", + local_softirq_pending()); + rate_limit++; + } +} + +#endif + /* * we cannot loop indefinitely here to avoid userspace starvation, * but we also don't want to introduce a worst case 1/HZ latency * to the pending events, so lets the scheduler to balance * the softirq load for us. */ -void wakeup_softirqd(void) +static void wakeup_softirqd(int softirq) { /* Interrupts are disabled: no need to stop preemption */ - struct task_struct *tsk = __get_cpu_var(ksoftirqd); + struct task_struct *tsk = __get_cpu_var(ksoftirqd)[softirq].tsk; if (tsk && tsk->state != TASK_RUNNING) wake_up_process(tsk); } /* + * Wake up the softirq threads which have work + */ +static void trigger_softirqs(void) +{ + u32 pending = local_softirq_pending(); + int curr = 0; + + while (pending) { + if (pending & 1) + wakeup_softirqd(curr); + pending >>= 1; + curr++; + } +} + +#ifndef CONFIG_PREEMPT_HARDIRQS + +/* * This one is for softirq.c-internal use, * where hardirqs are disabled legitimately: */ @@ -128,7 +229,6 @@ EXPORT_SYMBOL(local_bh_disable); */ void _local_bh_enable(void) { - WARN_ON_ONCE(in_irq()); WARN_ON_ONCE(!irqs_disabled()); if (softirq_count() == SOFTIRQ_OFFSET) @@ -138,45 +238,72 @@ void _local_bh_enable(void) EXPORT_SYMBOL(_local_bh_enable); -static inline void _local_bh_enable_ip(unsigned long ip) +void local_bh_enable(void) { - WARN_ON_ONCE(in_irq() || irqs_disabled()); #ifdef CONFIG_TRACE_IRQFLAGS - local_irq_disable(); + unsigned long flags; + + WARN_ON_ONCE(in_irq()); +#endif + +#ifdef CONFIG_TRACE_IRQFLAGS + local_irq_save(flags); #endif /* * Are softirqs going to be turned on now: */ if (softirq_count() == SOFTIRQ_OFFSET) - trace_softirqs_on(ip); + trace_softirqs_on((unsigned long)__builtin_return_address(0)); /* * Keep preemption disabled until we are done with * softirq processing: - */ - sub_preempt_count(SOFTIRQ_OFFSET - 1); + */ + sub_preempt_count(SOFTIRQ_OFFSET - 1); if (unlikely(!in_interrupt() && local_softirq_pending())) do_softirq(); dec_preempt_count(); #ifdef CONFIG_TRACE_IRQFLAGS - local_irq_enable(); + local_irq_restore(flags); #endif preempt_check_resched(); } - -void local_bh_enable(void) -{ - _local_bh_enable_ip((unsigned long)__builtin_return_address(0)); -} EXPORT_SYMBOL(local_bh_enable); void local_bh_enable_ip(unsigned long ip) { - _local_bh_enable_ip(ip); +#ifdef CONFIG_TRACE_IRQFLAGS + unsigned long flags; + + WARN_ON_ONCE(in_irq()); + + local_irq_save(flags); +#endif + /* + * Are softirqs going to be turned on now: + */ + if (softirq_count() == SOFTIRQ_OFFSET) + trace_softirqs_on(ip); + /* + * Keep preemption disabled until we are done with + * softirq processing: + */ + sub_preempt_count(SOFTIRQ_OFFSET - 1); + + if (unlikely(!in_interrupt() && local_softirq_pending())) + do_softirq(); + + dec_preempt_count(); +#ifdef CONFIG_TRACE_IRQFLAGS + local_irq_restore(flags); +#endif + preempt_check_resched(); } EXPORT_SYMBOL(local_bh_enable_ip); +#endif + /* * We restart softirq processing MAX_SOFTIRQ_RESTART times, * and we fall back to softirqd after that. @@ -186,53 +313,119 @@ EXPORT_SYMBOL(local_bh_enable_ip); * we want to handle softirqs as soon as possible, but they * should not be able to lock up the box. */ -#define MAX_SOFTIRQ_RESTART 10 +#define MAX_SOFTIRQ_RESTART 20 -asmlinkage void __do_softirq(void) +static DEFINE_PER_CPU(u32, softirq_running); + +/* + * Debug check for leaking preempt counts in h->action handlers: + */ + +static inline void debug_check_preempt_count_start(__u32 *preempt_count) { - struct softirq_action *h; - __u32 pending; +#ifdef CONFIG_DEBUG_PREEMPT + *preempt_count = preempt_count(); +#endif +} + +static inline void +debug_check_preempt_count_stop(__u32 *preempt_count, struct softirq_action *h) +{ +#ifdef CONFIG_DEBUG_PREEMPT + if (*preempt_count == preempt_count()) + return; + + print_symbol("BUG: %Ps exited with wrong preemption count!\n", + (unsigned long)h->action); + printk("=> enter: %08x, exit: %08x.\n", *preempt_count, preempt_count()); + preempt_count() = *preempt_count; +#endif +} + +/* + * Execute softirq handlers: + */ +static void ___do_softirq(const int same_prio_only) +{ + __u32 pending, available_mask, same_prio_skipped, preempt_count; int max_restart = MAX_SOFTIRQ_RESTART; - int cpu; + struct softirq_action *h; + int cpu, softirq; pending = local_softirq_pending(); account_system_vtime(current); - __local_bh_disable((unsigned long)__builtin_return_address(0)); - lockdep_softirq_enter(); - cpu = smp_processor_id(); restart: + available_mask = -1; + softirq = 0; + same_prio_skipped = 0; + /* Reset the pending bitmask before enabling irqs */ set_softirq_pending(0); - local_irq_enable(); - h = softirq_vec; do { - if (pending & 1) { - int prev_count = preempt_count(); - kstat_incr_softirqs_this_cpu(h - softirq_vec); - - trace_softirq_entry(h, softirq_vec); - h->action(h); - trace_softirq_exit(h, softirq_vec); - if (unlikely(prev_count != preempt_count())) { - printk(KERN_ERR "huh, entered softirq %td %s %p" - "with preempt_count %08x," - " exited with %08x?\n", h - softirq_vec, - softirq_to_name[h - softirq_vec], - h->action, prev_count, preempt_count()); - preempt_count() = prev_count; + u32 softirq_mask = 1 << softirq; + + if (!(pending & 1)) + goto next; + + debug_check_preempt_count_start(&preempt_count); + +#if defined(CONFIG_PREEMPT_SOFTIRQS) && defined(CONFIG_PREEMPT_HARDIRQS) + /* + * If executed by a same-prio hardirq thread + * then skip pending softirqs that belong + * to softirq threads with different priority: + */ + if (same_prio_only) { + struct task_struct *tsk; + + tsk = __get_cpu_var(ksoftirqd)[softirq].tsk; + if (tsk && tsk->normal_prio != current->normal_prio) { + same_prio_skipped |= softirq_mask; + available_mask &= ~softirq_mask; + goto next; } - - rcu_bh_qs(cpu); } +#endif + /* + * Is this softirq already being processed? + */ + if (per_cpu(softirq_running, cpu) & softirq_mask) { + available_mask &= ~softirq_mask; + goto next; + } + per_cpu(softirq_running, cpu) |= softirq_mask; + kstat_incr_softirqs_this_cpu(h - softirq_vec); + local_irq_enable(); + + trace_softirq_entry(h, softirq_vec); + h->action(h); + trace_softirq_exit(h, softirq_vec); + + debug_check_preempt_count_stop(&preempt_count, h); + + rcu_bh_qs(cpu); + cond_resched_softirq_context(); + local_irq_disable(); + per_cpu(softirq_running, cpu) &= ~softirq_mask; + +next: h++; + softirq++; pending >>= 1; } while (pending); + or_softirq_pending(same_prio_skipped); + pending = local_softirq_pending(); + if (pending & available_mask) { + if (--max_restart) + goto restart; + } + local_irq_disable(); pending = local_softirq_pending(); @@ -240,12 +433,34 @@ restart: goto restart; if (pending) - wakeup_softirqd(); + trigger_softirqs(); +} + +asmlinkage void __do_softirq(void) +{ +#ifdef CONFIG_PREEMPT_SOFTIRQS + /* + * 'preempt harder'. Push all softirq processing off to ksoftirqd. + */ + if (softirq_preemption) { + if (local_softirq_pending()) + trigger_softirqs(); + return; + } +#endif + /* + * 'immediate' softirq execution: + */ + __local_bh_disable((unsigned long)__builtin_return_address(0)); + lockdep_softirq_enter(); + + ___do_softirq(0); lockdep_softirq_exit(); account_system_vtime(current); _local_bh_enable(); + } #ifndef __ARCH_HAS_DO_SOFTIRQ @@ -308,7 +523,7 @@ void irq_exit(void) if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) tick_nohz_stop_sched_tick(0); #endif - preempt_enable_no_resched(); + __preempt_enable_no_resched(); } /* @@ -316,19 +531,11 @@ void irq_exit(void) */ inline void raise_softirq_irqoff(unsigned int nr) { - __raise_softirq_irqoff(nr); + __do_raise_softirq_irqoff(nr); - /* - * If we're in an interrupt or softirq, we're done - * (this also catches softirq-disabled code). We will - * actually run the softirq once we return from - * the irq or softirq. - * - * Otherwise we wake up ksoftirqd to make sure we - * schedule the softirq soon. - */ - if (!in_interrupt()) - wakeup_softirqd(); +#ifdef CONFIG_PREEMPT_SOFTIRQS + wakeup_softirqd(nr); +#endif } void raise_softirq(unsigned int nr) @@ -357,15 +564,45 @@ struct tasklet_head static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec); static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec); +static void inline +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr) +{ + if (tasklet_trylock(t)) { +again: + /* We may have been preempted before tasklet_trylock + * and __tasklet_action may have already run. + * So double check the sched bit while the takslet + * is locked before adding it to the list. + */ + if (test_bit(TASKLET_STATE_SCHED, &t->state)) { + t->next = NULL; + *head->tail = t; + head->tail = &(t->next); + raise_softirq_irqoff(nr); + tasklet_unlock(t); + } else { + /* This is subtle. If we hit the corner case above + * It is possible that we get preempted right here, + * and another task has successfully called + * tasklet_schedule(), then this function, and + * failed on the trylock. Thus we must be sure + * before releasing the tasklet lock, that the + * SCHED_BIT is clear. Otherwise the tasklet + * may get its SCHED_BIT set, but not added to the + * list + */ + if (!tasklet_tryunlock(t)) + goto again; + } + } +} + void __tasklet_schedule(struct tasklet_struct *t) { unsigned long flags; local_irq_save(flags); - t->next = NULL; - *__get_cpu_var(tasklet_vec).tail = t; - __get_cpu_var(tasklet_vec).tail = &(t->next); - raise_softirq_irqoff(TASKLET_SOFTIRQ); + __tasklet_common_schedule(t, &__get_cpu_var(tasklet_vec), TASKLET_SOFTIRQ); local_irq_restore(flags); } @@ -376,10 +613,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) unsigned long flags; local_irq_save(flags); - t->next = NULL; - *__get_cpu_var(tasklet_hi_vec).tail = t; - __get_cpu_var(tasklet_hi_vec).tail = &(t->next); - raise_softirq_irqoff(HI_SOFTIRQ); + __tasklet_common_schedule(t, &__get_cpu_var(tasklet_hi_vec), HI_SOFTIRQ); local_irq_restore(flags); } @@ -387,50 +621,119 @@ EXPORT_SYMBOL(__tasklet_hi_schedule); void __tasklet_hi_schedule_first(struct tasklet_struct *t) { - BUG_ON(!irqs_disabled()); - - t->next = __get_cpu_var(tasklet_hi_vec).head; - __get_cpu_var(tasklet_hi_vec).head = t; - __raise_softirq_irqoff(HI_SOFTIRQ); + __tasklet_hi_schedule(t); } EXPORT_SYMBOL(__tasklet_hi_schedule_first); -static void tasklet_action(struct softirq_action *a) +void tasklet_enable(struct tasklet_struct *t) { - struct tasklet_struct *list; + if (!atomic_dec_and_test(&t->count)) + return; + if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state)) + tasklet_schedule(t); +} - local_irq_disable(); - list = __get_cpu_var(tasklet_vec).head; - __get_cpu_var(tasklet_vec).head = NULL; - __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; - local_irq_enable(); +EXPORT_SYMBOL(tasklet_enable); + +void tasklet_hi_enable(struct tasklet_struct *t) +{ + if (!atomic_dec_and_test(&t->count)) + return; + if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state)) + tasklet_hi_schedule(t); +} + +EXPORT_SYMBOL(tasklet_hi_enable); + +static void +__tasklet_action(struct softirq_action *a, struct tasklet_struct *list) +{ + int loops = 1000000; while (list) { struct tasklet_struct *t = list; list = list->next; - if (tasklet_trylock(t)) { - if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) - BUG(); - t->func(t->data); - tasklet_unlock(t); - continue; - } - tasklet_unlock(t); + /* + * Should always succeed - after a tasklist got on the + * list (after getting the SCHED bit set from 0 to 1), + * nothing but the tasklet softirq it got queued to can + * lock it: + */ + if (!tasklet_trylock(t)) { + WARN_ON(1); + continue; } - local_irq_disable(); t->next = NULL; - *__get_cpu_var(tasklet_vec).tail = t; - __get_cpu_var(tasklet_vec).tail = &(t->next); - __raise_softirq_irqoff(TASKLET_SOFTIRQ); - local_irq_enable(); + + /* + * If we cannot handle the tasklet because it's disabled, + * mark it as pending. tasklet_enable() will later + * re-schedule the tasklet. + */ + if (unlikely(atomic_read(&t->count))) { +out_disabled: + /* implicit unlock: */ + wmb(); + t->state = TASKLET_STATEF_PENDING; + continue; + } + + /* + * After this point on the tasklet might be rescheduled + * on another CPU, but it can only be added to another + * CPU's tasklet list if we unlock the tasklet (which we + * dont do yet). + */ + if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) + WARN_ON(1); + +again: + t->func(t->data); + + /* + * Try to unlock the tasklet. We must use cmpxchg, because + * another CPU might have scheduled or disabled the tasklet. + * We only allow the STATE_RUN -> 0 transition here. + */ + while (!tasklet_tryunlock(t)) { + /* + * If it got disabled meanwhile, bail out: + */ + if (atomic_read(&t->count)) + goto out_disabled; + /* + * If it got scheduled meanwhile, re-execute + * the tasklet function: + */ + if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) + goto again; + if (!--loops) { + printk("hm, tasklet state: %08lx\n", t->state); + WARN_ON(1); + tasklet_unlock(t); + break; + } + } } } +static void tasklet_action(struct softirq_action *a) +{ + struct tasklet_struct *list; + + local_irq_disable(); + list = __get_cpu_var(tasklet_vec).head; + __get_cpu_var(tasklet_vec).head = NULL; + __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; + local_irq_enable(); + + __tasklet_action(a, list); +} + static void tasklet_hi_action(struct softirq_action *a) { struct tasklet_struct *list; @@ -441,29 +744,7 @@ static void tasklet_hi_action(struct softirq_action *a) __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head; local_irq_enable(); - while (list) { - struct tasklet_struct *t = list; - - list = list->next; - - if (tasklet_trylock(t)) { - if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) - BUG(); - t->func(t->data); - tasklet_unlock(t); - continue; - } - tasklet_unlock(t); - } - - local_irq_disable(); - t->next = NULL; - *__get_cpu_var(tasklet_hi_vec).tail = t; - __get_cpu_var(tasklet_hi_vec).tail = &(t->next); - __raise_softirq_irqoff(HI_SOFTIRQ); - local_irq_enable(); - } + __tasklet_action(a, list); } @@ -486,7 +767,7 @@ void tasklet_kill(struct tasklet_struct *t) while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { do { - yield(); + msleep(1); } while (test_bit(TASKLET_STATE_SCHED, &t->state)); } tasklet_unlock_wait(t); @@ -500,22 +781,17 @@ EXPORT_SYMBOL(tasklet_kill); */ /* - * The trampoline is called when the hrtimer expires. If this is - * called from the hrtimer interrupt then we schedule the tasklet as - * the timer callback function expects to run in softirq context. If - * it's called in softirq context anyway (i.e. high resolution timers - * disabled) then the hrtimer callback is called right away. + * The trampoline is called when the hrtimer expires. It schedules a tasklet + * to run __tasklet_hrtimer_trampoline() which in turn will call the intended + * hrtimer callback, but from softirq context. */ static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) { struct tasklet_hrtimer *ttimer = container_of(timer, struct tasklet_hrtimer, timer); - if (hrtimer_is_hres_active(timer)) { - tasklet_hi_schedule(&ttimer->tasklet); - return HRTIMER_NORESTART; - } - return ttimer->function(timer); + tasklet_hi_schedule(&ttimer->tasklet); + return HRTIMER_NORESTART; } /* @@ -697,34 +973,86 @@ void __init softirq_init(void) open_softirq(HI_SOFTIRQ, tasklet_hi_action); } -static int run_ksoftirqd(void * __bind_cpu) +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) +void tasklet_unlock_wait(struct tasklet_struct *t) { + while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { + /* + * Hack for now to avoid this busy-loop: + */ +#ifdef CONFIG_PREEMPT_RT + msleep(1); +#else + barrier(); +#endif + } +} +EXPORT_SYMBOL(tasklet_unlock_wait); +#endif + +static int run_ksoftirqd(void * __data) +{ + /* Priority needs to be below hardirqs */ + struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2 - 1}; + struct softirqdata *data = __data; + u32 softirq_mask = (1 << data->nr); + struct softirq_action *h; + int cpu = data->cpu; + + sys_sched_setscheduler(current->pid, SCHED_FIFO, ¶m); + current->extra_flags |= PFE_SOFTIRQ; set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { preempt_disable(); - if (!local_softirq_pending()) { - preempt_enable_no_resched(); - schedule(); + if (!(local_softirq_pending() & softirq_mask)) { +sleep_more: + preempt_enable_and_schedule(); preempt_disable(); } __set_current_state(TASK_RUNNING); + data->running = 1; - while (local_softirq_pending()) { + while (local_softirq_pending() & softirq_mask) { /* Preempt disable stops cpu going offline. If already offline, we'll be on wrong CPU: don't process */ - if (cpu_is_offline((long)__bind_cpu)) + if (cpu_is_offline(cpu)) goto wait_to_die; - do_softirq(); - preempt_enable_no_resched(); + + /* + * Is the softirq already being executed by + * a hardirq context? + */ + local_irq_disable(); + if (per_cpu(softirq_running, cpu) & softirq_mask) { + local_irq_enable(); + set_current_state(TASK_INTERRUPTIBLE); + goto sleep_more; + } + per_cpu(softirq_running, cpu) |= softirq_mask; + set_softirq_pending(local_softirq_pending() & ~softirq_mask); + local_bh_disable(); + local_irq_enable(); + preempt_enable(); + + h = &softirq_vec[data->nr]; + if (h) + h->action(h); + rcu_bh_qs(data->cpu); + + local_irq_disable(); + per_cpu(softirq_running, cpu) &= ~softirq_mask; + _local_bh_enable(); + local_irq_enable(); + cond_resched(); preempt_disable(); - rcu_sched_qs((long)__bind_cpu); } preempt_enable(); set_current_state(TASK_INTERRUPTIBLE); + data->running = 0; } __set_current_state(TASK_RUNNING); return 0; @@ -774,7 +1102,7 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) BUG(); } -static void takeover_tasklets(unsigned int cpu) +void takeover_tasklets(unsigned int cpu) { /* CPU is dead, so no lock needed. */ local_irq_disable(); @@ -800,49 +1128,76 @@ static void takeover_tasklets(unsigned int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ +static const char *softirq_names [] = +{ + [HI_SOFTIRQ] = "high", + [SCHED_SOFTIRQ] = "sched", + [TIMER_SOFTIRQ] = "timer", + [NET_TX_SOFTIRQ] = "net-tx", + [NET_RX_SOFTIRQ] = "net-rx", + [BLOCK_SOFTIRQ] = "block", + [BLOCK_IOPOLL_SOFTIRQ]= "block-iopoll", + [TASKLET_SOFTIRQ] = "tasklet", + [HRTIMER_SOFTIRQ] = "hrtimer", + [RCU_SOFTIRQ] = "rcu", +}; + static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - int hotcpu = (unsigned long)hcpu; + int hotcpu = (unsigned long)hcpu, i; struct task_struct *p; switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); - if (IS_ERR(p)) { - printk("ksoftirqd for %i failed\n", hotcpu); - return NOTIFY_BAD; + for (i = 0; i < NR_SOFTIRQS; i++) { + per_cpu(ksoftirqd, hotcpu)[i].nr = i; + per_cpu(ksoftirqd, hotcpu)[i].cpu = hotcpu; + per_cpu(ksoftirqd, hotcpu)[i].tsk = NULL; + } + for (i = 0; i < NR_SOFTIRQS; i++) { + p = kthread_create(run_ksoftirqd, + &per_cpu(ksoftirqd, hotcpu)[i], + "sirq-%s/%d", softirq_names[i], + hotcpu); + if (IS_ERR(p)) { + printk("ksoftirqd %d for %i failed\n", i, + hotcpu); + return NOTIFY_BAD; + } + kthread_bind(p, hotcpu); + per_cpu(ksoftirqd, hotcpu)[i].tsk = p; } - kthread_bind(p, hotcpu); - per_cpu(ksoftirqd, hotcpu) = p; - break; + break; + break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - wake_up_process(per_cpu(ksoftirqd, hotcpu)); + for (i = 0; i < NR_SOFTIRQS; i++) + wake_up_process(per_cpu(ksoftirqd, hotcpu)[i].tsk); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - if (!per_cpu(ksoftirqd, hotcpu)) - break; - /* Unbind so it can run. Fall thru. */ - kthread_bind(per_cpu(ksoftirqd, hotcpu), - cpumask_any(cpu_online_mask)); + /* Fall trough */ + case CPU_DEAD: case CPU_DEAD_FROZEN: { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + struct sched_param param; - p = per_cpu(ksoftirqd, hotcpu); - per_cpu(ksoftirqd, hotcpu) = NULL; - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); - kthread_stop(p); + for (i = 0; i < NR_SOFTIRQS; i++) { + param.sched_priority = MAX_RT_PRIO-1; + p = per_cpu(ksoftirqd, hotcpu)[i].tsk; + sched_setscheduler(p, SCHED_FIFO, ¶m); + per_cpu(ksoftirqd, hotcpu)[i].tsk = NULL; + kthread_stop(p); + } takeover_tasklets(hotcpu); break; } #endif /* CONFIG_HOTPLUG_CPU */ - } + } return NOTIFY_OK; } @@ -862,6 +1217,34 @@ static __init int spawn_ksoftirqd(void) } early_initcall(spawn_ksoftirqd); + +#ifdef CONFIG_PREEMPT_SOFTIRQS + +int softirq_preemption = 1; + +EXPORT_SYMBOL(softirq_preemption); + +/* + * Real-Time Preemption depends on softirq threading: + */ +#ifndef CONFIG_PREEMPT_RT + +static int __init softirq_preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) + softirq_preemption = 0; + else + get_option(&str, &softirq_preemption); + if (!softirq_preemption) + printk("turning off softirq preemption!\n"); + + return 1; +} + +__setup("softirq-preempt=", softirq_preempt_setup); +#endif +#endif + #ifdef CONFIG_SMP /* * Call a function on all processors diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 0d4c789..bbaeac5 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -20,7 +20,7 @@ #include <asm/irq_regs.h> -static DEFINE_SPINLOCK(print_lock); +static DEFINE_RAW_SPINLOCK(print_lock); static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ @@ -164,7 +164,7 @@ void softlockup_tick(void) per_cpu(softlockup_print_ts, this_cpu) = touch_ts; - spin_lock(&print_lock); + raw_spin_lock(&print_lock); printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", this_cpu, now - touch_ts, current->comm, task_pid_nr(current)); @@ -174,7 +174,7 @@ void softlockup_tick(void) show_regs(regs); else dump_stack(); - spin_unlock(&print_lock); + raw_spin_unlock(&print_lock); if (softlockup_panic) panic("softlockup: hung tasks"); diff --git a/kernel/spinlock.c b/kernel/spinlock.c index be6517f..e1d9b78 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -110,8 +110,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ * __[spin|read|write]_lock_bh() */ BUILD_LOCK_OPS(spin, raw_spinlock); + +#ifndef CONFIG_PREEMPT_RT BUILD_LOCK_OPS(read, rwlock); BUILD_LOCK_OPS(write, rwlock); +#endif #endif @@ -195,6 +198,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) EXPORT_SYMBOL(_raw_spin_unlock_bh); #endif +#ifndef CONFIG_PREEMPT_RT + #ifndef CONFIG_INLINE_READ_TRYLOCK int __lockfunc _raw_read_trylock(rwlock_t *lock) { @@ -339,6 +344,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) EXPORT_SYMBOL(_raw_write_unlock_bh); #endif +#endif /* !PREEMPT_RT */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 912823e..22d1d77 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -40,6 +40,8 @@ static atomic_t thread_ack; static DEFINE_MUTEX(lock); /* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */ static DEFINE_MUTEX(setup_lock); +/* do not start up until all worklets have been placed: */ +static DEFINE_MUTEX(startup_lock); /* Users of stop_machine. */ static int refcount; static struct workqueue_struct *stop_machine_wq; @@ -71,6 +73,15 @@ static void stop_cpu(struct work_struct *unused) int cpu = smp_processor_id(); int err; + /* + * Wait for the startup loop to finish: + */ + mutex_lock(&startup_lock); + /* + * Let other threads continue too: + */ + mutex_unlock(&startup_lock); + if (!active_cpus) { if (cpu == cpumask_first(cpu_online_mask)) smdata = &active; @@ -166,16 +177,21 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) set_state(STOPMACHINE_PREPARE); - /* Schedule the stop_cpu work on all cpus: hold this CPU so one - * doesn't hit this CPU until we're ready. */ - get_cpu(); + /* + * Schedule the stop_cpu work on all cpus before allowing any + * of the CPUs to execute it: + */ + mutex_lock(&startup_lock); + for_each_online_cpu(i) { sm_work = per_cpu_ptr(stop_machine_work, i); INIT_WORK(sm_work, stop_cpu); queue_work_on(i, stop_machine_wq, sm_work); } - /* This will release the thread on our CPU. */ - put_cpu(); + + /* This will release the thread on all CPUs: */ + mutex_unlock(&startup_lock); + flush_workqueue(stop_machine_wq); ret = active.fnret; mutex_unlock(&lock); diff --git a/kernel/sys.c b/kernel/sys.c index 26a6b73..1d96580 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -32,6 +32,7 @@ #include <linux/getcpu.h> #include <linux/task_io_accounting_ops.h> #include <linux/seccomp.h> +#include <linux/hardirq.h> #include <linux/cpu.h> #include <linux/ptrace.h> #include <linux/fs_struct.h> @@ -222,6 +223,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) if (which > PRIO_USER || which < PRIO_PROCESS) return -EINVAL; + rcu_read_lock(); read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: @@ -267,6 +269,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) } out_unlock: read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } @@ -281,6 +284,15 @@ out_unlock: */ void emergency_restart(void) { + /* + * Call the notifier chain if we are not in an + * atomic context: + */ +#ifdef CONFIG_PREEMPT + if (!in_atomic() && !irqs_disabled()) + blocking_notifier_call_chain(&reboot_notifier_list, + SYS_RESTART, NULL); +#endif machine_emergency_restart(); } EXPORT_SYMBOL_GPL(emergency_restart); diff --git a/kernel/time.c b/kernel/time.c index 8047980..adbe583 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -133,11 +133,11 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, */ static inline void warp_clock(void) { - write_seqlock_irq(&xtime_lock); + write_raw_seqlock_irq(&xtime_lock); wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; xtime.tv_sec += sys_tz.tz_minuteswest * 60; update_xtime_cache(0); - write_sequnlock_irq(&xtime_lock); + write_raw_sequnlock_irq(&xtime_lock); clock_was_set(); } @@ -699,9 +699,9 @@ u64 get_jiffies_64(void) u64 ret; do { - seq = read_seqbegin(&xtime_lock); + seq = read_raw_seqbegin(&xtime_lock); ret = jiffies_64; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_raw_seqretry(&xtime_lock, seq)); return ret; } EXPORT_SYMBOL(get_jiffies_64); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 4800f93..ed2aec1 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -188,7 +188,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) { enum hrtimer_restart res = HRTIMER_NORESTART; - write_seqlock(&xtime_lock); + write_raw_seqlock(&xtime_lock); switch (time_state) { case TIME_OK: @@ -218,7 +218,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) break; } - write_sequnlock(&xtime_lock); + write_raw_sequnlock(&xtime_lock); return res; } @@ -476,7 +476,7 @@ int do_adjtimex(struct timex *txc) getnstimeofday(&ts); - write_seqlock_irq(&xtime_lock); + write_raw_seqlock_irq(&xtime_lock); if (txc->modes & ADJ_ADJTIME) { long save_adjust = time_adjust; @@ -524,7 +524,7 @@ int do_adjtimex(struct timex *txc) txc->errcnt = 0; txc->stbcnt = 0; - write_sequnlock_irq(&xtime_lock); + write_raw_sequnlock_irq(&xtime_lock); txc->time.tv_sec = ts.tv_sec; txc->time.tv_usec = ts.tv_nsec; diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b6b898d..01165a7 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -60,13 +60,13 @@ int tick_is_oneshot_available(void) static void tick_periodic(int cpu) { if (tick_do_timer_cpu == cpu) { - write_seqlock(&xtime_lock); + write_raw_seqlock(&xtime_lock); /* Keep track of the next tick event */ tick_next_period = ktime_add(tick_next_period, tick_period); do_timer(1); - write_sequnlock(&xtime_lock); + write_raw_sequnlock(&xtime_lock); } update_process_times(user_mode(get_irq_regs())); @@ -127,9 +127,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) ktime_t next; do { - seq = read_seqbegin(&xtime_lock); + seq = read_raw_seqbegin(&xtime_lock); next = tick_next_period; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_raw_seqretry(&xtime_lock, seq)); clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f992762..a521150 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -57,7 +57,7 @@ static void tick_do_update_jiffies64(ktime_t now) return; /* Reevalute with xtime_lock held */ - write_seqlock(&xtime_lock); + write_raw_seqlock(&xtime_lock); delta = ktime_sub(now, last_jiffies_update); if (delta.tv64 >= tick_period.tv64) { @@ -80,7 +80,7 @@ static void tick_do_update_jiffies64(ktime_t now) /* Keep the tick_next_period variable up to date */ tick_next_period = ktime_add(last_jiffies_update, tick_period); } - write_sequnlock(&xtime_lock); + write_raw_sequnlock(&xtime_lock); } /* @@ -90,12 +90,12 @@ static ktime_t tick_init_jiffy_update(void) { ktime_t period; - write_seqlock(&xtime_lock); + write_raw_seqlock(&xtime_lock); /* Did we start the jiffies update yet ? */ if (last_jiffies_update.tv64 == 0) last_jiffies_update = tick_next_period; period = last_jiffies_update; - write_sequnlock(&xtime_lock); + write_raw_sequnlock(&xtime_lock); return period; } @@ -252,24 +252,18 @@ void tick_nohz_stop_sched_tick(int inidle) goto end; if (unlikely(local_softirq_pending() && cpu_online(cpu))) { - static int ratelimit; - - if (ratelimit < 10) { - printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", - (unsigned int) local_softirq_pending()); - ratelimit++; - } + softirq_check_pending_idle(); goto end; } ts->idle_calls++; /* Read jiffies and the time when jiffies were updated last */ do { - seq = read_seqbegin(&xtime_lock); + seq = read_raw_seqbegin(&xtime_lock); last_update = last_jiffies_update; last_jiffies = jiffies; time_delta = timekeeping_max_deferment(); - } while (read_seqretry(&xtime_lock, seq)); + } while (read_raw_seqretry(&xtime_lock, seq)); if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || arch_needs_cpu(cpu)) { @@ -733,6 +727,7 @@ void tick_setup_sched_timer(void) * Emulate tick processing via per-CPU hrtimers: */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + ts->sched_timer.irqsafe = 1; ts->sched_timer.function = tick_sched_timer; /* Get the next period (per cpu) */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e2ab064..ee2b90f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -135,8 +135,7 @@ static inline s64 timekeeping_get_ns_raw(void) * This read-write spinlock protects us from races in SMP while * playing with xtime. */ -__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); - +__cacheline_aligned_in_smp DEFINE_RAW_SEQLOCK(xtime_lock); /* * The current time @@ -226,7 +225,7 @@ void getnstimeofday(struct timespec *ts) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&xtime_lock); + seq = read_raw_seqbegin(&xtime_lock); *ts = xtime; nsecs = timekeeping_get_ns(); @@ -234,7 +233,7 @@ void getnstimeofday(struct timespec *ts) /* If arch requires, add in gettimeoffset() */ nsecs += arch_gettimeoffset(); - } while (read_seqretry(&xtime_lock, seq)); + } while (read_raw_seqretry(&xtime_lock, seq)); timespec_add_ns(ts, nsecs); } @@ -249,12 +248,12 @@ ktime_t ktime_get(void) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&xtime_lock); + seq = read_raw_seqbegin(&xtime_lock); secs = xtime.tv_sec + wall_to_monotonic.tv_sec; nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; nsecs += timekeeping_get_ns(); - } while (read_seqretry(&xtime_lock, seq)); + } while (read_raw_seqretry(&xtime_lock, seq)); /* * Use ktime_set/ktime_add_ns to create a proper ktime on * 32-bit architectures without CONFIG_KTIME_SCALAR. @@ -280,12 +279,12 @@ void ktime_get_ts(struct timespec *ts) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&xtime_lock); + seq = read_raw_seqbegin(&xtime_lock); *ts = xtime; tomono = wall_to_monotonic; nsecs = timekeeping_get_ns(); - } while (read_seqretry(&xtime_lock, seq)); + } while (read_raw_seqretry(&xtime_lock, seq)); set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, ts->tv_nsec + tomono.tv_nsec + nsecs); @@ -322,7 +321,7 @@ int do_settimeofday(struct timespec *tv) if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irqsave(&xtime_lock, flags); + write_raw_seqlock_irqsave(&xtime_lock, flags); timekeeping_forward_now(); @@ -339,7 +338,7 @@ int do_settimeofday(struct timespec *tv) update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); - write_sequnlock_irqrestore(&xtime_lock, flags); + write_raw_sequnlock_irqrestore(&xtime_lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -418,11 +417,11 @@ void ktime_get_ts(struct timespec *ts) unsigned long seq; do { - seq = read_seqbegin(&xtime_lock); + seq = read_raw_seqbegin(&xtime_lock); getnstimeofday(ts); tomono = wall_to_monotonic; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_raw_seqretry(&xtime_lock, seq)); set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, ts->tv_nsec + tomono.tv_nsec); @@ -458,11 +457,11 @@ void getrawmonotonic(struct timespec *ts) s64 nsecs; do { - seq = read_seqbegin(&xtime_lock); + seq = read_raw_seqbegin(&xtime_lock); nsecs = timekeeping_get_ns_raw(); *ts = raw_time; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_raw_seqretry(&xtime_lock, seq)); timespec_add_ns(ts, nsecs); } @@ -478,11 +477,11 @@ int timekeeping_valid_for_hres(void) int ret; do { - seq = read_seqbegin(&xtime_lock); + seq = read_raw_seqbegin(&xtime_lock); ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_raw_seqretry(&xtime_lock, seq)); return ret; } @@ -540,7 +539,7 @@ void __init timekeeping_init(void) read_persistent_clock(&now); read_boot_clock(&boot); - write_seqlock_irqsave(&xtime_lock, flags); + write_raw_seqlock_irqsave(&xtime_lock, flags); ntp_init(); @@ -562,7 +561,7 @@ void __init timekeeping_init(void) update_xtime_cache(0); total_sleep_time.tv_sec = 0; total_sleep_time.tv_nsec = 0; - write_sequnlock_irqrestore(&xtime_lock, flags); + write_raw_sequnlock_irqrestore(&xtime_lock, flags); } /* time in seconds when suspend began */ @@ -585,7 +584,7 @@ static int timekeeping_resume(struct sys_device *dev) clocksource_resume(); - write_seqlock_irqsave(&xtime_lock, flags); + write_raw_seqlock_irqsave(&xtime_lock, flags); if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { ts = timespec_sub(ts, timekeeping_suspend_time); @@ -598,7 +597,7 @@ static int timekeeping_resume(struct sys_device *dev) timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); timekeeper.ntp_error = 0; timekeeping_suspended = 0; - write_sequnlock_irqrestore(&xtime_lock, flags); + write_raw_sequnlock_irqrestore(&xtime_lock, flags); touch_softlockup_watchdog(); @@ -616,10 +615,10 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) read_persistent_clock(&timekeeping_suspend_time); - write_seqlock_irqsave(&xtime_lock, flags); + write_raw_seqlock_irqsave(&xtime_lock, flags); timekeeping_forward_now(); timekeeping_suspended = 1; - write_sequnlock_irqrestore(&xtime_lock, flags); + write_raw_sequnlock_irqrestore(&xtime_lock, flags); clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); @@ -909,10 +908,10 @@ struct timespec current_kernel_time(void) unsigned long seq; do { - seq = read_seqbegin(&xtime_lock); + seq = read_raw_seqbegin(&xtime_lock); now = xtime_cache; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_raw_seqretry(&xtime_lock, seq)); return now; } @@ -924,11 +923,11 @@ struct timespec get_monotonic_coarse(void) unsigned long seq; do { - seq = read_seqbegin(&xtime_lock); + seq = read_raw_seqbegin(&xtime_lock); now = xtime_cache; mono = wall_to_monotonic; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_raw_seqretry(&xtime_lock, seq)); set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); diff --git a/kernel/timer.c b/kernel/timer.c index c61a794..c850d06 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -34,6 +34,7 @@ #include <linux/posix-timers.h> #include <linux/cpu.h> #include <linux/syscalls.h> +#include <linux/kallsyms.h> #include <linux/delay.h> #include <linux/tick.h> #include <linux/kallsyms.h> @@ -74,6 +75,7 @@ struct tvec_root { struct tvec_base { spinlock_t lock; struct timer_list *running_timer; + wait_queue_head_t wait_for_running_timer; unsigned long timer_jiffies; unsigned long next_timer; struct tvec_root tv1; @@ -322,9 +324,7 @@ EXPORT_SYMBOL_GPL(round_jiffies_up_relative); static inline void set_running_timer(struct tvec_base *base, struct timer_list *timer) { -#ifdef CONFIG_SMP base->running_timer = timer; -#endif } static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) @@ -656,6 +656,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_activate(timer, expires); + preempt_disable(); cpu = smp_processor_id(); #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) @@ -666,6 +667,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, cpu = preferred_cpu; } #endif + preempt_enable(); + new_base = per_cpu(tvec_bases, cpu); if (base != new_base) { @@ -825,6 +828,18 @@ void add_timer_on(struct timer_list *timer, int cpu) } EXPORT_SYMBOL_GPL(add_timer_on); +/* + * Wait for a running timer + */ +void wait_for_running_timer(struct timer_list *timer) +{ + struct tvec_base *base = timer->base; + + if (base->running_timer == timer) + wait_event(base->wait_for_running_timer, + base->running_timer != timer); +} + /** * del_timer - deactive a timer. * @timer: the timer to be deactivated @@ -859,7 +874,34 @@ int del_timer(struct timer_list *timer) } EXPORT_SYMBOL(del_timer); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) +/* + * This function checks whether a timer is active and not running on any + * CPU. Upon successful (ret >= 0) exit the timer is not queued and the + * handler is not running on any CPU. + * + * It must not be called from interrupt contexts. + */ +int timer_pending_sync(struct timer_list *timer) +{ + struct tvec_base *base; + unsigned long flags; + int ret = -1; + + base = lock_timer_base(timer, &flags); + + if (base->running_timer == timer) + goto out; + + ret = 0; + if (timer_pending(timer)) + ret = 1; +out: + spin_unlock_irqrestore(&base->lock, flags); + + return ret; +} + /** * try_to_del_timer_sync - Try to deactivate a timer * @timer: timer do del @@ -927,7 +969,7 @@ int del_timer_sync(struct timer_list *timer) int ret = try_to_del_timer_sync(timer); if (ret >= 0) return ret; - cpu_relax(); + wait_for_running_timer(timer); } } EXPORT_SYMBOL(del_timer_sync); @@ -972,6 +1014,20 @@ static inline void __run_timers(struct tvec_base *base) struct list_head *head = &work_list; int index = base->timer_jiffies & TVR_MASK; + if (softirq_need_resched()) { + spin_unlock_irq(&base->lock); + wake_up(&base->wait_for_running_timer); + cond_resched_softirq_context(); + cpu_relax(); + spin_lock_irq(&base->lock); + /* + * We can simply continue after preemption, nobody + * else can touch timer_jiffies so 'index' is still + * valid. Any new jiffy will be taken care of in + * subsequent loops: + */ + } + /* * Cascade timers: */ @@ -1027,18 +1083,17 @@ static inline void __run_timers(struct tvec_base *base) lock_map_release(&lockdep_map); if (preempt_count != preempt_count()) { - printk(KERN_ERR "huh, entered %p " - "with preempt_count %08x, exited" - " with %08x?\n", - fn, preempt_count, - preempt_count()); - BUG(); + print_symbol("BUG: unbalanced timer-handler preempt count in %s!\n", (unsigned long) fn); + printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count()); + preempt_count() = preempt_count; } } + set_running_timer(base, NULL); + cond_resched_softirq_context(); spin_lock_irq(&base->lock); } } - set_running_timer(base, NULL); + wake_up(&base->wait_for_running_timer); spin_unlock_irq(&base->lock); } @@ -1171,6 +1226,18 @@ unsigned long get_next_timer_interrupt(unsigned long now) struct tvec_base *base = __get_cpu_var(tvec_bases); unsigned long expires; +#ifdef CONFIG_PREEMPT_RT + /* + * On PREEMPT_RT we cannot sleep here. If the trylock does not + * succeed then we return the worst-case 'expires in 1 tick' + * value: + */ + if (spin_trylock(&base->lock)) { + expires = __next_timer_interrupt(base); + spin_unlock(&base->lock); + } else + expires = now + 1; +#else spin_lock(&base->lock); if (time_before_eq(base->next_timer, base->timer_jiffies)) base->next_timer = __next_timer_interrupt(base); @@ -1179,7 +1246,7 @@ unsigned long get_next_timer_interrupt(unsigned long now) if (time_before_eq(expires, now)) return now; - +#endif return cmp_next_hrtimer_event(now, expires); } #endif @@ -1195,11 +1262,10 @@ void update_process_times(int user_tick) /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); + scheduler_tick(); run_local_timers(); rcu_check_callbacks(cpu, user_tick); - printk_tick(); perf_event_do_pending(); - scheduler_tick(); run_posix_cpu_timers(p); } @@ -1208,9 +1274,11 @@ void update_process_times(int user_tick) */ static void run_timer_softirq(struct softirq_action *h) { - struct tvec_base *base = __get_cpu_var(tvec_bases); + struct tvec_base *base = per_cpu(tvec_bases, raw_smp_processor_id()); + printk_tick(); hrtimer_run_pending(); + perf_event_do_pending_softirq(); if (time_after_eq(jiffies, base->timer_jiffies)) __run_timers(base); @@ -1550,6 +1618,7 @@ static int __cpuinit init_timers_cpu(int cpu) } spin_lock_init(&base->lock); + init_waitqueue_head(&base->wait_for_running_timer); for (j = 0; j < TVN_SIZE; j++) { INIT_LIST_HEAD(base->tv5.vec + j); @@ -1585,6 +1654,7 @@ static void __cpuinit migrate_timers(int cpu) { struct tvec_base *old_base; struct tvec_base *new_base; + unsigned long flags; int i; BUG_ON(cpu_online(cpu)); @@ -1594,8 +1664,11 @@ static void __cpuinit migrate_timers(int cpu) * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - spin_lock_irq(&new_base->lock); - spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + local_irq_save(flags); + while (!spin_trylock(&new_base->lock)) + cpu_relax(); + while (!spin_trylock(&old_base->lock)) + cpu_relax(); BUG_ON(old_base->running_timer); @@ -1609,7 +1682,9 @@ static void __cpuinit migrate_timers(int cpu) } spin_unlock(&old_base->lock); - spin_unlock_irq(&new_base->lock); + spin_unlock(&new_base->lock); + local_irq_restore(flags); + put_cpu_var(tvec_bases); } #endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 60e2ce0..7d57890 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -175,6 +175,24 @@ config IRQSOFF_TRACER enabled. This option and the preempt-off timing option can be used together or separately.) +config INTERRUPT_OFF_HIST + bool "Interrupts-off Latency Histogram" + depends on IRQSOFF_TRACER + help + This option generates continuously updated histograms (one per cpu) + of the duration of time periods with interrupts disabled. The + histograms are disabled by default. To enable them, write a non-zero + number to + + /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff + + If PREEMPT_OFF_HIST is also selected, additional histograms (one + per cpu) are generated that accumulate the duration of time periods + when both interrupts and preemption are disabled. The histogram data + will be located in the debug file system at + + /sys/kernel/debug/tracing/latency_hist/irqsoff + config PREEMPT_TRACER bool "Preemption-off Latency Tracer" default n @@ -197,14 +215,23 @@ config PREEMPT_TRACER enabled. This option and the irqs-off timing option can be used together or separately.) -config SYSPROF_TRACER - bool "Sysprof Tracer" - depends on X86 - select GENERIC_TRACER - select CONTEXT_SWITCH_TRACER +config PREEMPT_OFF_HIST + bool "Preemption-off Latency Histogram" + depends on PREEMPT_TRACER help - This tracer provides the trace needed by the 'Sysprof' userspace - tool. + This option generates continuously updated histograms (one per cpu) + of the duration of time periods with preemption disabled. The + histograms are disabled by default. To enable them, write a non-zero + number to + + /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff + + If INTERRUPT_OFF_HIST is also selected, additional histograms (one + per cpu) are generated that accumulate the duration of time periods + when both interrupts and preemption are disabled. The histogram data + will be located in the debug file system at + + /sys/kernel/debug/tracing/latency_hist/preemptoff config SCHED_TRACER bool "Scheduling Latency Tracer" @@ -215,6 +242,55 @@ config SCHED_TRACER This tracer tracks the latency of the highest priority task to be scheduled in, starting from the point it has woken up. +config WAKEUP_LATENCY_HIST + bool "Scheduling Latency Histogram" + depends on SCHED_TRACER + help + This option generates continuously updated histograms (one per cpu) + of the scheduling latency of the highest priority task. + The histograms are disabled by default. To enable them, write a + non-zero number to + + /sys/kernel/debug/tracing/latency_hist/enable/wakeup + + Two different algorithms are used, one to determine the latency of + processes that exclusively use the highest priority of the system and + another one to determine the latency of processes that share the + highest system priority with other processes. The former is used to + improve hardware and system software, the latter to optimize the + priority design of a given system. The histogram data will be + located in the debug file system at + + /sys/kernel/debug/tracing/latency_hist/wakeup + + and + + /sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio + +config MISSED_TIMER_OFFSETS_HIST + depends on GENERIC_TIME + select GENERIC_TRACER + bool "Missed timer offsets histogram" + help + Generate a histogram of missed timer offsets in microseconds. The + histograms are disabled by default. To enable them, write a non-zero + number to + + /sys/kernel/debug/tracing/latency_hist/enable/missed_timer_offsets + + The histogram data will be located in the debug file system at + + /sys/kernel/debug/tracing/latency_hist/missed_timer_offsets + +config SYSPROF_TRACER + bool "Sysprof Tracer" + depends on X86 + select GENERIC_TRACER + select CONTEXT_SWITCH_TRACER + help + This tracer provides the trace needed by the 'Sysprof' userspace + tool. + config ENABLE_DEFAULT_TRACERS bool "Trace process context switches and events" depends on !GENERIC_TRACER @@ -385,6 +461,7 @@ config STACK_TRACER config HW_BRANCH_TRACER depends on HAVE_HW_BRANCH_TRACER + depends on !PREEMPT_RT bool "Trace hw branches" select GENERIC_TRACER help @@ -412,7 +489,7 @@ config KMEMTRACE If unsure, say N. config WORKQUEUE_TRACER - bool "Trace workqueues" + bool "Trace workqueues" if !PREEMPT_RT select GENERIC_TRACER help The workqueue tracer provides some statistical information diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index cd9ecd8..5dddc4d 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -35,6 +35,10 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o +obj-$(CONFIG_INTERRUPT_OFF_HIST) += latency_hist.o +obj-$(CONFIG_PREEMPT_OFF_HIST) += latency_hist.o +obj-$(CONFIG_WAKEUP_LATENCY_HIST) += latency_hist.o +obj-$(CONFIG_MISSED_TIMER_OFFSETS_HIST) += latency_hist.o obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1e6640f..a6d0ebe 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -392,7 +392,8 @@ static int function_stat_show(struct seq_file *m, void *v) #ifdef CONFIG_FUNCTION_GRAPH_TRACER seq_printf(m, " "); avg = rec->time; - do_div(avg, rec->counter); + if (rec->counter) + do_div(avg, rec->counter); mutex_lock(&mutex); trace_seq_init(&s); diff --git a/kernel/trace/latency_hist.c b/kernel/trace/latency_hist.c new file mode 100644 index 0000000..ce67060 --- /dev/null +++ b/kernel/trace/latency_hist.c @@ -0,0 +1,1040 @@ +/* + * kernel/trace/latency_hist.c + * + * Add support for histograms of preemption-off latency and + * interrupt-off latency and wakeup latency, it depends on + * Real-Time Preemption Support. + * + * Copyright (C) 2005 MontaVista Software, Inc. + * Yi Yang <yyang@ch.mvista.com> + * + * Converted to work with the new latency tracer. + * Copyright (C) 2008 Red Hat, Inc. + * Steven Rostedt <srostedt@redhat.com> + * + */ +#include <linux/module.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <linux/percpu.h> +#include <linux/kallsyms.h> +#include <linux/uaccess.h> +#include <linux/sched.h> +#include <asm/atomic.h> +#include <asm/div64.h> + +#include "trace.h" +#include <trace/events/sched.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/hist.h> + +enum { + IRQSOFF_LATENCY = 0, + PREEMPTOFF_LATENCY, + PREEMPTIRQSOFF_LATENCY, + WAKEUP_LATENCY, + WAKEUP_LATENCY_SHAREDPRIO, + MISSED_TIMER_OFFSETS, + MAX_LATENCY_TYPE, +}; + +#define MAX_ENTRY_NUM 10240 + +struct hist_data { + atomic_t hist_mode; /* 0 log, 1 don't log */ + long offset; /* set it to MAX_ENTRY_NUM/2 for a bipolar scale */ + unsigned long min_lat; + unsigned long max_lat; + unsigned long long below_hist_bound_samples; + unsigned long long above_hist_bound_samples; + unsigned long long accumulate_lat; + unsigned long long total_samples; + unsigned long long hist_array[MAX_ENTRY_NUM]; +}; + +struct enable_data { + int latency_type; + int enabled; +}; + +static char *latency_hist_dir_root = "latency_hist"; + +#ifdef CONFIG_INTERRUPT_OFF_HIST +static DEFINE_PER_CPU(struct hist_data, irqsoff_hist); +static char *irqsoff_hist_dir = "irqsoff"; +static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start); +static DEFINE_PER_CPU(int, hist_irqsoff_counting); +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST +static DEFINE_PER_CPU(struct hist_data, preemptoff_hist); +static char *preemptoff_hist_dir = "preemptoff"; +static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start); +static DEFINE_PER_CPU(int, hist_preemptoff_counting); +#endif + +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST) +static DEFINE_PER_CPU(struct hist_data, preemptirqsoff_hist); +static char *preemptirqsoff_hist_dir = "preemptirqsoff"; +static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start); +static DEFINE_PER_CPU(int, hist_preemptirqsoff_counting); +#endif + +#if defined(CONFIG_PREEMPT_OFF_HIST) || defined(CONFIG_INTERRUPT_OFF_HIST) +static notrace void probe_preemptirqsoff_hist(int reason, int start); +static struct enable_data preemptirqsoff_enabled_data = { + .latency_type = PREEMPTIRQSOFF_LATENCY, + .enabled = 0, +}; +#endif + +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \ + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST) +struct maxlatproc_data { + char comm[FIELD_SIZEOF(struct task_struct, comm)]; + int pid; + int prio; + long latency; +}; +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist); +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist_sharedprio); +static char *wakeup_latency_hist_dir = "wakeup"; +static char *wakeup_latency_hist_dir_sharedprio = "sharedprio"; +static notrace void probe_wakeup_latency_hist_start(struct rq *rq, + struct task_struct *p, int success); +static notrace void probe_wakeup_latency_hist_stop(struct rq *rq, + struct task_struct *prev, struct task_struct *next); +static notrace void probe_sched_migrate_task(struct task_struct *task, + int cpu); +static struct enable_data wakeup_latency_enabled_data = { + .latency_type = WAKEUP_LATENCY, + .enabled = 0, +}; +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc); +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc_sharedprio); +static DEFINE_PER_CPU(struct task_struct *, wakeup_task); +static DEFINE_PER_CPU(int, wakeup_sharedprio); +static unsigned long wakeup_pid; +#endif + +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST +static DEFINE_PER_CPU(struct hist_data, missed_timer_offsets); +static char *missed_timer_offsets_dir = "missed_timer_offsets"; +static notrace void probe_hrtimer_interrupt(int cpu, + long long offset, struct task_struct *curr, struct task_struct *task); +static struct enable_data missed_timer_offsets_enabled_data = { + .latency_type = MISSED_TIMER_OFFSETS, + .enabled = 0, +}; +static DEFINE_PER_CPU(struct maxlatproc_data, missed_timer_offsets_maxlatproc); +static unsigned long missed_timer_offsets_pid; +#endif + +void notrace latency_hist(int latency_type, int cpu, unsigned long latency, + struct task_struct *p) +{ + struct hist_data *my_hist; +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \ + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST) + struct maxlatproc_data *mp = NULL; +#endif + + if (cpu < 0 || cpu >= NR_CPUS || latency_type < 0 || + latency_type >= MAX_LATENCY_TYPE) + return; + + switch (latency_type) { +#ifdef CONFIG_INTERRUPT_OFF_HIST + case IRQSOFF_LATENCY: + my_hist = &per_cpu(irqsoff_hist, cpu); + break; +#endif +#ifdef CONFIG_PREEMPT_OFF_HIST + case PREEMPTOFF_LATENCY: + my_hist = &per_cpu(preemptoff_hist, cpu); + break; +#endif +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST) + case PREEMPTIRQSOFF_LATENCY: + my_hist = &per_cpu(preemptirqsoff_hist, cpu); + break; +#endif +#ifdef CONFIG_WAKEUP_LATENCY_HIST + case WAKEUP_LATENCY: + my_hist = &per_cpu(wakeup_latency_hist, cpu); + mp = &per_cpu(wakeup_maxlatproc, cpu); + break; + case WAKEUP_LATENCY_SHAREDPRIO: + my_hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu); + mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu); + break; +#endif +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST + case MISSED_TIMER_OFFSETS: + my_hist = &per_cpu(missed_timer_offsets, cpu); + mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu); + break; +#endif + default: + return; + } + + if (atomic_read(&my_hist->hist_mode) == 0) + return; + + latency += my_hist->offset; + + if (latency < 0 || latency >= MAX_ENTRY_NUM) { + if (latency < 0) + my_hist->below_hist_bound_samples++; + else + my_hist->above_hist_bound_samples++; + } else + my_hist->hist_array[latency]++; + + if (latency < my_hist->min_lat) + my_hist->min_lat = latency; + if (latency > my_hist->max_lat) { +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \ + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST) + if (latency_type == WAKEUP_LATENCY || + latency_type == WAKEUP_LATENCY_SHAREDPRIO || + latency_type == MISSED_TIMER_OFFSETS) { + strncpy(mp->comm, p->comm, sizeof(mp->comm)); + mp->pid = task_pid_nr(p); + mp->prio = p->prio; + mp->latency = latency; + } +#endif + my_hist->max_lat = latency; + } + my_hist->total_samples++; + my_hist->accumulate_lat += latency; +} + +static void *l_start(struct seq_file *m, loff_t *pos) +{ + loff_t *index_ptr = NULL; + loff_t index = *pos; + struct hist_data *my_hist = m->private; + + if (index == 0) { + char minstr[32], avgstr[32], maxstr[32]; + + atomic_dec(&my_hist->hist_mode); + + if (likely(my_hist->total_samples)) { + unsigned long avg = (unsigned long) + div64_u64(my_hist->accumulate_lat, + my_hist->total_samples); + snprintf(minstr, sizeof(minstr), "%ld", + (long) my_hist->min_lat - my_hist->offset); + snprintf(avgstr, sizeof(avgstr), "%ld", + (long) avg - my_hist->offset); + snprintf(maxstr, sizeof(maxstr), "%ld", + (long) my_hist->max_lat - my_hist->offset); + } else { + strcpy(minstr, "<undef>"); + strcpy(avgstr, minstr); + strcpy(maxstr, minstr); + } + + seq_printf(m, "#Minimum latency: %s microseconds\n" + "#Average latency: %s microseconds\n" + "#Maximum latency: %s microseconds\n" + "#Total samples: %llu\n" + "#There are %llu samples lower than %ld" + " microseconds.\n" + "#There are %llu samples greater or equal" + " than %ld microseconds.\n" + "#usecs\t%16s\n", + minstr, avgstr, maxstr, + my_hist->total_samples, + my_hist->below_hist_bound_samples, + -my_hist->offset, + my_hist->above_hist_bound_samples, + MAX_ENTRY_NUM - my_hist->offset, + "samples"); + } + if (index < MAX_ENTRY_NUM) { + index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL); + if (index_ptr) + *index_ptr = index; + } + + return index_ptr; +} + +static void *l_next(struct seq_file *m, void *p, loff_t *pos) +{ + loff_t *index_ptr = p; + struct hist_data *my_hist = m->private; + + if (++*pos >= MAX_ENTRY_NUM) { + atomic_inc(&my_hist->hist_mode); + return NULL; + } + *index_ptr = *pos; + return index_ptr; +} + +static void l_stop(struct seq_file *m, void *p) +{ + kfree(p); +} + +static int l_show(struct seq_file *m, void *p) +{ + int index = *(loff_t *) p; + struct hist_data *my_hist = m->private; + + seq_printf(m, "%6ld\t%16llu\n", index - my_hist->offset, + my_hist->hist_array[index]); + return 0; +} + +static struct seq_operations latency_hist_seq_op = { + .start = l_start, + .next = l_next, + .stop = l_stop, + .show = l_show +}; + +static int latency_hist_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = seq_open(file, &latency_hist_seq_op); + if (!ret) { + struct seq_file *seq = file->private_data; + seq->private = inode->i_private; + } + return ret; +} + +static struct file_operations latency_hist_fops = { + .open = latency_hist_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void hist_reset(struct hist_data *hist) +{ + atomic_dec(&hist->hist_mode); + + memset(hist->hist_array, 0, sizeof(hist->hist_array)); + hist->below_hist_bound_samples = 0ULL; + hist->above_hist_bound_samples = 0ULL; + hist->min_lat = 0xFFFFFFFFUL; + hist->max_lat = 0UL; + hist->total_samples = 0ULL; + hist->accumulate_lat = 0ULL; + + atomic_inc(&hist->hist_mode); +} + +static ssize_t +latency_hist_reset(struct file *file, const char __user *a, + size_t size, loff_t *off) +{ + int cpu; + struct hist_data *hist = NULL; +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \ + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST) + struct maxlatproc_data *mp = NULL; +#endif + off_t latency_type = (off_t) file->private_data; + + for_each_online_cpu(cpu) { + + switch (latency_type) { +#ifdef CONFIG_PREEMPT_OFF_HIST + case PREEMPTOFF_LATENCY: + hist = &per_cpu(preemptoff_hist, cpu); + break; +#endif +#ifdef CONFIG_INTERRUPT_OFF_HIST + case IRQSOFF_LATENCY: + hist = &per_cpu(irqsoff_hist, cpu); + break; +#endif +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) + case PREEMPTIRQSOFF_LATENCY: + hist = &per_cpu(preemptirqsoff_hist, cpu); + break; +#endif +#ifdef CONFIG_WAKEUP_LATENCY_HIST + case WAKEUP_LATENCY: + hist = &per_cpu(wakeup_latency_hist, cpu); + mp = &per_cpu(wakeup_maxlatproc, cpu); + break; + case WAKEUP_LATENCY_SHAREDPRIO: + hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu); + mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu); + break; +#endif +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST + case MISSED_TIMER_OFFSETS: + hist = &per_cpu(missed_timer_offsets, cpu); + mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu); + break; +#endif + } + + hist_reset(hist); +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \ + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST) + if (latency_type == WAKEUP_LATENCY || + latency_type == WAKEUP_LATENCY_SHAREDPRIO || + latency_type == MISSED_TIMER_OFFSETS) { + mp->comm[0] = '\0'; + mp->prio = mp->pid = mp->latency = -1; + } +#endif + } + + return size; +} + +#ifdef CONFIG_WAKEUP_LATENCY_HIST +static ssize_t +show_pid(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + unsigned long *this_pid = file->private_data; + + r = snprintf(buf, sizeof(buf), "%lu\n", *this_pid); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t do_pid(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + unsigned long pid; + unsigned long *this_pid = file->private_data; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = '\0'; + + if (strict_strtoul(buf, 10, &pid)) + return(-EINVAL); + + *this_pid = pid; + + return cnt; +} +#endif + +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \ + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST) +static ssize_t +show_maxlatproc(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + int r; + struct maxlatproc_data *mp = file->private_data; + int strmaxlen = TASK_COMM_LEN + 32; + char *buf = kmalloc(strmaxlen, GFP_KERNEL); + + if (buf == NULL) + return -ENOMEM; + + r = snprintf(buf, strmaxlen, "%d %d %ld %s\n", + mp->pid, MAX_RT_PRIO-1 - mp->prio, mp->latency, mp->comm); + r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + kfree(buf); + return r; +} +#endif + +static ssize_t +show_enable(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + char buf[64]; + struct enable_data *ed = file->private_data; + int r; + + r = snprintf(buf, sizeof(buf), "%d\n", ed->enabled); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +do_enable(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) +{ + char buf[64]; + long enable; + struct enable_data *ed = file->private_data; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + if (strict_strtol(buf, 10, &enable)) + return(-EINVAL); + + if ((enable && ed->enabled) || (!enable && !ed->enabled)) + return cnt; + + if (enable) { + int ret; + + switch (ed->latency_type) { +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST) + case PREEMPTIRQSOFF_LATENCY: + ret = register_trace_preemptirqsoff_hist( + probe_preemptirqsoff_hist); + if (ret) { + pr_info("wakeup trace: Couldn't assign " + "probe_preemptirqsoff_hist " + "to trace_preemptirqsoff_hist\n"); + return ret; + } + break; +#endif +#ifdef CONFIG_WAKEUP_LATENCY_HIST + case WAKEUP_LATENCY: + ret = register_trace_sched_wakeup( + probe_wakeup_latency_hist_start); + if (ret) { + pr_info("wakeup trace: Couldn't assign " + "probe_wakeup_latency_hist_start " + "to trace_sched_wakeup\n"); + return ret; + } + ret = register_trace_sched_wakeup_new( + probe_wakeup_latency_hist_start); + if (ret) { + pr_info("wakeup trace: Couldn't assign " + "probe_wakeup_latency_hist_start " + "to trace_sched_wakeup_new\n"); + unregister_trace_sched_wakeup( + probe_wakeup_latency_hist_start); + return ret; + } + ret = register_trace_sched_switch( + probe_wakeup_latency_hist_stop); + if (ret) { + pr_info("wakeup trace: Couldn't assign " + "probe_wakeup_latency_hist_stop " + "to trace_sched_switch\n"); + unregister_trace_sched_wakeup( + probe_wakeup_latency_hist_start); + unregister_trace_sched_wakeup_new( + probe_wakeup_latency_hist_start); + return ret; + } + ret = register_trace_sched_migrate_task( + probe_sched_migrate_task); + if (ret) { + pr_info("wakeup trace: Couldn't assign " + "probe_sched_migrate_task " + "to trace_sched_migrate_task\n"); + unregister_trace_sched_wakeup( + probe_wakeup_latency_hist_start); + unregister_trace_sched_wakeup_new( + probe_wakeup_latency_hist_start); + unregister_trace_sched_switch( + probe_wakeup_latency_hist_stop); + return ret; + } + break; +#endif +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST + case MISSED_TIMER_OFFSETS: + ret = register_trace_hrtimer_interrupt( + probe_hrtimer_interrupt); + if (ret) { + pr_info("wakeup trace: Couldn't assign " + "probe_hrtimer_interrupt " + "to trace_hrtimer_interrupt\n"); + return ret; + } + break; +#endif + default: + break; + } + } else { + switch (ed->latency_type) { +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST) + case PREEMPTIRQSOFF_LATENCY: + { + int cpu; + + unregister_trace_preemptirqsoff_hist( + probe_preemptirqsoff_hist); + for_each_online_cpu(cpu) { +#ifdef CONFIG_INTERRUPT_OFF_HIST + per_cpu(hist_irqsoff_counting, + cpu) = 0; +#endif +#ifdef CONFIG_PREEMPT_OFF_HIST + per_cpu(hist_preemptoff_counting, + cpu) = 0; +#endif +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) + per_cpu(hist_preemptirqsoff_counting, + cpu) = 0; +#endif + } + } + break; +#endif +#ifdef CONFIG_WAKEUP_LATENCY_HIST + case WAKEUP_LATENCY: + { + int cpu; + + unregister_trace_sched_wakeup( + probe_wakeup_latency_hist_start); + unregister_trace_sched_wakeup_new( + probe_wakeup_latency_hist_start); + unregister_trace_sched_switch( + probe_wakeup_latency_hist_stop); + unregister_trace_sched_migrate_task( + probe_sched_migrate_task); + + for_each_online_cpu(cpu) { + per_cpu(wakeup_task, cpu) = NULL; + per_cpu(wakeup_sharedprio, cpu) = 0; + } + } + break; +#endif +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST + case MISSED_TIMER_OFFSETS: + unregister_trace_hrtimer_interrupt( + probe_hrtimer_interrupt); + break; +#endif + default: + break; + } + } + ed->enabled = enable; + return cnt; +} + +static const struct file_operations latency_hist_reset_fops = { + .open = tracing_open_generic, + .write = latency_hist_reset, +}; + +static const struct file_operations enable_fops = { + .open = tracing_open_generic, + .read = show_enable, + .write = do_enable, +}; + +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \ + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST) +static const struct file_operations pid_fops = { + .open = tracing_open_generic, + .read = show_pid, + .write = do_pid, +}; + +static const struct file_operations maxlatproc_fops = { + .open = tracing_open_generic, + .read = show_maxlatproc, +}; +#endif + +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST) +static notrace void probe_preemptirqsoff_hist(int reason, int starthist) +{ + int cpu = raw_smp_processor_id(); + int time_set = 0; + + if (starthist) { + cycle_t uninitialized_var(start); + + if (!preempt_count() && !irqs_disabled()) + return; + +#ifdef CONFIG_INTERRUPT_OFF_HIST + if ((reason == IRQS_OFF || reason == TRACE_START) && + !per_cpu(hist_irqsoff_counting, cpu)) { + per_cpu(hist_irqsoff_counting, cpu) = 1; + start = ftrace_now(cpu); + time_set++; + per_cpu(hist_irqsoff_start, cpu) = start; + } +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + if ((reason == PREEMPT_OFF || reason == TRACE_START) && + !per_cpu(hist_preemptoff_counting, cpu)) { + per_cpu(hist_preemptoff_counting, cpu) = 1; + if (!(time_set++)) + start = ftrace_now(cpu); + per_cpu(hist_preemptoff_start, cpu) = start; + } +#endif + +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) + if (per_cpu(hist_irqsoff_counting, cpu) && + per_cpu(hist_preemptoff_counting, cpu) && + !per_cpu(hist_preemptirqsoff_counting, cpu)) { + per_cpu(hist_preemptirqsoff_counting, cpu) = 1; + if (!time_set) + start = ftrace_now(cpu); + per_cpu(hist_preemptirqsoff_start, cpu) = start; + } +#endif + } else { + cycle_t uninitialized_var(stop); + +#ifdef CONFIG_INTERRUPT_OFF_HIST + if ((reason == IRQS_ON || reason == TRACE_STOP) && + per_cpu(hist_irqsoff_counting, cpu)) { + cycle_t start = per_cpu(hist_irqsoff_start, cpu); + + stop = ftrace_now(cpu); + time_set++; + if (start && stop >= start) { + unsigned long latency = + nsecs_to_usecs(stop - start); + + latency_hist(IRQSOFF_LATENCY, cpu, latency, + NULL); + } + per_cpu(hist_irqsoff_counting, cpu) = 0; + } +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + if ((reason == PREEMPT_ON || reason == TRACE_STOP) && + per_cpu(hist_preemptoff_counting, cpu)) { + cycle_t start = per_cpu(hist_preemptoff_start, cpu); + + if (!(time_set++)) + stop = ftrace_now(cpu); + if (start && stop >= start) { + unsigned long latency = + nsecs_to_usecs(stop - start); + + latency_hist(PREEMPTOFF_LATENCY, cpu, latency, + NULL); + } + per_cpu(hist_preemptoff_counting, cpu) = 0; + } +#endif + +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) + if ((!per_cpu(hist_irqsoff_counting, cpu) || + !per_cpu(hist_preemptoff_counting, cpu)) && + per_cpu(hist_preemptirqsoff_counting, cpu)) { + cycle_t start = per_cpu(hist_preemptirqsoff_start, cpu); + + if (!time_set) + stop = ftrace_now(cpu); + if (start && stop >= start) { + unsigned long latency = + nsecs_to_usecs(stop - start); + latency_hist(PREEMPTIRQSOFF_LATENCY, cpu, + latency, NULL); + } + per_cpu(hist_preemptirqsoff_counting, cpu) = 0; + } +#endif + } +} +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST +static DEFINE_RAW_SPINLOCK(wakeup_lock); +static notrace void probe_sched_migrate_task(struct task_struct *task, int cpu) +{ + int old_cpu = task_cpu(task); + + if (cpu != old_cpu) { + unsigned long flags; + struct task_struct *cpu_wakeup_task; + + raw_spin_lock_irqsave(&wakeup_lock, flags); + + cpu_wakeup_task = per_cpu(wakeup_task, old_cpu); + if (task == cpu_wakeup_task) { + put_task_struct(cpu_wakeup_task); + per_cpu(wakeup_task, old_cpu) = NULL; + cpu_wakeup_task = per_cpu(wakeup_task, cpu) = task; + get_task_struct(cpu_wakeup_task); + } + + raw_spin_unlock_irqrestore(&wakeup_lock, flags); + } +} + +static notrace void probe_wakeup_latency_hist_start(struct rq *rq, + struct task_struct *p, int success) +{ + unsigned long flags; + struct task_struct *curr = rq_curr(rq); + int cpu = task_cpu(p); + struct task_struct *cpu_wakeup_task; + + raw_spin_lock_irqsave(&wakeup_lock, flags); + + cpu_wakeup_task = per_cpu(wakeup_task, cpu); + + if (wakeup_pid) { + if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) || + p->prio == curr->prio) + per_cpu(wakeup_sharedprio, cpu) = 1; + if (likely(wakeup_pid != task_pid_nr(p))) + goto out; + } else { + if (likely(!rt_task(p)) || + (cpu_wakeup_task && p->prio > cpu_wakeup_task->prio) || + p->prio > curr->prio) + goto out; + if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) || + p->prio == curr->prio) + per_cpu(wakeup_sharedprio, cpu) = 1; + } + + if (cpu_wakeup_task) + put_task_struct(cpu_wakeup_task); + cpu_wakeup_task = per_cpu(wakeup_task, cpu) = p; + get_task_struct(cpu_wakeup_task); + cpu_wakeup_task->preempt_timestamp_hist = + ftrace_now(raw_smp_processor_id()); +out: + raw_spin_unlock_irqrestore(&wakeup_lock, flags); +} + +static notrace void probe_wakeup_latency_hist_stop(struct rq *rq, + struct task_struct *prev, struct task_struct *next) +{ + unsigned long flags; + int cpu = task_cpu(next); + unsigned long latency; + cycle_t stop; + struct task_struct *cpu_wakeup_task; + + raw_spin_lock_irqsave(&wakeup_lock, flags); + + cpu_wakeup_task = per_cpu(wakeup_task, cpu); + + if (cpu_wakeup_task == NULL) + goto out; + + /* Already running? */ + if (unlikely(current == cpu_wakeup_task)) + goto out_reset; + + if (next != cpu_wakeup_task) { + if (next->prio < cpu_wakeup_task->prio) + goto out_reset; + + if (next->prio == cpu_wakeup_task->prio) + per_cpu(wakeup_sharedprio, cpu) = 1; + + goto out; + } + + /* + * The task we are waiting for is about to be switched to. + * Calculate latency and store it in histogram. + */ + stop = ftrace_now(raw_smp_processor_id()); + + latency = nsecs_to_usecs(stop - next->preempt_timestamp_hist); + + if (per_cpu(wakeup_sharedprio, cpu)) { + latency_hist(WAKEUP_LATENCY_SHAREDPRIO, cpu, latency, next); + per_cpu(wakeup_sharedprio, cpu) = 0; + } else + latency_hist(WAKEUP_LATENCY, cpu, latency, next); + +out_reset: + put_task_struct(cpu_wakeup_task); + per_cpu(wakeup_task, cpu) = NULL; +out: + raw_spin_unlock_irqrestore(&wakeup_lock, flags); +} +#endif + +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST +static notrace void probe_hrtimer_interrupt(int cpu, long long latency_ns, + struct task_struct *curr, struct task_struct *task) +{ + if (latency_ns <= 0 && task != NULL && rt_task(task) && + task->prio < curr->prio) { + unsigned long latency; + + if (missed_timer_offsets_pid) { + if (likely(missed_timer_offsets_pid != + task_pid_nr(task))) + return; + } + + latency = (unsigned long) div_s64(-latency_ns, 1000); + latency_hist(MISSED_TIMER_OFFSETS, cpu, latency, task); + } +} +#endif + +static __init int latency_hist_init(void) +{ + struct dentry *latency_hist_root = NULL; + struct dentry *dentry; +#ifdef CONFIG_WAKEUP_LATENCY_HIST + struct dentry *dentry_sharedprio; +#endif + struct dentry *entry; + struct dentry *enable_root; + int i = 0; + struct hist_data *my_hist; + char name[64]; + char *cpufmt = "CPU%d"; +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \ + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST) + char *cpufmt_maxlatproc = "max_latency-CPU%d"; + struct maxlatproc_data *mp = NULL; +#endif + + dentry = tracing_init_dentry(); + latency_hist_root = debugfs_create_dir(latency_hist_dir_root, dentry); + enable_root = debugfs_create_dir("enable", latency_hist_root); + +#ifdef CONFIG_INTERRUPT_OFF_HIST + dentry = debugfs_create_dir(irqsoff_hist_dir, latency_hist_root); + for_each_possible_cpu(i) { + sprintf(name, cpufmt, i); + entry = debugfs_create_file(name, 0444, dentry, + &per_cpu(irqsoff_hist, i), &latency_hist_fops); + my_hist = &per_cpu(irqsoff_hist, i); + atomic_set(&my_hist->hist_mode, 1); + my_hist->min_lat = 0xFFFFFFFFUL; + } + entry = debugfs_create_file("reset", 0644, dentry, + (void *)IRQSOFF_LATENCY, &latency_hist_reset_fops); +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + dentry = debugfs_create_dir(preemptoff_hist_dir, + latency_hist_root); + for_each_possible_cpu(i) { + sprintf(name, cpufmt, i); + entry = debugfs_create_file(name, 0444, dentry, + &per_cpu(preemptoff_hist, i), &latency_hist_fops); + my_hist = &per_cpu(preemptoff_hist, i); + atomic_set(&my_hist->hist_mode, 1); + my_hist->min_lat = 0xFFFFFFFFUL; + } + entry = debugfs_create_file("reset", 0644, dentry, + (void *)PREEMPTOFF_LATENCY, &latency_hist_reset_fops); +#endif + +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) + dentry = debugfs_create_dir(preemptirqsoff_hist_dir, + latency_hist_root); + for_each_possible_cpu(i) { + sprintf(name, cpufmt, i); + entry = debugfs_create_file(name, 0444, dentry, + &per_cpu(preemptirqsoff_hist, i), &latency_hist_fops); + my_hist = &per_cpu(preemptirqsoff_hist, i); + atomic_set(&my_hist->hist_mode, 1); + my_hist->min_lat = 0xFFFFFFFFUL; + } + entry = debugfs_create_file("reset", 0644, dentry, + (void *)PREEMPTIRQSOFF_LATENCY, &latency_hist_reset_fops); +#endif + +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST) + entry = debugfs_create_file("preemptirqsoff", 0644, + enable_root, (void *)&preemptirqsoff_enabled_data, + &enable_fops); +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST + dentry = debugfs_create_dir(wakeup_latency_hist_dir, + latency_hist_root); + dentry_sharedprio = debugfs_create_dir( + wakeup_latency_hist_dir_sharedprio, dentry); + for_each_possible_cpu(i) { + sprintf(name, cpufmt, i); + + entry = debugfs_create_file(name, 0444, dentry, + &per_cpu(wakeup_latency_hist, i), + &latency_hist_fops); + my_hist = &per_cpu(wakeup_latency_hist, i); + atomic_set(&my_hist->hist_mode, 1); + my_hist->min_lat = 0xFFFFFFFFUL; + + entry = debugfs_create_file(name, 0444, dentry_sharedprio, + &per_cpu(wakeup_latency_hist_sharedprio, i), + &latency_hist_fops); + my_hist = &per_cpu(wakeup_latency_hist_sharedprio, i); + atomic_set(&my_hist->hist_mode, 1); + my_hist->min_lat = 0xFFFFFFFFUL; + + sprintf(name, cpufmt_maxlatproc, i); + + mp = &per_cpu(wakeup_maxlatproc, i); + entry = debugfs_create_file(name, 0444, dentry, mp, + &maxlatproc_fops); + mp->prio = mp->pid = mp->latency = -1; + + mp = &per_cpu(wakeup_maxlatproc_sharedprio, i); + entry = debugfs_create_file(name, 0444, dentry_sharedprio, mp, + &maxlatproc_fops); + mp->prio = mp->pid = mp->latency = -1; + } + entry = debugfs_create_file("pid", 0644, dentry, + (void *)&wakeup_pid, &pid_fops); + entry = debugfs_create_file("reset", 0644, dentry, + (void *)WAKEUP_LATENCY, &latency_hist_reset_fops); + entry = debugfs_create_file("reset", 0644, dentry_sharedprio, + (void *)WAKEUP_LATENCY_SHAREDPRIO, &latency_hist_reset_fops); + entry = debugfs_create_file("wakeup", 0644, + enable_root, (void *)&wakeup_latency_enabled_data, + &enable_fops); +#endif + +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST + dentry = debugfs_create_dir(missed_timer_offsets_dir, + latency_hist_root); + for_each_possible_cpu(i) { + sprintf(name, cpufmt, i); + entry = debugfs_create_file(name, 0444, dentry, + &per_cpu(missed_timer_offsets, i), &latency_hist_fops); + my_hist = &per_cpu(missed_timer_offsets, i); + atomic_set(&my_hist->hist_mode, 1); + my_hist->min_lat = 0xFFFFFFFFUL; + + sprintf(name, cpufmt_maxlatproc, i); + mp = &per_cpu(missed_timer_offsets_maxlatproc, i); + entry = debugfs_create_file(name, 0444, dentry, mp, + &maxlatproc_fops); + mp->prio = mp->pid = mp->latency = -1; + } + entry = debugfs_create_file("pid", 0644, dentry, + (void *)&missed_timer_offsets_pid, &pid_fops); + entry = debugfs_create_file("reset", 0644, dentry, + (void *)MISSED_TIMER_OFFSETS, &latency_hist_reset_fops); + entry = debugfs_create_file("missed_timer_offsets", 0644, + enable_root, (void *)&missed_timer_offsets_enabled_data, + &enable_fops); +#endif + return 0; +} + +__initcall(latency_hist_init); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 8c1b2d2..9e095ef 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -422,7 +422,7 @@ int ring_buffer_print_page_header(struct trace_seq *s) struct ring_buffer_per_cpu { int cpu; struct ring_buffer *buffer; - spinlock_t reader_lock; /* serialize readers */ + raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; struct list_head *pages; @@ -998,7 +998,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) cpu_buffer->cpu = cpu; cpu_buffer->buffer = buffer; - spin_lock_init(&cpu_buffer->reader_lock); + raw_spin_lock_init(&cpu_buffer->reader_lock); lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; @@ -1191,11 +1191,12 @@ static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); static void rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) { - struct buffer_page *bpage; + struct buffer_page *bpage, *tmp; struct list_head *p; + LIST_HEAD(tofree); unsigned i; - spin_lock_irq(&cpu_buffer->reader_lock); + raw_spin_lock_irq(&cpu_buffer->reader_lock); rb_head_page_deactivate(cpu_buffer); for (i = 0; i < nr_pages; i++) { @@ -1203,8 +1204,8 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) return; p = cpu_buffer->pages->next; bpage = list_entry(p, struct buffer_page, list); - list_del_init(&bpage->list); - free_buffer_page(bpage); + list_del(&bpage->list); + list_add(&bpage->list, &tofree); } if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) return; @@ -1212,7 +1213,13 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) rb_reset_cpu(cpu_buffer); rb_check_pages(cpu_buffer); - spin_unlock_irq(&cpu_buffer->reader_lock); + raw_spin_unlock_irq(&cpu_buffer->reader_lock); + + list_for_each_entry_safe(bpage, tmp, &tofree, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + } static void @@ -1223,7 +1230,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, struct list_head *p; unsigned i; - spin_lock_irq(&cpu_buffer->reader_lock); + raw_spin_lock_irq(&cpu_buffer->reader_lock); rb_head_page_deactivate(cpu_buffer); for (i = 0; i < nr_pages; i++) { @@ -1237,7 +1244,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_reset_cpu(cpu_buffer); rb_check_pages(cpu_buffer); - spin_unlock_irq(&cpu_buffer->reader_lock); + raw_spin_unlock_irq(&cpu_buffer->reader_lock); } /** @@ -2739,9 +2746,9 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter) cpu_buffer = iter->cpu_buffer; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_iter_reset(iter); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); @@ -3175,12 +3182,12 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) again: local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + raw_spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(cpu_buffer, ts); if (event && event->type_len == RINGBUF_TYPE_PADDING) rb_advance_reader(cpu_buffer); if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + raw_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) @@ -3205,9 +3212,9 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) unsigned long flags; again: - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_iter_peek(iter, ts); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; @@ -3243,14 +3250,14 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + raw_spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(cpu_buffer, ts); if (event) rb_advance_reader(cpu_buffer); if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + raw_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); out: @@ -3296,11 +3303,11 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu) atomic_inc(&cpu_buffer->record_disabled); synchronize_sched(); - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); arch_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); arch_spin_unlock(&cpu_buffer->lock); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return iter; } @@ -3337,7 +3344,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; unsigned long flags; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); again: event = rb_iter_peek(iter, ts); if (!event) @@ -3348,7 +3355,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) rb_advance_iter(iter); out: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return event; } @@ -3414,7 +3421,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) atomic_inc(&cpu_buffer->record_disabled); - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) goto out; @@ -3426,7 +3433,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) arch_spin_unlock(&cpu_buffer->lock); out: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); atomic_dec(&cpu_buffer->record_disabled); } @@ -3464,10 +3471,10 @@ int ring_buffer_empty(struct ring_buffer *buffer) cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + raw_spin_lock(&cpu_buffer->reader_lock); ret = rb_per_cpu_empty(cpu_buffer); if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + raw_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); if (!ret) @@ -3498,10 +3505,10 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + raw_spin_lock(&cpu_buffer->reader_lock); ret = rb_per_cpu_empty(cpu_buffer); if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + raw_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); return ret; @@ -3696,7 +3703,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, if (!bpage) goto out; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); reader = rb_get_reader_page(cpu_buffer); if (!reader) @@ -3771,7 +3778,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, ret = read; out_unlock: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); out: return ret; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index eac6875..4af5b21 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -258,7 +258,7 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_GRAPH_TIME; static int trace_stop_count; -static DEFINE_SPINLOCK(tracing_start_lock); +static DEFINE_RAW_SPINLOCK(tracing_start_lock); /** * trace_wake_up - wake up tasks waiting for trace input @@ -272,6 +272,11 @@ void trace_wake_up(void) if (trace_flags & TRACE_ITER_BLOCK) return; + +#ifdef CONFIG_PREEMPT_RT + if (in_atomic() || irqs_disabled()) + return; +#endif /* * The runqueue_is_locked() can fail, but this is the best we * have for now: @@ -847,7 +852,7 @@ void tracing_start(void) if (tracing_disabled) return; - spin_lock_irqsave(&tracing_start_lock, flags); + raw_spin_lock_irqsave(&tracing_start_lock, flags); if (--trace_stop_count) { if (trace_stop_count < 0) { /* Someone screwed up their debugging */ @@ -868,7 +873,7 @@ void tracing_start(void) ftrace_start(); out: - spin_unlock_irqrestore(&tracing_start_lock, flags); + raw_spin_unlock_irqrestore(&tracing_start_lock, flags); } /** @@ -883,7 +888,7 @@ void tracing_stop(void) unsigned long flags; ftrace_stop(); - spin_lock_irqsave(&tracing_start_lock, flags); + raw_spin_lock_irqsave(&tracing_start_lock, flags); if (trace_stop_count++) goto out; @@ -896,7 +901,7 @@ void tracing_stop(void) ring_buffer_record_disable(buffer); out: - spin_unlock_irqrestore(&tracing_start_lock, flags); + raw_spin_unlock_irqrestore(&tracing_start_lock, flags); } void trace_stop_cmdline_recording(void); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b1342c5..c2eef67 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -608,7 +608,7 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) int ret, len; int i; - sprintf(msecs_str, "%lu", (unsigned long) duration); + snprintf(msecs_str, sizeof(msecs_str), "%lu", (unsigned long) duration); /* Print msecs */ ret = trace_seq_printf(s, "%s", msecs_str); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 2974bc7..4ab5d9a 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -17,6 +17,7 @@ #include <linux/fs.h> #include "trace.h" +#include <trace/events/hist.h> static struct trace_array *irqsoff_trace __read_mostly; static int tracer_enabled __read_mostly; @@ -249,11 +250,13 @@ void start_critical_timings(void) { if (preempt_trace() || irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + trace_preemptirqsoff_hist(TRACE_START, 1); } EXPORT_SYMBOL_GPL(start_critical_timings); void stop_critical_timings(void) { + trace_preemptirqsoff_hist(TRACE_STOP, 0); if (preempt_trace() || irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } @@ -263,6 +266,7 @@ EXPORT_SYMBOL_GPL(stop_critical_timings); #ifdef CONFIG_PROVE_LOCKING void time_hardirqs_on(unsigned long a0, unsigned long a1) { + trace_preemptirqsoff_hist(IRQS_ON, 0); if (!preempt_trace() && irq_trace()) stop_critical_timing(a0, a1); } @@ -271,6 +275,7 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1) { if (!preempt_trace() && irq_trace()) start_critical_timing(a0, a1); + trace_preemptirqsoff_hist(IRQS_OFF, 1); } #else /* !CONFIG_PROVE_LOCKING */ @@ -304,6 +309,7 @@ inline void print_irqtrace_events(struct task_struct *curr) */ void trace_hardirqs_on(void) { + trace_preemptirqsoff_hist(IRQS_ON, 0); if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } @@ -313,11 +319,13 @@ void trace_hardirqs_off(void) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + trace_preemptirqsoff_hist(IRQS_OFF, 1); } EXPORT_SYMBOL(trace_hardirqs_off); void trace_hardirqs_on_caller(unsigned long caller_addr) { + trace_preemptirqsoff_hist(IRQS_ON, 0); if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, caller_addr); } @@ -327,6 +335,7 @@ void trace_hardirqs_off_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, caller_addr); + trace_preemptirqsoff_hist(IRQS_OFF, 1); } EXPORT_SYMBOL(trace_hardirqs_off_caller); @@ -336,12 +345,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller); #ifdef CONFIG_PREEMPT_TRACER void trace_preempt_on(unsigned long a0, unsigned long a1) { + trace_preemptirqsoff_hist(PREEMPT_ON, 0); if (preempt_trace()) stop_critical_timing(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { + trace_preemptirqsoff_hist(PREEMPT_OFF, 1); if (preempt_trace()) start_critical_timing(a0, a1); } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 6ea90c0..50b1b82 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -689,7 +689,7 @@ static int create_trace_probe(int argc, char **argv) return -EINVAL; } /* an address specified */ - ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr); + ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr); if (ret) { pr_info("Failed to parse address.\n"); return ret; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 678a512..f4bc9b2 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, unsigned long val, flags; char buf[64]; int ret; + int cpu; if (count >= sizeof(buf)) return -EINVAL; @@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, return ret; local_irq_save(flags); + + /* + * In case we trace inside arch_spin_lock() or after (NMI), + * we will cause circular lock, so we also need to increase + * the percpu trace_active here. + */ + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)++; + arch_spin_lock(&max_stack_lock); *ptr = val; arch_spin_unlock(&max_stack_lock); + + per_cpu(trace_active, cpu)--; local_irq_restore(flags); return count; @@ -206,7 +218,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { + int cpu; + local_irq_disable(); + + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)++; + arch_spin_lock(&max_stack_lock); if (*pos == 0) @@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos) static void t_stop(struct seq_file *m, void *p) { + int cpu; + arch_spin_unlock(&max_stack_lock); + + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)--; + local_irq_enable(); } diff --git a/kernel/user.c b/kernel/user.c index 46d0165..0c81912 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -416,11 +416,11 @@ void free_uid(struct user_struct *up) if (!up) return; - local_irq_save(flags); + local_irq_save_nort(flags); if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) free_user(up, flags); else - local_irq_restore(flags); + local_irq_restore_nort(flags); } struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index dee4865..26b8839 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -26,6 +26,7 @@ #include <linux/slab.h> #include <linux/cpu.h> #include <linux/notifier.h> +#include <linux/syscalls.h> #include <linux/kthread.h> #include <linux/hardirq.h> #include <linux/mempolicy.h> @@ -36,6 +37,8 @@ #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> +#include <asm/uaccess.h> + /* * The per-CPU workqueue (if single thread, we always use the first * possible cpu). @@ -270,13 +273,14 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, * * We queue the work to the CPU on which it was submitted, but if the CPU dies * it can be processed by another CPU. + * + * Especially no such guarantee on PREEMPT_RT. */ int queue_work(struct workqueue_struct *wq, struct work_struct *work) { - int ret; + int ret = 0, cpu = raw_smp_processor_id(); - ret = queue_work_on(get_cpu(), wq, work); - put_cpu(); + ret = queue_work_on(cpu, wq, work); return ret; } @@ -313,7 +317,7 @@ static void delayed_work_timer_fn(unsigned long __data) struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); struct workqueue_struct *wq = cwq->wq; - __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work); + __queue_work(wq_per_cpu(wq, raw_smp_processor_id()), &dwork->work); } /** @@ -774,9 +778,9 @@ void flush_delayed_work(struct delayed_work *dwork) { if (del_timer_sync(&dwork->timer)) { struct cpu_workqueue_struct *cwq; - cwq = wq_per_cpu(keventd_wq, get_cpu()); + int cpu = raw_smp_processor_id(); + cwq = wq_per_cpu(keventd_wq, cpu); __queue_work(cwq, &dwork->work); - put_cpu(); } flush_work(&dwork->work); } @@ -1044,6 +1048,49 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) cwq->thread = NULL; } +void set_workqueue_thread_prio(struct workqueue_struct *wq, int cpu, + int policy, int rt_priority, int nice) +{ + struct sched_param param = { .sched_priority = rt_priority }; + struct cpu_workqueue_struct *cwq; + mm_segment_t oldfs = get_fs(); + struct task_struct *p; + unsigned long flags; + int ret; + + cwq = per_cpu_ptr(wq->cpu_wq, cpu); + spin_lock_irqsave(&cwq->lock, flags); + p = cwq->thread; + spin_unlock_irqrestore(&cwq->lock, flags); + + set_user_nice(p, nice); + + set_fs(KERNEL_DS); + ret = sys_sched_setscheduler(p->pid, policy, ¶m); + set_fs(oldfs); + + WARN_ON(ret); +} + +void set_workqueue_prio(struct workqueue_struct *wq, int policy, + int rt_priority, int nice) +{ + int cpu; + + /* We don't need the distraction of CPUs appearing and vanishing. */ + get_online_cpus(); + spin_lock(&workqueue_lock); + if (is_wq_single_threaded(wq)) + set_workqueue_thread_prio(wq, 0, policy, rt_priority, nice); + else { + for_each_online_cpu(cpu) + set_workqueue_thread_prio(wq, cpu, policy, + rt_priority, nice); + } + spin_unlock(&workqueue_lock); + put_online_cpus(); +} + /** * destroy_workqueue - safely terminate a workqueue * @wq: target workqueue @@ -1176,4 +1223,5 @@ void __init init_workqueues(void) hotcpu_notifier(workqueue_cpu_callback, 0); keventd_wq = create_workqueue("events"); BUG_ON(!keventd_wq); + set_workqueue_prio(keventd_wq, SCHED_FIFO, 1, -20); } diff --git a/lib/Kconfig b/lib/Kconfig index 97b136f..38dc31e 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -183,6 +183,7 @@ config HAVE_LMB config CPUMASK_OFFSTACK bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS + depends on !PREEMPT_RT && BROKEN help Use dynamic allocation for cpumask_var_t, instead of putting them on the stack. This is a bit more expensive, but avoids diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 25c3ed5..cbf6e02 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -144,7 +144,7 @@ config DEBUG_KERNEL config DEBUG_SHIRQ bool "Debug shared IRQ handlers" - depends on DEBUG_KERNEL && GENERIC_HARDIRQS + depends on DEBUG_KERNEL && GENERIC_HARDIRQS && !PREEMPT_RT help Enable this to generate a spurious interrupt as soon as a shared interrupt handler is registered, and just before one is deregistered. @@ -415,6 +415,8 @@ config DEBUG_RT_MUTEXES help This allows rt mutex semantics violations and rt mutex related deadlocks (lockups) to be detected and reported automatically. + When realtime preemption is enabled this includes spinlocks, + rwlocks, mutexes and (rw)semaphores config DEBUG_PI_LIST bool @@ -438,7 +440,7 @@ config DEBUG_SPINLOCK config DEBUG_MUTEXES bool "Mutex debugging: basic checks" - depends on DEBUG_KERNEL + depends on DEBUG_KERNEL && !PREEMPT_RT help This feature allows mutex semantics violations to be detected and reported. diff --git a/lib/Makefile b/lib/Makefile index 3b0b4a6..2d21722 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -34,7 +34,8 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o -lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o +obj-$(CONFIG_PREEMPT_RT) += plist.o +obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_GENERIC_FIND_FIRST_BIT) += find_next_bit.o lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o diff --git a/lib/idr.c b/lib/idr.c index 1cac726..0dc7822 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -156,10 +156,12 @@ static int sub_alloc(struct idr *idp, int *starting_id, struct idr_layer **pa) id = (id | ((1 << (IDR_BITS * l)) - 1)) + 1; /* if already at the top layer, we need to grow */ - if (!(p = pa[l])) { + if (id >= 1 << (idp->layers * IDR_BITS)) { *starting_id = id; return IDR_NEED_TO_GROW; } + p = pa[l]; + BUG_ON(!p); /* If we need to go up one layer, continue the * loop; otherwise, restart from the top. diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c index b135d04..bc62ed8 100644 --- a/lib/kernel_lock.c +++ b/lib/kernel_lock.c @@ -7,114 +7,64 @@ */ #include <linux/module.h> #include <linux/kallsyms.h> -#include <linux/semaphore.h> +#include <linux/mutex.h> #include <linux/smp_lock.h> #define CREATE_TRACE_POINTS #include <trace/events/bkl.h> /* - * The 'big kernel lock' + * The 'big kernel semaphore' * - * This spinlock is taken and released recursively by lock_kernel() + * This mutex is taken and released recursively by lock_kernel() * and unlock_kernel(). It is transparently dropped and reacquired * over schedule(). It is used to protect legacy code that hasn't * been migrated to a proper locking design yet. * + * Note: code locked by this semaphore will only be serialized against + * other code using the same locking facility. The code guarantees that + * the task remains on the same CPU. + * * Don't use in new code. */ -static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(kernel_flag); - +DEFINE_MUTEX(kernel_sem); /* - * Acquire/release the underlying lock from the scheduler. + * Re-acquire the kernel semaphore. * - * This is called with preemption disabled, and should - * return an error value if it cannot get the lock and - * TIF_NEED_RESCHED gets set. + * This function is called with preemption off. * - * If it successfully gets the lock, it should increment - * the preemption count like any spinlock does. + * We are executing in schedule() so the code must be extremely careful + * about recursion, both due to the down() and due to the enabling of + * preemption. schedule() will re-check the preemption flag after + * reacquiring the semaphore. * - * (This works on UP too - do_raw_spin_trylock will never - * return false in that case) + * Called with interrupts disabled. */ int __lockfunc __reacquire_kernel_lock(void) { - while (!do_raw_spin_trylock(&kernel_flag)) { - if (need_resched()) - return -EAGAIN; - cpu_relax(); - } - preempt_disable(); - return 0; -} + int saved_lock_depth = current->lock_depth; -void __lockfunc __release_kernel_lock(void) -{ - do_raw_spin_unlock(&kernel_flag); - preempt_enable_no_resched(); -} + BUG_ON(saved_lock_depth < 0); -/* - * These are the BKL spinlocks - we try to be polite about preemption. - * If SMP is not on (ie UP preemption), this all goes away because the - * do_raw_spin_trylock() will always succeed. - */ -#ifdef CONFIG_PREEMPT -static inline void __lock_kernel(void) -{ - preempt_disable(); - if (unlikely(!do_raw_spin_trylock(&kernel_flag))) { - /* - * If preemption was disabled even before this - * was called, there's nothing we can be polite - * about - just spin. - */ - if (preempt_count() > 1) { - do_raw_spin_lock(&kernel_flag); - return; - } + current->lock_depth = -1; + local_irq_enable(); - /* - * Otherwise, let's wait for the kernel lock - * with preemption enabled.. - */ - do { - preempt_enable(); - while (raw_spin_is_locked(&kernel_flag)) - cpu_relax(); - preempt_disable(); - } while (!do_raw_spin_trylock(&kernel_flag)); - } -} + mutex_lock(&kernel_sem); -#else + local_irq_disable(); + current->lock_depth = saved_lock_depth; -/* - * Non-preemption case - just get the spinlock - */ -static inline void __lock_kernel(void) -{ - do_raw_spin_lock(&kernel_flag); + return 0; } -#endif -static inline void __unlock_kernel(void) +void __lockfunc __release_kernel_lock(void) { - /* - * the BKL is not covered by lockdep, so we open-code the - * unlocking sequence (and thus avoid the dep-chain ops): - */ - do_raw_spin_unlock(&kernel_flag); - preempt_enable(); + mutex_unlock(&kernel_sem); } /* - * Getting the big kernel lock. - * - * This cannot happen asynchronously, so we only need to - * worry about other CPU's. + * Getting the big kernel semaphore. */ void __lockfunc _lock_kernel(const char *func, const char *file, int line) { @@ -124,17 +74,28 @@ void __lockfunc _lock_kernel(const char *func, const char *file, int line) if (likely(!depth)) { might_sleep(); - __lock_kernel(); + /* + * No recursion worries - we set up lock_depth _after_ + */ + mutex_lock(&kernel_sem); +#ifdef CONFIG_DEBUG_RT_MUTEXES + current->last_kernel_lock = __builtin_return_address(0); +#endif } + current->lock_depth = depth; } void __lockfunc _unlock_kernel(const char *func, const char *file, int line) { BUG_ON(current->lock_depth < 0); - if (likely(--current->lock_depth < 0)) - __unlock_kernel(); + if (likely(--current->lock_depth < 0)) { +#ifdef CONFIG_DEBUG_RT_MUTEXES + current->last_kernel_lock = NULL; +#endif + mutex_unlock(&kernel_sem); + } trace_unlock_kernel(func, file, line); } diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index 619313e..65e7eab 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -158,7 +158,7 @@ static void init_shared_classes(void) local_bh_disable(); \ local_irq_disable(); \ lockdep_softirq_enter(); \ - WARN_ON(!in_softirq()); + /* FIXME: preemptible softirqs. WARN_ON(!in_softirq()); */ #define SOFTIRQ_EXIT() \ lockdep_softirq_exit(); \ @@ -550,6 +550,11 @@ GENERATE_TESTCASE(init_held_rsem) #undef E /* + * FIXME: turns these into raw-spinlock tests on -rt + */ +#ifndef CONFIG_PREEMPT_RT + +/* * locking an irq-safe lock with irqs enabled: */ #define E1() \ @@ -890,6 +895,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft) #include "locking-selftest-softirq.h" // GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft) +#endif /* !CONFIG_PREEMPT_RT */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC # define I_SPINLOCK(x) lockdep_reset_lock(&lock_##x.dep_map) # define I_RWLOCK(x) lockdep_reset_lock(&rwlock_##x.dep_map) @@ -998,7 +1005,7 @@ static inline void print_testname(const char *testname) #define DO_TESTCASE_1(desc, name, nr) \ print_testname(desc"/"#nr); \ - dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ + dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ printk("\n"); #define DO_TESTCASE_1B(desc, name, nr) \ @@ -1006,17 +1013,17 @@ static inline void print_testname(const char *testname) dotest(name##_##nr, FAILURE, LOCKTYPE_RWLOCK); \ printk("\n"); -#define DO_TESTCASE_3(desc, name, nr) \ - print_testname(desc"/"#nr); \ - dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN); \ - dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ +#define DO_TESTCASE_3(desc, name, nr) \ + print_testname(desc"/"#nr); \ + dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN); \ + dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ printk("\n"); -#define DO_TESTCASE_3RW(desc, name, nr) \ - print_testname(desc"/"#nr); \ +#define DO_TESTCASE_3RW(desc, name, nr) \ + print_testname(desc"/"#nr); \ dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN|LOCKTYPE_RWLOCK);\ - dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ + dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ printk("\n"); @@ -1047,7 +1054,7 @@ static inline void print_testname(const char *testname) print_testname(desc); \ dotest(name##_spin, FAILURE, LOCKTYPE_SPIN); \ dotest(name##_wlock, FAILURE, LOCKTYPE_RWLOCK); \ - dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK); \ + dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK); \ dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX); \ dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM); \ dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM); \ @@ -1179,6 +1186,7 @@ void locking_selftest(void) /* * irq-context testcases: */ +#ifndef CONFIG_PREEMPT_RT DO_TESTCASE_2x6("irqs-on + irq-safe-A", irqsafe1); DO_TESTCASE_2x3("sirq-safe-A => hirqs-on", irqsafe2A); DO_TESTCASE_2x6("safe-A + irqs-on", irqsafe2B); @@ -1188,6 +1196,7 @@ void locking_selftest(void) DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion); // DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2); +#endif if (unexpected_testcase_failures) { printk("-----------------------------------------------------------------\n"); diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index aeaa6d7..10fb740 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -16,13 +16,13 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount) { int cpu; - spin_lock(&fbc->lock); + raw_spin_lock(&fbc->lock); for_each_possible_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); *pcount = 0; } fbc->count = amount; - spin_unlock(&fbc->lock); + raw_spin_unlock(&fbc->lock); } EXPORT_SYMBOL(percpu_counter_set); @@ -35,10 +35,10 @@ void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch) pcount = per_cpu_ptr(fbc->counters, cpu); count = *pcount + amount; if (count >= batch || count <= -batch) { - spin_lock(&fbc->lock); + raw_spin_lock(&fbc->lock); fbc->count += count; *pcount = 0; - spin_unlock(&fbc->lock); + raw_spin_unlock(&fbc->lock); } else { *pcount = count; } @@ -55,13 +55,13 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc) s64 ret; int cpu; - spin_lock(&fbc->lock); + raw_spin_lock(&fbc->lock); ret = fbc->count; for_each_online_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; } - spin_unlock(&fbc->lock); + raw_spin_unlock(&fbc->lock); return ret; } EXPORT_SYMBOL(__percpu_counter_sum); @@ -69,7 +69,7 @@ EXPORT_SYMBOL(__percpu_counter_sum); int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, struct lock_class_key *key) { - spin_lock_init(&fbc->lock); + raw_spin_lock_init(&fbc->lock); lockdep_set_class(&fbc->lock, key); fbc->count = amount; fbc->counters = alloc_percpu(s32); @@ -126,11 +126,11 @@ static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb, s32 *pcount; unsigned long flags; - spin_lock_irqsave(&fbc->lock, flags); + raw_spin_lock_irqsave(&fbc->lock, flags); pcount = per_cpu_ptr(fbc->counters, cpu); fbc->count += *pcount; *pcount = 0; - spin_unlock_irqrestore(&fbc->lock, flags); + raw_spin_unlock_irqrestore(&fbc->lock, flags); } mutex_unlock(&percpu_counters_lock); #endif diff --git a/lib/proportions.c b/lib/proportions.c index d50746a..05df848 100644 --- a/lib/proportions.c +++ b/lib/proportions.c @@ -190,7 +190,7 @@ prop_adjust_shift(int *pl_shift, unsigned long *pl_period, int new_shift) int prop_local_init_percpu(struct prop_local_percpu *pl) { - spin_lock_init(&pl->lock); + raw_spin_lock_init(&pl->lock); pl->shift = 0; pl->period = 0; return percpu_counter_init(&pl->events, 0); @@ -226,7 +226,7 @@ void prop_norm_percpu(struct prop_global *pg, struct prop_local_percpu *pl) if (pl->period == global_period) return; - spin_lock_irqsave(&pl->lock, flags); + raw_spin_lock_irqsave(&pl->lock, flags); prop_adjust_shift(&pl->shift, &pl->period, pg->shift); /* @@ -247,7 +247,7 @@ void prop_norm_percpu(struct prop_global *pg, struct prop_local_percpu *pl) percpu_counter_set(&pl->events, 0); pl->period = global_period; - spin_unlock_irqrestore(&pl->lock, flags); + raw_spin_unlock_irqrestore(&pl->lock, flags); } /* @@ -324,7 +324,7 @@ void prop_fraction_percpu(struct prop_descriptor *pd, int prop_local_init_single(struct prop_local_single *pl) { - spin_lock_init(&pl->lock); + raw_spin_lock_init(&pl->lock); pl->shift = 0; pl->period = 0; pl->events = 0; @@ -356,7 +356,7 @@ void prop_norm_single(struct prop_global *pg, struct prop_local_single *pl) if (pl->period == global_period) return; - spin_lock_irqsave(&pl->lock, flags); + raw_spin_lock_irqsave(&pl->lock, flags); prop_adjust_shift(&pl->shift, &pl->period, pg->shift); /* * For each missed period, we half the local counter. @@ -367,7 +367,7 @@ void prop_norm_single(struct prop_global *pg, struct prop_local_single *pl) else pl->events = 0; pl->period = global_period; - spin_unlock_irqrestore(&pl->lock, flags); + raw_spin_unlock_irqrestore(&pl->lock, flags); } /* diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 92cdd99..bc1e61b 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -157,12 +157,14 @@ radix_tree_node_alloc(struct radix_tree_root *root) * succeed in getting a node here (and never reach * kmem_cache_alloc) */ + rtp = &get_cpu_var(radix_tree_preloads); rtp = &__get_cpu_var(radix_tree_preloads); if (rtp->nr) { ret = rtp->nodes[rtp->nr - 1]; rtp->nodes[rtp->nr - 1] = NULL; rtp->nr--; } + put_cpu_var(radix_tree_preloads); } if (ret == NULL) ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); @@ -195,6 +197,8 @@ radix_tree_node_free(struct radix_tree_node *node) call_rcu(&node->rcu_head, radix_tree_node_rcu_free); } +#ifndef CONFIG_PREEMPT_RT + /* * Load up this CPU's radix_tree_node buffer with sufficient objects to * ensure that the addition of a single element in the tree cannot fail. On @@ -230,6 +234,8 @@ out: } EXPORT_SYMBOL(radix_tree_preload); +#endif + /* * Return the maximum key which can be store into a * radix tree with height HEIGHT. diff --git a/lib/ratelimit.c b/lib/ratelimit.c index 09f5ce1..39588b3 100644 --- a/lib/ratelimit.c +++ b/lib/ratelimit.c @@ -34,7 +34,7 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func) * in addition to the one that will be printed by * the entity that is holding the lock already: */ - if (!spin_trylock_irqsave(&rs->lock, flags)) + if (!raw_spin_trylock_irqsave(&rs->lock, flags)) return 1; if (!rs->begin) @@ -55,7 +55,7 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func) rs->missed++; ret = 0; } - spin_unlock_irqrestore(&rs->lock, flags); + raw_spin_unlock_irqrestore(&rs->lock, flags); return ret; } diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c index ccf95bf..4010d32 100644 --- a/lib/rwsem-spinlock.c +++ b/lib/rwsem-spinlock.c @@ -17,24 +17,24 @@ struct rwsem_waiter { #define RWSEM_WAITING_FOR_WRITE 0x00000002 }; -int rwsem_is_locked(struct rw_semaphore *sem) +int anon_rwsem_is_locked(struct rw_anon_semaphore *sem) { int ret = 1; unsigned long flags; - if (spin_trylock_irqsave(&sem->wait_lock, flags)) { + if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { ret = (sem->activity != 0); - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); } return ret; } -EXPORT_SYMBOL(rwsem_is_locked); +EXPORT_SYMBOL(anon_rwsem_is_locked); /* * initialise the semaphore */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) +void __init_anon_rwsem(struct rw_anon_semaphore *sem, const char *name, + struct lock_class_key *key) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -44,10 +44,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, lockdep_init_map(&sem->dep_map, name, key, 0); #endif sem->activity = 0; - spin_lock_init(&sem->wait_lock); + raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); } -EXPORT_SYMBOL(__init_rwsem); +EXPORT_SYMBOL(__init_anon_rwsem); /* * handle the lock release when processes blocked on it that can now run @@ -58,8 +58,8 @@ EXPORT_SYMBOL(__init_rwsem); * - woken process blocks are discarded from the list after having task zeroed * - writers are only woken if wakewrite is non-zero */ -static inline struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) +static inline struct rw_anon_semaphore * +__rwsem_do_wake(struct rw_anon_semaphore *sem, int wakewrite) { struct rwsem_waiter *waiter; struct task_struct *tsk; @@ -117,8 +117,8 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) /* * wake a single writer */ -static inline struct rw_semaphore * -__rwsem_wake_one_writer(struct rw_semaphore *sem) +static inline struct rw_anon_semaphore * +__rwsem_wake_one_writer(struct rw_anon_semaphore *sem) { struct rwsem_waiter *waiter; struct task_struct *tsk; @@ -139,17 +139,17 @@ __rwsem_wake_one_writer(struct rw_semaphore *sem) /* * get a read lock on the semaphore */ -void __sched __down_read(struct rw_semaphore *sem) +void __sched __down_read(struct rw_anon_semaphore *sem) { struct rwsem_waiter waiter; struct task_struct *tsk; - spin_lock_irq(&sem->wait_lock); + raw_spin_lock_irq(&sem->wait_lock); if (sem->activity >= 0 && list_empty(&sem->wait_list)) { /* granted */ sem->activity++; - spin_unlock_irq(&sem->wait_lock); + raw_spin_unlock_irq(&sem->wait_lock); goto out; } @@ -164,7 +164,7 @@ void __sched __down_read(struct rw_semaphore *sem) list_add_tail(&waiter.list, &sem->wait_list); /* we don't need to touch the semaphore struct anymore */ - spin_unlock_irq(&sem->wait_lock); + raw_spin_unlock_irq(&sem->wait_lock); /* wait to be given the lock */ for (;;) { @@ -182,13 +182,13 @@ void __sched __down_read(struct rw_semaphore *sem) /* * trylock for reading -- returns 1 if successful, 0 if contention */ -int __down_read_trylock(struct rw_semaphore *sem) +int __down_read_trylock(struct rw_anon_semaphore *sem) { unsigned long flags; int ret = 0; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); if (sem->activity >= 0 && list_empty(&sem->wait_list)) { /* granted */ @@ -196,7 +196,7 @@ int __down_read_trylock(struct rw_semaphore *sem) ret = 1; } - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); return ret; } @@ -205,17 +205,17 @@ int __down_read_trylock(struct rw_semaphore *sem) * get a write lock on the semaphore * - we increment the waiting count anyway to indicate an exclusive lock */ -void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) +void __sched __down_write_nested(struct rw_anon_semaphore *sem, int subclass) { struct rwsem_waiter waiter; struct task_struct *tsk; - spin_lock_irq(&sem->wait_lock); + raw_spin_lock_irq(&sem->wait_lock); if (sem->activity == 0 && list_empty(&sem->wait_list)) { /* granted */ sem->activity = -1; - spin_unlock_irq(&sem->wait_lock); + raw_spin_unlock_irq(&sem->wait_lock); goto out; } @@ -230,7 +230,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) list_add_tail(&waiter.list, &sem->wait_list); /* we don't need to touch the semaphore struct anymore */ - spin_unlock_irq(&sem->wait_lock); + raw_spin_unlock_irq(&sem->wait_lock); /* wait to be given the lock */ for (;;) { @@ -245,7 +245,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) ; } -void __sched __down_write(struct rw_semaphore *sem) +void __sched __down_write(struct rw_anon_semaphore *sem) { __down_write_nested(sem, 0); } @@ -253,12 +253,12 @@ void __sched __down_write(struct rw_semaphore *sem) /* * trylock for writing -- returns 1 if successful, 0 if contention */ -int __down_write_trylock(struct rw_semaphore *sem) +int __down_write_trylock(struct rw_anon_semaphore *sem) { unsigned long flags; int ret = 0; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); if (sem->activity == 0 && list_empty(&sem->wait_list)) { /* granted */ @@ -266,7 +266,7 @@ int __down_write_trylock(struct rw_semaphore *sem) ret = 1; } - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); return ret; } @@ -274,48 +274,47 @@ int __down_write_trylock(struct rw_semaphore *sem) /* * release a read lock on the semaphore */ -void __up_read(struct rw_semaphore *sem) +void __up_read(struct rw_anon_semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); if (--sem->activity == 0 && !list_empty(&sem->wait_list)) sem = __rwsem_wake_one_writer(sem); - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); } /* * release a write lock on the semaphore */ -void __up_write(struct rw_semaphore *sem) +void __up_write(struct rw_anon_semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); sem->activity = 0; if (!list_empty(&sem->wait_list)) sem = __rwsem_do_wake(sem, 1); - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); } /* * downgrade a write lock into a read lock * - just wake up any readers at the front of the queue */ -void __downgrade_write(struct rw_semaphore *sem) +void __downgrade_write(struct rw_anon_semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); sem->activity = 1; if (!list_empty(&sem->wait_list)) sem = __rwsem_do_wake(sem, 0); - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); } - diff --git a/lib/rwsem.c b/lib/rwsem.c index 3e3365e..47f5a75 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -11,8 +11,8 @@ /* * Initialize an rwsem: */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) +void __init_anon_rwsem(struct rw_anon_semaphore *sem, const char *name, + struct lock_class_key *key) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -22,11 +22,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, lockdep_init_map(&sem->dep_map, name, key, 0); #endif sem->count = RWSEM_UNLOCKED_VALUE; - spin_lock_init(&sem->wait_lock); + raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); } - -EXPORT_SYMBOL(__init_rwsem); +EXPORT_SYMBOL(__init_anon_rwsem); struct rwsem_waiter { struct list_head list; @@ -46,8 +45,8 @@ struct rwsem_waiter { * - woken process blocks are discarded from the list after having task zeroed * - writers are only woken if downgrading is false */ -static inline struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int downgrading) +static inline struct rw_anon_semaphore * +__rwsem_do_wake(struct rw_anon_semaphore *sem, int downgrading) { struct rwsem_waiter *waiter; struct task_struct *tsk; @@ -146,9 +145,9 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading) /* * wait for a lock to be granted */ -static struct rw_semaphore __sched * -rwsem_down_failed_common(struct rw_semaphore *sem, - struct rwsem_waiter *waiter, signed long adjustment) +static struct rw_anon_semaphore __sched * +rwsem_down_failed_common(struct rw_anon_semaphore *sem, + struct rwsem_waiter *waiter, signed long adjustment) { struct task_struct *tsk = current; signed long count; @@ -156,7 +155,7 @@ rwsem_down_failed_common(struct rw_semaphore *sem, set_task_state(tsk, TASK_UNINTERRUPTIBLE); /* set up my own style of waitqueue */ - spin_lock_irq(&sem->wait_lock); + raw_spin_lock_irq(&sem->wait_lock); waiter->task = tsk; get_task_struct(tsk); @@ -169,7 +168,7 @@ rwsem_down_failed_common(struct rw_semaphore *sem, if (!(count & RWSEM_ACTIVE_MASK)) sem = __rwsem_do_wake(sem, 0); - spin_unlock_irq(&sem->wait_lock); + raw_spin_unlock_irq(&sem->wait_lock); /* wait to be given the lock */ for (;;) { @@ -187,8 +186,8 @@ rwsem_down_failed_common(struct rw_semaphore *sem, /* * wait for the read lock to be granted */ -asmregparm struct rw_semaphore __sched * -rwsem_down_read_failed(struct rw_semaphore *sem) +asmregparm struct rw_anon_semaphore __sched * +rwsem_down_read_failed(struct rw_anon_semaphore *sem) { struct rwsem_waiter waiter; @@ -201,8 +200,8 @@ rwsem_down_read_failed(struct rw_semaphore *sem) /* * wait for the write lock to be granted */ -asmregparm struct rw_semaphore __sched * -rwsem_down_write_failed(struct rw_semaphore *sem) +asmregparm struct rw_anon_semaphore __sched * +rwsem_down_write_failed(struct rw_anon_semaphore *sem) { struct rwsem_waiter waiter; @@ -216,17 +215,17 @@ rwsem_down_write_failed(struct rw_semaphore *sem) * handle waking up a waiter on the semaphore * - up_read/up_write has decremented the active part of count if we come here */ -asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) +asmregparm struct rw_anon_semaphore *rwsem_wake(struct rw_anon_semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) sem = __rwsem_do_wake(sem, 0); - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); return sem; } @@ -236,17 +235,18 @@ asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) * - caller incremented waiting part of count and discovered it still negative * - just wake up any readers at the front of the queue */ -asmregparm struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) +asmregparm struct rw_anon_semaphore * +rwsem_downgrade_wake(struct rw_anon_semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) sem = __rwsem_do_wake(sem, 1); - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); return sem; } diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 0d475d8..e6dcd3b 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/scatterlist.h> #include <linux/highmem.h> +#include <linux/interrupt.h> /** * sg_next - return the next scatterlist entry in a list @@ -399,7 +400,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter) flush_kernel_dcache_page(miter->page); if (miter->__flags & SG_MITER_ATOMIC) { - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); kunmap_atomic(miter->addr, KM_BIO_SRC_IRQ); } else kunmap(miter->page); @@ -439,7 +440,7 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, sg_miter_start(&miter, sgl, nents, sg_flags); - local_irq_save(flags); + local_irq_save_nort(flags); while (sg_miter_next(&miter) && offset < buflen) { unsigned int len; @@ -456,7 +457,7 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, sg_miter_stop(&miter); - local_irq_restore(flags); + local_irq_restore_nort(flags); return offset; } diff --git a/lib/spinlock_debug.c b/lib/spinlock_debug.c index 4755b98..f65f7cd 100644 --- a/lib/spinlock_debug.c +++ b/lib/spinlock_debug.c @@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, EXPORT_SYMBOL(__raw_spin_lock_init); +#ifndef CONFIG_PREEMPT_RT void __rwlock_init(rwlock_t *lock, const char *name, struct lock_class_key *key) { @@ -46,8 +47,8 @@ void __rwlock_init(rwlock_t *lock, const char *name, lock->owner = SPINLOCK_OWNER_INIT; lock->owner_cpu = -1; } - EXPORT_SYMBOL(__rwlock_init); +#endif static void spin_bug(raw_spinlock_t *lock, const char *msg) { @@ -154,6 +155,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock) arch_spin_unlock(&lock->raw_lock); } +#ifndef CONFIG_PREEMPT_RT static void rwlock_bug(rwlock_t *lock, const char *msg) { if (!debug_locks_off()) @@ -295,3 +297,5 @@ void do_raw_write_unlock(rwlock_t *lock) debug_write_unlock(lock); arch_write_unlock(&lock->raw_lock); } + +#endif diff --git a/mm/bounce.c b/mm/bounce.c index a2b76a5..4a91eed 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -13,6 +13,7 @@ #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> +#include <linux/interrupt.h> #include <asm/tlbflush.h> #include <trace/events/block.h> @@ -49,11 +50,11 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) unsigned long flags; unsigned char *vto; - local_irq_save(flags); + local_irq_save_nort(flags); vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); memcpy(vto + to->bv_offset, vfrom, to->bv_len); kunmap_atomic(vto, KM_BOUNCE_READ); - local_irq_restore(flags); + local_irq_restore_nort(flags); } #else /* CONFIG_HIGHMEM */ diff --git a/mm/filemap.c b/mm/filemap.c index 698ea80..01c2711 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1868,7 +1868,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, char *kaddr; size_t copied; - BUG_ON(!in_atomic()); +// BUG_ON(!in_atomic()); kaddr = kmap_atomic(page, KM_USER0); if (likely(i->nr_segs == 1)) { int left; diff --git a/mm/highmem.c b/mm/highmem.c index 9c1e627..446b75c 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -14,6 +14,11 @@ * based on Linus' idea. * * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> + * + * Largely rewritten to get rid of all global locks + * + * Copyright (C) 2006 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * */ #include <linux/mm.h> @@ -26,18 +31,15 @@ #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> +#include <linux/hardirq.h> + #include <asm/tlbflush.h> +#include <asm/pgtable.h> -/* - * Virtual_count is not a pure "count". - * 0 means that it is not mapped, and has not been mapped - * since a TLB flush - it is usable. - * 1 means that there are no users, but it has been mapped - * since the last TLB flush - so we can't use it. - * n means that there are (n-1) current users of it. - */ #ifdef CONFIG_HIGHMEM +static int __set_page_address(struct page *page, void *virtual, int pos); + unsigned long totalhigh_pages __read_mostly; EXPORT_SYMBOL(totalhigh_pages); @@ -58,13 +60,21 @@ unsigned int nr_free_highpages (void) return pages; } -static int pkmap_count[LAST_PKMAP]; -static unsigned int last_pkmap_nr; -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); +/* + * count is not a pure "count". + * 0 means its owned exclusively by someone + * 1 means its free for use - either mapped or not. + * n means that there are (n-1) current users of it. + */ +static atomic_t pkmap_count[LAST_PKMAP]; +static atomic_t pkmap_hand; +static atomic_t pkmap_free; +static atomic_t pkmap_users; pte_t * pkmap_page_table; -static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); +static DECLARE_WAIT_QUEUE_HEAD(pkmap_wait); + /* * Most architectures have no use for kmap_high_get(), so let's abstract @@ -85,131 +95,261 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); do { spin_unlock(&kmap_lock); (void)(flags); } while (0) #endif -static void flush_all_zero_pkmaps(void) +/* + * Try to free a given kmap slot. + * + * Returns: + * -1 - in use + * 0 - free, no TLB flush needed + * 1 - free, needs TLB flush + */ +static int pkmap_try_free(int pos) { - int i; - int need_flush = 0; + if (atomic_cmpxchg(&pkmap_count[pos], 1, 0) != 1) + return -1; + atomic_dec(&pkmap_free); + /* + * TODO: add a young bit to make it CLOCK + */ + if (!pte_none(pkmap_page_table[pos])) { + struct page *page = pte_page(pkmap_page_table[pos]); + unsigned long addr = PKMAP_ADDR(pos); + pte_t *ptep = &pkmap_page_table[pos]; + + VM_BUG_ON(addr != (unsigned long)page_address(page)); - flush_cache_kmaps(); + if (!__set_page_address(page, NULL, pos)) + BUG(); + flush_kernel_dcache_page(page); + pte_clear(&init_mm, addr, ptep); + + return 1; + } + + return 0; +} + +static inline void pkmap_put(atomic_t *counter) +{ + switch (atomic_dec_return(counter)) { + case 0: + BUG(); + + case 1: + atomic_inc(&pkmap_free); + wake_up(&pkmap_wait); + } +} + +#define TLB_BATCH 32 + +static int pkmap_get_free(void) +{ + int i, pos, flush; + +restart: for (i = 0; i < LAST_PKMAP; i++) { - struct page *page; + pos = atomic_inc_return(&pkmap_hand) & LAST_PKMAP_MASK; + flush = pkmap_try_free(pos); + if (flush >= 0) + goto got_one; + } + + atomic_dec(&pkmap_free); + /* + * wait for somebody else to unmap their entries + */ + if (likely(!in_interrupt())) + wait_event(pkmap_wait, atomic_read(&pkmap_free) != 0); + + goto restart; + +got_one: + if (flush) { +#if 0 + flush_tlb_kernel_range(PKMAP_ADDR(pos), PKMAP_ADDR(pos+1)); +#else + int pos2 = (pos + 1) & LAST_PKMAP_MASK; + int nr; + int entries[TLB_BATCH]; /* - * zero means we don't have anything to do, - * >1 means that it is still in use. Only - * a count of 1 means that it is free but - * needs to be unmapped + * For those architectures that cannot help but flush the + * whole TLB, flush some more entries to make it worthwhile. + * Scan ahead of the hand to minimise search distances. */ - if (pkmap_count[i] != 1) - continue; - pkmap_count[i] = 0; + for (i = 0, nr = 0; i < LAST_PKMAP && nr < TLB_BATCH; + i++, pos2 = (pos2 + 1) & LAST_PKMAP_MASK) { + + flush = pkmap_try_free(pos2); + if (flush < 0) + continue; + + if (!flush) { + atomic_t *counter = &pkmap_count[pos2]; + VM_BUG_ON(atomic_read(counter) != 0); + atomic_set(counter, 2); + pkmap_put(counter); + } else + entries[nr++] = pos2; + } + flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); - /* sanity check */ - BUG_ON(pte_none(pkmap_page_table[i])); + for (i = 0; i < nr; i++) { + atomic_t *counter = &pkmap_count[entries[i]]; + VM_BUG_ON(atomic_read(counter) != 0); + atomic_set(counter, 2); + pkmap_put(counter); + } +#endif + } + return pos; +} + +static unsigned long pkmap_insert(struct page *page) +{ + int pos = pkmap_get_free(); + unsigned long vaddr = PKMAP_ADDR(pos); + pte_t *ptep = &pkmap_page_table[pos]; + pte_t entry = mk_pte(page, kmap_prot); + atomic_t *counter = &pkmap_count[pos]; + VM_BUG_ON(atomic_read(counter) != 0); + + set_pte_at(&init_mm, vaddr, ptep, entry); + if (unlikely(!__set_page_address(page, (void *)vaddr, pos))) { /* - * Don't need an atomic fetch-and-clear op here; - * no-one has the page mapped, and cannot get at - * its virtual address (and hence PTE) without first - * getting the kmap_lock (which is held here). - * So no dangers, even with speculative execution. + * concurrent pkmap_inserts for this page - + * the other won the race, release this entry. + * + * we can still clear the pte without a tlb flush since + * it couldn't have been used yet. */ - page = pte_page(pkmap_page_table[i]); - pte_clear(&init_mm, (unsigned long)page_address(page), - &pkmap_page_table[i]); + pte_clear(&init_mm, vaddr, ptep); + VM_BUG_ON(atomic_read(counter) != 0); + atomic_set(counter, 2); + pkmap_put(counter); + vaddr = 0; + } else + atomic_set(counter, 2); - set_page_address(page, NULL); - need_flush = 1; - } - if (need_flush) - flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); + return vaddr; } -/** - * kmap_flush_unused - flush all unused kmap mappings in order to remove stray mappings +/* + * Flush all unused kmap mappings in order to remove stray mappings. */ void kmap_flush_unused(void) { - lock_kmap(); - flush_all_zero_pkmaps(); - unlock_kmap(); + WARN_ON_ONCE(1); } -static inline unsigned long map_new_virtual(struct page *page) +/* + * Avoid starvation deadlock by limiting the number of tasks that can obtain a + * kmap to (LAST_PKMAP - KM_TYPE_NR*NR_CPUS)/2. + */ +static void kmap_account(void) { - unsigned long vaddr; - int count; - -start: - count = LAST_PKMAP; - /* Find an empty entry */ - for (;;) { - last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; - if (!last_pkmap_nr) { - flush_all_zero_pkmaps(); - count = LAST_PKMAP; - } - if (!pkmap_count[last_pkmap_nr]) - break; /* Found a usable entry */ - if (--count) - continue; + int weight; +#ifndef CONFIG_PREEMPT_RT + if (in_interrupt()) { + /* irqs can always get them */ + weight = -1; + } else +#endif + if (current->flags & PF_KMAP) { + current->flags &= ~PF_KMAP; + /* we already accounted the second */ + weight = 0; + } else { + /* mark 1, account 2 */ + current->flags |= PF_KMAP; + weight = 2; + } + + if (weight > 0) { /* - * Sleep for somebody else to unmap their entries + * reserve KM_TYPE_NR maps per CPU for interrupt context */ - { - DECLARE_WAITQUEUE(wait, current); - - __set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&pkmap_map_wait, &wait); - unlock_kmap(); - schedule(); - remove_wait_queue(&pkmap_map_wait, &wait); - lock_kmap(); - - /* Somebody else might have mapped it while we slept */ - if (page_address(page)) - return (unsigned long)page_address(page); - - /* Re-start */ - goto start; + const int target = LAST_PKMAP +#ifndef CONFIG_PREEMPT_RT + - KM_TYPE_NR*NR_CPUS +#endif + ; + +again: + wait_event(pkmap_wait, + atomic_read(&pkmap_users) + weight <= target); + + if (atomic_add_return(weight, &pkmap_users) > target) { + atomic_sub(weight, &pkmap_users); + goto again; } } - vaddr = PKMAP_ADDR(last_pkmap_nr); - set_pte_at(&init_mm, vaddr, - &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); +} - pkmap_count[last_pkmap_nr] = 1; - set_page_address(page, (void *)vaddr); +static void kunmap_account(void) +{ + int weight; - return vaddr; +#ifndef CONFIG_PREEMPT_RT + if (in_irq()) { + weight = -1; + } else +#endif + if (current->flags & PF_KMAP) { + /* there was only 1 kmap, un-account both */ + current->flags &= ~PF_KMAP; + weight = 2; + } else { + /* there were two kmaps, un-account per kunmap */ + weight = 1; + } + + if (weight > 0) + atomic_sub(weight, &pkmap_users); + wake_up(&pkmap_wait); } -/** - * kmap_high - map a highmem page into memory - * @page: &struct page to map - * - * Returns the page's virtual memory address. - * - * We cannot call this from interrupts, as it may block. - */ void *kmap_high(struct page *page) { unsigned long vaddr; - /* - * For highmem pages, we can't trust "virtual" until - * after we have the lock. - */ - lock_kmap(); + + kmap_account(); +again: vaddr = (unsigned long)page_address(page); + if (vaddr) { + atomic_t *counter = &pkmap_count[PKMAP_NR(vaddr)]; + if (atomic_inc_not_zero(counter)) { + /* + * atomic_inc_not_zero implies a (memory) barrier on success + * so page address will be reloaded. + */ + unsigned long vaddr2 = (unsigned long)page_address(page); + if (likely(vaddr == vaddr2)) + return (void *)vaddr; + + /* + * Oops, we got someone else. + * + * This can happen if we get preempted after + * page_address() and before atomic_inc_not_zero() + * and during that preemption this slot is freed and + * reused. + */ + pkmap_put(counter); + goto again; + } + } + + vaddr = pkmap_insert(page); if (!vaddr) - vaddr = map_new_virtual(page); - pkmap_count[PKMAP_NR(vaddr)]++; - BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2); - unlock_kmap(); - return (void*) vaddr; + goto again; + + return (void *)vaddr; } EXPORT_SYMBOL(kmap_high); @@ -240,51 +380,12 @@ void *kmap_high_get(struct page *page) } #endif -/** - * kunmap_high - map a highmem page into memory - * @page: &struct page to unmap - * - * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called - * only from user context. - */ -void kunmap_high(struct page *page) + void kunmap_high(struct page *page) { - unsigned long vaddr; - unsigned long nr; - unsigned long flags; - int need_wakeup; - - lock_kmap_any(flags); - vaddr = (unsigned long)page_address(page); + unsigned long vaddr = (unsigned long)page_address(page); BUG_ON(!vaddr); - nr = PKMAP_NR(vaddr); - - /* - * A count must never go down to zero - * without a TLB flush! - */ - need_wakeup = 0; - switch (--pkmap_count[nr]) { - case 0: - BUG(); - case 1: - /* - * Avoid an unnecessary wake_up() function call. - * The common case is pkmap_count[] == 1, but - * no waiters. - * The tasks queued in the wait-queue are guarded - * by both the lock in the wait-queue-head and by - * the kmap_lock. As the kmap_lock is held here, - * no need for the wait-queue-head's lock. Simply - * test if the queue is empty. - */ - need_wakeup = waitqueue_active(&pkmap_map_wait); - } - unlock_kmap_any(flags); - - /* do wake-up, if needed, race-free outside of the spin lock */ - if (need_wakeup) - wake_up(&pkmap_map_wait); + pkmap_put(&pkmap_count[PKMAP_NR(vaddr)]); + kunmap_account(); } EXPORT_SYMBOL(kunmap_high); @@ -295,19 +396,13 @@ EXPORT_SYMBOL(kunmap_high); #define PA_HASH_ORDER 7 /* - * Describes one page->virtual association + * Describes one page->virtual address association. */ -struct page_address_map { +static struct page_address_map { struct page *page; void *virtual; struct list_head list; -}; - -/* - * page_address_map freelist, allocated from page_address_maps. - */ -static struct list_head page_address_pool; /* freelist */ -static spinlock_t pool_lock; /* protects page_address_pool */ +} page_address_maps[LAST_PKMAP]; /* * Hash table bucket @@ -328,29 +423,37 @@ static struct page_address_slot *page_slot(struct page *page) * * Returns the page's virtual address. */ -void *page_address(struct page *page) -{ - unsigned long flags; - void *ret; - struct page_address_slot *pas; - if (!PageHighMem(page)) - return lowmem_page_address(page); +static void *__page_address(struct page_address_slot *pas, struct page *page) +{ + void *ret = NULL; - pas = page_slot(page); - ret = NULL; - spin_lock_irqsave(&pas->lock, flags); if (!list_empty(&pas->lh)) { struct page_address_map *pam; list_for_each_entry(pam, &pas->lh, list) { if (pam->page == page) { ret = pam->virtual; - goto done; + break; } } } -done: + + return ret; +} + +void *page_address(struct page *page) +{ + unsigned long flags; + void *ret; + struct page_address_slot *pas; + + if (!PageHighMem(page)) + return lowmem_page_address(page); + + pas = page_slot(page); + spin_lock_irqsave(&pas->lock, flags); + ret = __page_address(pas, page); spin_unlock_irqrestore(&pas->lock, flags); return ret; } @@ -362,62 +465,90 @@ EXPORT_SYMBOL(page_address); * @page: &struct page to set * @virtual: virtual address to use */ -void set_page_address(struct page *page, void *virtual) +static int __set_page_address(struct page *page, void *virtual, int pos) { + int ret = 0; unsigned long flags; struct page_address_slot *pas; struct page_address_map *pam; - BUG_ON(!PageHighMem(page)); + VM_BUG_ON(!PageHighMem(page)); + VM_BUG_ON(atomic_read(&pkmap_count[pos]) != 0); + VM_BUG_ON(pos < 0 || pos >= LAST_PKMAP); pas = page_slot(page); - if (virtual) { /* Add */ - BUG_ON(list_empty(&page_address_pool)); - - spin_lock_irqsave(&pool_lock, flags); - pam = list_entry(page_address_pool.next, - struct page_address_map, list); - list_del(&pam->list); - spin_unlock_irqrestore(&pool_lock, flags); - - pam->page = page; - pam->virtual = virtual; - - spin_lock_irqsave(&pas->lock, flags); - list_add_tail(&pam->list, &pas->lh); - spin_unlock_irqrestore(&pas->lock, flags); - } else { /* Remove */ - spin_lock_irqsave(&pas->lock, flags); - list_for_each_entry(pam, &pas->lh, list) { - if (pam->page == page) { - list_del(&pam->list); - spin_unlock_irqrestore(&pas->lock, flags); - spin_lock_irqsave(&pool_lock, flags); - list_add_tail(&pam->list, &page_address_pool); - spin_unlock_irqrestore(&pool_lock, flags); - goto done; - } + pam = &page_address_maps[pos]; + + spin_lock_irqsave(&pas->lock, flags); + if (virtual) { /* add */ + VM_BUG_ON(!list_empty(&pam->list)); + + if (!__page_address(pas, page)) { + pam->page = page; + pam->virtual = virtual; + list_add_tail(&pam->list, &pas->lh); + ret = 1; + } + } else { /* remove */ + if (!list_empty(&pam->list)) { + list_del_init(&pam->list); + ret = 1; } - spin_unlock_irqrestore(&pas->lock, flags); } -done: - return; + spin_unlock_irqrestore(&pas->lock, flags); + + return ret; } -static struct page_address_map page_address_maps[LAST_PKMAP]; +int set_page_address(struct page *page, void *virtual) +{ + /* + * set_page_address is not supposed to be called when using + * hashed virtual addresses. + */ + BUG(); + return 0; +} -void __init page_address_init(void) +void __init __page_address_init(void) { int i; - INIT_LIST_HEAD(&page_address_pool); for (i = 0; i < ARRAY_SIZE(page_address_maps); i++) - list_add(&page_address_maps[i].list, &page_address_pool); + INIT_LIST_HEAD(&page_address_maps[i].list); + for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { INIT_LIST_HEAD(&page_address_htable[i].lh); spin_lock_init(&page_address_htable[i].lock); } - spin_lock_init(&pool_lock); +} + +#elif defined (CONFIG_HIGHMEM) /* HASHED_PAGE_VIRTUAL */ + +static int __set_page_address(struct page *page, void *virtual, int pos) +{ + return set_page_address(page, virtual); +} + +#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ + +#if defined(CONFIG_HIGHMEM) || defined(HASHED_PAGE_VIRTUAL) + +void __init page_address_init(void) +{ +#ifdef CONFIG_HIGHMEM + int i; + + for (i = 0; i < ARRAY_SIZE(pkmap_count); i++) + atomic_set(&pkmap_count[i], 1); + atomic_set(&pkmap_hand, 0); + atomic_set(&pkmap_free, LAST_PKMAP); + atomic_set(&pkmap_users, 0); +#endif + +#ifdef HASHED_PAGE_VIRTUAL + __page_address_init(); +#endif } #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 954032b..d3e716f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1258,13 +1258,15 @@ void mem_cgroup_update_file_mapped(struct page *page, int val) goto done; /* - * Preemption is already disabled, we don't need get_cpu() + * Preemption is already disabled, we don't need get_cpu(), + * but that's not true for RT ! */ - cpu = smp_processor_id(); + cpu = get_cpu(); stat = &mem->stat; cpustat = &stat->cpustat[cpu]; __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val); + put_cpu(); done: unlock_page_cgroup(pc); } diff --git a/mm/memory.c b/mm/memory.c index 09e4b1b..aecb745 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -974,10 +974,13 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, return addr; } -#ifdef CONFIG_PREEMPT +#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_RT) # define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) #else -/* No preempt: go for improved straight-line efficiency */ +/* + * No preempt: go for improved straight-line efficiency + * on PREEMPT_RT this is not a critical latency-path. + */ # define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) #endif @@ -1007,17 +1010,14 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ -unsigned long unmap_vmas(struct mmu_gather **tlbp, +unsigned long unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *details) { long zap_work = ZAP_BLOCK_SIZE; - unsigned long tlb_start = 0; /* For tlb_finish_mmu */ - int tlb_start_valid = 0; unsigned long start = start_addr; spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; - int fullmm = (*tlbp)->fullmm; struct mm_struct *mm = vma->vm_mm; mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); @@ -1038,11 +1038,6 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, untrack_pfn_vma(vma, 0, 0); while (start != end) { - if (!tlb_start_valid) { - tlb_start = start; - tlb_start_valid = 1; - } - if (unlikely(is_vm_hugetlb_page(vma))) { /* * It is undesirable to test vma->vm_file as it @@ -1063,7 +1058,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, start = end; } else - start = unmap_page_range(*tlbp, vma, + start = unmap_page_range(tlb, vma, start, end, &zap_work, details); if (zap_work > 0) { @@ -1071,19 +1066,13 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, break; } - tlb_finish_mmu(*tlbp, tlb_start, start); - if (need_resched() || (i_mmap_lock && spin_needbreak(i_mmap_lock))) { - if (i_mmap_lock) { - *tlbp = NULL; + if (i_mmap_lock) goto out; - } cond_resched(); } - *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); - tlb_start_valid = 0; zap_work = ZAP_BLOCK_SIZE; } } @@ -1103,16 +1092,15 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { struct mm_struct *mm = vma->vm_mm; - struct mmu_gather *tlb; + struct mmu_gather tlb; unsigned long end = address + size; unsigned long nr_accounted = 0; lru_add_drain(); - tlb = tlb_gather_mmu(mm, 0); + tlb_gather_mmu(&tlb, mm, 0); update_hiwater_rss(mm); end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); - if (tlb) - tlb_finish_mmu(tlb, address, end); + tlb_finish_mmu(&tlb, address, end); return end; } @@ -2491,12 +2479,12 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) return -ENOSYS; mutex_lock(&inode->i_mutex); - down_write(&inode->i_alloc_sem); + anon_down_write(&inode->i_alloc_sem); unmap_mapping_range(mapping, offset, (end - offset), 1); truncate_inode_pages_range(mapping, offset, end); unmap_mapping_range(mapping, offset, (end - offset), 1); inode->i_op->truncate_range(inode, offset, end); - up_write(&inode->i_alloc_sem); + anon_up_write(&inode->i_alloc_sem); mutex_unlock(&inode->i_mutex); return 0; @@ -3008,6 +2996,28 @@ unlock: return 0; } +void pagefault_disable(void) +{ + current->pagefault_disabled++; + /* + * make sure to have issued the store before a pagefault + * can hit. + */ + barrier(); +} +EXPORT_SYMBOL(pagefault_disable); + +void pagefault_enable(void) +{ + /* + * make sure to issue those last loads/stores before enabling + * the pagefault handler again. + */ + barrier(); + current->pagefault_disabled--; +} +EXPORT_SYMBOL(pagefault_enable); + /* * By the time we get here, we already hold the mm semaphore */ diff --git a/mm/migrate.c b/mm/migrate.c index 9a0db5b..880bd59 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1002,33 +1002,27 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, #define DO_PAGES_STAT_CHUNK_NR 16 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; int chunk_status[DO_PAGES_STAT_CHUNK_NR]; - unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR; - int err; - for (i = 0; i < nr_pages; i += chunk_nr) { - if (chunk_nr > nr_pages - i) - chunk_nr = nr_pages - i; + while (nr_pages) { + unsigned long chunk_nr; - err = copy_from_user(chunk_pages, &pages[i], - chunk_nr * sizeof(*chunk_pages)); - if (err) { - err = -EFAULT; - goto out; - } + chunk_nr = nr_pages; + if (chunk_nr > DO_PAGES_STAT_CHUNK_NR) + chunk_nr = DO_PAGES_STAT_CHUNK_NR; + + if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages))) + break; do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); - err = copy_to_user(&status[i], chunk_status, - chunk_nr * sizeof(*chunk_status)); - if (err) { - err = -EFAULT; - goto out; - } - } - err = 0; + if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status))) + break; -out: - return err; + pages += chunk_nr; + status += chunk_nr; + nr_pages -= chunk_nr; + } + return nr_pages ? -EFAULT : 0; } /* diff --git a/mm/mmap.c b/mm/mmap.c index ee22989..73ab63e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1820,17 +1820,17 @@ static void unmap_region(struct mm_struct *mm, unsigned long start, unsigned long end) { struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; - struct mmu_gather *tlb; + struct mmu_gather tlb; unsigned long nr_accounted = 0; lru_add_drain(); - tlb = tlb_gather_mmu(mm, 0); + tlb_gather_mmu(&tlb, mm, 0); update_hiwater_rss(mm); unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, + free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, next? next->vm_start: 0); - tlb_finish_mmu(tlb, start, end); + tlb_finish_mmu(&tlb, start, end); } /* @@ -2032,10 +2032,16 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) static inline void verify_mm_writelocked(struct mm_struct *mm) { #ifdef CONFIG_DEBUG_VM - if (unlikely(down_read_trylock(&mm->mmap_sem))) { +# ifdef CONFIG_PREEMPT_RT + if (unlikely(!rwsem_is_locked(&mm->mmap_sem))) { WARN_ON(1); - up_read(&mm->mmap_sem); } +# else + if (unlikely(down_read_trylock(&mm->mmap_sem))) { + WARN_ON(1); + up_read(&mm->mmap_sem); + } +# endif #endif } @@ -2143,7 +2149,7 @@ EXPORT_SYMBOL(do_brk); /* Release all mmaps. */ void exit_mmap(struct mm_struct *mm) { - struct mmu_gather *tlb; + struct mmu_gather tlb; struct vm_area_struct *vma; unsigned long nr_accounted = 0; unsigned long end; @@ -2168,14 +2174,14 @@ void exit_mmap(struct mm_struct *mm) lru_add_drain(); flush_cache_mm(mm); - tlb = tlb_gather_mmu(mm, 1); + tlb_gather_mmu(&tlb, mm, 1); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); - tlb_finish_mmu(tlb, 0, end); + free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); + tlb_finish_mmu(&tlb, 0, end); /* * Walk the list again, actually closing and freeing it, diff --git a/mm/mmu_context.c b/mm/mmu_context.c index ded9081..fa3d693 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c @@ -25,6 +25,7 @@ void use_mm(struct mm_struct *mm) struct task_struct *tsk = current; task_lock(tsk); + preempt_disable(); active_mm = tsk->active_mm; if (active_mm != mm) { atomic_inc(&mm->mm_count); @@ -32,6 +33,7 @@ void use_mm(struct mm_struct *mm) } tsk->mm = mm; switch_mm(active_mm, mm, tsk); + preempt_enable(); task_unlock(tsk); if (active_mm != mm) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f52481b..9210595 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -349,7 +349,7 @@ static void dump_tasks(const struct mem_cgroup *mem) continue; } printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", - p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, + p->pid, task_uid(p), p->tgid, mm->total_vm, get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj, p->comm); task_unlock(p); @@ -459,6 +459,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, list_for_each_entry(c, &p->children, sibling) { if (c->mm == p->mm) continue; + if (mem && !task_in_mem_cgroup(c, mem)) + continue; if (!oom_kill_task(c)) return 0; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8deb9d0..bde9ea1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -162,6 +162,53 @@ static unsigned long __meminitdata dma_reserve; EXPORT_SYMBOL(movable_zone); #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ +#ifdef CONFIG_PREEMPT_RT +static DEFINE_PER_CPU_LOCKED(int, pcp_locks); +#endif + +static inline void __lock_cpu_pcp(unsigned long *flags, int cpu) +{ +#ifdef CONFIG_PREEMPT_RT + spin_lock(&__get_cpu_lock(pcp_locks, cpu)); + *flags = 0; +#else + local_irq_save(*flags); +#endif +} + +static inline void lock_cpu_pcp(unsigned long *flags, int *this_cpu) +{ +#ifdef CONFIG_PREEMPT_RT + (void)get_cpu_var_locked(pcp_locks, this_cpu); + flags = 0; +#else + local_irq_save(*flags); + *this_cpu = smp_processor_id(); +#endif +} + +static inline void unlock_cpu_pcp(unsigned long flags, int this_cpu) +{ +#ifdef CONFIG_PREEMPT_RT + put_cpu_var_locked(pcp_locks, this_cpu); +#else + local_irq_restore(flags); +#endif +} + +static struct per_cpu_pageset * +get_zone_pcp(struct zone *zone, unsigned long *flags, int *this_cpu) +{ + lock_cpu_pcp(flags, this_cpu); + return zone_pcp(zone, *this_cpu); +} + +static void +put_zone_pcp(struct zone *zone, unsigned long flags, int this_cpu) +{ + unlock_cpu_pcp(flags, this_cpu); +} + #if MAX_NUMNODES > 1 int nr_node_ids __read_mostly = MAX_NUMNODES; int nr_online_nodes __read_mostly = 1; @@ -524,16 +571,48 @@ static inline int free_pages_check(struct page *page) * pinned" detection logic. */ static void free_pcppages_bulk(struct zone *zone, int count, - struct per_cpu_pages *pcp) + struct per_cpu_pages *pcp) { int migratetype = 0; - int batch_free = 0; + unsigned long flags; - spin_lock(&zone->lock); + spin_lock_irqsave(&zone->lock, flags); zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); zone->pages_scanned = 0; __mod_zone_page_state(zone, NR_FREE_PAGES, count); + + for (migratetype =0; migratetype < MIGRATE_PCPTYPES; migratetype++) { + struct list_head *list = &pcp->lists[migratetype]; + + while (!list_empty(list)) { + struct page *page; + + page = list_first_entry(list, struct page, lru); + /* must delete as __free_one_page list manipulates */ + list_del(&page->lru); + /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ + __free_one_page(page, zone, 0, page_private(page)); + trace_mm_page_pcpu_drain(page, 0, page_private(page)); +#ifdef CONFIG_PREEMPT_RT + cond_resched_lock(&zone->lock); +#endif + count--; + } + } + WARN_ON(count != 0); + spin_unlock_irqrestore(&zone->lock, flags); +} + +static void isolate_pcp_pages(int count, struct per_cpu_pages *src, + struct per_cpu_pages *dst) +{ + int migratetype, batch_free = 0; + + for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) + INIT_LIST_HEAD(&dst->lists[migratetype]); + migratetype = 0; + while (count) { struct page *page; struct list_head *list; @@ -549,38 +628,36 @@ static void free_pcppages_bulk(struct zone *zone, int count, batch_free++; if (++migratetype == MIGRATE_PCPTYPES) migratetype = 0; - list = &pcp->lists[migratetype]; + list = &src->lists[migratetype]; } while (list_empty(list)); do { - page = list_entry(list->prev, struct page, lru); + page = list_last_entry(list, struct page, lru); /* must delete as __free_one_page list manipulates */ list_del(&page->lru); - /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ - __free_one_page(page, zone, 0, page_private(page)); - trace_mm_page_pcpu_drain(page, 0, page_private(page)); + list_add(&page->lru, &dst->lists[migratetype]); } while (--count && --batch_free && !list_empty(list)); } - spin_unlock(&zone->lock); } static void free_one_page(struct zone *zone, struct page *page, int order, int migratetype) { - spin_lock(&zone->lock); + unsigned long flags; + + spin_lock_irqsave(&zone->lock, flags); zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); zone->pages_scanned = 0; __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); __free_one_page(page, zone, order, migratetype); - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); } static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; - int i; - int bad = 0; + int i, this_cpu, bad = 0; int wasMlocked = __TestClearPageMlocked(page); kmemcheck_free_shadow(page, order); @@ -598,13 +675,13 @@ static void __free_pages_ok(struct page *page, unsigned int order) arch_free_page(page, order); kernel_map_pages(page, 1 << order, 0); - local_irq_save(flags); + lock_cpu_pcp(&flags, &this_cpu); if (unlikely(wasMlocked)) free_page_mlock(page); - __count_vm_events(PGFREE, 1 << order); + count_vm_events(PGFREE, 1 << order); + unlock_cpu_pcp(flags, this_cpu); free_one_page(page_zone(page), page, order, - get_pageblock_migratetype(page)); - local_irq_restore(flags); + get_pageblock_migratetype(page)); } /* @@ -979,17 +1056,19 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, */ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { + struct per_cpu_pages dst; unsigned long flags; - int to_drain; + int to_drain, this_cpu; - local_irq_save(flags); + lock_cpu_pcp(&flags, &this_cpu); if (pcp->count >= pcp->batch) to_drain = pcp->batch; else to_drain = pcp->count; - free_pcppages_bulk(zone, to_drain, pcp); + isolate_pcp_pages(to_drain, pcp, &dst); pcp->count -= to_drain; - local_irq_restore(flags); + unlock_cpu_pcp(flags, this_cpu); + free_pcppages_bulk(zone, to_drain, &dst); } #endif @@ -1007,15 +1086,23 @@ static void drain_pages(unsigned int cpu) for_each_populated_zone(zone) { struct per_cpu_pageset *pset; - struct per_cpu_pages *pcp; + struct per_cpu_pages *pcp, dst; + int count; + __lock_cpu_pcp(&flags, cpu); pset = zone_pcp(zone, cpu); + if (!pset) { + unlock_cpu_pcp(flags, cpu); + WARN_ON(1); + continue; + } pcp = &pset->pcp; - local_irq_save(flags); - free_pcppages_bulk(zone, pcp->count, pcp); + isolate_pcp_pages(pcp->count, pcp, &dst); + count = pcp->count; pcp->count = 0; - local_irq_restore(flags); + unlock_cpu_pcp(flags, cpu); + free_pcppages_bulk(zone, count, &dst); } } @@ -1027,12 +1114,52 @@ void drain_local_pages(void *arg) drain_pages(smp_processor_id()); } +#ifdef CONFIG_PREEMPT_RT +static void drain_local_pages_work(struct work_struct *wrk) +{ + drain_pages(smp_processor_id()); +} +#endif + /* * Spill all the per-cpu pages from all CPUs back into the buddy allocator */ void drain_all_pages(void) { +#ifdef CONFIG_PREEMPT_RT + /* + * HACK!!!!! + * For RT we can't use IPIs to run drain_local_pages, since + * that code will call spin_locks that will now sleep. + * But, schedule_on_each_cpu will call kzalloc, which will + * call page_alloc which was what calls this. + * + * Luckily, there's a condition to get here, and that is if + * the order passed in to alloc_pages is greater than 0 + * (alloced more than a page size). The slabs only allocate + * what is needed, and the allocation made by schedule_on_each_cpu + * does an alloc of "sizeof(void *)*nr_cpu_ids". + * + * So we can safely call schedule_on_each_cpu if that number + * is less than a page. Otherwise don't bother. At least warn of + * this issue. + * + * And yes, this is one big hack. Please fix ;-) + */ + if (sizeof(void *)*nr_cpu_ids < PAGE_SIZE) + schedule_on_each_cpu(drain_local_pages_work); + else { + static int once; + if (!once) { + printk(KERN_ERR "Can't drain all CPUS due to possible recursion\n"); + once = 1; + } + drain_local_pages(NULL); + } + +#else on_each_cpu(drain_local_pages, NULL, 1); +#endif } #ifdef CONFIG_HIBERNATION @@ -1077,9 +1204,10 @@ void mark_free_pages(struct zone *zone) static void free_hot_cold_page(struct page *page, int cold) { struct zone *zone = page_zone(page); + struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; unsigned long flags; - int migratetype; + int migratetype, this_cpu, count; int wasMlocked = __TestClearPageMlocked(page); kmemcheck_free_shadow(page, 0); @@ -1096,13 +1224,13 @@ static void free_hot_cold_page(struct page *page, int cold) arch_free_page(page, 0); kernel_map_pages(page, 1, 0); - pcp = &zone_pcp(zone, get_cpu())->pcp; + pset = get_zone_pcp(zone, &flags, &this_cpu); + pcp = &pset->pcp; migratetype = get_pageblock_migratetype(page); set_page_private(page, migratetype); - local_irq_save(flags); if (unlikely(wasMlocked)) free_page_mlock(page); - __count_vm_event(PGFREE); + count_vm_event(PGFREE); /* * We only track unmovable, reclaimable and movable on pcp lists. @@ -1125,13 +1253,17 @@ static void free_hot_cold_page(struct page *page, int cold) list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { - free_pcppages_bulk(zone, pcp->batch, pcp); + struct per_cpu_pages dst; + + isolate_pcp_pages(pcp->batch, pcp, &dst); pcp->count -= pcp->batch; + count = pcp->batch; + put_zone_pcp(zone, flags, this_cpu); + free_pcppages_bulk(zone, count, &dst); + return; } - out: - local_irq_restore(flags); - put_cpu(); + put_zone_pcp(zone, flags, this_cpu); } void free_hot_page(struct page *page) @@ -1181,17 +1313,17 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, unsigned long flags; struct page *page; int cold = !!(gfp_flags & __GFP_COLD); - int cpu; + struct per_cpu_pageset *pset; + int this_cpu; again: - cpu = get_cpu(); + pset = get_zone_pcp(zone, &flags, &this_cpu); + if (likely(order == 0)) { - struct per_cpu_pages *pcp; + struct per_cpu_pages *pcp = &pset->pcp; struct list_head *list; - pcp = &zone_pcp(zone, cpu)->pcp; list = &pcp->lists[migratetype]; - local_irq_save(flags); if (list_empty(list)) { pcp->count += rmqueue_bulk(zone, 0, pcp->batch, list, @@ -1221,7 +1353,7 @@ again: */ WARN_ON_ONCE(order > 1); } - spin_lock_irqsave(&zone->lock, flags); + spin_lock(&zone->lock); page = __rmqueue(zone, order, migratetype); spin_unlock(&zone->lock); if (!page) @@ -1231,8 +1363,7 @@ again: __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone); - local_irq_restore(flags); - put_cpu(); + put_zone_pcp(zone, flags, this_cpu); VM_BUG_ON(bad_range(zone, page)); if (prep_new_page(page, order, gfp_flags)) @@ -1240,8 +1371,7 @@ again: return page; failed: - local_irq_restore(flags); - put_cpu(); + put_zone_pcp(zone, flags, this_cpu); return NULL; } @@ -3159,7 +3289,23 @@ static inline void free_zone_pagesets(int cpu) struct zone *zone; for_each_zone(zone) { - struct per_cpu_pageset *pset = zone_pcp(zone, cpu); + unsigned long flags; + struct per_cpu_pageset *pset; + + /* + * On PREEMPT_RT the allocator is preemptible, therefore + * kstopmachine can preempt a process in the middle of an + * allocation, freeing the pset underneath such a process + * isn't a good idea. + * + * Take the per-cpu pcp lock to allow the task to complete + * before we free it. New tasks will be held off by the + * cpu_online() check in get_cpu_var_locked(). + */ + __lock_cpu_pcp(&flags, cpu); + pset = zone_pcp(zone, cpu); + zone_pcp(zone, cpu) = NULL; + unlock_cpu_pcp(flags, cpu); /* Free per_cpu_pageset if it is slab allocated */ if (pset != &boot_pageset[cpu]) diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 3d535d5..54facf0 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -14,6 +14,7 @@ static void __meminit __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) { pc->flags = 0; + spin_lock_init(&pc->lock); pc->mem_cgroup = NULL; pc->page = pfn_to_page(pfn); INIT_LIST_HEAD(&pc->lru); diff --git a/mm/quicklist.c b/mm/quicklist.c index 6633965..b6fb023 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -19,7 +19,7 @@ #include <linux/module.h> #include <linux/quicklist.h> -DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist); +DEFINE_PER_CPU_LOCKED(struct quicklist [CONFIG_NR_QUICK], quicklist); #define FRACTION_OF_NODE_MEM 16 @@ -65,17 +65,14 @@ void quicklist_trim(int nr, void (*dtor)(void *), { long pages_to_free; struct quicklist *q; + int cpu; - q = &get_cpu_var(quicklist)[nr]; + q = &get_cpu_var_locked(quicklist, &cpu)[nr]; if (q->nr_pages > min_pages) { pages_to_free = min_pages_to_free(q, min_pages, max_free); while (pages_to_free > 0) { - /* - * We pass a gfp_t of 0 to quicklist_alloc here - * because we will never call into the page allocator. - */ - void *p = quicklist_alloc(nr, 0, NULL); + void *p = __quicklist_alloc(q); if (dtor) dtor(p); @@ -83,7 +80,7 @@ void quicklist_trim(int nr, void (*dtor)(void *), pages_to_free--; } } - put_cpu_var(quicklist); + put_cpu_var_locked(quicklist, cpu); } unsigned long quicklist_total_size(void) @@ -93,7 +90,7 @@ unsigned long quicklist_total_size(void) struct quicklist *ql, *q; for_each_online_cpu(cpu) { - ql = per_cpu(quicklist, cpu); + ql = per_cpu_var_locked(quicklist, cpu); for (q = ql; q < ql + CONFIG_NR_QUICK; q++) count += q->nr_pages; } diff --git a/mm/slab.c b/mm/slab.c index 7451bda..985c67b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -121,6 +121,157 @@ #include <asm/page.h> /* + * On !PREEMPT_RT, raw irq flags are used as a per-CPU locking + * mechanism. + * + * On PREEMPT_RT, we use per-CPU locks for this. That's why the + * calling convention is changed slightly: a new 'flags' argument + * is passed to 'irq disable/enable' - the PREEMPT_RT code stores + * the CPU number of the lock there. + */ +#ifndef CONFIG_PREEMPT_RT + +# define slab_irq_disable(cpu) \ + do { local_irq_disable(); (cpu) = smp_processor_id(); } while (0) +# define slab_irq_enable(cpu) local_irq_enable() + +static inline void slab_irq_disable_this_rt(int cpu) +{ +} + +static inline void slab_irq_enable_rt(int cpu) +{ +} + +# define slab_irq_save(flags, cpu) \ + do { local_irq_save(flags); (cpu) = smp_processor_id(); } while (0) +# define slab_irq_restore(flags, cpu) local_irq_restore(flags) + +/* + * In the __GFP_WAIT case we enable/disable interrupts on !PREEMPT_RT, + * which has no per-CPU locking effect since we are holding the cache + * lock in that case already. + */ +static void slab_irq_enable_GFP_WAIT(gfp_t flags, int *cpu) +{ + if (flags & __GFP_WAIT) + local_irq_enable(); +} + +static void slab_irq_disable_GFP_WAIT(gfp_t flags, int *cpu) +{ + if (flags & __GFP_WAIT) + local_irq_disable(); +} + +#define slab_spin_trylock_irq(lock, cpu) \ + ({ int __l = spin_trylock_irq(lock); if (__l) (cpu) = smp_processor_id(); __l; }) + +# define slab_spin_lock_irq(lock, cpu) \ + do { spin_lock_irq(lock); (cpu) = smp_processor_id(); } while (0) +# define slab_spin_unlock_irq(lock, cpu) spin_unlock_irq(lock) + +# define slab_spin_lock_irqsave(lock, flags, cpu) \ + do { spin_lock_irqsave(lock, flags); (cpu) = smp_processor_id(); } while (0) +# define slab_spin_unlock_irqrestore(lock, flags, cpu) \ + do { spin_unlock_irqrestore(lock, flags); } while (0) + +#else /* CONFIG_PREEMPT_RT */ + +/* + * Instead of serializing the per-cpu state by disabling interrupts we do so + * by a lock. This keeps the code preemptable - albeit at the cost of remote + * memory access when the task does get migrated away. + */ +DEFINE_PER_CPU_LOCKED(struct list_head, slab) = { 0, }; + +static void _slab_irq_disable(int *cpu) +{ + (void)get_cpu_var_locked(slab, cpu); +} + +#define slab_irq_disable(cpu) _slab_irq_disable(&(cpu)) + +static inline void slab_irq_enable(int cpu) +{ + LIST_HEAD(list); + + list_splice_init(&__get_cpu_var_locked(slab, cpu), &list); + put_cpu_var_locked(slab, cpu); + + while (!list_empty(&list)) { + struct page *page = list_first_entry(&list, struct page, lru); + list_del(&page->lru); + __free_pages(page, page->index); + } +} + +static inline void slab_irq_disable_this_rt(int cpu) +{ + spin_lock(&__get_cpu_lock(slab, cpu)); +} + +static inline void slab_irq_enable_rt(int cpu) +{ + LIST_HEAD(list); + + list_splice_init(&__get_cpu_var_locked(slab, cpu), &list); + spin_unlock(&__get_cpu_lock(slab, cpu)); + + while (!list_empty(&list)) { + struct page *page = list_first_entry(&list, struct page, lru); + list_del(&page->lru); + __free_pages(page, page->index); + } +} + +# define slab_irq_save(flags, cpu) \ + do { slab_irq_disable(cpu); (void) (flags); } while (0) +# define slab_irq_restore(flags, cpu) \ + do { slab_irq_enable(cpu); (void) (flags); } while (0) + +/* + * On PREEMPT_RT we have to drop the locks unconditionally to avoid lock + * recursion on the cache_grow()->alloc_slabmgmt() path. + */ +static void slab_irq_enable_GFP_WAIT(gfp_t flags, int *cpu) +{ + slab_irq_enable(*cpu); +} + +static void slab_irq_disable_GFP_WAIT(gfp_t flags, int *cpu) +{ + slab_irq_disable(*cpu); +} + +static inline int _slab_spin_trylock_irq(spinlock_t *lock, int *cpu) +{ + int locked; + + slab_irq_disable(*cpu); + locked = spin_trylock(lock); + if (!locked) + slab_irq_enable(*cpu); + + return locked; +} + +# define slab_spin_trylock_irq(lock, cpu) \ + _slab_spin_trylock_irq((lock), &(cpu)) + +# define slab_spin_lock_irq(lock, cpu) \ + do { slab_irq_disable(cpu); spin_lock(lock); } while (0) +# define slab_spin_unlock_irq(lock, cpu) \ + do { spin_unlock(lock); slab_irq_enable(cpu); } while (0) + +# define slab_spin_lock_irqsave(lock, flags, cpu) \ + do { slab_irq_disable(cpu); spin_lock_irqsave(lock, flags); } while (0) +# define slab_spin_unlock_irqrestore(lock, flags, cpu) \ + do { spin_unlock_irqrestore(lock, flags); slab_irq_enable(cpu); } while (0) + +#endif /* CONFIG_PREEMPT_RT */ + +/* * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. * 0 for faster, smaller code (especially in the critical paths). * @@ -316,7 +467,7 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; static int drain_freelist(struct kmem_cache *cache, struct kmem_list3 *l3, int tofree); static void free_block(struct kmem_cache *cachep, void **objpp, int len, - int node); + int node, int *this_cpu); static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); static void cache_reap(struct work_struct *unused); @@ -699,9 +850,10 @@ static struct list_head cache_chain; static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); -static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) +static inline struct array_cache * +cpu_cache_get(struct kmem_cache *cachep, int this_cpu) { - return cachep->array[smp_processor_id()]; + return cachep->array[this_cpu]; } static inline struct kmem_cache *__find_general_cachep(size_t size, @@ -942,7 +1094,7 @@ static int transfer_objects(struct array_cache *to, #ifndef CONFIG_NUMA #define drain_alien_cache(cachep, alien) do { } while (0) -#define reap_alien(cachep, l3) do { } while (0) +#define reap_alien(cachep, l3) 0 static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { @@ -953,27 +1105,28 @@ static inline void free_alien_cache(struct array_cache **ac_ptr) { } -static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) +static inline int +cache_free_alien(struct kmem_cache *cachep, void *objp, int *this_cpu) { return 0; } static inline void *alternate_node_alloc(struct kmem_cache *cachep, - gfp_t flags) + gfp_t flags, int *this_cpu) { return NULL; } static inline void *____cache_alloc_node(struct kmem_cache *cachep, - gfp_t flags, int nodeid) + gfp_t flags, int nodeid, int *this_cpu) { return NULL; } #else /* CONFIG_NUMA */ -static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); -static void *alternate_node_alloc(struct kmem_cache *, gfp_t); +static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int, int *); +static void *alternate_node_alloc(struct kmem_cache *, gfp_t, int *); static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { @@ -1014,7 +1167,8 @@ static void free_alien_cache(struct array_cache **ac_ptr) } static void __drain_alien_cache(struct kmem_cache *cachep, - struct array_cache *ac, int node) + struct array_cache *ac, int node, + int *this_cpu) { struct kmem_list3 *rl3 = cachep->nodelists[node]; @@ -1028,7 +1182,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, if (rl3->shared) transfer_objects(rl3->shared, ac, ac->limit); - free_block(cachep, ac->entry, ac->avail, node); + free_block(cachep, ac->entry, ac->avail, node, this_cpu); ac->avail = 0; spin_unlock(&rl3->list_lock); } @@ -1037,38 +1191,43 @@ static void __drain_alien_cache(struct kmem_cache *cachep, /* * Called from cache_reap() to regularly drain alien caches round robin. */ -static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) +static int reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) { int node = __get_cpu_var(slab_reap_node); + int this_cpu; if (l3->alien) { struct array_cache *ac = l3->alien[node]; - if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { - __drain_alien_cache(cachep, ac, node); - spin_unlock_irq(&ac->lock); + if (ac && ac->avail && + slab_spin_trylock_irq(&ac->lock, this_cpu)) { + __drain_alien_cache(cachep, ac, node, &this_cpu); + slab_spin_unlock_irq(&ac->lock, this_cpu); + return 1; } } + return 0; } static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien) { - int i = 0; + int this_cpu, i = 0; struct array_cache *ac; unsigned long flags; for_each_online_node(i) { ac = alien[i]; if (ac) { - spin_lock_irqsave(&ac->lock, flags); - __drain_alien_cache(cachep, ac, i); - spin_unlock_irqrestore(&ac->lock, flags); + slab_spin_lock_irqsave(&ac->lock, flags, this_cpu); + __drain_alien_cache(cachep, ac, i, &this_cpu); + slab_spin_unlock_irqrestore(&ac->lock, flags, this_cpu); } } } -static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) +static inline int +cache_free_alien(struct kmem_cache *cachep, void *objp, int *this_cpu) { struct slab *slabp = virt_to_slab(objp); int nodeid = slabp->nodeid; @@ -1076,7 +1235,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) struct array_cache *alien = NULL; int node; - node = numa_node_id(); + node = cpu_to_node(*this_cpu); /* * Make sure we are not freeing a object from another node to the array @@ -1092,20 +1251,20 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) spin_lock(&alien->lock); if (unlikely(alien->avail == alien->limit)) { STATS_INC_ACOVERFLOW(cachep); - __drain_alien_cache(cachep, alien, nodeid); + __drain_alien_cache(cachep, alien, nodeid, this_cpu); } alien->entry[alien->avail++] = objp; spin_unlock(&alien->lock); } else { spin_lock(&(cachep->nodelists[nodeid])->list_lock); - free_block(cachep, &objp, 1, nodeid); + free_block(cachep, &objp, 1, nodeid, this_cpu); spin_unlock(&(cachep->nodelists[nodeid])->list_lock); } return 1; } #endif -static void __cpuinit cpuup_canceled(long cpu) +static void __cpuinit cpuup_canceled(int cpu) { struct kmem_cache *cachep; struct kmem_list3 *l3 = NULL; @@ -1116,6 +1275,7 @@ static void __cpuinit cpuup_canceled(long cpu) struct array_cache *nc; struct array_cache *shared; struct array_cache **alien; + int orig_cpu = cpu; /* cpu is dead; no one can alloc from it. */ nc = cachep->array[cpu]; @@ -1130,7 +1290,7 @@ static void __cpuinit cpuup_canceled(long cpu) /* Free limit for this kmem_list3 */ l3->free_limit -= cachep->batchcount; if (nc) - free_block(cachep, nc->entry, nc->avail, node); + free_block(cachep, nc->entry, nc->avail, node, &cpu); if (!cpumask_empty(mask)) { spin_unlock_irq(&l3->list_lock); @@ -1140,7 +1300,7 @@ static void __cpuinit cpuup_canceled(long cpu) shared = l3->shared; if (shared) { free_block(cachep, shared->entry, - shared->avail, node); + shared->avail, node, &cpu); l3->shared = NULL; } @@ -1156,6 +1316,7 @@ static void __cpuinit cpuup_canceled(long cpu) } free_array_cache: kfree(nc); + BUG_ON(cpu != orig_cpu); } /* * In the previous loop, all the objects were freed to @@ -1170,7 +1331,7 @@ free_array_cache: } } -static int __cpuinit cpuup_prepare(long cpu) +static int __cpuinit cpuup_prepare(int cpu) { struct kmem_cache *cachep; struct kmem_list3 *l3 = NULL; @@ -1280,10 +1441,19 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, long cpu = (long)hcpu; int err = 0; + switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: mutex_lock(&cache_chain_mutex); + /* + * lock/unlock cycle to push any holders away -- no new ones + * can come in due to the cpu still being offline. + * + * XXX -- weird case anyway, can it happen? + */ + slab_irq_disable_this_rt(cpu); + slab_irq_enable_rt(cpu); err = cpuup_prepare(cpu); mutex_unlock(&cache_chain_mutex); break; @@ -1323,10 +1493,14 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: mutex_lock(&cache_chain_mutex); + slab_irq_disable_this_rt(cpu); cpuup_canceled(cpu); + slab_irq_enable_rt(cpu); mutex_unlock(&cache_chain_mutex); break; } + + return err ? NOTIFY_BAD : NOTIFY_OK; } @@ -1384,6 +1558,12 @@ void __init kmem_cache_init(void) int order; int node; +#ifdef CONFIG_PREEMPT_RT + for_each_possible_cpu(i) { + INIT_LIST_HEAD(&__get_cpu_var_locked(slab, i)); + } +#endif + if (num_possible_nodes() == 1) use_alien_caches = 0; @@ -1513,32 +1693,34 @@ void __init kmem_cache_init(void) /* 4) Replace the bootstrap head arrays */ { struct array_cache *ptr; + int cpu = smp_processor_id(); ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); - BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); - memcpy(ptr, cpu_cache_get(&cache_cache), + BUG_ON(cpu_cache_get(&cache_cache, cpu) != + &initarray_cache.cache); + memcpy(ptr, cpu_cache_get(&cache_cache, cpu), sizeof(struct arraycache_init)); /* * Do not assume that spinlocks can be initialized via memcpy: */ spin_lock_init(&ptr->lock); - cache_cache.array[smp_processor_id()] = ptr; + cache_cache.array[cpu] = ptr; ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); - BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) + BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep, cpu) != &initarray_generic.cache); - memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), + memcpy(ptr, + cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep, cpu), sizeof(struct arraycache_init)); /* * Do not assume that spinlocks can be initialized via memcpy: */ spin_lock_init(&ptr->lock); - malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = - ptr; + malloc_sizes[INDEX_AC].cs_cachep->array[cpu] = ptr; } /* 5) Replace the bootstrap kmem_list3's */ { @@ -1656,12 +1838,14 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) /* * Interface to system's page release. */ -static void kmem_freepages(struct kmem_cache *cachep, void *addr) +static void kmem_freepages(struct kmem_cache *cachep, void *addr, int cpu) { unsigned long i = (1 << cachep->gfporder); - struct page *page = virt_to_page(addr); + struct page *page, *basepage = virt_to_page(addr); const unsigned long nr_freed = i; + page = basepage; + kmemcheck_free_shadow(page, cachep->gfporder); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) @@ -1670,6 +1854,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) else sub_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_freed); + while (i--) { BUG_ON(!PageSlab(page)); __ClearPageSlab(page); @@ -1677,6 +1862,13 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) } if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; + +#ifdef CONFIG_PREEMPT_RT + if (cpu >= 0) { + basepage->index = cachep->gfporder; + list_add(&basepage->lru, &__get_cpu_var_locked(slab, cpu)); + } else +#endif free_pages((unsigned long)addr, cachep->gfporder); } @@ -1685,7 +1877,7 @@ static void kmem_rcu_free(struct rcu_head *head) struct slab_rcu *slab_rcu = (struct slab_rcu *)head; struct kmem_cache *cachep = slab_rcu->cachep; - kmem_freepages(cachep, slab_rcu->addr); + kmem_freepages(cachep, slab_rcu->addr, -1); if (OFF_SLAB(cachep)) kmem_cache_free(cachep->slabp_cache, slab_rcu); } @@ -1705,7 +1897,7 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, *addr++ = 0x12345678; *addr++ = caller; - *addr++ = smp_processor_id(); + *addr++ = raw_smp_processor_id(); size -= 3 * sizeof(unsigned long); { unsigned long *sptr = &caller; @@ -1895,6 +2087,10 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab } #endif +static void +__cache_free(struct kmem_cache *cachep, void *objp, int *this_cpu); + + /** * slab_destroy - destroy and release all objects in a slab * @cachep: cache pointer being destroyed @@ -1904,7 +2100,8 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab * Before calling the slab must have been unlinked from the cache. The * cache-lock is not held/needed. */ -static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) +static void +slab_destroy(struct kmem_cache *cachep, struct slab *slabp, int *this_cpu) { void *addr = slabp->s_mem - slabp->colouroff; @@ -1917,9 +2114,13 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) slab_rcu->addr = addr; call_rcu(&slab_rcu->head, kmem_rcu_free); } else { - kmem_freepages(cachep, addr); - if (OFF_SLAB(cachep)) - kmem_cache_free(cachep->slabp_cache, slabp); + kmem_freepages(cachep, addr, *this_cpu); + if (OFF_SLAB(cachep)) { + if (this_cpu) + __cache_free(cachep->slabp_cache, slabp, this_cpu); + else + kmem_cache_free(cachep->slabp_cache, slabp); + } } } @@ -2016,6 +2217,8 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) { + int this_cpu; + if (g_cpucache_up == FULL) return enable_cpucache(cachep, gfp); @@ -2059,10 +2262,12 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) jiffies + REAPTIMEOUT_LIST3 + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; - cpu_cache_get(cachep)->avail = 0; - cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; - cpu_cache_get(cachep)->batchcount = 1; - cpu_cache_get(cachep)->touched = 0; + this_cpu = raw_smp_processor_id(); + + cpu_cache_get(cachep, this_cpu)->avail = 0; + cpu_cache_get(cachep, this_cpu)->limit = BOOT_CPUCACHE_ENTRIES; + cpu_cache_get(cachep, this_cpu)->batchcount = 1; + cpu_cache_get(cachep, this_cpu)->touched = 0; cachep->batchcount = 1; cachep->limit = BOOT_CPUCACHE_ENTRIES; return 0; @@ -2374,19 +2579,19 @@ EXPORT_SYMBOL(kmem_cache_create); #if DEBUG static void check_irq_off(void) { +/* + * On PREEMPT_RT we use locks to protect the per-CPU lists, + * and keep interrupts enabled. + */ +#ifndef CONFIG_PREEMPT_RT BUG_ON(!irqs_disabled()); +#endif } static void check_irq_on(void) { +#ifndef CONFIG_PREEMPT_RT BUG_ON(irqs_disabled()); -} - -static void check_spinlock_acquired(struct kmem_cache *cachep) -{ -#ifdef CONFIG_SMP - check_irq_off(); - assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock); #endif } @@ -2401,34 +2606,67 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) #else #define check_irq_off() do { } while(0) #define check_irq_on() do { } while(0) -#define check_spinlock_acquired(x) do { } while(0) #define check_spinlock_acquired_node(x, y) do { } while(0) #endif -static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, +static int drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, struct array_cache *ac, int force, int node); -static void do_drain(void *arg) +static void __do_drain(void *arg, int this_cpu) { struct kmem_cache *cachep = arg; + int node = cpu_to_node(this_cpu); struct array_cache *ac; - int node = numa_node_id(); check_irq_off(); - ac = cpu_cache_get(cachep); + ac = cpu_cache_get(cachep, this_cpu); spin_lock(&cachep->nodelists[node]->list_lock); - free_block(cachep, ac->entry, ac->avail, node); + free_block(cachep, ac->entry, ac->avail, node, &this_cpu); spin_unlock(&cachep->nodelists[node]->list_lock); ac->avail = 0; } +#ifdef CONFIG_PREEMPT_RT +static void do_drain(void *arg, int this_cpu) +{ + __do_drain(arg, this_cpu); +} +#else +static void do_drain(void *arg) +{ + __do_drain(arg, smp_processor_id()); +} +#endif + +#ifdef CONFIG_PREEMPT_RT +/* + * execute func() for all CPUs. On PREEMPT_RT we dont actually have + * to run on the remote CPUs - we only have to take their CPU-locks. + * (This is a rare operation, so cacheline bouncing is not an issue.) + */ +static void +slab_on_each_cpu(void (*func)(void *arg, int this_cpu), void *arg) +{ + unsigned int i; + + check_irq_on(); + for_each_online_cpu(i) { + spin_lock(&__get_cpu_lock(slab, i)); + func(arg, i); + spin_unlock(&__get_cpu_lock(slab, i)); + } +} +#else +# define slab_on_each_cpu(func, cachep) on_each_cpu(func, cachep, 1) +#endif + static void drain_cpu_caches(struct kmem_cache *cachep) { struct kmem_list3 *l3; int node; - on_each_cpu(do_drain, cachep, 1); + slab_on_each_cpu(do_drain, cachep); check_irq_on(); for_each_online_node(node) { l3 = cachep->nodelists[node]; @@ -2453,16 +2691,16 @@ static int drain_freelist(struct kmem_cache *cache, struct kmem_list3 *l3, int tofree) { struct list_head *p; - int nr_freed; + int nr_freed, this_cpu; struct slab *slabp; nr_freed = 0; while (nr_freed < tofree && !list_empty(&l3->slabs_free)) { - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); p = l3->slabs_free.prev; if (p == &l3->slabs_free) { - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); goto out; } @@ -2471,13 +2709,9 @@ static int drain_freelist(struct kmem_cache *cache, BUG_ON(slabp->inuse); #endif list_del(&slabp->list); - /* - * Safe to drop the lock. The slab is no longer linked - * to the cache. - */ l3->free_objects -= cache->num; - spin_unlock_irq(&l3->list_lock); - slab_destroy(cache, slabp); + slab_destroy(cache, slabp, &this_cpu); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); nr_freed++; } out: @@ -2741,8 +2975,8 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, * Grow (by 1) the number of slabs within a cache. This is called by * kmem_cache_alloc() when there are no active objs left in a cache. */ -static int cache_grow(struct kmem_cache *cachep, - gfp_t flags, int nodeid, void *objp) +static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid, + void *objp, int *this_cpu) { struct slab *slabp; size_t offset; @@ -2770,8 +3004,7 @@ static int cache_grow(struct kmem_cache *cachep, offset *= cachep->colour_off; - if (local_flags & __GFP_WAIT) - local_irq_enable(); + slab_irq_enable_GFP_WAIT(local_flags, this_cpu); /* * The test for missing atomic flag is performed here, rather than @@ -2800,8 +3033,8 @@ static int cache_grow(struct kmem_cache *cachep, cache_init_objs(cachep, slabp); - if (local_flags & __GFP_WAIT) - local_irq_disable(); + slab_irq_disable_GFP_WAIT(local_flags, this_cpu); + check_irq_off(); spin_lock(&l3->list_lock); @@ -2812,10 +3045,9 @@ static int cache_grow(struct kmem_cache *cachep, spin_unlock(&l3->list_lock); return 1; opps1: - kmem_freepages(cachep, objp); + kmem_freepages(cachep, objp, -1); failed: - if (local_flags & __GFP_WAIT) - local_irq_disable(); + slab_irq_disable_GFP_WAIT(local_flags, this_cpu); return 0; } @@ -2937,7 +3169,8 @@ bad: #define check_slabp(x,y) do { } while(0) #endif -static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) +static void * +cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, int *this_cpu) { int batchcount; struct kmem_list3 *l3; @@ -2947,7 +3180,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) retry: check_irq_off(); node = numa_node_id(); - ac = cpu_cache_get(cachep); + ac = cpu_cache_get(cachep, *this_cpu); batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { /* @@ -2957,7 +3190,7 @@ retry: */ batchcount = BATCHREFILL_LIMIT; } - l3 = cachep->nodelists[node]; + l3 = cachep->nodelists[cpu_to_node(*this_cpu)]; BUG_ON(ac->avail > 0 || !l3); spin_lock(&l3->list_lock); @@ -2980,7 +3213,7 @@ retry: slabp = list_entry(entry, struct slab, list); check_slabp(cachep, slabp); - check_spinlock_acquired(cachep); + check_spinlock_acquired_node(cachep, cpu_to_node(*this_cpu)); /* * The slab was either on partial or free list so @@ -2994,8 +3227,9 @@ retry: STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, - node); + ac->entry[ac->avail++] = + slab_get_obj(cachep, slabp, + cpu_to_node(*this_cpu)); } check_slabp(cachep, slabp); @@ -3014,10 +3248,10 @@ alloc_done: if (unlikely(!ac->avail)) { int x; - x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); + x = cache_grow(cachep, flags | GFP_THISNODE, cpu_to_node(*this_cpu), NULL, this_cpu); /* cache_grow can reenable interrupts, then ac could change. */ - ac = cpu_cache_get(cachep); + ac = cpu_cache_get(cachep, *this_cpu); if (!x && ac->avail == 0) /* no objects in sight? abort */ return NULL; @@ -3104,26 +3338,27 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) return should_failslab(obj_size(cachep), flags); } -static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) +static inline void * +____cache_alloc(struct kmem_cache *cachep, gfp_t flags, int *this_cpu) { void *objp; struct array_cache *ac; check_irq_off(); - ac = cpu_cache_get(cachep); + ac = cpu_cache_get(cachep, *this_cpu); if (likely(ac->avail)) { STATS_INC_ALLOCHIT(cachep); ac->touched = 1; objp = ac->entry[--ac->avail]; } else { STATS_INC_ALLOCMISS(cachep); - objp = cache_alloc_refill(cachep, flags); + objp = cache_alloc_refill(cachep, flags, this_cpu); /* * the 'ac' may be updated by cache_alloc_refill(), * and kmemleak_erase() requires its correct value. */ - ac = cpu_cache_get(cachep); + ac = cpu_cache_get(cachep, *this_cpu); } /* * To avoid a false negative, if an object that is in one of the @@ -3142,7 +3377,8 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) * If we are in_interrupt, then process context, including cpusets and * mempolicy, may not apply and should not be used for allocation policy. */ -static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) +static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags, + int *this_cpu) { int nid_alloc, nid_here; @@ -3154,7 +3390,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) else if (current->mempolicy) nid_alloc = slab_node(current->mempolicy); if (nid_alloc != nid_here) - return ____cache_alloc_node(cachep, flags, nid_alloc); + return ____cache_alloc_node(cachep, flags, nid_alloc, this_cpu); return NULL; } @@ -3166,7 +3402,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) * allocator to do its reclaim / fallback magic. We then insert the * slab into the proper nodelist and then allocate from it. */ -static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) +static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags, int *this_cpu) { struct zonelist *zonelist; gfp_t local_flags; @@ -3194,7 +3430,8 @@ retry: cache->nodelists[nid] && cache->nodelists[nid]->free_objects) { obj = ____cache_alloc_node(cache, - flags | GFP_THISNODE, nid); + flags | GFP_THISNODE, nid, + this_cpu); if (obj) break; } @@ -3207,20 +3444,21 @@ retry: * We may trigger various forms of reclaim on the allowed * set and go into memory reserves if necessary. */ - if (local_flags & __GFP_WAIT) - local_irq_enable(); + slab_irq_enable_GFP_WAIT(local_flags, this_cpu); + kmem_flagcheck(cache, flags); - obj = kmem_getpages(cache, local_flags, numa_node_id()); - if (local_flags & __GFP_WAIT) - local_irq_disable(); + obj = kmem_getpages(cache, local_flags, cpu_to_node(*this_cpu)); + + slab_irq_disable_GFP_WAIT(local_flags, this_cpu); + if (obj) { /* * Insert into the appropriate per node queues */ nid = page_to_nid(virt_to_page(obj)); - if (cache_grow(cache, flags, nid, obj)) { + if (cache_grow(cache, flags, nid, obj, this_cpu)) { obj = ____cache_alloc_node(cache, - flags | GFP_THISNODE, nid); + flags | GFP_THISNODE, nid, this_cpu); if (!obj) /* * Another processor may allocate the @@ -3241,7 +3479,7 @@ retry: * A interface to enable slab creation on nodeid */ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, - int nodeid) + int nodeid, int *this_cpu) { struct list_head *entry; struct slab *slabp; @@ -3289,11 +3527,11 @@ retry: must_grow: spin_unlock(&l3->list_lock); - x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); + x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL, this_cpu); if (x) goto retry; - return fallback_alloc(cachep, flags); + return fallback_alloc(cachep, flags, this_cpu); done: return obj; @@ -3316,6 +3554,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, void *caller) { unsigned long save_flags; + int this_cpu, this_node; void *ptr; flags &= gfp_allowed_mask; @@ -3326,32 +3565,33 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, return NULL; cache_alloc_debugcheck_before(cachep, flags); - local_irq_save(save_flags); + slab_irq_save(save_flags, this_cpu); + this_node = cpu_to_node(this_cpu); if (nodeid == -1) - nodeid = numa_node_id(); + nodeid = this_node; if (unlikely(!cachep->nodelists[nodeid])) { /* Node not bootstrapped yet */ - ptr = fallback_alloc(cachep, flags); + ptr = fallback_alloc(cachep, flags, &this_cpu); goto out; } - if (nodeid == numa_node_id()) { + if (nodeid == this_node) { /* * Use the locally cached objects if possible. * However ____cache_alloc does not allow fallback * to other nodes. It may fail while we still have * objects on other nodes available. */ - ptr = ____cache_alloc(cachep, flags); + ptr = ____cache_alloc(cachep, flags, &this_cpu); if (ptr) goto out; } /* ___cache_alloc_node can fall back to other nodes */ - ptr = ____cache_alloc_node(cachep, flags, nodeid); + ptr = ____cache_alloc_node(cachep, flags, nodeid, &this_cpu); out: - local_irq_restore(save_flags); + slab_irq_restore(save_flags, this_cpu); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, flags); @@ -3366,33 +3606,33 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, } static __always_inline void * -__do_cache_alloc(struct kmem_cache *cache, gfp_t flags) +__do_cache_alloc(struct kmem_cache *cache, gfp_t flags, int *this_cpu) { void *objp; if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { - objp = alternate_node_alloc(cache, flags); + objp = alternate_node_alloc(cache, flags, this_cpu); if (objp) goto out; } - objp = ____cache_alloc(cache, flags); + objp = ____cache_alloc(cache, flags, this_cpu); /* * We may just have run out of memory on the local node. * ____cache_alloc_node() knows how to locate memory on other nodes */ - if (!objp) - objp = ____cache_alloc_node(cache, flags, numa_node_id()); - + if (!objp) + objp = ____cache_alloc_node(cache, flags, + cpu_to_node(*this_cpu), this_cpu); out: return objp; } #else static __always_inline void * -__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) +__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int *this_cpu) { - return ____cache_alloc(cachep, flags); + return ____cache_alloc(cachep, flags, this_cpu); } #endif /* CONFIG_NUMA */ @@ -3401,6 +3641,7 @@ static __always_inline void * __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) { unsigned long save_flags; + int this_cpu; void *objp; flags &= gfp_allowed_mask; @@ -3411,9 +3652,9 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) return NULL; cache_alloc_debugcheck_before(cachep, flags); - local_irq_save(save_flags); - objp = __do_cache_alloc(cachep, flags); - local_irq_restore(save_flags); + slab_irq_save(save_flags, this_cpu); + objp = __do_cache_alloc(cachep, flags, &this_cpu); + slab_irq_restore(save_flags, this_cpu); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags, flags); @@ -3432,7 +3673,7 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) * Caller needs to acquire correct kmem_list's list_lock */ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, - int node) + int node, int *this_cpu) { int i; struct kmem_list3 *l3; @@ -3461,7 +3702,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, * a different cache, refer to comments before * alloc_slabmgmt. */ - slab_destroy(cachep, slabp); + slab_destroy(cachep, slabp, this_cpu); } else { list_add(&slabp->list, &l3->slabs_free); } @@ -3475,11 +3716,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, } } -static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) +static void +cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac, int *this_cpu) { int batchcount; struct kmem_list3 *l3; - int node = numa_node_id(); + int node = cpu_to_node(*this_cpu); batchcount = ac->batchcount; #if DEBUG @@ -3501,7 +3743,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) } } - free_block(cachep, ac->entry, batchcount, node); + free_block(cachep, ac->entry, batchcount, node, this_cpu); free_done: #if STATS { @@ -3530,9 +3772,9 @@ free_done: * Release an obj back to its cache. If the obj has a constructed state, it must * be in this state _before_ it is released. Called with disabled ints. */ -static inline void __cache_free(struct kmem_cache *cachep, void *objp) +static void __cache_free(struct kmem_cache *cachep, void *objp, int *this_cpu) { - struct array_cache *ac = cpu_cache_get(cachep); + struct array_cache *ac = cpu_cache_get(cachep, *this_cpu); check_irq_off(); kmemleak_free_recursive(objp, cachep->flags); @@ -3547,7 +3789,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) * variable to skip the call, which is mostly likely to be present in * the cache. */ - if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) + if (nr_online_nodes > 1 && cache_free_alien(cachep, objp, this_cpu)) return; if (likely(ac->avail < ac->limit)) { @@ -3556,7 +3798,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) return; } else { STATS_INC_FREEMISS(cachep); - cache_flusharray(cachep, ac); + cache_flusharray(cachep, ac, this_cpu); ac->entry[ac->avail++] = objp; } } @@ -3755,13 +3997,14 @@ EXPORT_SYMBOL(__kmalloc); void kmem_cache_free(struct kmem_cache *cachep, void *objp) { unsigned long flags; + int this_cpu; - local_irq_save(flags); + slab_irq_save(flags, this_cpu); debug_check_no_locks_freed(objp, obj_size(cachep)); if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(objp, obj_size(cachep)); - __cache_free(cachep, objp); - local_irq_restore(flags); + __cache_free(cachep, objp, &this_cpu); + slab_irq_restore(flags, this_cpu); trace_kmem_cache_free(_RET_IP_, objp); } @@ -3780,18 +4023,19 @@ void kfree(const void *objp) { struct kmem_cache *c; unsigned long flags; + int this_cpu; trace_kfree(_RET_IP_, objp); if (unlikely(ZERO_OR_NULL_PTR(objp))) return; - local_irq_save(flags); + slab_irq_save(flags, this_cpu); kfree_debugcheck(objp); c = virt_to_cache(objp); debug_check_no_locks_freed(objp, obj_size(c)); debug_check_no_obj_freed(objp, obj_size(c)); - __cache_free(c, (void *)objp); - local_irq_restore(flags); + __cache_free(c, (void *)objp, &this_cpu); + slab_irq_restore(flags, this_cpu); } EXPORT_SYMBOL(kfree); @@ -3812,7 +4056,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name); */ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) { - int node; + int node, this_cpu; struct kmem_list3 *l3; struct array_cache *new_shared; struct array_cache **new_alien = NULL; @@ -3840,11 +4084,11 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) if (l3) { struct array_cache *shared = l3->shared; - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); if (shared) free_block(cachep, shared->entry, - shared->avail, node); + shared->avail, node, &this_cpu); l3->shared = new_shared; if (!l3->alien) { @@ -3853,7 +4097,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) } l3->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); kfree(shared); free_alien_cache(new_alien); continue; @@ -3900,24 +4144,36 @@ struct ccupdate_struct { struct array_cache *new[NR_CPUS]; }; -static void do_ccupdate_local(void *info) +static void __do_ccupdate_local(void *info, int this_cpu) { struct ccupdate_struct *new = info; struct array_cache *old; check_irq_off(); - old = cpu_cache_get(new->cachep); + old = cpu_cache_get(new->cachep, this_cpu); + + new->cachep->array[this_cpu] = new->new[this_cpu]; + new->new[this_cpu] = old; +} - new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; - new->new[smp_processor_id()] = old; +#ifdef CONFIG_PREEMPT_RT +static void do_ccupdate_local(void *arg, int this_cpu) +{ + __do_ccupdate_local(arg, this_cpu); +} +#else +static void do_ccupdate_local(void *arg) +{ + __do_ccupdate_local(arg, smp_processor_id()); } +#endif /* Always called with the cache_chain_mutex held */ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, int shared, gfp_t gfp) { struct ccupdate_struct *new; - int i; + int i, this_cpu; new = kzalloc(sizeof(*new), gfp); if (!new) @@ -3935,7 +4191,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, } new->cachep = cachep; - on_each_cpu(do_ccupdate_local, (void *)new, 1); + slab_on_each_cpu(do_ccupdate_local, (void *)new); check_irq_on(); cachep->batchcount = batchcount; @@ -3946,9 +4202,12 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, struct array_cache *ccold = new->new[i]; if (!ccold) continue; - spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); - free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i)); - spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); + slab_spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock, + this_cpu); + free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i), + &this_cpu); + slab_spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock, + this_cpu); kfree(ccold); } kfree(new); @@ -4013,29 +4272,31 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) * Drain an array if it contains any elements taking the l3 lock only if * necessary. Note that the l3 listlock also protects the array_cache * if drain_array() is used on the shared array. + * returns non-zero if some work is done */ -void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, - struct array_cache *ac, int force, int node) +int drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, + struct array_cache *ac, int force, int node) { - int tofree; + int tofree, this_cpu; if (!ac || !ac->avail) - return; + return 0; if (ac->touched && !force) { ac->touched = 0; } else { - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); if (ac->avail) { tofree = force ? ac->avail : (ac->limit + 4) / 5; if (tofree > ac->avail) tofree = (ac->avail + 1) / 2; - free_block(cachep, ac->entry, tofree, node); + free_block(cachep, ac->entry, tofree, node, &this_cpu); ac->avail -= tofree; memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail); } - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); } + return 1; } /** @@ -4052,10 +4313,11 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, */ static void cache_reap(struct work_struct *w) { + int this_cpu = smp_processor_id(), node = cpu_to_node(this_cpu); struct kmem_cache *searchp; struct kmem_list3 *l3; - int node = numa_node_id(); struct delayed_work *work = to_delayed_work(w); + int work_done = 0; if (!mutex_trylock(&cache_chain_mutex)) /* Give up. Setup the next iteration. */ @@ -4071,9 +4333,12 @@ static void cache_reap(struct work_struct *w) */ l3 = searchp->nodelists[node]; - reap_alien(searchp, l3); + work_done += reap_alien(searchp, l3); + + node = cpu_to_node(this_cpu); - drain_array(searchp, l3, cpu_cache_get(searchp), 0, node); + work_done += drain_array(searchp, l3, + cpu_cache_get(searchp, this_cpu), 0, node); /* * These are racy checks but it does not matter @@ -4084,7 +4349,7 @@ static void cache_reap(struct work_struct *w) l3->next_reap = jiffies + REAPTIMEOUT_LIST3; - drain_array(searchp, l3, l3->shared, 0, node); + work_done += drain_array(searchp, l3, l3->shared, 0, node); if (l3->free_touched) l3->free_touched = 0; @@ -4103,7 +4368,8 @@ next: next_reap_node(); out: /* Set up the next iteration */ - schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); + schedule_delayed_work(work, + round_jiffies_relative((1+!work_done) * REAPTIMEOUT_CPUC)); } #ifdef CONFIG_SLABINFO @@ -4162,7 +4428,7 @@ static int s_show(struct seq_file *m, void *p) unsigned long num_slabs, free_objects = 0, shared_avail = 0; const char *name; char *error = NULL; - int node; + int this_cpu, node; struct kmem_list3 *l3; active_objs = 0; @@ -4173,7 +4439,7 @@ static int s_show(struct seq_file *m, void *p) continue; check_irq_on(); - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); list_for_each_entry(slabp, &l3->slabs_full, list) { if (slabp->inuse != cachep->num && !error) @@ -4198,7 +4464,7 @@ static int s_show(struct seq_file *m, void *p) if (l3->shared) shared_avail += l3->shared->avail; - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); } num_slabs += active_slabs; num_objs = num_slabs * cachep->num; @@ -4408,7 +4674,7 @@ static int leaks_show(struct seq_file *m, void *p) struct kmem_list3 *l3; const char *name; unsigned long *n = m->private; - int node; + int node, this_cpu; int i; if (!(cachep->flags & SLAB_STORE_USER)) @@ -4426,13 +4692,13 @@ static int leaks_show(struct seq_file *m, void *p) continue; check_irq_on(); - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); list_for_each_entry(slabp, &l3->slabs_full, list) handle_slab(n, cachep, slabp); list_for_each_entry(slabp, &l3->slabs_partial, list) handle_slab(n, cachep, slabp); - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); } name = cachep->name; if (n[0] == n[1]) { diff --git a/mm/swap.c b/mm/swap.c index 308e57d..a4de467 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -30,15 +30,93 @@ #include <linux/notifier.h> #include <linux/backing-dev.h> #include <linux/memcontrol.h> +#include <linux/interrupt.h> #include "internal.h" /* How many pages do we try to swap or page in/out together? */ int page_cluster; +#ifdef CONFIG_PREEMPT_RT +/* + * On PREEMPT_RT we don't want to disable preemption for cpu variables. + * We grab a cpu and then use that cpu to lock the variables accordingly. + * + * (On !PREEMPT_RT this turns into normal preempt-off sections, as before.) + */ +static DEFINE_PER_CPU_LOCKED(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); +static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_rotate_pvecs); + +#define swap_get_cpu_var_irq_save(var, flags, cpu) \ + ({ \ + (void)flags; \ + &get_cpu_var_locked(var, &cpu); \ + }) + +#define swap_put_cpu_var_irq_restore(var, flags, cpu) \ + put_cpu_var_locked(var, cpu) + +#define swap_get_cpu_var(var, cpu) \ + &get_cpu_var_locked(var, &cpu) + +#define swap_put_cpu_var(var, cpu) \ + put_cpu_var_locked(var, cpu) + +#define swap_per_cpu_lock(var, cpu) \ + ({ \ + spin_lock(&__get_cpu_lock(var, cpu)); \ + &__get_cpu_var_locked(var, cpu); \ + }) + +#define swap_per_cpu_unlock(var, cpu) \ + spin_unlock(&__get_cpu_lock(var, cpu)); + +#define swap_get_cpu() raw_smp_processor_id() + +#define swap_put_cpu() do { } while (0) + +#define swap_irq_save(flags) do { (void)flags; } while (0) + +#define swap_irq_restore(flags) do { (void)flags; } while (0) + +#else + static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); +#define swap_get_cpu_var_irq_save(var, flags, cpu) \ + ({ \ + (void)cpu; \ + local_irq_save(flags); \ + &__get_cpu_var(var); \ + }) + +#define swap_put_cpu_var_irq_restore(var, flags, cpu) \ + local_irq_restore(flags) + +#define swap_get_cpu_var(var, cpu) \ + &get_cpu_var(var) \ + +#define swap_put_cpu_var(var, cpu) \ + ({ \ + (void)cpu; \ + put_cpu_var(var); \ + }) + +#define swap_per_cpu_lock(var, cpu) &per_cpu(var, cpu) + +#define swap_per_cpu_unlock(var, cpu) do { } while (0) + +#define swap_get_cpu() get_cpu() + +#define swap_put_cpu() put_cpu() + +#define swap_irq_save(flags) local_irq_save(flags) + +#define swap_irq_restore(flags) local_irq_restore(flags) + +#endif + /* * This path almost never happens for VM activity - pages are normally * freed via pagevecs. But it gets used by networking. @@ -141,13 +219,13 @@ void rotate_reclaimable_page(struct page *page) !PageUnevictable(page) && PageLRU(page)) { struct pagevec *pvec; unsigned long flags; + int cpu; page_cache_get(page); - local_irq_save(flags); - pvec = &__get_cpu_var(lru_rotate_pvecs); + pvec = swap_get_cpu_var_irq_save(lru_rotate_pvecs, flags, cpu); if (!pagevec_add(pvec, page)) pagevec_move_tail(pvec); - local_irq_restore(flags); + swap_put_cpu_var_irq_restore(lru_rotate_pvecs, flags, cpu); } } @@ -216,12 +294,14 @@ EXPORT_SYMBOL(mark_page_accessed); void __lru_cache_add(struct page *page, enum lru_list lru) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; + struct pagevec *pvec; + int cpu; + pvec = swap_get_cpu_var(lru_add_pvecs, cpu)[lru]; page_cache_get(page); if (!pagevec_add(pvec, page)) ____pagevec_lru_add(pvec, lru); - put_cpu_var(lru_add_pvecs); + swap_put_cpu_var(lru_add_pvecs, cpu); } /** @@ -271,31 +351,33 @@ void add_page_to_unevictable_list(struct page *page) */ static void drain_cpu_pagevecs(int cpu) { - struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); - struct pagevec *pvec; + struct pagevec *pvecs, *pvec; int lru; + pvecs = swap_per_cpu_lock(lru_add_pvecs, cpu)[0]; for_each_lru(lru) { pvec = &pvecs[lru - LRU_BASE]; if (pagevec_count(pvec)) ____pagevec_lru_add(pvec, lru); } + swap_per_cpu_unlock(lru_add_pvecs, cpu); - pvec = &per_cpu(lru_rotate_pvecs, cpu); + pvec = swap_per_cpu_lock(lru_rotate_pvecs, cpu); if (pagevec_count(pvec)) { unsigned long flags; /* No harm done if a racing interrupt already did this */ - local_irq_save(flags); + swap_irq_save(flags); pagevec_move_tail(pvec); - local_irq_restore(flags); + swap_irq_restore(flags); } + swap_per_cpu_unlock(lru_rotate_pvecs, cpu); } void lru_add_drain(void) { - drain_cpu_pagevecs(get_cpu()); - put_cpu(); + drain_cpu_pagevecs(swap_get_cpu()); + swap_put_cpu(); } static void lru_add_drain_per_cpu(struct work_struct *dummy) @@ -369,7 +451,7 @@ void release_pages(struct page **pages, int nr, int cold) } __pagevec_free(&pages_to_free); pagevec_reinit(&pages_to_free); - } + } } if (zone) spin_unlock_irqrestore(&zone->lru_lock, flags); diff --git a/mm/vmscan.c b/mm/vmscan.c index c26986c..f055abc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1196,7 +1196,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, nr_reclaimed += nr_freed; - local_irq_disable(); + local_irq_disable_nort(); if (current_is_kswapd()) __count_vm_events(KSWAPD_STEAL, nr_freed); __count_zone_vm_events(PGSTEAL, zone, nr_freed); diff --git a/mm/vmstat.c b/mm/vmstat.c index 6051fba..ea832b5 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -149,17 +149,16 @@ static void refresh_zone_stat_thresholds(void) void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, int delta) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + struct per_cpu_pageset *pcp = zone_pcp(zone, get_cpu()); s8 *p = pcp->vm_stat_diff + item; - long x; - - x = delta + *p; + long x = delta + *p; if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { zone_page_state_add(x, zone, item); x = 0; } *p = x; + put_cpu(); } EXPORT_SYMBOL(__mod_zone_page_state); @@ -202,7 +201,7 @@ EXPORT_SYMBOL(mod_zone_page_state); */ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + struct per_cpu_pageset *pcp = zone_pcp(zone, get_cpu()); s8 *p = pcp->vm_stat_diff + item; (*p)++; @@ -213,17 +212,28 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) zone_page_state_add(*p + overstep, zone, item); *p = -overstep; } + put_cpu(); } void __inc_zone_page_state(struct page *page, enum zone_stat_item item) { +#ifdef CONFIG_PREEMPT_RT + unsigned long flags; + struct zone *zone; + + zone = page_zone(page); + local_irq_save(flags); + __inc_zone_state(zone, item); + local_irq_restore(flags); +#else __inc_zone_state(page_zone(page), item); +#endif } EXPORT_SYMBOL(__inc_zone_page_state); void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + struct per_cpu_pageset *pcp = zone_pcp(zone, get_cpu()); s8 *p = pcp->vm_stat_diff + item; (*p)--; @@ -234,6 +244,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) zone_page_state_add(*p - overstep, zone, item); *p = overstep; } + put_cpu(); } void __dec_zone_page_state(struct page *page, enum zone_stat_item item) diff --git a/net/core/dev.c b/net/core/dev.c index be9924f..6a867ce 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2064,11 +2064,12 @@ gso: Either shot noqueue qdisc, it is even simpler 8) */ if (dev->flags & IFF_UP) { - int cpu = smp_processor_id(); /* ok because BHs are off */ - - if (txq->xmit_lock_owner != cpu) { + /* + * No need to check for recursion with threaded interrupts: + */ + if (!netif_tx_lock_recursion(txq)) { - HARD_TX_LOCK(dev, txq, cpu); + HARD_TX_LOCK(dev, txq); if (!netif_tx_queue_stopped(txq)) { rc = dev_hard_start_xmit(skb, dev, txq); @@ -2173,8 +2174,8 @@ int netif_rx_ni(struct sk_buff *skb) { int err; - preempt_disable(); err = netif_rx(skb); + preempt_disable(); if (local_softirq_pending()) do_softirq(); preempt_enable(); @@ -2185,7 +2186,8 @@ EXPORT_SYMBOL(netif_rx_ni); static void net_tx_action(struct softirq_action *h) { - struct softnet_data *sd = &__get_cpu_var(softnet_data); + struct softnet_data *sd = &per_cpu(softnet_data, + raw_smp_processor_id()); if (sd->completion_queue) { struct sk_buff *clist; @@ -2201,6 +2203,11 @@ static void net_tx_action(struct softirq_action *h) WARN_ON(atomic_read(&skb->users)); __kfree_skb(skb); + /* + * Safe to reschedule - the list is private + * at this point. + */ + cond_resched_softirq_context(); } } @@ -2219,6 +2226,22 @@ static void net_tx_action(struct softirq_action *h) head = head->next_sched; root_lock = qdisc_lock(q); + /* + * We are executing in softirq context here, and + * if softirqs are preemptible, we must avoid + * infinite reactivation of the softirq by + * either the tx handler, or by netif_schedule(). + * (it would result in an infinitely looping + * softirq context) + * So we take the spinlock unconditionally. + */ +#ifdef CONFIG_PREEMPT_SOFTIRQS + spin_lock(root_lock); + smp_mb__before_clear_bit(); + clear_bit(__QDISC_STATE_SCHED, &q->state); + qdisc_run(q); + spin_unlock(root_lock); +#else if (spin_trylock(root_lock)) { smp_mb__before_clear_bit(); clear_bit(__QDISC_STATE_SCHED, @@ -2235,6 +2258,7 @@ static void net_tx_action(struct softirq_action *h) &q->state); } } +#endif } } } @@ -2447,7 +2471,7 @@ int netif_receive_skb(struct sk_buff *skb) skb->dev = orig_dev->master; } - __get_cpu_var(netdev_rx_stat).total++; + per_cpu(netdev_rx_stat, raw_smp_processor_id()).total++; skb_reset_network_header(skb); skb_reset_transport_header(skb); @@ -2761,7 +2785,7 @@ gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, switch (ret) { case GRO_NORMAL: case GRO_HELD: - skb->protocol = eth_type_trans(skb, napi->dev); + skb->protocol = eth_type_trans(skb, skb->dev); if (ret == GRO_HELD) skb_gro_pull(skb, -ETH_HLEN); @@ -2833,9 +2857,10 @@ EXPORT_SYMBOL(napi_gro_frags); static int process_backlog(struct napi_struct *napi, int quota) { int work = 0; - struct softnet_data *queue = &__get_cpu_var(softnet_data); + struct softnet_data *queue; unsigned long start_time = jiffies; + queue = &per_cpu(softnet_data, raw_smp_processor_id()); napi->weight = weight_p; do { struct sk_buff *skb; @@ -2867,7 +2892,7 @@ void __napi_schedule(struct napi_struct *n) local_irq_save(flags); list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list); - __raise_softirq_irqoff(NET_RX_SOFTIRQ); + raise_softirq_irqoff(NET_RX_SOFTIRQ); local_irq_restore(flags); } EXPORT_SYMBOL(__napi_schedule); @@ -3021,7 +3046,7 @@ out: softnet_break: __get_cpu_var(netdev_rx_stat).time_squeeze++; - __raise_softirq_irqoff(NET_RX_SOFTIRQ); + raise_softirq_irqoff(NET_RX_SOFTIRQ); goto out; } @@ -4853,7 +4878,7 @@ static void __netdev_init_queue_locks_one(struct net_device *dev, { spin_lock_init(&dev_queue->_xmit_lock); netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type); - dev_queue->xmit_lock_owner = -1; + dev_queue->xmit_lock_owner = (void *)-1; } static void netdev_init_queue_locks(struct net_device *dev) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index d8aee58..236a998 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -927,6 +927,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GPERMADDR: case ETHTOOL_GUFO: case ETHTOOL_GGSO: + case ETHTOOL_GGRO: case ETHTOOL_GFLAGS: case ETHTOOL_GPFLAGS: case ETHTOOL_GRXFH: diff --git a/net/core/flow.c b/net/core/flow.c index 9601587..f032d1c 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -39,9 +39,10 @@ atomic_t flow_cache_genid = ATOMIC_INIT(0); static u32 flow_hash_shift; #define flow_hash_size (1 << flow_hash_shift) -static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL }; -#define flow_table(cpu) (per_cpu(flow_tables, cpu)) +static DEFINE_PER_CPU_LOCKED(struct flow_cache_entry **, flow_tables); + +#define flow_table(cpu) (per_cpu_var_locked(flow_tables, cpu)) static struct kmem_cache *flow_cachep __read_mostly; @@ -168,24 +169,24 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2) void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, flow_resolve_t resolver) { - struct flow_cache_entry *fle, **head; + struct flow_cache_entry **table, *fle, **head; unsigned int hash; int cpu; local_bh_disable(); - cpu = smp_processor_id(); + table = get_cpu_var_locked(flow_tables, &cpu); fle = NULL; /* Packet really early in init? Making flow_cache_init a * pre-smp initcall would solve this. --RR */ - if (!flow_table(cpu)) + if (!table) goto nocache; if (flow_hash_rnd_recalc(cpu)) flow_new_hash_rnd(cpu); hash = flow_hash_code(key, cpu); - head = &flow_table(cpu)[hash]; + head = &table[hash]; for (fle = *head; fle; fle = fle->next) { if (fle->family == family && fle->dir == dir && @@ -195,6 +196,7 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, if (ret) atomic_inc(fle->object_ref); + put_cpu_var_locked(flow_tables, cpu); local_bh_enable(); return ret; @@ -220,6 +222,8 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, } nocache: + put_cpu_var_locked(flow_tables, cpu); + { int err; void *obj; @@ -249,14 +253,15 @@ nocache: static void flow_cache_flush_tasklet(unsigned long data) { struct flow_flush_info *info = (void *)data; + struct flow_cache_entry **table; int i; int cpu; - cpu = smp_processor_id(); + table = get_cpu_var_locked(flow_tables, &cpu); for (i = 0; i < flow_hash_size; i++) { struct flow_cache_entry *fle; - fle = flow_table(cpu)[i]; + fle = table[i]; for (; fle; fle = fle->next) { unsigned genid = atomic_read(&flow_cache_genid); @@ -267,6 +272,7 @@ static void flow_cache_flush_tasklet(unsigned long data) atomic_dec(fle->object_ref); } } + put_cpu_var_locked(flow_tables, cpu); if (atomic_dec_and_test(&info->cpuleft)) complete(&info->completion); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index fbc1c74..099c753 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -410,7 +410,8 @@ static ssize_t wireless_show(struct device *d, char *buf, const struct iw_statistics *iw; ssize_t ret = -EINVAL; - rtnl_lock(); + if (!rtnl_trylock()) + return restart_syscall(); if (dev_isalive(dev)) { iw = get_wireless_stats(dev); if (iw) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 0b4d0d3..ffd4ecb 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -73,20 +73,20 @@ static void queue_process(struct work_struct *work) txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); - local_irq_save(flags); - __netif_tx_lock(txq, smp_processor_id()); + local_irq_save_nort(flags); + __netif_tx_lock(txq); if (netif_tx_queue_stopped(txq) || netif_tx_queue_frozen(txq) || ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) { skb_queue_head(&npinfo->txq, skb); __netif_tx_unlock(txq); - local_irq_restore(flags); + local_irq_restore_nort(flags); schedule_delayed_work(&npinfo->tx_work, HZ/10); return; } __netif_tx_unlock(txq); - local_irq_restore(flags); + local_irq_restore_nort(flags); } } @@ -157,7 +157,7 @@ static void poll_napi(struct net_device *dev) int budget = 16; list_for_each_entry(napi, &dev->napi_list, dev_list) { - if (napi->poll_owner != smp_processor_id() && + if (napi->poll_owner != raw_smp_processor_id() && spin_trylock(&napi->poll_lock)) { budget = poll_one_napi(dev->npinfo, napi, budget); spin_unlock(&napi->poll_lock); @@ -218,30 +218,35 @@ static void refill_skbs(void) static void zap_completion_queue(void) { - unsigned long flags; struct softnet_data *sd = &get_cpu_var(softnet_data); + struct sk_buff *clist = NULL; + unsigned long flags; if (sd->completion_queue) { - struct sk_buff *clist; local_irq_save(flags); clist = sd->completion_queue; sd->completion_queue = NULL; local_irq_restore(flags); - - while (clist != NULL) { - struct sk_buff *skb = clist; - clist = clist->next; - if (skb->destructor) { - atomic_inc(&skb->users); - dev_kfree_skb_any(skb); /* put this one back */ - } else { - __kfree_skb(skb); - } - } } + + /* + * Took the list private, can drop our softnet + * reference: + */ put_cpu_var(softnet_data); + + while (clist != NULL) { + struct sk_buff *skb = clist; + clist = clist->next; + if (skb->destructor) { + atomic_inc(&skb->users); + dev_kfree_skb_any(skb); /* put this one back */ + } else { + __kfree_skb(skb); + } + } } static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve) @@ -249,13 +254,26 @@ static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve) int count = 0; struct sk_buff *skb; +#ifdef CONFIG_PREEMPT_RT + /* + * On -rt skb_pool.lock is schedulable, so if we are + * in an atomic context we just try to dequeue from the + * pool and fail if we cannot get one. + */ + if (in_atomic() || irqs_disabled()) + goto pick_atomic; +#endif zap_completion_queue(); refill_skbs(); repeat: skb = alloc_skb(len, GFP_ATOMIC); - if (!skb) + if (!skb) { +#ifdef CONFIG_PREEMPT_RT +pick_atomic: +#endif skb = skb_dequeue(&skb_pool); + } if (!skb) { if (++count < 10) { @@ -275,7 +293,7 @@ static int netpoll_owner_active(struct net_device *dev) struct napi_struct *napi; list_for_each_entry(napi, &dev->napi_list, dev_list) { - if (napi->poll_owner == smp_processor_id()) + if (napi->poll_owner == raw_smp_processor_id()) return 1; } return 0; @@ -301,7 +319,7 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); - local_irq_save(flags); + local_irq_save_nort(flags); /* try until next clock tick */ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; tries > 0; --tries) { @@ -328,7 +346,7 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) "netpoll_send_skb(): %s enabled interrupts in poll (%pF)\n", dev->name, ops->ndo_start_xmit); - local_irq_restore(flags); + local_irq_restore_nort(flags); } if (status != NETDEV_TX_OK) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 93c4e06..68c401c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -395,7 +395,7 @@ static void skb_release_head_state(struct sk_buff *skb) secpath_put(skb->sp); #endif if (skb->destructor) { - WARN_ON(in_irq()); +// WARN_ON(in_irq()); skb->destructor(skb); } #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) diff --git a/net/core/sock.c b/net/core/sock.c index e1f6f22..f852c18 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2123,8 +2123,9 @@ static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); #ifdef CONFIG_NET_NS void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) { - int cpu = smp_processor_id(); + int cpu = get_cpu(); per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val; + put_cpu(); } EXPORT_SYMBOL_GPL(sock_prot_inuse_add); @@ -2170,7 +2171,9 @@ static DEFINE_PER_CPU(struct prot_inuse, prot_inuse); void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) { - __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val; + int cpu = get_cpu(); + per_cpu(prot_inuse, cpu).val[prot->inuse_idx] += val; + put_cpu(); } EXPORT_SYMBOL_GPL(sock_prot_inuse_add); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 040c4f0..26dec2b 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1317,14 +1317,19 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write, { int *valp = ctl->data; int val = *valp; + loff_t pos = *ppos; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write && *valp != val) { struct net *net = ctl->extra2; if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) { - if (!rtnl_trylock()) + if (!rtnl_trylock()) { + /* Restore the original values before restarting */ + *valp = val; + *ppos = pos; return restart_syscall(); + } if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) { inet_forward_change(net); } else if (*valp) { diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index fe11f60..52463e4 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -201,7 +201,10 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; */ static struct sock *icmp_sk(struct net *net) { - return net->ipv4.icmp_sk[smp_processor_id()]; + /* + * Should be safe on PREEMPT_SOFTIRQS/HARDIRQS to use raw-smp-processor-id: + */ + return net->ipv4.icmp_sk[raw_smp_processor_id()]; } static inline struct sock *icmp_xmit_lock(struct net *net) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 76c0840..a42f658 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -946,7 +946,6 @@ int igmp_rcv(struct sk_buff *skb) break; case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: - case IGMPV3_HOST_MEMBERSHIP_REPORT: /* Is it our report looped back? */ if (skb_rtable(skb)->fl.iif == 0) break; @@ -960,6 +959,7 @@ int igmp_rcv(struct sk_buff *skb) in_dev_put(in_dev); return pim_rcv_v1(skb); #endif + case IGMPV3_HOST_MEMBERSHIP_REPORT: case IGMP_DVMRP: case IGMP_TRACE: case IGMP_HOST_LEAVE_MESSAGE: diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 38fbf04..544ce08 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -124,16 +124,12 @@ static int ipcomp4_init_state(struct xfrm_state *x) if (x->props.mode == XFRM_MODE_TUNNEL) { err = ipcomp_tunnel_attach(x); if (err) - goto error_tunnel; + goto out; } err = 0; out: return err; - -error_tunnel: - ipcomp_destroy(x); - goto out; } static const struct xfrm_type ipcomp_type = { diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 90203e1..aeaf948 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -252,6 +252,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, void *table_base; const struct xt_table_info *private; struct xt_target_param tgpar; + int cpu; if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) return NF_DROP; @@ -259,9 +260,9 @@ unsigned int arpt_do_table(struct sk_buff *skb, indev = in ? in->name : nulldevname; outdev = out ? out->name : nulldevname; - xt_info_rdlock_bh(); + cpu = xt_info_rdlock_bh(); private = table->private; - table_base = private->entries[smp_processor_id()]; + table_base = private->entries[cpu]; e = get_entry(table_base, private->hook_entry[hook]); back = get_entry(table_base, private->underflow[hook]); @@ -332,7 +333,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, /* Verdict */ break; } while (!hotdrop); - xt_info_rdunlock_bh(); + xt_info_rdunlock_bh(cpu); if (hotdrop) return NF_DROP; @@ -727,7 +728,7 @@ static void get_counters(const struct xt_table_info *t, { unsigned int cpu; unsigned int i; - unsigned int curcpu; + unsigned int curcpu = NR_CPUS; /* Instead of clearing (by a previous call to memset()) * the counters and using adds, we set the counters @@ -737,6 +738,7 @@ static void get_counters(const struct xt_table_info *t, * if new softirq were to run and call ipt_do_table */ local_bh_disable(); +#ifndef CONFIG_PREEMPT_RT curcpu = smp_processor_id(); i = 0; @@ -745,7 +747,7 @@ static void get_counters(const struct xt_table_info *t, set_entry_to_counter, counters, &i); - +#endif for_each_possible_cpu(cpu) { if (cpu == curcpu) continue; @@ -1201,7 +1203,7 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len, i = 0; /* Choose the copy that is on our node */ - curcpu = smp_processor_id(); + curcpu = raw_smp_processor_id(); loc_cpu_entry = private->entries[curcpu]; xt_info_wrlock(curcpu); ARPT_ENTRY_ITERATE(loc_cpu_entry, diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 3ce53cf..afeadce 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -320,6 +320,7 @@ ipt_do_table(struct sk_buff *skb, struct xt_table_info *private; struct xt_match_param mtpar; struct xt_target_param tgpar; + int cpu; /* Initialization */ ip = ip_hdr(skb); @@ -340,9 +341,9 @@ ipt_do_table(struct sk_buff *skb, mtpar.hooknum = tgpar.hooknum = hook; IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - xt_info_rdlock_bh(); + cpu = xt_info_rdlock_bh(); private = table->private; - table_base = private->entries[smp_processor_id()]; + table_base = private->entries[cpu]; e = get_entry(table_base, private->hook_entry[hook]); @@ -427,7 +428,7 @@ ipt_do_table(struct sk_buff *skb, /* Verdict */ break; } while (!hotdrop); - xt_info_rdunlock_bh(); + xt_info_rdunlock_bh(cpu); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; @@ -906,7 +907,7 @@ get_counters(const struct xt_table_info *t, { unsigned int cpu; unsigned int i; - unsigned int curcpu; + unsigned int curcpu = NR_CPUS; /* Instead of clearing (by a previous call to memset()) * the counters and using adds, we set the counters @@ -916,6 +917,7 @@ get_counters(const struct xt_table_info *t, * if new softirq were to run and call ipt_do_table */ local_bh_disable(); +#ifndef CONFIG_PREEMPT_RT curcpu = smp_processor_id(); i = 0; @@ -924,7 +926,7 @@ get_counters(const struct xt_table_info *t, set_entry_to_counter, counters, &i); - +#endif for_each_possible_cpu(cpu) { if (cpu == curcpu) continue; @@ -1405,7 +1407,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat i = 0; /* Choose the copy that is on our node */ - curcpu = smp_processor_id(); + curcpu = raw_smp_processor_id(); loc_cpu_entry = private->entries[curcpu]; xt_info_wrlock(curcpu); IPT_ENTRY_ITERATE(loc_cpu_entry, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d62b05d..1d2a42b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -204,13 +204,13 @@ struct rt_hash_bucket { }; #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ - defined(CONFIG_PROVE_LOCKING) + defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_PREEMPT_RT) /* * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks * The size of this table is a power of two and depends on the number of CPUS. * (on lockdep we have a quite big spinlock_t, so keep the size down there) */ -#ifdef CONFIG_LOCKDEP +#if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT) # define RT_HASH_LOCK_SZ 256 #else # if NR_CPUS >= 32 @@ -242,7 +242,7 @@ static __init void rt_hash_lock_init(void) spin_lock_init(&rt_hash_locks[i]); } #else -# define rt_hash_lock_addr(slot) NULL +# define rt_hash_lock_addr(slot) ((spinlock_t *)NULL) static inline void rt_hash_lock_init(void) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b0a26bb..02bebfe 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1405,11 +1405,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && !sysctl_tcp_low_latency && dma_find_channel(DMA_MEMCPY)) { - preempt_enable_no_resched(); + preempt_enable(); tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len); } else { - preempt_enable_no_resched(); + preempt_enable(); } } #endif diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 28e0296..3fddc69 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5783,11 +5783,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* tcp_ack considers this ACK as duplicate * and does not calculate rtt. - * Fix it at least with timestamps. + * Force it here. */ - if (tp->rx_opt.saw_tstamp && - tp->rx_opt.rcv_tsecr && !tp->srtt) - tcp_ack_saw_tstamp(sk, 0); + tcp_ack_update_rtt(sk, 0, 0); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index de7a194..143791d 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -502,8 +502,11 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old) if (p == &net->ipv6.devconf_dflt->forwarding) return 0; - if (!rtnl_trylock()) + if (!rtnl_trylock()) { + /* Restore the original values before restarting */ + *p = old; return restart_syscall(); + } if (p == &net->ipv6.devconf_all->forwarding) { __s32 newf = net->ipv6.devconf_all->forwarding; @@ -4028,12 +4031,15 @@ int addrconf_sysctl_forward(ctl_table *ctl, int write, { int *valp = ctl->data; int val = *valp; + loff_t pos = *ppos; int ret; ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write) ret = addrconf_fixup_forwarding(ctl, valp, val); + if (ret) + *ppos = pos; return ret; } @@ -4075,8 +4081,11 @@ static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int old) if (p == &net->ipv6.devconf_dflt->disable_ipv6) return 0; - if (!rtnl_trylock()) + if (!rtnl_trylock()) { + /* Restore the original values before restarting */ + *p = old; return restart_syscall(); + } if (p == &net->ipv6.devconf_all->disable_ipv6) { __s32 newf = net->ipv6.devconf_all->disable_ipv6; @@ -4095,12 +4104,15 @@ int addrconf_sysctl_disable(ctl_table *ctl, int write, { int *valp = ctl->data; int val = *valp; + loff_t pos = *ppos; int ret; ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write) ret = addrconf_disable_ipv6(ctl, valp, val); + if (ret) + *ppos = pos; return ret; } diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 2f2a5ca..002e6ee 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -154,16 +154,12 @@ static int ipcomp6_init_state(struct xfrm_state *x) if (x->props.mode == XFRM_MODE_TUNNEL) { err = ipcomp6_tunnel_attach(x); if (err) - goto error_tunnel; + goto out; } err = 0; out: return err; -error_tunnel: - ipcomp_destroy(x); - - goto out; } static const struct xfrm_type ipcomp6_type = diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 8a7e0f5..9238a5a 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -350,6 +350,7 @@ ip6t_do_table(struct sk_buff *skb, struct xt_table_info *private; struct xt_match_param mtpar; struct xt_target_param tgpar; + int cpu; /* Initialization */ indev = in ? in->name : nulldevname; @@ -368,9 +369,9 @@ ip6t_do_table(struct sk_buff *skb, IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - xt_info_rdlock_bh(); + cpu = xt_info_rdlock_bh(); private = table->private; - table_base = private->entries[smp_processor_id()]; + table_base = private->entries[cpu]; e = get_entry(table_base, private->hook_entry[hook]); @@ -459,7 +460,7 @@ ip6t_do_table(struct sk_buff *skb, #ifdef CONFIG_NETFILTER_DEBUG tb_comefrom = NETFILTER_LINK_POISON; #endif - xt_info_rdunlock_bh(); + xt_info_rdunlock_bh(cpu); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; @@ -938,7 +939,7 @@ get_counters(const struct xt_table_info *t, { unsigned int cpu; unsigned int i; - unsigned int curcpu; + unsigned int curcpu = NR_CPUS; /* Instead of clearing (by a previous call to memset()) * the counters and using adds, we set the counters @@ -948,6 +949,8 @@ get_counters(const struct xt_table_info *t, * if new softirq were to run and call ipt_do_table */ local_bh_disable(); + +#ifndef CONFIG_PREEMPT_RT curcpu = smp_processor_id(); i = 0; @@ -956,7 +959,7 @@ get_counters(const struct xt_table_info *t, set_entry_to_counter, counters, &i); - +#endif for_each_possible_cpu(cpu) { if (cpu == curcpu) continue; @@ -977,12 +980,13 @@ static struct xt_counters *alloc_counters(struct xt_table *table) unsigned int countersize; struct xt_counters *counters; struct xt_table_info *private = table->private; + int node = cpu_to_node(raw_smp_processor_id()); /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care about). */ countersize = sizeof(struct xt_counters) * private->number; - counters = vmalloc_node(countersize, numa_node_id()); + counters = vmalloc_node(countersize, node); if (counters == NULL) return ERR_PTR(-ENOMEM); @@ -1440,7 +1444,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, i = 0; /* Choose the copy that is on our node */ - curcpu = smp_processor_id(); + curcpu = raw_smp_processor_id(); xt_info_wrlock(curcpu); loc_cpu_entry = private->entries[curcpu]; IP6T_ENTRY_ITERATE(loc_cpu_entry, diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 1f2db64..22f0c2a 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -647,7 +647,7 @@ static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata, } if (pos[1] != 0 && (pos[1] != ifibss->ssid_len || - !memcmp(pos + 2, ifibss->ssid, ifibss->ssid_len))) { + memcmp(pos + 2, ifibss->ssid, ifibss->ssid_len))) { /* Ignore ProbeReq for foreign SSID */ return; } diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index b9007f8..12a2bff 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -245,6 +245,9 @@ void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, info->control.rates[i].count = 1; } + if (sdata->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) + return; + if (sta && sdata->force_unicast_rateidx > -1) { info->control.rates[0].idx = sdata->force_unicast_rateidx; } else { diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 82a30c1..e735c17 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2464,7 +2464,7 @@ void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb) struct ieee80211_supported_band *sband; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); - WARN_ON_ONCE(softirq_count() == 0); + WARN_ON_ONCE_NONRT(softirq_count() == 0); if (WARN_ON(status->band < 0 || status->band >= IEEE80211_NUM_BANDS)) diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index f934c96..bc17cf7 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -439,6 +439,16 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, if (local->scan_req) return -EBUSY; + if (req != local->int_scan_req && + sdata->vif.type == NL80211_IFTYPE_STATION && + !list_empty(&ifmgd->work_list)) { + /* actually wait for the work it's doing to finish/time out */ + set_bit(IEEE80211_STA_REQ_SCAN, &ifmgd->request); + local->scan_req = req; + local->scan_sdata = sdata; + return 0; + } + if (local->ops->hw_scan) { u8 *ies; @@ -463,14 +473,6 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, local->scan_req = req; local->scan_sdata = sdata; - if (req != local->int_scan_req && - sdata->vif.type == NL80211_IFTYPE_STATION && - !list_empty(&ifmgd->work_list)) { - /* actually wait for the work it's doing to finish/time out */ - set_bit(IEEE80211_STA_REQ_SCAN, &ifmgd->request); - return 0; - } - if (local->ops->hw_scan) __set_bit(SCAN_HW_SCANNING, &local->scanning); else diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 60ec4e4..978dd44 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -233,7 +233,7 @@ EXPORT_SYMBOL(nf_ct_attach); void (*nf_ct_destroy)(struct nf_conntrack *); EXPORT_SYMBOL(nf_ct_destroy); -void nf_conntrack_destroy(struct nf_conntrack *nfct) +static void __nf_conntrack_destroy(struct nf_conntrack *nfct) { void (*destroy)(struct nf_conntrack *); @@ -243,6 +243,28 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct) destroy(nfct); rcu_read_unlock(); } + +#ifdef CONFIG_PREEMPT_RT +/* + * nf_contrack_destroy is called with preemption disabled + * and will call functions that might schedule in PREEMPT_RT. + * For PREEMPT_RT we use a rcu callback instead to handle + * the destroying. + */ +static void nf_conntrack_destroy_rcu(struct rcu_head *rhp) +{ + __nf_conntrack_destroy(container_of(rhp, struct nf_conntrack, rcu)); +} +void nf_conntrack_destroy(struct nf_conntrack *nfct) +{ + call_rcu(&nfct->rcu, nf_conntrack_destroy_rcu); +} +#else /* !PREEMPT_RT */ +void nf_conntrack_destroy(struct nf_conntrack *nfct) +{ + __nf_conntrack_destroy(nfct); +} +#endif /* PREEMPT_RT */ EXPORT_SYMBOL(nf_conntrack_destroy); #endif /* CONFIG_NF_CONNTRACK */ diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 4c5972b..95826e0 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1074,7 +1074,7 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, return -ENOBUFS; if (info.delivered) { - if (info.congested && (allocation & __GFP_WAIT)) + if (info.congested && (allocation & __GFP_WAIT) && !rt_task(current)) yield(); return 0; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 5173c1e..bf7f50e 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -12,6 +12,7 @@ */ #include <linux/bitops.h> +#include <linux/kallsyms.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> @@ -24,6 +25,7 @@ #include <linux/init.h> #include <linux/rcupdate.h> #include <linux/list.h> +#include <linux/delay.h> #include <net/pkt_sched.h> /* Main transmission queue. */ @@ -76,7 +78,7 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb, { int ret; - if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) { + if (unlikely(netif_tx_lock_recursion(dev_queue))) { /* * Same CPU holding the lock. It may be a transient * configuration error, when hard_start_xmit() recurses. We @@ -93,7 +95,9 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb, * Another cpu is holding lock, requeue & delay xmits for * some time. */ + preempt_disable(); /* FIXME: we need an _rt version of this */ __get_cpu_var(netdev_rx_stat).cpu_collision++; + preempt_enable(); ret = dev_requeue_skb(skb, q); } @@ -118,7 +122,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, /* And release qdisc */ spin_unlock(root_lock); - HARD_TX_LOCK(dev, txq, smp_processor_id()); + HARD_TX_LOCK(dev, txq); if (!netif_tx_queue_stopped(txq) && !netif_tx_queue_frozen(txq)) ret = dev_hard_start_xmit(skb, dev, txq); @@ -786,9 +790,12 @@ void dev_deactivate(struct net_device *dev) /* Wait for outstanding qdisc-less dev_queue_xmit calls. */ synchronize_rcu(); - /* Wait for outstanding qdisc_run calls. */ + /* + * Wait for outstanding qdisc_run calls. + * TODO: shouldnt this be wakeup-based, instead of polling it? + */ while (some_qdisc_is_busy(dev)) - yield(); + msleep(1); } static void dev_init_scheduler_queue(struct net_device *dev, diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index b36cc34..f445ea1 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1102,7 +1102,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, int *errp) int err = -ENOMEM; struct xfrm_state *x = xfrm_state_alloc(net); if (!x) - goto error; + goto out; memcpy(&x->id, &orig->id, sizeof(x->id)); memcpy(&x->sel, &orig->sel, sizeof(x->sel)); @@ -1160,16 +1160,10 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, int *errp) return x; error: + xfrm_state_put(x); +out: if (errp) *errp = err; - if (x) { - kfree(x->aalg); - kfree(x->ealg); - kfree(x->calg); - kfree(x->encap); - kfree(x->coaddr); - } - kfree(x); return NULL; } diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 3257d3d..46f47ac 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2618,14 +2618,11 @@ sub process { WARN("__func__ should be used instead of gcc specific __FUNCTION__\n" . $herecurr); } -# check for semaphores used as mutexes - if ($line =~ /^.\s*(DECLARE_MUTEX|init_MUTEX)\s*\(/) { - WARN("mutexes are preferred for single holder semaphores\n" . $herecurr); - } -# check for semaphores used as mutexes - if ($line =~ /^.\s*init_MUTEX_LOCKED\s*\(/) { +# check for semaphores initialized locked + if ($line =~ /sema_init\(.*,\s*0\)/) { WARN("consider using a completion\n" . $herecurr); } + # recommend strict_strto* over simple_strto* if ($line =~ /\bsimple_(strto.*?)\s*\(/) { WARN("consider using strict_$1 in preference to simple_$1\n" . $herecurr); diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h index 23dbad8..20fc5f6 100755 --- a/scripts/mkcompile_h +++ b/scripts/mkcompile_h @@ -4,7 +4,8 @@ TARGET=$1 ARCH=$2 SMP=$3 PREEMPT=$4 -CC=$5 +PREEMPT_RT=$5 +CC=$6 vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; } @@ -47,6 +48,7 @@ UTS_VERSION="#$VERSION" CONFIG_FLAGS="" if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi +if [ -n "$PREEMPT_RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP" # Truncate to maximum length diff --git a/security/keys/permission.c b/security/keys/permission.c index 0ed802c..5d16a18 100644 --- a/security/keys/permission.c +++ b/security/keys/permission.c @@ -23,8 +23,7 @@ * Check to see whether permission is granted to use a key in the desired way, * but permit the security modules to override. * - * The caller must hold either a ref on cred or must hold the RCU readlock or a - * spinlock. + * The caller must hold either a ref on cred or must hold the RCU readlock. */ int key_task_permission(const key_ref_t key_ref, const struct cred *cred, key_perm_t perm) diff --git a/security/keys/proc.c b/security/keys/proc.c index 9d01021..55eeba4 100644 --- a/security/keys/proc.c +++ b/security/keys/proc.c @@ -194,8 +194,6 @@ static int proc_keys_show(struct seq_file *m, void *v) /* check whether the current task is allowed to view the key (assuming * non-possession) - * - the caller holds a spinlock, and thus the RCU read lock, making our - * access to __current_cred() safe */ rc = key_task_permission(make_key_ref(key, 0), current_cred(), KEY_VIEW); diff --git a/sound/drivers/pcsp/pcsp.h b/sound/drivers/pcsp/pcsp.h index 1e12307..4ff6c8c 100644 --- a/sound/drivers/pcsp/pcsp.h +++ b/sound/drivers/pcsp/pcsp.h @@ -16,7 +16,7 @@ #include <asm/i8253.h> #else #include <asm/8253pit.h> -static DEFINE_SPINLOCK(i8253_lock); +static DEFINE_RAW_SPINLOCK(i8253_lock); #endif #define PCSP_SOUND_VERSION 0x400 /* read 4.00 */ diff --git a/sound/drivers/pcsp/pcsp_input.c b/sound/drivers/pcsp/pcsp_input.c index 0444cde..b5e2b54 100644 --- a/sound/drivers/pcsp/pcsp_input.c +++ b/sound/drivers/pcsp/pcsp_input.c @@ -21,7 +21,7 @@ static void pcspkr_do_sound(unsigned int count) { unsigned long flags; - spin_lock_irqsave(&i8253_lock, flags); + raw_spin_lock_irqsave(&i8253_lock, flags); if (count) { /* set command for counter 2, 2 byte write */ @@ -36,7 +36,7 @@ static void pcspkr_do_sound(unsigned int count) outb(inb_p(0x61) & 0xFC, 0x61); } - spin_unlock_irqrestore(&i8253_lock, flags); + raw_spin_unlock_irqrestore(&i8253_lock, flags); } void pcspkr_stop_sound(void) diff --git a/sound/drivers/pcsp/pcsp_lib.c b/sound/drivers/pcsp/pcsp_lib.c index e1145ac..f6a2e72 100644 --- a/sound/drivers/pcsp/pcsp_lib.c +++ b/sound/drivers/pcsp/pcsp_lib.c @@ -65,7 +65,7 @@ static u64 pcsp_timer_update(struct snd_pcsp *chip) timer_cnt = val * CUR_DIV() / 256; if (timer_cnt && chip->enable) { - spin_lock_irqsave(&i8253_lock, flags); + raw_spin_lock_irqsave(&i8253_lock, flags); if (!nforce_wa) { outb_p(chip->val61, 0x61); outb_p(timer_cnt, 0x42); @@ -74,7 +74,7 @@ static u64 pcsp_timer_update(struct snd_pcsp *chip) outb(chip->val61 ^ 2, 0x61); chip->thalf = 1; } - spin_unlock_irqrestore(&i8253_lock, flags); + raw_spin_unlock_irqrestore(&i8253_lock, flags); } chip->ns_rem = PCSP_PERIOD_NS(); @@ -158,10 +158,10 @@ static int pcsp_start_playing(struct snd_pcsp *chip) return -EIO; } - spin_lock(&i8253_lock); + raw_spin_lock(&i8253_lock); chip->val61 = inb(0x61) | 0x03; outb_p(0x92, 0x43); /* binary, mode 1, LSB only, ch 2 */ - spin_unlock(&i8253_lock); + raw_spin_unlock(&i8253_lock); atomic_set(&chip->timer_active, 1); chip->thalf = 0; @@ -178,11 +178,11 @@ static void pcsp_stop_playing(struct snd_pcsp *chip) return; atomic_set(&chip->timer_active, 0); - spin_lock(&i8253_lock); + raw_spin_lock(&i8253_lock); /* restore the timer */ outb_p(0xb6, 0x43); /* binary, mode 3, LSB/MSB, ch 2 */ outb(chip->val61 & 0xFC, 0x61); - spin_unlock(&i8253_lock); + raw_spin_unlock(&i8253_lock); } /* diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 3600e9c..ff6da6f 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -2350,7 +2350,7 @@ static void __devinit check_probe_mask(struct azx *chip, int dev) */ static struct snd_pci_quirk msi_black_list[] __devinitdata = { SND_PCI_QUIRK(0x1043, 0x81f2, "ASUS", 0), /* Athlon64 X2 + nvidia */ - SND_PCI_QUIRK(0x1043, 0x829c, "ASUS", 0), /* nvidia */ + SND_PCI_QUIRK(0x1043, 0x81f6, "ASUS", 0), /* nvidia */ {} }; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index ddc584b..4b91d8c 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -705,7 +705,7 @@ static void print_mapped_keys(void) fprintf(stdout, "\t[w] toggle display weighted/count[E]r. \t(%d)\n", display_weighted ? 1 : 0); fprintf(stdout, - "\t[K] hide kernel_symbols symbols. \t(%s)\n", + "\t[K] hide kernel_symbols symbols. \t(%s)\n", hide_kernel_symbols ? "yes" : "no"); fprintf(stdout, "\t[U] hide user symbols. \t(%s)\n", diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index bb0fd6d..8a9e6ba 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -295,10 +295,10 @@ void thread__find_addr_location(struct thread *self, al->thread = self; al->addr = addr; - if (cpumode & PERF_RECORD_MISC_KERNEL) { + if (cpumode == PERF_RECORD_MISC_KERNEL) { al->level = 'k'; mg = &session->kmaps; - } else if (cpumode & PERF_RECORD_MISC_USER) + } else if (cpumode == PERF_RECORD_MISC_USER) al->level = '.'; else { al->level = 'H'; diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 29465d4..fde17b0 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -272,6 +272,7 @@ int synthesize_perf_probe_point(struct probe_point *pp) int ret; pp->probes[0] = buf = zalloc(MAX_CMDLEN); + pp->found = 1; if (!buf) die("Failed to allocate memory by zalloc."); if (pp->offset) { @@ -294,6 +295,7 @@ int synthesize_perf_probe_point(struct probe_point *pp) error: free(pp->probes[0]); pp->probes[0] = NULL; + pp->found = 0; } return ret; } @@ -455,6 +457,7 @@ void show_perf_probe_events(void) struct strlist *rawlist; struct str_node *ent; + memset(&pp, 0, sizeof(pp)); fd = open_kprobe_events(O_RDONLY, 0); rawlist = get_trace_kprobe_event_rawlist(fd); close(fd); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a944be3..6d0e484 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -137,7 +137,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) zalloc_cpumask_var(&cpus, GFP_ATOMIC); spin_lock(&kvm->requests_lock); - me = smp_processor_id(); + me = raw_smp_processor_id(); kvm_for_each_vcpu(i, vcpu, kvm) { if (test_and_set_bit(req, &vcpu->requests)) continue; @@ -145,12 +145,14 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) if (cpus != NULL && cpu != -1 && cpu != me) cpumask_set_cpu(cpu, cpus); } + preempt_disable_rt(); if (unlikely(cpus == NULL)) smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1); else if (!cpumask_empty(cpus)) smp_call_function_many(cpus, ack_flush, NULL, 1); else called = false; + preempt_enable_rt(); spin_unlock(&kvm->requests_lock); free_cpumask_var(cpus); return called;