mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 16:48:26 +02:00 
			
		
		
		
	Scheduler updates:
- migrate_disable/enable() support which originates from the RT tree and
    is now a prerequisite for the new preemptible kmap_local() API which aims
    to replace kmap_atomic().
 
  - A fair amount of topology and NUMA related improvements
 
  - Improvements for the frequency invariant calculations
 
  - Enhanced robustness for the global CPU priority tracking and decision
    making
 
  - The usual small fixes and enhancements all over the place
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAl/XwK4THHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYoX28D/9cVrvziSQGfBfuQWnUiw8iOIq1QBa2
 Me+Tvenhfrlt7xU6rbP9ciFu7eTN+fS06m5uQPGI+t22WuJmHzbmw1bJVXfkvYfI
 /QoU+Hg7DkDAn1p7ZKXh0dRkV0nI9ixxSHl0E+Zf1ATBxCUMV2SO85flg6z/4qJq
 3VWUye0dmR7/bhtkIjv5rwce9v2JB2g1AbgYXYTW9lHVoUdGoMSdiZAF4tGyHLnx
 sJ6DMqQ+k+dmPyYO0z5MTzjW/fXit4n9w2e3z9TvRH/uBu58WSW1RBmQYX6aHBAg
 dhT9F4lvTs6lJY23x5RSFWDOv6xAvKF5a0xfb8UZcyH5EoLYrPRvm42a0BbjdeRa
 u0z7LbwIlKA+RFdZzFZWz8UvvO0ljyMjmiuqZnZ5dY9Cd80LSBuxrWeQYG0qg6lR
 Y2povhhCepEG+q8AXIe2YjHKWKKC1s/l/VY3CNnCzcd21JPQjQ4Z5eWGmHif5IED
 CntaeFFhZadR3w02tkX35zFmY3w4soKKrbI4EKWrQwd+cIEQlOSY7dEPI/b5BbYj
 MWAb3P4EG9N77AWTNmbhK4nN0brEYb+rBbCA+5dtNBVhHTxAC7OTWElJOC2O66FI
 e06dREjvwYtOkRUkUguWwErbIai2gJ2MH0VILV3hHoh64oRk7jjM8PZYnjQkdptQ
 Gsq0rJW5iiu/OQ==
 =Oz1V
 -----END PGP SIGNATURE-----
Merge tag 'sched-core-2020-12-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Thomas Gleixner:
 - migrate_disable/enable() support which originates from the RT tree
   and is now a prerequisite for the new preemptible kmap_local() API
   which aims to replace kmap_atomic().
 - A fair amount of topology and NUMA related improvements
 - Improvements for the frequency invariant calculations
 - Enhanced robustness for the global CPU priority tracking and decision
   making
 - The usual small fixes and enhancements all over the place
* tag 'sched-core-2020-12-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (61 commits)
  sched/fair: Trivial correction of the newidle_balance() comment
  sched/fair: Clear SMT siblings after determining the core is not idle
  sched: Fix kernel-doc markup
  x86: Print ratio freq_max/freq_base used in frequency invariance calculations
  x86, sched: Use midpoint of max_boost and max_P for frequency invariance on AMD EPYC
  x86, sched: Calculate frequency invariance for AMD systems
  irq_work: Optimize irq_work_single()
  smp: Cleanup smp_call_function*()
  irq_work: Cleanup
  sched: Limit the amount of NUMA imbalance that can exist at fork time
  sched/numa: Allow a floating imbalance between NUMA nodes
  sched: Avoid unnecessary calculation of load imbalance at clone time
  sched/numa: Rename nr_running and break out the magic number
  sched: Make migrate_disable/enable() independent of RT
  sched/topology: Condition EAS enablement on FIE support
  arm64: Rebuild sched domains on invariance status changes
  sched/topology,schedutil: Wrap sched domains rebuild
  sched/uclamp: Allow to reset a task uclamp constraint value
  sched/core: Fix typos in comments
  Documentation: scheduler: fix information on arch SD flags, sched_domain and sched_debug
  ...
			
			
This commit is contained in:
		
						commit
						adb35e8dc9
					
				
					 59 changed files with 1929 additions and 642 deletions
				
			
		|  | @ -65,21 +65,17 @@ of the SMP domain will span the entire machine, with each group having the | ||||||
| cpumask of a node. Or, you could do multi-level NUMA or Opteron, for example, | cpumask of a node. Or, you could do multi-level NUMA or Opteron, for example, | ||||||
| might have just one domain covering its one NUMA level. | might have just one domain covering its one NUMA level. | ||||||
| 
 | 
 | ||||||
| The implementor should read comments in include/linux/sched.h: | The implementor should read comments in include/linux/sched/sd_flags.h: | ||||||
| struct sched_domain fields, SD_FLAG_*, SD_*_INIT to get an idea of | SD_* to get an idea of the specifics and what to tune for the SD flags | ||||||
| the specifics and what to tune. | of a sched_domain. | ||||||
| 
 | 
 | ||||||
| Architectures may retain the regular override the default SD_*_INIT flags | Architectures may override the generic domain builder and the default SD flags | ||||||
| while using the generic domain builder in kernel/sched/core.c if they wish to | for a given topology level by creating a sched_domain_topology_level array and | ||||||
| retain the traditional SMT->SMP->NUMA topology (or some subset of that). This | calling set_sched_topology() with this array as the parameter. | ||||||
| can be done by #define'ing ARCH_HASH_SCHED_TUNE. |  | ||||||
| 
 |  | ||||||
| Alternatively, the architecture may completely override the generic domain |  | ||||||
| builder by #define'ing ARCH_HASH_SCHED_DOMAIN, and exporting your |  | ||||||
| arch_init_sched_domains function. This function will attach domains to all |  | ||||||
| CPUs using cpu_attach_domain. |  | ||||||
| 
 | 
 | ||||||
| The sched-domains debugging infrastructure can be enabled by enabling | The sched-domains debugging infrastructure can be enabled by enabling | ||||||
| CONFIG_SCHED_DEBUG. This enables an error checking parse of the sched domains | CONFIG_SCHED_DEBUG and adding 'sched_debug' to your cmdline. If you forgot to | ||||||
| which should catch most possible errors (described above). It also prints out | tweak your cmdline, you can also flip the /sys/kernel/debug/sched_debug | ||||||
| the domain structure in a visual format. | knob. This enables an error checking parse of the sched domains which should | ||||||
|  | catch most possible errors (described above). It also prints out the domain | ||||||
|  | structure in a visual format. | ||||||
|  |  | ||||||
|  | @ -223,6 +223,7 @@ static DEFINE_STATIC_KEY_FALSE(amu_fie_key); | ||||||
| 
 | 
 | ||||||
| static int __init init_amu_fie(void) | static int __init init_amu_fie(void) | ||||||
| { | { | ||||||
|  | 	bool invariance_status = topology_scale_freq_invariant(); | ||||||
| 	cpumask_var_t valid_cpus; | 	cpumask_var_t valid_cpus; | ||||||
| 	bool have_policy = false; | 	bool have_policy = false; | ||||||
| 	int ret = 0; | 	int ret = 0; | ||||||
|  | @ -269,6 +270,15 @@ static int __init init_amu_fie(void) | ||||||
| 	if (!topology_scale_freq_invariant()) | 	if (!topology_scale_freq_invariant()) | ||||||
| 		static_branch_disable(&amu_fie_key); | 		static_branch_disable(&amu_fie_key); | ||||||
| 
 | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * Task scheduler behavior depends on frequency invariance support, | ||||||
|  | 	 * either cpufreq or counter driven. If the support status changes as | ||||||
|  | 	 * a result of counter initialisation and use, retrigger the build of | ||||||
|  | 	 * scheduling domains to ensure the information is propagated properly. | ||||||
|  | 	 */ | ||||||
|  | 	if (invariance_status != topology_scale_freq_invariant()) | ||||||
|  | 		rebuild_sched_domains_energy(); | ||||||
|  | 
 | ||||||
| free_valid_mask: | free_valid_mask: | ||||||
| 	free_cpumask_var(valid_cpus); | 	free_cpumask_var(valid_cpus); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -702,7 +702,6 @@ unsigned long arch_align_stack(unsigned long sp) | ||||||
| 	return sp & ALMASK; | 	return sp & ALMASK; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static DEFINE_PER_CPU(call_single_data_t, backtrace_csd); |  | ||||||
| static struct cpumask backtrace_csd_busy; | static struct cpumask backtrace_csd_busy; | ||||||
| 
 | 
 | ||||||
| static void handle_backtrace(void *info) | static void handle_backtrace(void *info) | ||||||
|  | @ -711,6 +710,9 @@ static void handle_backtrace(void *info) | ||||||
| 	cpumask_clear_cpu(smp_processor_id(), &backtrace_csd_busy); | 	cpumask_clear_cpu(smp_processor_id(), &backtrace_csd_busy); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static DEFINE_PER_CPU(call_single_data_t, backtrace_csd) = | ||||||
|  | 	CSD_INIT(handle_backtrace, NULL); | ||||||
|  | 
 | ||||||
| static void raise_backtrace(cpumask_t *mask) | static void raise_backtrace(cpumask_t *mask) | ||||||
| { | { | ||||||
| 	call_single_data_t *csd; | 	call_single_data_t *csd; | ||||||
|  | @ -730,7 +732,6 @@ static void raise_backtrace(cpumask_t *mask) | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
| 		csd = &per_cpu(backtrace_csd, cpu); | 		csd = &per_cpu(backtrace_csd, cpu); | ||||||
| 		csd->func = handle_backtrace; |  | ||||||
| 		smp_call_function_single_async(cpu, csd); | 		smp_call_function_single_async(cpu, csd); | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -687,7 +687,13 @@ EXPORT_SYMBOL(flush_tlb_one); | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST | ||||||
| 
 | 
 | ||||||
| static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd); | static void tick_broadcast_callee(void *info) | ||||||
|  | { | ||||||
|  | 	tick_receive_broadcast(); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd) = | ||||||
|  | 	CSD_INIT(tick_broadcast_callee, NULL); | ||||||
| 
 | 
 | ||||||
| void tick_broadcast(const struct cpumask *mask) | void tick_broadcast(const struct cpumask *mask) | ||||||
| { | { | ||||||
|  | @ -700,23 +706,4 @@ void tick_broadcast(const struct cpumask *mask) | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void tick_broadcast_callee(void *info) |  | ||||||
| { |  | ||||||
| 	tick_receive_broadcast(); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static int __init tick_broadcast_init(void) |  | ||||||
| { |  | ||||||
| 	call_single_data_t *csd; |  | ||||||
| 	int cpu; |  | ||||||
| 
 |  | ||||||
| 	for (cpu = 0; cpu < NR_CPUS; cpu++) { |  | ||||||
| 		csd = &per_cpu(tick_broadcast_csd, cpu); |  | ||||||
| 		csd->func = tick_broadcast_callee; |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	return 0; |  | ||||||
| } |  | ||||||
| early_initcall(tick_broadcast_init); |  | ||||||
| 
 |  | ||||||
| #endif /* CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */ | #endif /* CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */ | ||||||
|  |  | ||||||
|  | @ -179,9 +179,7 @@ static void zpci_handle_fallback_irq(void) | ||||||
| 		if (atomic_inc_return(&cpu_data->scheduled) > 1) | 		if (atomic_inc_return(&cpu_data->scheduled) > 1) | ||||||
| 			continue; | 			continue; | ||||||
| 
 | 
 | ||||||
| 		cpu_data->csd.func = zpci_handle_remote_irq; | 		INIT_CSD(&cpu_data->csd, zpci_handle_remote_irq, &cpu_data->scheduled); | ||||||
| 		cpu_data->csd.info = &cpu_data->scheduled; |  | ||||||
| 		cpu_data->csd.flags = 0; |  | ||||||
| 		smp_call_function_single_async(cpu, &cpu_data->csd); | 		smp_call_function_single_async(cpu, &cpu_data->csd); | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -218,4 +218,9 @@ static inline void arch_set_max_freq_ratio(bool turbo_disabled) | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  | #ifdef CONFIG_ACPI_CPPC_LIB | ||||||
|  | void init_freq_invariance_cppc(void); | ||||||
|  | #define init_freq_invariance_cppc init_freq_invariance_cppc | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| #endif /* _ASM_X86_TOPOLOGY_H */ | #endif /* _ASM_X86_TOPOLOGY_H */ | ||||||
|  |  | ||||||
|  | @ -74,10 +74,9 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, | ||||||
| 
 | 
 | ||||||
| 	init_completion(&cmd.done); | 	init_completion(&cmd.done); | ||||||
| 	for (; count; count -= 16) { | 	for (; count; count -= 16) { | ||||||
| 		call_single_data_t csd = { | 		call_single_data_t csd; | ||||||
| 			.func = cpuid_smp_cpuid, | 
 | ||||||
| 			.info = &cmd, | 		INIT_CSD(&csd, cpuid_smp_cpuid, &cmd); | ||||||
| 		}; |  | ||||||
| 
 | 
 | ||||||
| 		cmd.regs.eax = pos; | 		cmd.regs.eax = pos; | ||||||
| 		cmd.regs.ecx = pos >> 32; | 		cmd.regs.ecx = pos >> 32; | ||||||
|  |  | ||||||
|  | @ -82,6 +82,10 @@ | ||||||
| #include <asm/hw_irq.h> | #include <asm/hw_irq.h> | ||||||
| #include <asm/stackprotector.h> | #include <asm/stackprotector.h> | ||||||
| 
 | 
 | ||||||
|  | #ifdef CONFIG_ACPI_CPPC_LIB | ||||||
|  | #include <acpi/cppc_acpi.h> | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| /* representing HT siblings of each logical CPU */ | /* representing HT siblings of each logical CPU */ | ||||||
| DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); | DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); | ||||||
| EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); | EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); | ||||||
|  | @ -148,7 +152,7 @@ static inline void smpboot_restore_warm_reset_vector(void) | ||||||
| 	*((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; | 	*((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void init_freq_invariance(bool secondary); | static void init_freq_invariance(bool secondary, bool cppc_ready); | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * Report back to the Boot Processor during boot time or to the caller processor |  * Report back to the Boot Processor during boot time or to the caller processor | ||||||
|  | @ -186,7 +190,7 @@ static void smp_callin(void) | ||||||
| 	 */ | 	 */ | ||||||
| 	set_cpu_sibling_map(raw_smp_processor_id()); | 	set_cpu_sibling_map(raw_smp_processor_id()); | ||||||
| 
 | 
 | ||||||
| 	init_freq_invariance(true); | 	init_freq_invariance(true, false); | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Get our bogomips. | 	 * Get our bogomips. | ||||||
|  | @ -1341,7 +1345,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) | ||||||
| 	set_sched_topology(x86_topology); | 	set_sched_topology(x86_topology); | ||||||
| 
 | 
 | ||||||
| 	set_cpu_sibling_map(0); | 	set_cpu_sibling_map(0); | ||||||
| 	init_freq_invariance(false); | 	init_freq_invariance(false, false); | ||||||
| 	smp_sanity_check(); | 	smp_sanity_check(); | ||||||
| 
 | 
 | ||||||
| 	switch (apic_intr_mode) { | 	switch (apic_intr_mode) { | ||||||
|  | @ -2028,6 +2032,48 @@ static bool intel_set_max_freq_ratio(void) | ||||||
| 	return true; | 	return true; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | #ifdef CONFIG_ACPI_CPPC_LIB | ||||||
|  | static bool amd_set_max_freq_ratio(void) | ||||||
|  | { | ||||||
|  | 	struct cppc_perf_caps perf_caps; | ||||||
|  | 	u64 highest_perf, nominal_perf; | ||||||
|  | 	u64 perf_ratio; | ||||||
|  | 	int rc; | ||||||
|  | 
 | ||||||
|  | 	rc = cppc_get_perf_caps(0, &perf_caps); | ||||||
|  | 	if (rc) { | ||||||
|  | 		pr_debug("Could not retrieve perf counters (%d)\n", rc); | ||||||
|  | 		return false; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	highest_perf = perf_caps.highest_perf; | ||||||
|  | 	nominal_perf = perf_caps.nominal_perf; | ||||||
|  | 
 | ||||||
|  | 	if (!highest_perf || !nominal_perf) { | ||||||
|  | 		pr_debug("Could not retrieve highest or nominal performance\n"); | ||||||
|  | 		return false; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf); | ||||||
|  | 	/* midpoint between max_boost and max_P */ | ||||||
|  | 	perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1; | ||||||
|  | 	if (!perf_ratio) { | ||||||
|  | 		pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n"); | ||||||
|  | 		return false; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	arch_turbo_freq_ratio = perf_ratio; | ||||||
|  | 	arch_set_max_freq_ratio(false); | ||||||
|  | 
 | ||||||
|  | 	return true; | ||||||
|  | } | ||||||
|  | #else | ||||||
|  | static bool amd_set_max_freq_ratio(void) | ||||||
|  | { | ||||||
|  | 	return false; | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| static void init_counter_refs(void) | static void init_counter_refs(void) | ||||||
| { | { | ||||||
| 	u64 aperf, mperf; | 	u64 aperf, mperf; | ||||||
|  | @ -2039,7 +2085,7 @@ static void init_counter_refs(void) | ||||||
| 	this_cpu_write(arch_prev_mperf, mperf); | 	this_cpu_write(arch_prev_mperf, mperf); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void init_freq_invariance(bool secondary) | static void init_freq_invariance(bool secondary, bool cppc_ready) | ||||||
| { | { | ||||||
| 	bool ret = false; | 	bool ret = false; | ||||||
| 
 | 
 | ||||||
|  | @ -2055,15 +2101,38 @@ static void init_freq_invariance(bool secondary) | ||||||
| 
 | 
 | ||||||
| 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) | 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) | ||||||
| 		ret = intel_set_max_freq_ratio(); | 		ret = intel_set_max_freq_ratio(); | ||||||
|  | 	else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { | ||||||
|  | 		if (!cppc_ready) { | ||||||
|  | 			return; | ||||||
|  | 		} | ||||||
|  | 		ret = amd_set_max_freq_ratio(); | ||||||
|  | 	} | ||||||
| 
 | 
 | ||||||
| 	if (ret) { | 	if (ret) { | ||||||
| 		init_counter_refs(); | 		init_counter_refs(); | ||||||
| 		static_branch_enable(&arch_scale_freq_key); | 		static_branch_enable(&arch_scale_freq_key); | ||||||
|  | 		pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); | ||||||
| 	} else { | 	} else { | ||||||
| 		pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n"); | 		pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n"); | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | #ifdef CONFIG_ACPI_CPPC_LIB | ||||||
|  | static DEFINE_MUTEX(freq_invariance_lock); | ||||||
|  | 
 | ||||||
|  | void init_freq_invariance_cppc(void) | ||||||
|  | { | ||||||
|  | 	static bool secondary; | ||||||
|  | 
 | ||||||
|  | 	mutex_lock(&freq_invariance_lock); | ||||||
|  | 
 | ||||||
|  | 	init_freq_invariance(secondary, true); | ||||||
|  | 	secondary = true; | ||||||
|  | 
 | ||||||
|  | 	mutex_unlock(&freq_invariance_lock); | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| static void disable_freq_invariance_workfn(struct work_struct *work) | static void disable_freq_invariance_workfn(struct work_struct *work) | ||||||
| { | { | ||||||
| 	static_branch_disable(&arch_scale_freq_key); | 	static_branch_disable(&arch_scale_freq_key); | ||||||
|  | @ -2113,7 +2182,7 @@ void arch_scale_freq_tick(void) | ||||||
| 	schedule_work(&disable_freq_invariance_work); | 	schedule_work(&disable_freq_invariance_work); | ||||||
| } | } | ||||||
| #else | #else | ||||||
| static inline void init_freq_invariance(bool secondary) | static inline void init_freq_invariance(bool secondary, bool cppc_ready) | ||||||
| { | { | ||||||
| } | } | ||||||
| #endif /* CONFIG_X86_64 */ | #endif /* CONFIG_X86_64 */ | ||||||
|  |  | ||||||
|  | @ -169,12 +169,11 @@ static void __wrmsr_safe_on_cpu(void *info) | ||||||
| int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) | int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) | ||||||
| { | { | ||||||
| 	struct msr_info_completion rv; | 	struct msr_info_completion rv; | ||||||
| 	call_single_data_t csd = { | 	call_single_data_t csd; | ||||||
| 		.func	= __rdmsr_safe_on_cpu, |  | ||||||
| 		.info	= &rv, |  | ||||||
| 	}; |  | ||||||
| 	int err; | 	int err; | ||||||
| 
 | 
 | ||||||
|  | 	INIT_CSD(&csd, __rdmsr_safe_on_cpu, &rv); | ||||||
|  | 
 | ||||||
| 	memset(&rv, 0, sizeof(rv)); | 	memset(&rv, 0, sizeof(rv)); | ||||||
| 	init_completion(&rv.done); | 	init_completion(&rv.done); | ||||||
| 	rv.msr.msr_no = msr_no; | 	rv.msr.msr_no = msr_no; | ||||||
|  |  | ||||||
|  | @ -671,9 +671,7 @@ bool blk_mq_complete_request_remote(struct request *rq) | ||||||
| 		return false; | 		return false; | ||||||
| 
 | 
 | ||||||
| 	if (blk_mq_complete_need_ipi(rq)) { | 	if (blk_mq_complete_need_ipi(rq)) { | ||||||
| 		rq->csd.func = __blk_mq_complete_request_remote; | 		INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq); | ||||||
| 		rq->csd.info = rq; |  | ||||||
| 		rq->csd.flags = 0; |  | ||||||
| 		smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd); | 		smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd); | ||||||
| 	} else { | 	} else { | ||||||
| 		if (rq->q->nr_hw_queues > 1) | 		if (rq->q->nr_hw_queues > 1) | ||||||
|  |  | ||||||
|  | @ -39,6 +39,7 @@ | ||||||
| #include <linux/ktime.h> | #include <linux/ktime.h> | ||||||
| #include <linux/rwsem.h> | #include <linux/rwsem.h> | ||||||
| #include <linux/wait.h> | #include <linux/wait.h> | ||||||
|  | #include <linux/topology.h> | ||||||
| 
 | 
 | ||||||
| #include <acpi/cppc_acpi.h> | #include <acpi/cppc_acpi.h> | ||||||
| 
 | 
 | ||||||
|  | @ -688,6 +689,10 @@ static bool is_cppc_supported(int revision, int num_ent) | ||||||
|  *	} |  *	} | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
|  | #ifndef init_freq_invariance_cppc | ||||||
|  | static inline void init_freq_invariance_cppc(void) { } | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| /**
 | /**
 | ||||||
|  * acpi_cppc_processor_probe - Search for per CPU _CPC objects. |  * acpi_cppc_processor_probe - Search for per CPU _CPC objects. | ||||||
|  * @pr: Ptr to acpi_processor containing this CPU's logical ID. |  * @pr: Ptr to acpi_processor containing this CPU's logical ID. | ||||||
|  | @ -850,6 +855,8 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr) | ||||||
| 		goto out_free; | 		goto out_free; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	init_freq_invariance_cppc(); | ||||||
|  | 
 | ||||||
| 	kfree(output.pointer); | 	kfree(output.pointer); | ||||||
| 	return 0; | 	return 0; | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -674,8 +674,7 @@ int cpuidle_coupled_register_device(struct cpuidle_device *dev) | ||||||
| 	coupled->refcnt++; | 	coupled->refcnt++; | ||||||
| 
 | 
 | ||||||
| 	csd = &per_cpu(cpuidle_coupled_poke_cb, dev->cpu); | 	csd = &per_cpu(cpuidle_coupled_poke_cb, dev->cpu); | ||||||
| 	csd->func = cpuidle_coupled_handle_poke; | 	INIT_CSD(csd, cpuidle_coupled_handle_poke, (void *)(unsigned long)dev->cpu); | ||||||
| 	csd->info = (void *)(unsigned long)dev->cpu; |  | ||||||
| 
 | 
 | ||||||
| 	return 0; | 	return 0; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -197,7 +197,7 @@ __notify_execute_cb(struct i915_request *rq, bool (*fn)(struct irq_work *wrk)) | ||||||
| 
 | 
 | ||||||
| 	llist_for_each_entry_safe(cb, cn, | 	llist_for_each_entry_safe(cb, cn, | ||||||
| 				  llist_del_all(&rq->execute_cb), | 				  llist_del_all(&rq->execute_cb), | ||||||
| 				  work.llnode) | 				  work.node.llist) | ||||||
| 		fn(&cb->work); | 		fn(&cb->work); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -460,7 +460,7 @@ __await_execution(struct i915_request *rq, | ||||||
| 	 * callback first, then checking the ACTIVE bit, we serialise with | 	 * callback first, then checking the ACTIVE bit, we serialise with | ||||||
| 	 * the completed/retired request. | 	 * the completed/retired request. | ||||||
| 	 */ | 	 */ | ||||||
| 	if (llist_add(&cb->work.llnode, &signal->execute_cb)) { | 	if (llist_add(&cb->work.node.llist, &signal->execute_cb)) { | ||||||
| 		if (i915_request_is_active(signal) || | 		if (i915_request_is_active(signal) || | ||||||
| 		    __request_in_flight(signal)) | 		    __request_in_flight(signal)) | ||||||
| 			__notify_execute_cb_imm(signal); | 			__notify_execute_cb_imm(signal); | ||||||
|  |  | ||||||
|  | @ -729,13 +729,8 @@ static void liquidio_napi_drv_callback(void *arg) | ||||||
| 	    droq->cpu_id == this_cpu) { | 	    droq->cpu_id == this_cpu) { | ||||||
| 		napi_schedule_irqoff(&droq->napi); | 		napi_schedule_irqoff(&droq->napi); | ||||||
| 	} else { | 	} else { | ||||||
| 		call_single_data_t *csd = &droq->csd; | 		INIT_CSD(&droq->csd, napi_schedule_wrapper, &droq->napi); | ||||||
| 
 | 		smp_call_function_single_async(droq->cpu_id, &droq->csd); | ||||||
| 		csd->func = napi_schedule_wrapper; |  | ||||||
| 		csd->info = &droq->napi; |  | ||||||
| 		csd->flags = 0; |  | ||||||
| 
 |  | ||||||
| 		smp_call_function_single_async(droq->cpu_id, csd); |  | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -383,9 +383,9 @@ static inline void task_context_switch_counts(struct seq_file *m, | ||||||
| static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) | static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) | ||||||
| { | { | ||||||
| 	seq_printf(m, "Cpus_allowed:\t%*pb\n", | 	seq_printf(m, "Cpus_allowed:\t%*pb\n", | ||||||
| 		   cpumask_pr_args(task->cpus_ptr)); | 		   cpumask_pr_args(&task->cpus_mask)); | ||||||
| 	seq_printf(m, "Cpus_allowed_list:\t%*pbl\n", | 	seq_printf(m, "Cpus_allowed_list:\t%*pbl\n", | ||||||
| 		   cpumask_pr_args(task->cpus_ptr)); | 		   cpumask_pr_args(&task->cpus_mask)); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) | static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) | ||||||
|  |  | ||||||
|  | @ -152,6 +152,7 @@ enum cpuhp_state { | ||||||
| 	CPUHP_AP_ONLINE, | 	CPUHP_AP_ONLINE, | ||||||
| 	CPUHP_TEARDOWN_CPU, | 	CPUHP_TEARDOWN_CPU, | ||||||
| 	CPUHP_AP_ONLINE_IDLE, | 	CPUHP_AP_ONLINE_IDLE, | ||||||
|  | 	CPUHP_AP_SCHED_WAIT_EMPTY, | ||||||
| 	CPUHP_AP_SMPBOOT_THREADS, | 	CPUHP_AP_SMPBOOT_THREADS, | ||||||
| 	CPUHP_AP_X86_VDSO_VMA_ONLINE, | 	CPUHP_AP_X86_VDSO_VMA_ONLINE, | ||||||
| 	CPUHP_AP_IRQ_AFFINITY_ONLINE, | 	CPUHP_AP_IRQ_AFFINITY_ONLINE, | ||||||
|  |  | ||||||
|  | @ -199,6 +199,11 @@ static inline int cpumask_any_and_distribute(const struct cpumask *src1p, | ||||||
| 	return cpumask_next_and(-1, src1p, src2p); | 	return cpumask_next_and(-1, src1p, src2p); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static inline int cpumask_any_distribute(const struct cpumask *srcp) | ||||||
|  | { | ||||||
|  | 	return cpumask_first(srcp); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| #define for_each_cpu(cpu, mask)			\ | #define for_each_cpu(cpu, mask)			\ | ||||||
| 	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) | 	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) | ||||||
| #define for_each_cpu_not(cpu, mask)		\ | #define for_each_cpu_not(cpu, mask)		\ | ||||||
|  | @ -252,6 +257,7 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu); | ||||||
| unsigned int cpumask_local_spread(unsigned int i, int node); | unsigned int cpumask_local_spread(unsigned int i, int node); | ||||||
| int cpumask_any_and_distribute(const struct cpumask *src1p, | int cpumask_any_and_distribute(const struct cpumask *src1p, | ||||||
| 			       const struct cpumask *src2p); | 			       const struct cpumask *src2p); | ||||||
|  | int cpumask_any_distribute(const struct cpumask *srcp); | ||||||
| 
 | 
 | ||||||
| /**
 | /**
 | ||||||
|  * for_each_cpu - iterate over every cpu in a mask |  * for_each_cpu - iterate over every cpu in a mask | ||||||
|  |  | ||||||
|  | @ -14,28 +14,37 @@ | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
| struct irq_work { | struct irq_work { | ||||||
| 	union { | 	struct __call_single_node node; | ||||||
| 		struct __call_single_node node; |  | ||||||
| 		struct { |  | ||||||
| 			struct llist_node llnode; |  | ||||||
| 			atomic_t flags; |  | ||||||
| 		}; |  | ||||||
| 	}; |  | ||||||
| 	void (*func)(struct irq_work *); | 	void (*func)(struct irq_work *); | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | #define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){	\ | ||||||
|  | 	.node = { .u_flags = (_flags), },			\ | ||||||
|  | 	.func = (_func),					\ | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #define IRQ_WORK_INIT(_func) __IRQ_WORK_INIT(_func, 0) | ||||||
|  | #define IRQ_WORK_INIT_LAZY(_func) __IRQ_WORK_INIT(_func, IRQ_WORK_LAZY) | ||||||
|  | #define IRQ_WORK_INIT_HARD(_func) __IRQ_WORK_INIT(_func, IRQ_WORK_HARD_IRQ) | ||||||
|  | 
 | ||||||
|  | #define DEFINE_IRQ_WORK(name, _f)				\ | ||||||
|  | 	struct irq_work name = IRQ_WORK_INIT(_f) | ||||||
|  | 
 | ||||||
| static inline | static inline | ||||||
| void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *)) | void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *)) | ||||||
| { | { | ||||||
| 	atomic_set(&work->flags, 0); | 	*work = IRQ_WORK_INIT(func); | ||||||
| 	work->func = func; |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #define DEFINE_IRQ_WORK(name, _f) struct irq_work name = {	\ | static inline bool irq_work_is_pending(struct irq_work *work) | ||||||
| 		.flags = ATOMIC_INIT(0),			\ | { | ||||||
| 		.func  = (_f)					\ | 	return atomic_read(&work->node.a_flags) & IRQ_WORK_PENDING; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static inline bool irq_work_is_busy(struct irq_work *work) | ||||||
|  | { | ||||||
|  | 	return atomic_read(&work->node.a_flags) & IRQ_WORK_BUSY; | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| bool irq_work_queue(struct irq_work *work); | bool irq_work_queue(struct irq_work *work); | ||||||
| bool irq_work_queue_on(struct irq_work *work, int cpu); | bool irq_work_queue_on(struct irq_work *work, int cpu); | ||||||
|  |  | ||||||
|  | @ -107,14 +107,14 @@ do {						\ | ||||||
| 		  current->irq_config = 0;			\ | 		  current->irq_config = 0;			\ | ||||||
| 	  } while (0) | 	  } while (0) | ||||||
| 
 | 
 | ||||||
| # define lockdep_irq_work_enter(__work)					\ | # define lockdep_irq_work_enter(_flags)					\ | ||||||
| 	  do {								\ | 	  do {								\ | ||||||
| 		  if (!(atomic_read(&__work->flags) & IRQ_WORK_HARD_IRQ))\ | 		  if (!((_flags) & IRQ_WORK_HARD_IRQ))			\ | ||||||
| 			current->irq_config = 1;			\ | 			current->irq_config = 1;			\ | ||||||
| 	  } while (0) | 	  } while (0) | ||||||
| # define lockdep_irq_work_exit(__work)					\ | # define lockdep_irq_work_exit(_flags)					\ | ||||||
| 	  do {								\ | 	  do {								\ | ||||||
| 		  if (!(atomic_read(&__work->flags) & IRQ_WORK_HARD_IRQ))\ | 		  if (!((_flags) & IRQ_WORK_HARD_IRQ))			\ | ||||||
| 			current->irq_config = 0;			\ | 			current->irq_config = 0;			\ | ||||||
| 	  } while (0) | 	  } while (0) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -204,6 +204,7 @@ extern int _cond_resched(void); | ||||||
| extern void ___might_sleep(const char *file, int line, int preempt_offset); | extern void ___might_sleep(const char *file, int line, int preempt_offset); | ||||||
| extern void __might_sleep(const char *file, int line, int preempt_offset); | extern void __might_sleep(const char *file, int line, int preempt_offset); | ||||||
| extern void __cant_sleep(const char *file, int line, int preempt_offset); | extern void __cant_sleep(const char *file, int line, int preempt_offset); | ||||||
|  | extern void __cant_migrate(const char *file, int line); | ||||||
| 
 | 
 | ||||||
| /**
 | /**
 | ||||||
|  * might_sleep - annotation for functions that can sleep |  * might_sleep - annotation for functions that can sleep | ||||||
|  | @ -227,6 +228,18 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); | ||||||
| # define cant_sleep() \ | # define cant_sleep() \ | ||||||
| 	do { __cant_sleep(__FILE__, __LINE__, 0); } while (0) | 	do { __cant_sleep(__FILE__, __LINE__, 0); } while (0) | ||||||
| # define sched_annotate_sleep()	(current->task_state_change = 0) | # define sched_annotate_sleep()	(current->task_state_change = 0) | ||||||
|  | 
 | ||||||
|  | /**
 | ||||||
|  |  * cant_migrate - annotation for functions that cannot migrate | ||||||
|  |  * | ||||||
|  |  * Will print a stack trace if executed in code which is migratable | ||||||
|  |  */ | ||||||
|  | # define cant_migrate()							\ | ||||||
|  | 	do {								\ | ||||||
|  | 		if (IS_ENABLED(CONFIG_SMP))				\ | ||||||
|  | 			__cant_migrate(__FILE__, __LINE__);		\ | ||||||
|  | 	} while (0) | ||||||
|  | 
 | ||||||
| /**
 | /**
 | ||||||
|  * non_block_start - annotate the start of section where sleeping is prohibited |  * non_block_start - annotate the start of section where sleeping is prohibited | ||||||
|  * |  * | ||||||
|  | @ -251,6 +264,7 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); | ||||||
| 				   int preempt_offset) { } | 				   int preempt_offset) { } | ||||||
| # define might_sleep() do { might_resched(); } while (0) | # define might_sleep() do { might_resched(); } while (0) | ||||||
| # define cant_sleep() do { } while (0) | # define cant_sleep() do { } while (0) | ||||||
|  | # define cant_migrate()		do { } while (0) | ||||||
| # define sched_annotate_sleep() do { } while (0) | # define sched_annotate_sleep() do { } while (0) | ||||||
| # define non_block_start() do { } while (0) | # define non_block_start() do { } while (0) | ||||||
| # define non_block_end() do { } while (0) | # define non_block_end() do { } while (0) | ||||||
|  | @ -258,13 +272,6 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); | ||||||
| 
 | 
 | ||||||
| #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0) | #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0) | ||||||
| 
 | 
 | ||||||
| #ifndef CONFIG_PREEMPT_RT |  | ||||||
| # define cant_migrate()		cant_sleep() |  | ||||||
| #else |  | ||||||
|   /* Placeholder for now */ |  | ||||||
| # define cant_migrate()		do { } while (0) |  | ||||||
| #endif |  | ||||||
| 
 |  | ||||||
| /**
 | /**
 | ||||||
|  * abs - return absolute value of an argument |  * abs - return absolute value of an argument | ||||||
|  * @x: the value.  If it is unsigned type, it is converted to signed type first. |  * @x: the value.  If it is unsigned type, it is converted to signed type first. | ||||||
|  |  | ||||||
|  | @ -322,34 +322,71 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, | ||||||
| 
 | 
 | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| /**
 | #ifdef CONFIG_SMP | ||||||
|  * migrate_disable - Prevent migration of the current task |  | ||||||
|  * |  | ||||||
|  * Maps to preempt_disable() which also disables preemption. Use |  | ||||||
|  * migrate_disable() to annotate that the intent is to prevent migration, |  | ||||||
|  * but not necessarily preemption. |  | ||||||
|  * |  | ||||||
|  * Can be invoked nested like preempt_disable() and needs the corresponding |  | ||||||
|  * number of migrate_enable() invocations. |  | ||||||
|  */ |  | ||||||
| static __always_inline void migrate_disable(void) |  | ||||||
| { |  | ||||||
| 	preempt_disable(); |  | ||||||
| } |  | ||||||
| 
 | 
 | ||||||
| /**
 | /*
 | ||||||
|  * migrate_enable - Allow migration of the current task |  * Migrate-Disable and why it is undesired. | ||||||
|  * |  * | ||||||
|  * Counterpart to migrate_disable(). |  * When a preempted task becomes elegible to run under the ideal model (IOW it | ||||||
|  |  * becomes one of the M highest priority tasks), it might still have to wait | ||||||
|  |  * for the preemptee's migrate_disable() section to complete. Thereby suffering | ||||||
|  |  * a reduction in bandwidth in the exact duration of the migrate_disable() | ||||||
|  |  * section. | ||||||
|  * |  * | ||||||
|  * As migrate_disable() can be invoked nested, only the outermost invocation |  * Per this argument, the change from preempt_disable() to migrate_disable() | ||||||
|  * reenables migration. |  * gets us: | ||||||
|  |  * | ||||||
|  |  * - a higher priority tasks gains reduced wake-up latency; with preempt_disable() | ||||||
|  |  *   it would have had to wait for the lower priority task. | ||||||
|  |  * | ||||||
|  |  * - a lower priority tasks; which under preempt_disable() could've instantly | ||||||
|  |  *   migrated away when another CPU becomes available, is now constrained | ||||||
|  |  *   by the ability to push the higher priority task away, which might itself be | ||||||
|  |  *   in a migrate_disable() section, reducing it's available bandwidth. | ||||||
|  |  * | ||||||
|  |  * IOW it trades latency / moves the interference term, but it stays in the | ||||||
|  |  * system, and as long as it remains unbounded, the system is not fully | ||||||
|  |  * deterministic. | ||||||
|  |  * | ||||||
|  |  * | ||||||
|  |  * The reason we have it anyway. | ||||||
|  |  * | ||||||
|  |  * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a | ||||||
|  |  * number of primitives into becoming preemptible, they would also allow | ||||||
|  |  * migration. This turns out to break a bunch of per-cpu usage. To this end, | ||||||
|  |  * all these primitives employ migirate_disable() to restore this implicit | ||||||
|  |  * assumption. | ||||||
|  |  * | ||||||
|  |  * This is a 'temporary' work-around at best. The correct solution is getting | ||||||
|  |  * rid of the above assumptions and reworking the code to employ explicit | ||||||
|  |  * per-cpu locking or short preempt-disable regions. | ||||||
|  |  * | ||||||
|  |  * The end goal must be to get rid of migrate_disable(), alternatively we need | ||||||
|  |  * a schedulability theory that does not depend on abritrary migration. | ||||||
|  |  * | ||||||
|  |  * | ||||||
|  |  * Notes on the implementation. | ||||||
|  |  * | ||||||
|  |  * The implementation is particularly tricky since existing code patterns | ||||||
|  |  * dictate neither migrate_disable() nor migrate_enable() is allowed to block. | ||||||
|  |  * This means that it cannot use cpus_read_lock() to serialize against hotplug, | ||||||
|  |  * nor can it easily migrate itself into a pending affinity mask change on | ||||||
|  |  * migrate_enable(). | ||||||
|  |  * | ||||||
|  |  * | ||||||
|  |  * Note: even non-work-conserving schedulers like semi-partitioned depends on | ||||||
|  |  *       migration, so migrate_disable() is not only a problem for | ||||||
|  |  *       work-conserving schedulers. | ||||||
|  * |  * | ||||||
|  * Currently mapped to preempt_enable(). |  | ||||||
|  */ |  */ | ||||||
| static __always_inline void migrate_enable(void) | extern void migrate_disable(void); | ||||||
| { | extern void migrate_enable(void); | ||||||
| 	preempt_enable(); | 
 | ||||||
| } | #else | ||||||
|  | 
 | ||||||
|  | static inline void migrate_disable(void) { } | ||||||
|  | static inline void migrate_enable(void) { } | ||||||
|  | 
 | ||||||
|  | #endif /* CONFIG_SMP */ | ||||||
| 
 | 
 | ||||||
| #endif /* __LINUX_PREEMPT_H */ | #endif /* __LINUX_PREEMPT_H */ | ||||||
|  |  | ||||||
|  | @ -723,6 +723,11 @@ struct task_struct { | ||||||
| 	int				nr_cpus_allowed; | 	int				nr_cpus_allowed; | ||||||
| 	const cpumask_t			*cpus_ptr; | 	const cpumask_t			*cpus_ptr; | ||||||
| 	cpumask_t			cpus_mask; | 	cpumask_t			cpus_mask; | ||||||
|  | 	void				*migration_pending; | ||||||
|  | #ifdef CONFIG_SMP | ||||||
|  | 	unsigned short			migration_disabled; | ||||||
|  | #endif | ||||||
|  | 	unsigned short			migration_flags; | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_PREEMPT_RCU | #ifdef CONFIG_PREEMPT_RCU | ||||||
| 	int				rcu_read_lock_nesting; | 	int				rcu_read_lock_nesting; | ||||||
|  |  | ||||||
|  | @ -11,8 +11,10 @@ extern int sched_cpu_activate(unsigned int cpu); | ||||||
| extern int sched_cpu_deactivate(unsigned int cpu); | extern int sched_cpu_deactivate(unsigned int cpu); | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_HOTPLUG_CPU | #ifdef CONFIG_HOTPLUG_CPU | ||||||
|  | extern int sched_cpu_wait_empty(unsigned int cpu); | ||||||
| extern int sched_cpu_dying(unsigned int cpu); | extern int sched_cpu_dying(unsigned int cpu); | ||||||
| #else | #else | ||||||
|  | # define sched_cpu_wait_empty	NULL | ||||||
| # define sched_cpu_dying	NULL | # define sched_cpu_dying	NULL | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -347,6 +347,8 @@ static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) | ||||||
| 
 | 
 | ||||||
| extern void membarrier_exec_mmap(struct mm_struct *mm); | extern void membarrier_exec_mmap(struct mm_struct *mm); | ||||||
| 
 | 
 | ||||||
|  | extern void membarrier_update_current_mm(struct mm_struct *next_mm); | ||||||
|  | 
 | ||||||
| #else | #else | ||||||
| #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS | #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS | ||||||
| static inline void membarrier_arch_switch_mm(struct mm_struct *prev, | static inline void membarrier_arch_switch_mm(struct mm_struct *prev, | ||||||
|  | @ -361,6 +363,9 @@ static inline void membarrier_exec_mmap(struct mm_struct *mm) | ||||||
| static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) | static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) | ||||||
| { | { | ||||||
| } | } | ||||||
|  | static inline void membarrier_update_current_mm(struct mm_struct *next_mm) | ||||||
|  | { | ||||||
|  | } | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| #endif /* _LINUX_SCHED_MM_H */ | #endif /* _LINUX_SCHED_MM_H */ | ||||||
|  |  | ||||||
|  | @ -225,6 +225,14 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) | ||||||
| 
 | 
 | ||||||
| #endif	/* !CONFIG_SMP */ | #endif	/* !CONFIG_SMP */ | ||||||
| 
 | 
 | ||||||
|  | #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) | ||||||
|  | extern void rebuild_sched_domains_energy(void); | ||||||
|  | #else | ||||||
|  | static inline void rebuild_sched_domains_energy(void) | ||||||
|  | { | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| #ifndef arch_scale_cpu_capacity | #ifndef arch_scale_cpu_capacity | ||||||
| /**
 | /**
 | ||||||
|  * arch_scale_cpu_capacity - get the capacity scale factor of a given CPU. |  * arch_scale_cpu_capacity - get the capacity scale factor of a given CPU. | ||||||
|  |  | ||||||
|  | @ -21,24 +21,23 @@ typedef bool (*smp_cond_func_t)(int cpu, void *info); | ||||||
|  * structure shares (partial) layout with struct irq_work |  * structure shares (partial) layout with struct irq_work | ||||||
|  */ |  */ | ||||||
| struct __call_single_data { | struct __call_single_data { | ||||||
| 	union { | 	struct __call_single_node node; | ||||||
| 		struct __call_single_node node; |  | ||||||
| 		struct { |  | ||||||
| 			struct llist_node llist; |  | ||||||
| 			unsigned int flags; |  | ||||||
| #ifdef CONFIG_64BIT |  | ||||||
| 			u16 src, dst; |  | ||||||
| #endif |  | ||||||
| 		}; |  | ||||||
| 	}; |  | ||||||
| 	smp_call_func_t func; | 	smp_call_func_t func; | ||||||
| 	void *info; | 	void *info; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | #define CSD_INIT(_func, _info) \ | ||||||
|  | 	(struct __call_single_data){ .func = (_func), .info = (_info), } | ||||||
|  | 
 | ||||||
| /* Use __aligned() to avoid to use 2 cache lines for 1 csd */ | /* Use __aligned() to avoid to use 2 cache lines for 1 csd */ | ||||||
| typedef struct __call_single_data call_single_data_t | typedef struct __call_single_data call_single_data_t | ||||||
| 	__aligned(sizeof(struct __call_single_data)); | 	__aligned(sizeof(struct __call_single_data)); | ||||||
| 
 | 
 | ||||||
|  | #define INIT_CSD(_csd, _func, _info)		\ | ||||||
|  | do {						\ | ||||||
|  | 	*(_csd) = CSD_INIT((_func), (_info));	\ | ||||||
|  | } while (0) | ||||||
|  | 
 | ||||||
| /*
 | /*
 | ||||||
|  * Enqueue a llist_node on the call_single_queue; be very careful, read |  * Enqueue a llist_node on the call_single_queue; be very careful, read | ||||||
|  * flush_smp_call_function_queue() in detail. |  * flush_smp_call_function_queue() in detail. | ||||||
|  |  | ||||||
|  | @ -24,6 +24,7 @@ typedef int (*cpu_stop_fn_t)(void *arg); | ||||||
| struct cpu_stop_work { | struct cpu_stop_work { | ||||||
| 	struct list_head	list;		/* cpu_stopper->works */ | 	struct list_head	list;		/* cpu_stopper->works */ | ||||||
| 	cpu_stop_fn_t		fn; | 	cpu_stop_fn_t		fn; | ||||||
|  | 	unsigned long		caller; | ||||||
| 	void			*arg; | 	void			*arg; | ||||||
| 	struct cpu_stop_done	*done; | 	struct cpu_stop_done	*done; | ||||||
| }; | }; | ||||||
|  | @ -36,6 +37,8 @@ void stop_machine_park(int cpu); | ||||||
| void stop_machine_unpark(int cpu); | void stop_machine_unpark(int cpu); | ||||||
| void stop_machine_yield(const struct cpumask *cpumask); | void stop_machine_yield(const struct cpumask *cpumask); | ||||||
| 
 | 
 | ||||||
|  | extern void print_stop_info(const char *log_lvl, struct task_struct *task); | ||||||
|  | 
 | ||||||
| #else	/* CONFIG_SMP */ | #else	/* CONFIG_SMP */ | ||||||
| 
 | 
 | ||||||
| #include <linux/workqueue.h> | #include <linux/workqueue.h> | ||||||
|  | @ -80,6 +83,8 @@ static inline bool stop_one_cpu_nowait(unsigned int cpu, | ||||||
| 	return false; | 	return false; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static inline void print_stop_info(const char *log_lvl, struct task_struct *task) { } | ||||||
|  | 
 | ||||||
| #endif	/* CONFIG_SMP */ | #endif	/* CONFIG_SMP */ | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  |  | ||||||
|  | @ -96,6 +96,8 @@ struct sched_param { | ||||||
|  * on a CPU with a capacity big enough to fit the specified value. |  * on a CPU with a capacity big enough to fit the specified value. | ||||||
|  * A task with a max utilization value smaller than 1024 is more likely |  * A task with a max utilization value smaller than 1024 is more likely | ||||||
|  * scheduled on a CPU with no more capacity than the specified value. |  * scheduled on a CPU with no more capacity than the specified value. | ||||||
|  |  * | ||||||
|  |  * A task utilization boundary can be reset by setting the attribute to -1. | ||||||
|  */ |  */ | ||||||
| struct sched_attr { | struct sched_attr { | ||||||
| 	__u32 size; | 	__u32 size; | ||||||
|  |  | ||||||
|  | @ -298,7 +298,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, | ||||||
| 	if (irqs_disabled()) { | 	if (irqs_disabled()) { | ||||||
| 		if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { | 		if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { | ||||||
| 			work = this_cpu_ptr(&up_read_work); | 			work = this_cpu_ptr(&up_read_work); | ||||||
| 			if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) { | 			if (irq_work_is_busy(&work->irq_work)) { | ||||||
| 				/* cannot queue more up_read, fallback */ | 				/* cannot queue more up_read, fallback */ | ||||||
| 				irq_work_busy = true; | 				irq_work_busy = true; | ||||||
| 			} | 			} | ||||||
|  |  | ||||||
|  | @ -983,25 +983,48 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], | ||||||
|  */ |  */ | ||||||
| static void rebuild_sched_domains_locked(void) | static void rebuild_sched_domains_locked(void) | ||||||
| { | { | ||||||
|  | 	struct cgroup_subsys_state *pos_css; | ||||||
| 	struct sched_domain_attr *attr; | 	struct sched_domain_attr *attr; | ||||||
| 	cpumask_var_t *doms; | 	cpumask_var_t *doms; | ||||||
|  | 	struct cpuset *cs; | ||||||
| 	int ndoms; | 	int ndoms; | ||||||
| 
 | 
 | ||||||
| 	lockdep_assert_cpus_held(); | 	lockdep_assert_cpus_held(); | ||||||
| 	percpu_rwsem_assert_held(&cpuset_rwsem); | 	percpu_rwsem_assert_held(&cpuset_rwsem); | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * We have raced with CPU hotplug. Don't do anything to avoid | 	 * If we have raced with CPU hotplug, return early to avoid | ||||||
| 	 * passing doms with offlined cpu to partition_sched_domains(). | 	 * passing doms with offlined cpu to partition_sched_domains(). | ||||||
| 	 * Anyways, hotplug work item will rebuild sched domains. | 	 * Anyways, cpuset_hotplug_workfn() will rebuild sched domains. | ||||||
|  | 	 * | ||||||
|  | 	 * With no CPUs in any subpartitions, top_cpuset's effective CPUs | ||||||
|  | 	 * should be the same as the active CPUs, so checking only top_cpuset | ||||||
|  | 	 * is enough to detect racing CPU offlines. | ||||||
| 	 */ | 	 */ | ||||||
| 	if (!top_cpuset.nr_subparts_cpus && | 	if (!top_cpuset.nr_subparts_cpus && | ||||||
| 	    !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) | 	    !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) | ||||||
| 		return; | 		return; | ||||||
| 
 | 
 | ||||||
| 	if (top_cpuset.nr_subparts_cpus && | 	/*
 | ||||||
| 	   !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask)) | 	 * With subpartition CPUs, however, the effective CPUs of a partition | ||||||
| 		return; | 	 * root should be only a subset of the active CPUs.  Since a CPU in any | ||||||
|  | 	 * partition root could be offlined, all must be checked. | ||||||
|  | 	 */ | ||||||
|  | 	if (top_cpuset.nr_subparts_cpus) { | ||||||
|  | 		rcu_read_lock(); | ||||||
|  | 		cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { | ||||||
|  | 			if (!is_partition_root(cs)) { | ||||||
|  | 				pos_css = css_rightmost_descendant(pos_css); | ||||||
|  | 				continue; | ||||||
|  | 			} | ||||||
|  | 			if (!cpumask_subset(cs->effective_cpus, | ||||||
|  | 					    cpu_active_mask)) { | ||||||
|  | 				rcu_read_unlock(); | ||||||
|  | 				return; | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 		rcu_read_unlock(); | ||||||
|  | 	} | ||||||
| 
 | 
 | ||||||
| 	/* Generate domain masks and attrs */ | 	/* Generate domain masks and attrs */ | ||||||
| 	ndoms = generate_sched_domains(&doms, &attr); | 	ndoms = generate_sched_domains(&doms, &attr); | ||||||
|  |  | ||||||
|  | @ -1606,7 +1606,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { | ||||||
| 		.name			= "ap:online", | 		.name			= "ap:online", | ||||||
| 	}, | 	}, | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Handled on controll processor until the plugged processor manages | 	 * Handled on control processor until the plugged processor manages | ||||||
| 	 * this itself. | 	 * this itself. | ||||||
| 	 */ | 	 */ | ||||||
| 	[CPUHP_TEARDOWN_CPU] = { | 	[CPUHP_TEARDOWN_CPU] = { | ||||||
|  | @ -1615,6 +1615,13 @@ static struct cpuhp_step cpuhp_hp_states[] = { | ||||||
| 		.teardown.single	= takedown_cpu, | 		.teardown.single	= takedown_cpu, | ||||||
| 		.cant_stop		= true, | 		.cant_stop		= true, | ||||||
| 	}, | 	}, | ||||||
|  | 
 | ||||||
|  | 	[CPUHP_AP_SCHED_WAIT_EMPTY] = { | ||||||
|  | 		.name			= "sched:waitempty", | ||||||
|  | 		.startup.single		= NULL, | ||||||
|  | 		.teardown.single	= sched_cpu_wait_empty, | ||||||
|  | 	}, | ||||||
|  | 
 | ||||||
| 	/* Handle smpboot threads park/unpark */ | 	/* Handle smpboot threads park/unpark */ | ||||||
| 	[CPUHP_AP_SMPBOOT_THREADS] = { | 	[CPUHP_AP_SMPBOOT_THREADS] = { | ||||||
| 		.name			= "smpboot/threads:online", | 		.name			= "smpboot/threads:online", | ||||||
|  |  | ||||||
|  | @ -225,8 +225,6 @@ NOKPROBE_SYMBOL(kgdb_skipexception); | ||||||
|  * Default (weak) implementation for kgdb_roundup_cpus |  * Default (weak) implementation for kgdb_roundup_cpus | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
| static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd); |  | ||||||
| 
 |  | ||||||
| void __weak kgdb_call_nmi_hook(void *ignored) | void __weak kgdb_call_nmi_hook(void *ignored) | ||||||
| { | { | ||||||
| 	/*
 | 	/*
 | ||||||
|  | @ -241,6 +239,9 @@ void __weak kgdb_call_nmi_hook(void *ignored) | ||||||
| } | } | ||||||
| NOKPROBE_SYMBOL(kgdb_call_nmi_hook); | NOKPROBE_SYMBOL(kgdb_call_nmi_hook); | ||||||
| 
 | 
 | ||||||
|  | static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd) = | ||||||
|  | 	CSD_INIT(kgdb_call_nmi_hook, NULL); | ||||||
|  | 
 | ||||||
| void __weak kgdb_roundup_cpus(void) | void __weak kgdb_roundup_cpus(void) | ||||||
| { | { | ||||||
| 	call_single_data_t *csd; | 	call_single_data_t *csd; | ||||||
|  | @ -267,7 +268,6 @@ void __weak kgdb_roundup_cpus(void) | ||||||
| 			continue; | 			continue; | ||||||
| 		kgdb_info[cpu].rounding_up = true; | 		kgdb_info[cpu].rounding_up = true; | ||||||
| 
 | 
 | ||||||
| 		csd->func = kgdb_call_nmi_hook; |  | ||||||
| 		ret = smp_call_function_single_async(cpu, csd); | 		ret = smp_call_function_single_async(cpu, csd); | ||||||
| 		if (ret) | 		if (ret) | ||||||
| 			kgdb_info[cpu].rounding_up = false; | 			kgdb_info[cpu].rounding_up = false; | ||||||
|  |  | ||||||
|  | @ -478,10 +478,24 @@ static void exit_mm(void) | ||||||
| 	BUG_ON(mm != current->active_mm); | 	BUG_ON(mm != current->active_mm); | ||||||
| 	/* more a memory barrier than a real lock */ | 	/* more a memory barrier than a real lock */ | ||||||
| 	task_lock(current); | 	task_lock(current); | ||||||
|  | 	/*
 | ||||||
|  | 	 * When a thread stops operating on an address space, the loop | ||||||
|  | 	 * in membarrier_private_expedited() may not observe that | ||||||
|  | 	 * tsk->mm, and the loop in membarrier_global_expedited() may | ||||||
|  | 	 * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED | ||||||
|  | 	 * rq->membarrier_state, so those would not issue an IPI. | ||||||
|  | 	 * Membarrier requires a memory barrier after accessing | ||||||
|  | 	 * user-space memory, before clearing tsk->mm or the | ||||||
|  | 	 * rq->membarrier_state. | ||||||
|  | 	 */ | ||||||
|  | 	smp_mb__after_spinlock(); | ||||||
|  | 	local_irq_disable(); | ||||||
| 	current->mm = NULL; | 	current->mm = NULL; | ||||||
| 	mmap_read_unlock(mm); | 	membarrier_update_current_mm(NULL); | ||||||
| 	enter_lazy_tlb(mm, current); | 	enter_lazy_tlb(mm, current); | ||||||
|  | 	local_irq_enable(); | ||||||
| 	task_unlock(current); | 	task_unlock(current); | ||||||
|  | 	mmap_read_unlock(mm); | ||||||
| 	mm_update_next_owner(mm); | 	mm_update_next_owner(mm); | ||||||
| 	mmput(mm); | 	mmput(mm); | ||||||
| 	if (test_thread_flag(TIF_MEMDIE)) | 	if (test_thread_flag(TIF_MEMDIE)) | ||||||
|  |  | ||||||
|  | @ -31,10 +31,10 @@ static bool irq_work_claim(struct irq_work *work) | ||||||
| { | { | ||||||
| 	int oflags; | 	int oflags; | ||||||
| 
 | 
 | ||||||
| 	oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags); | 	oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags); | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * If the work is already pending, no need to raise the IPI. | 	 * If the work is already pending, no need to raise the IPI. | ||||||
| 	 * The pairing atomic_fetch_andnot() in irq_work_run() makes sure | 	 * The pairing smp_mb() in irq_work_single() makes sure | ||||||
| 	 * everything we did before is visible. | 	 * everything we did before is visible. | ||||||
| 	 */ | 	 */ | ||||||
| 	if (oflags & IRQ_WORK_PENDING) | 	if (oflags & IRQ_WORK_PENDING) | ||||||
|  | @ -53,12 +53,12 @@ void __weak arch_irq_work_raise(void) | ||||||
| static void __irq_work_queue_local(struct irq_work *work) | static void __irq_work_queue_local(struct irq_work *work) | ||||||
| { | { | ||||||
| 	/* If the work is "lazy", handle it from next tick if any */ | 	/* If the work is "lazy", handle it from next tick if any */ | ||||||
| 	if (atomic_read(&work->flags) & IRQ_WORK_LAZY) { | 	if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) { | ||||||
| 		if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && | 		if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) && | ||||||
| 		    tick_nohz_tick_stopped()) | 		    tick_nohz_tick_stopped()) | ||||||
| 			arch_irq_work_raise(); | 			arch_irq_work_raise(); | ||||||
| 	} else { | 	} else { | ||||||
| 		if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) | 		if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list))) | ||||||
| 			arch_irq_work_raise(); | 			arch_irq_work_raise(); | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
|  | @ -102,7 +102,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) | ||||||
| 	if (cpu != smp_processor_id()) { | 	if (cpu != smp_processor_id()) { | ||||||
| 		/* Arch remote IPI send/receive backend aren't NMI safe */ | 		/* Arch remote IPI send/receive backend aren't NMI safe */ | ||||||
| 		WARN_ON_ONCE(in_nmi()); | 		WARN_ON_ONCE(in_nmi()); | ||||||
| 		__smp_call_single_queue(cpu, &work->llnode); | 		__smp_call_single_queue(cpu, &work->node.llist); | ||||||
| 	} else { | 	} else { | ||||||
| 		__irq_work_queue_local(work); | 		__irq_work_queue_local(work); | ||||||
| 	} | 	} | ||||||
|  | @ -136,23 +136,28 @@ void irq_work_single(void *arg) | ||||||
| 	int flags; | 	int flags; | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Clear the PENDING bit, after this point the @work | 	 * Clear the PENDING bit, after this point the @work can be re-used. | ||||||
| 	 * can be re-used. | 	 * The PENDING bit acts as a lock, and we own it, so we can clear it | ||||||
| 	 * Make it immediately visible so that other CPUs trying | 	 * without atomic ops. | ||||||
| 	 * to claim that work don't rely on us to handle their data |  | ||||||
| 	 * while we are in the middle of the func. |  | ||||||
| 	 */ |  | ||||||
| 	flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags); |  | ||||||
| 
 |  | ||||||
| 	lockdep_irq_work_enter(work); |  | ||||||
| 	work->func(work); |  | ||||||
| 	lockdep_irq_work_exit(work); |  | ||||||
| 	/*
 |  | ||||||
| 	 * Clear the BUSY bit and return to the free state if |  | ||||||
| 	 * no-one else claimed it meanwhile. |  | ||||||
| 	 */ | 	 */ | ||||||
|  | 	flags = atomic_read(&work->node.a_flags); | ||||||
| 	flags &= ~IRQ_WORK_PENDING; | 	flags &= ~IRQ_WORK_PENDING; | ||||||
| 	(void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); | 	atomic_set(&work->node.a_flags, flags); | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * See irq_work_claim(). | ||||||
|  | 	 */ | ||||||
|  | 	smp_mb(); | ||||||
|  | 
 | ||||||
|  | 	lockdep_irq_work_enter(flags); | ||||||
|  | 	work->func(work); | ||||||
|  | 	lockdep_irq_work_exit(flags); | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * Clear the BUSY bit, if set, and return to the free state if no-one | ||||||
|  | 	 * else claimed it meanwhile. | ||||||
|  | 	 */ | ||||||
|  | 	(void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void irq_work_run_list(struct llist_head *list) | static void irq_work_run_list(struct llist_head *list) | ||||||
|  | @ -166,7 +171,7 @@ static void irq_work_run_list(struct llist_head *list) | ||||||
| 		return; | 		return; | ||||||
| 
 | 
 | ||||||
| 	llnode = llist_del_all(list); | 	llnode = llist_del_all(list); | ||||||
| 	llist_for_each_entry_safe(work, tmp, llnode, llnode) | 	llist_for_each_entry_safe(work, tmp, llnode, node.llist) | ||||||
| 		irq_work_single(work); | 		irq_work_single(work); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -198,7 +203,7 @@ void irq_work_sync(struct irq_work *work) | ||||||
| { | { | ||||||
| 	lockdep_assert_irqs_enabled(); | 	lockdep_assert_irqs_enabled(); | ||||||
| 
 | 
 | ||||||
| 	while (atomic_read(&work->flags) & IRQ_WORK_BUSY) | 	while (irq_work_is_busy(work)) | ||||||
| 		cpu_relax(); | 		cpu_relax(); | ||||||
| } | } | ||||||
| EXPORT_SYMBOL_GPL(irq_work_sync); | EXPORT_SYMBOL_GPL(irq_work_sync); | ||||||
|  |  | ||||||
|  | @ -1249,6 +1249,7 @@ void kthread_use_mm(struct mm_struct *mm) | ||||||
| 		tsk->active_mm = mm; | 		tsk->active_mm = mm; | ||||||
| 	} | 	} | ||||||
| 	tsk->mm = mm; | 	tsk->mm = mm; | ||||||
|  | 	membarrier_update_current_mm(mm); | ||||||
| 	switch_mm_irqs_off(active_mm, mm, tsk); | 	switch_mm_irqs_off(active_mm, mm, tsk); | ||||||
| 	local_irq_enable(); | 	local_irq_enable(); | ||||||
| 	task_unlock(tsk); | 	task_unlock(tsk); | ||||||
|  | @ -1256,8 +1257,19 @@ void kthread_use_mm(struct mm_struct *mm) | ||||||
| 	finish_arch_post_lock_switch(); | 	finish_arch_post_lock_switch(); | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * When a kthread starts operating on an address space, the loop | ||||||
|  | 	 * in membarrier_{private,global}_expedited() may not observe | ||||||
|  | 	 * that tsk->mm, and not issue an IPI. Membarrier requires a | ||||||
|  | 	 * memory barrier after storing to tsk->mm, before accessing | ||||||
|  | 	 * user-space memory. A full memory barrier for membarrier | ||||||
|  | 	 * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by | ||||||
|  | 	 * mmdrop(), or explicitly with smp_mb(). | ||||||
|  | 	 */ | ||||||
| 	if (active_mm != mm) | 	if (active_mm != mm) | ||||||
| 		mmdrop(active_mm); | 		mmdrop(active_mm); | ||||||
|  | 	else | ||||||
|  | 		smp_mb(); | ||||||
| 
 | 
 | ||||||
| 	to_kthread(tsk)->oldfs = force_uaccess_begin(); | 	to_kthread(tsk)->oldfs = force_uaccess_begin(); | ||||||
| } | } | ||||||
|  | @ -1277,9 +1289,18 @@ void kthread_unuse_mm(struct mm_struct *mm) | ||||||
| 	force_uaccess_end(to_kthread(tsk)->oldfs); | 	force_uaccess_end(to_kthread(tsk)->oldfs); | ||||||
| 
 | 
 | ||||||
| 	task_lock(tsk); | 	task_lock(tsk); | ||||||
|  | 	/*
 | ||||||
|  | 	 * When a kthread stops operating on an address space, the loop | ||||||
|  | 	 * in membarrier_{private,global}_expedited() may not observe | ||||||
|  | 	 * that tsk->mm, and not issue an IPI. Membarrier requires a | ||||||
|  | 	 * memory barrier after accessing user-space memory, before | ||||||
|  | 	 * clearing tsk->mm. | ||||||
|  | 	 */ | ||||||
|  | 	smp_mb__after_spinlock(); | ||||||
| 	sync_mm_rss(mm); | 	sync_mm_rss(mm); | ||||||
| 	local_irq_disable(); | 	local_irq_disable(); | ||||||
| 	tsk->mm = NULL; | 	tsk->mm = NULL; | ||||||
|  | 	membarrier_update_current_mm(NULL); | ||||||
| 	/* active_mm is still 'mm' */ | 	/* active_mm is still 'mm' */ | ||||||
| 	enter_lazy_tlb(mm, tsk); | 	enter_lazy_tlb(mm, tsk); | ||||||
| 	local_irq_enable(); | 	local_irq_enable(); | ||||||
|  |  | ||||||
|  | @ -3025,10 +3025,8 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) | ||||||
| 		wake_up_interruptible(&log_wait); | 		wake_up_interruptible(&log_wait); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { | static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = | ||||||
| 	.func = wake_up_klogd_work_func, | 	IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func); | ||||||
| 	.flags = ATOMIC_INIT(IRQ_WORK_LAZY), |  | ||||||
| }; |  | ||||||
| 
 | 
 | ||||||
| void wake_up_klogd(void) | void wake_up_klogd(void) | ||||||
| { | { | ||||||
|  |  | ||||||
|  | @ -1322,8 +1322,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | ||||||
| 		if (IS_ENABLED(CONFIG_IRQ_WORK) && | 		if (IS_ENABLED(CONFIG_IRQ_WORK) && | ||||||
| 		    !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq && | 		    !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq && | ||||||
| 		    (rnp->ffmask & rdp->grpmask)) { | 		    (rnp->ffmask & rdp->grpmask)) { | ||||||
| 			init_irq_work(&rdp->rcu_iw, rcu_iw_handler); |  | ||||||
| 			atomic_set(&rdp->rcu_iw.flags, IRQ_WORK_HARD_IRQ); |  | ||||||
| 			rdp->rcu_iw_pending = true; | 			rdp->rcu_iw_pending = true; | ||||||
| 			rdp->rcu_iw_gp_seq = rnp->gp_seq; | 			rdp->rcu_iw_gp_seq = rnp->gp_seq; | ||||||
| 			irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); | 			irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); | ||||||
|  | @ -4023,6 +4021,7 @@ int rcutree_prepare_cpu(unsigned int cpu) | ||||||
| 	rdp->cpu_no_qs.b.norm = true; | 	rdp->cpu_no_qs.b.norm = true; | ||||||
| 	rdp->core_needs_qs = false; | 	rdp->core_needs_qs = false; | ||||||
| 	rdp->rcu_iw_pending = false; | 	rdp->rcu_iw_pending = false; | ||||||
|  | 	rdp->rcu_iw = IRQ_WORK_INIT_HARD(rcu_iw_handler); | ||||||
| 	rdp->rcu_iw_gp_seq = rdp->gp_seq - 1; | 	rdp->rcu_iw_gp_seq = rdp->gp_seq - 1; | ||||||
| 	trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); | 	trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); | ||||||
| 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||||||
|  |  | ||||||
							
								
								
									
										1140
									
								
								kernel/sched/core.c
									
									
									
									
									
								
							
							
						
						
									
										1140
									
								
								kernel/sched/core.c
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							|  | @ -120,7 +120,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, | ||||||
| 	const struct sched_dl_entity *dl_se = &p->dl; | 	const struct sched_dl_entity *dl_se = &p->dl; | ||||||
| 
 | 
 | ||||||
| 	if (later_mask && | 	if (later_mask && | ||||||
| 	    cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { | 	    cpumask_and(later_mask, cp->free_cpus, &p->cpus_mask)) { | ||||||
| 		unsigned long cap, max_cap = 0; | 		unsigned long cap, max_cap = 0; | ||||||
| 		int cpu, max_cpu = -1; | 		int cpu, max_cpu = -1; | ||||||
| 
 | 
 | ||||||
|  | @ -151,7 +151,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, | ||||||
| 
 | 
 | ||||||
| 		WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); | 		WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); | ||||||
| 
 | 
 | ||||||
| 		if (cpumask_test_cpu(best_cpu, p->cpus_ptr) && | 		if (cpumask_test_cpu(best_cpu, &p->cpus_mask) && | ||||||
| 		    dl_time_before(dl_se->deadline, cp->elements[0].dl)) { | 		    dl_time_before(dl_se->deadline, cp->elements[0].dl)) { | ||||||
| 			if (later_mask) | 			if (later_mask) | ||||||
| 				cpumask_set_cpu(best_cpu, later_mask); | 				cpumask_set_cpu(best_cpu, later_mask); | ||||||
|  |  | ||||||
|  | @ -899,16 +899,9 @@ struct cpufreq_governor *cpufreq_default_governor(void) | ||||||
| cpufreq_governor_init(schedutil_gov); | cpufreq_governor_init(schedutil_gov); | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_ENERGY_MODEL | #ifdef CONFIG_ENERGY_MODEL | ||||||
| extern bool sched_energy_update; |  | ||||||
| extern struct mutex sched_energy_mutex; |  | ||||||
| 
 |  | ||||||
| static void rebuild_sd_workfn(struct work_struct *work) | static void rebuild_sd_workfn(struct work_struct *work) | ||||||
| { | { | ||||||
| 	mutex_lock(&sched_energy_mutex); | 	rebuild_sched_domains_energy(); | ||||||
| 	sched_energy_update = true; |  | ||||||
| 	rebuild_sched_domains(); |  | ||||||
| 	sched_energy_update = false; |  | ||||||
| 	mutex_unlock(&sched_energy_mutex); |  | ||||||
| } | } | ||||||
| static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); | static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -11,7 +11,7 @@ | ||||||
|  *  This code tracks the priority of each CPU so that global migration |  *  This code tracks the priority of each CPU so that global migration | ||||||
|  *  decisions are easy to calculate.  Each CPU can be in a state as follows: |  *  decisions are easy to calculate.  Each CPU can be in a state as follows: | ||||||
|  * |  * | ||||||
|  *                 (INVALID), IDLE, NORMAL, RT1, ... RT99 |  *                 (INVALID), NORMAL, RT1, ... RT99, HIGHER | ||||||
|  * |  * | ||||||
|  *  going from the lowest priority to the highest.  CPUs in the INVALID state |  *  going from the lowest priority to the highest.  CPUs in the INVALID state | ||||||
|  *  are not eligible for routing.  The system maintains this state with |  *  are not eligible for routing.  The system maintains this state with | ||||||
|  | @ -19,24 +19,48 @@ | ||||||
|  *  in that class).  Therefore a typical application without affinity |  *  in that class).  Therefore a typical application without affinity | ||||||
|  *  restrictions can find a suitable CPU with O(1) complexity (e.g. two bit |  *  restrictions can find a suitable CPU with O(1) complexity (e.g. two bit | ||||||
|  *  searches).  For tasks with affinity restrictions, the algorithm has a |  *  searches).  For tasks with affinity restrictions, the algorithm has a | ||||||
|  *  worst case complexity of O(min(102, nr_domcpus)), though the scenario that |  *  worst case complexity of O(min(101, nr_domcpus)), though the scenario that | ||||||
|  *  yields the worst case search is fairly contrived. |  *  yields the worst case search is fairly contrived. | ||||||
|  */ |  */ | ||||||
| #include "sched.h" | #include "sched.h" | ||||||
| 
 | 
 | ||||||
| /* Convert between a 140 based task->prio, and our 102 based cpupri */ | /*
 | ||||||
|  |  * p->rt_priority   p->prio   newpri   cpupri | ||||||
|  |  * | ||||||
|  |  *				  -1       -1 (CPUPRI_INVALID) | ||||||
|  |  * | ||||||
|  |  *				  99        0 (CPUPRI_NORMAL) | ||||||
|  |  * | ||||||
|  |  *		1        98       98        1 | ||||||
|  |  *	      ... | ||||||
|  |  *	       49        50       50       49 | ||||||
|  |  *	       50        49       49       50 | ||||||
|  |  *	      ... | ||||||
|  |  *	       99         0        0       99 | ||||||
|  |  * | ||||||
|  |  *				 100	  100 (CPUPRI_HIGHER) | ||||||
|  |  */ | ||||||
| static int convert_prio(int prio) | static int convert_prio(int prio) | ||||||
| { | { | ||||||
| 	int cpupri; | 	int cpupri; | ||||||
| 
 | 
 | ||||||
| 	if (prio == CPUPRI_INVALID) | 	switch (prio) { | ||||||
| 		cpupri = CPUPRI_INVALID; | 	case CPUPRI_INVALID: | ||||||
| 	else if (prio == MAX_PRIO) | 		cpupri = CPUPRI_INVALID;	/* -1 */ | ||||||
| 		cpupri = CPUPRI_IDLE; | 		break; | ||||||
| 	else if (prio >= MAX_RT_PRIO) | 
 | ||||||
| 		cpupri = CPUPRI_NORMAL; | 	case 0 ... 98: | ||||||
| 	else | 		cpupri = MAX_RT_PRIO-1 - prio;	/* 1 ... 99 */ | ||||||
| 		cpupri = MAX_RT_PRIO - prio + 1; | 		break; | ||||||
|  | 
 | ||||||
|  | 	case MAX_RT_PRIO-1: | ||||||
|  | 		cpupri = CPUPRI_NORMAL;		/*  0 */ | ||||||
|  | 		break; | ||||||
|  | 
 | ||||||
|  | 	case MAX_RT_PRIO: | ||||||
|  | 		cpupri = CPUPRI_HIGHER;		/* 100 */ | ||||||
|  | 		break; | ||||||
|  | 	} | ||||||
| 
 | 
 | ||||||
| 	return cpupri; | 	return cpupri; | ||||||
| } | } | ||||||
|  | @ -73,11 +97,11 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p, | ||||||
| 	if (skip) | 	if (skip) | ||||||
| 		return 0; | 		return 0; | ||||||
| 
 | 
 | ||||||
| 	if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) | 	if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids) | ||||||
| 		return 0; | 		return 0; | ||||||
| 
 | 
 | ||||||
| 	if (lowest_mask) { | 	if (lowest_mask) { | ||||||
| 		cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); | 		cpumask_and(lowest_mask, &p->cpus_mask, vec->mask); | ||||||
| 
 | 
 | ||||||
| 		/*
 | 		/*
 | ||||||
| 		 * We have to ensure that we have at least one bit | 		 * We have to ensure that we have at least one bit | ||||||
|  | @ -177,7 +201,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, | ||||||
|  * cpupri_set - update the CPU priority setting |  * cpupri_set - update the CPU priority setting | ||||||
|  * @cp: The cpupri context |  * @cp: The cpupri context | ||||||
|  * @cpu: The target CPU |  * @cpu: The target CPU | ||||||
|  * @newpri: The priority (INVALID-RT99) to assign to this CPU |  * @newpri: The priority (INVALID,NORMAL,RT1-RT99,HIGHER) to assign to this CPU | ||||||
|  * |  * | ||||||
|  * Note: Assumes cpu_rq(cpu)->lock is locked |  * Note: Assumes cpu_rq(cpu)->lock is locked | ||||||
|  * |  * | ||||||
|  |  | ||||||
|  | @ -1,11 +1,11 @@ | ||||||
| /* SPDX-License-Identifier: GPL-2.0 */ | /* SPDX-License-Identifier: GPL-2.0 */ | ||||||
| 
 | 
 | ||||||
| #define CPUPRI_NR_PRIORITIES	(MAX_RT_PRIO + 2) | #define CPUPRI_NR_PRIORITIES	(MAX_RT_PRIO+1) | ||||||
| 
 | 
 | ||||||
| #define CPUPRI_INVALID		-1 | #define CPUPRI_INVALID		-1 | ||||||
| #define CPUPRI_IDLE		 0 | #define CPUPRI_NORMAL		 0 | ||||||
| #define CPUPRI_NORMAL		 1 | /* values 1-99 are for RT1-RT99 priorities */ | ||||||
| /* values 2-101 are RT priorities 0-99 */ | #define CPUPRI_HIGHER		100 | ||||||
| 
 | 
 | ||||||
| struct cpupri_vec { | struct cpupri_vec { | ||||||
| 	atomic_t		count; | 	atomic_t		count; | ||||||
|  |  | ||||||
|  | @ -119,6 +119,17 @@ static inline unsigned long dl_bw_capacity(int i) | ||||||
| 		return __dl_bw_capacity(i); | 		return __dl_bw_capacity(i); | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
|  | 
 | ||||||
|  | static inline bool dl_bw_visited(int cpu, u64 gen) | ||||||
|  | { | ||||||
|  | 	struct root_domain *rd = cpu_rq(cpu)->rd; | ||||||
|  | 
 | ||||||
|  | 	if (rd->visit_gen == gen) | ||||||
|  | 		return true; | ||||||
|  | 
 | ||||||
|  | 	rd->visit_gen = gen; | ||||||
|  | 	return false; | ||||||
|  | } | ||||||
| #else | #else | ||||||
| static inline struct dl_bw *dl_bw_of(int i) | static inline struct dl_bw *dl_bw_of(int i) | ||||||
| { | { | ||||||
|  | @ -134,6 +145,11 @@ static inline unsigned long dl_bw_capacity(int i) | ||||||
| { | { | ||||||
| 	return SCHED_CAPACITY_SCALE; | 	return SCHED_CAPACITY_SCALE; | ||||||
| } | } | ||||||
|  | 
 | ||||||
|  | static inline bool dl_bw_visited(int cpu, u64 gen) | ||||||
|  | { | ||||||
|  | 	return false; | ||||||
|  | } | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| static inline | static inline | ||||||
|  | @ -565,7 +581,7 @@ static int push_dl_task(struct rq *rq); | ||||||
| 
 | 
 | ||||||
| static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) | static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) | ||||||
| { | { | ||||||
| 	return dl_task(prev); | 	return rq->online && dl_task(prev); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static DEFINE_PER_CPU(struct callback_head, dl_push_head); | static DEFINE_PER_CPU(struct callback_head, dl_push_head); | ||||||
|  | @ -1397,6 +1413,8 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) | ||||||
| 
 | 
 | ||||||
| 	if (dl_rq->earliest_dl.curr == 0 || | 	if (dl_rq->earliest_dl.curr == 0 || | ||||||
| 	    dl_time_before(deadline, dl_rq->earliest_dl.curr)) { | 	    dl_time_before(deadline, dl_rq->earliest_dl.curr)) { | ||||||
|  | 		if (dl_rq->earliest_dl.curr == 0) | ||||||
|  | 			cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_HIGHER); | ||||||
| 		dl_rq->earliest_dl.curr = deadline; | 		dl_rq->earliest_dl.curr = deadline; | ||||||
| 		cpudl_set(&rq->rd->cpudl, rq->cpu, deadline); | 		cpudl_set(&rq->rd->cpudl, rq->cpu, deadline); | ||||||
| 	} | 	} | ||||||
|  | @ -1414,6 +1432,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) | ||||||
| 		dl_rq->earliest_dl.curr = 0; | 		dl_rq->earliest_dl.curr = 0; | ||||||
| 		dl_rq->earliest_dl.next = 0; | 		dl_rq->earliest_dl.next = 0; | ||||||
| 		cpudl_clear(&rq->rd->cpudl, rq->cpu); | 		cpudl_clear(&rq->rd->cpudl, rq->cpu); | ||||||
|  | 		cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); | ||||||
| 	} else { | 	} else { | ||||||
| 		struct rb_node *leftmost = dl_rq->root.rb_leftmost; | 		struct rb_node *leftmost = dl_rq->root.rb_leftmost; | ||||||
| 		struct sched_dl_entity *entry; | 		struct sched_dl_entity *entry; | ||||||
|  | @ -1670,13 +1689,13 @@ static void yield_task_dl(struct rq *rq) | ||||||
| static int find_later_rq(struct task_struct *task); | static int find_later_rq(struct task_struct *task); | ||||||
| 
 | 
 | ||||||
| static int | static int | ||||||
| select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) | select_task_rq_dl(struct task_struct *p, int cpu, int flags) | ||||||
| { | { | ||||||
| 	struct task_struct *curr; | 	struct task_struct *curr; | ||||||
| 	bool select_rq; | 	bool select_rq; | ||||||
| 	struct rq *rq; | 	struct rq *rq; | ||||||
| 
 | 
 | ||||||
| 	if (sd_flag != SD_BALANCE_WAKE) | 	if (!(flags & WF_TTWU)) | ||||||
| 		goto out; | 		goto out; | ||||||
| 
 | 
 | ||||||
| 	rq = cpu_rq(cpu); | 	rq = cpu_rq(cpu); | ||||||
|  | @ -1918,7 +1937,7 @@ static void task_fork_dl(struct task_struct *p) | ||||||
| static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) | static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) | ||||||
| { | { | ||||||
| 	if (!task_running(rq, p) && | 	if (!task_running(rq, p) && | ||||||
| 	    cpumask_test_cpu(cpu, p->cpus_ptr)) | 	    cpumask_test_cpu(cpu, &p->cpus_mask)) | ||||||
| 		return 1; | 		return 1; | ||||||
| 	return 0; | 	return 0; | ||||||
| } | } | ||||||
|  | @ -2008,8 +2027,8 @@ static int find_later_rq(struct task_struct *task) | ||||||
| 				return this_cpu; | 				return this_cpu; | ||||||
| 			} | 			} | ||||||
| 
 | 
 | ||||||
| 			best_cpu = cpumask_first_and(later_mask, | 			best_cpu = cpumask_any_and_distribute(later_mask, | ||||||
| 							sched_domain_span(sd)); | 							      sched_domain_span(sd)); | ||||||
| 			/*
 | 			/*
 | ||||||
| 			 * Last chance: if a CPU being in both later_mask | 			 * Last chance: if a CPU being in both later_mask | ||||||
| 			 * and current sd span is valid, that becomes our | 			 * and current sd span is valid, that becomes our | ||||||
|  | @ -2031,7 +2050,7 @@ static int find_later_rq(struct task_struct *task) | ||||||
| 	if (this_cpu != -1) | 	if (this_cpu != -1) | ||||||
| 		return this_cpu; | 		return this_cpu; | ||||||
| 
 | 
 | ||||||
| 	cpu = cpumask_any(later_mask); | 	cpu = cpumask_any_distribute(later_mask); | ||||||
| 	if (cpu < nr_cpu_ids) | 	if (cpu < nr_cpu_ids) | ||||||
| 		return cpu; | 		return cpu; | ||||||
| 
 | 
 | ||||||
|  | @ -2068,7 +2087,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) | ||||||
| 		/* Retry if something changed. */ | 		/* Retry if something changed. */ | ||||||
| 		if (double_lock_balance(rq, later_rq)) { | 		if (double_lock_balance(rq, later_rq)) { | ||||||
| 			if (unlikely(task_rq(task) != rq || | 			if (unlikely(task_rq(task) != rq || | ||||||
| 				     !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) || | 				     !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) || | ||||||
| 				     task_running(rq, task) || | 				     task_running(rq, task) || | ||||||
| 				     !dl_task(task) || | 				     !dl_task(task) || | ||||||
| 				     !task_on_rq_queued(task))) { | 				     !task_on_rq_queued(task))) { | ||||||
|  | @ -2135,6 +2154,9 @@ static int push_dl_task(struct rq *rq) | ||||||
| 		return 0; | 		return 0; | ||||||
| 
 | 
 | ||||||
| retry: | retry: | ||||||
|  | 	if (is_migration_disabled(next_task)) | ||||||
|  | 		return 0; | ||||||
|  | 
 | ||||||
| 	if (WARN_ON(next_task == rq->curr)) | 	if (WARN_ON(next_task == rq->curr)) | ||||||
| 		return 0; | 		return 0; | ||||||
| 
 | 
 | ||||||
|  | @ -2212,7 +2234,7 @@ static void push_dl_tasks(struct rq *rq) | ||||||
| static void pull_dl_task(struct rq *this_rq) | static void pull_dl_task(struct rq *this_rq) | ||||||
| { | { | ||||||
| 	int this_cpu = this_rq->cpu, cpu; | 	int this_cpu = this_rq->cpu, cpu; | ||||||
| 	struct task_struct *p; | 	struct task_struct *p, *push_task; | ||||||
| 	bool resched = false; | 	bool resched = false; | ||||||
| 	struct rq *src_rq; | 	struct rq *src_rq; | ||||||
| 	u64 dmin = LONG_MAX; | 	u64 dmin = LONG_MAX; | ||||||
|  | @ -2242,6 +2264,7 @@ static void pull_dl_task(struct rq *this_rq) | ||||||
| 			continue; | 			continue; | ||||||
| 
 | 
 | ||||||
| 		/* Might drop this_rq->lock */ | 		/* Might drop this_rq->lock */ | ||||||
|  | 		push_task = NULL; | ||||||
| 		double_lock_balance(this_rq, src_rq); | 		double_lock_balance(this_rq, src_rq); | ||||||
| 
 | 
 | ||||||
| 		/*
 | 		/*
 | ||||||
|  | @ -2273,17 +2296,27 @@ static void pull_dl_task(struct rq *this_rq) | ||||||
| 					   src_rq->curr->dl.deadline)) | 					   src_rq->curr->dl.deadline)) | ||||||
| 				goto skip; | 				goto skip; | ||||||
| 
 | 
 | ||||||
| 			resched = true; | 			if (is_migration_disabled(p)) { | ||||||
| 
 | 				push_task = get_push_task(src_rq); | ||||||
| 			deactivate_task(src_rq, p, 0); | 			} else { | ||||||
| 			set_task_cpu(p, this_cpu); | 				deactivate_task(src_rq, p, 0); | ||||||
| 			activate_task(this_rq, p, 0); | 				set_task_cpu(p, this_cpu); | ||||||
| 			dmin = p->dl.deadline; | 				activate_task(this_rq, p, 0); | ||||||
|  | 				dmin = p->dl.deadline; | ||||||
|  | 				resched = true; | ||||||
|  | 			} | ||||||
| 
 | 
 | ||||||
| 			/* Is there any other task even earlier? */ | 			/* Is there any other task even earlier? */ | ||||||
| 		} | 		} | ||||||
| skip: | skip: | ||||||
| 		double_unlock_balance(this_rq, src_rq); | 		double_unlock_balance(this_rq, src_rq); | ||||||
|  | 
 | ||||||
|  | 		if (push_task) { | ||||||
|  | 			raw_spin_unlock(&this_rq->lock); | ||||||
|  | 			stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, | ||||||
|  | 					    push_task, &src_rq->push_work); | ||||||
|  | 			raw_spin_lock(&this_rq->lock); | ||||||
|  | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	if (resched) | 	if (resched) | ||||||
|  | @ -2307,7 +2340,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void set_cpus_allowed_dl(struct task_struct *p, | static void set_cpus_allowed_dl(struct task_struct *p, | ||||||
| 				const struct cpumask *new_mask) | 				const struct cpumask *new_mask, | ||||||
|  | 				u32 flags) | ||||||
| { | { | ||||||
| 	struct root_domain *src_rd; | 	struct root_domain *src_rd; | ||||||
| 	struct rq *rq; | 	struct rq *rq; | ||||||
|  | @ -2336,7 +2370,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, | ||||||
| 		raw_spin_unlock(&src_dl_b->lock); | 		raw_spin_unlock(&src_dl_b->lock); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	set_cpus_allowed_common(p, new_mask); | 	set_cpus_allowed_common(p, new_mask, flags); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /* Assumes rq->lock is held */ | /* Assumes rq->lock is held */ | ||||||
|  | @ -2509,8 +2543,8 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| const struct sched_class dl_sched_class | DEFINE_SCHED_CLASS(dl) = { | ||||||
| 	__section("__dl_sched_class") = { | 
 | ||||||
| 	.enqueue_task		= enqueue_task_dl, | 	.enqueue_task		= enqueue_task_dl, | ||||||
| 	.dequeue_task		= dequeue_task_dl, | 	.dequeue_task		= dequeue_task_dl, | ||||||
| 	.yield_task		= yield_task_dl, | 	.yield_task		= yield_task_dl, | ||||||
|  | @ -2529,6 +2563,7 @@ const struct sched_class dl_sched_class | ||||||
| 	.rq_online              = rq_online_dl, | 	.rq_online              = rq_online_dl, | ||||||
| 	.rq_offline             = rq_offline_dl, | 	.rq_offline             = rq_offline_dl, | ||||||
| 	.task_woken		= task_woken_dl, | 	.task_woken		= task_woken_dl, | ||||||
|  | 	.find_lock_rq		= find_lock_later_rq, | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| 	.task_tick		= task_tick_dl, | 	.task_tick		= task_tick_dl, | ||||||
|  | @ -2541,33 +2576,39 @@ const struct sched_class dl_sched_class | ||||||
| 	.update_curr		= update_curr_dl, | 	.update_curr		= update_curr_dl, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | /* Used for dl_bw check and update, used under sched_rt_handler()::mutex */ | ||||||
|  | static u64 dl_generation; | ||||||
|  | 
 | ||||||
| int sched_dl_global_validate(void) | int sched_dl_global_validate(void) | ||||||
| { | { | ||||||
| 	u64 runtime = global_rt_runtime(); | 	u64 runtime = global_rt_runtime(); | ||||||
| 	u64 period = global_rt_period(); | 	u64 period = global_rt_period(); | ||||||
| 	u64 new_bw = to_ratio(period, runtime); | 	u64 new_bw = to_ratio(period, runtime); | ||||||
|  | 	u64 gen = ++dl_generation; | ||||||
| 	struct dl_bw *dl_b; | 	struct dl_bw *dl_b; | ||||||
| 	int cpu, ret = 0; | 	int cpu, cpus, ret = 0; | ||||||
| 	unsigned long flags; | 	unsigned long flags; | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Here we want to check the bandwidth not being set to some | 	 * Here we want to check the bandwidth not being set to some | ||||||
| 	 * value smaller than the currently allocated bandwidth in | 	 * value smaller than the currently allocated bandwidth in | ||||||
| 	 * any of the root_domains. | 	 * any of the root_domains. | ||||||
| 	 * |  | ||||||
| 	 * FIXME: Cycling on all the CPUs is overdoing, but simpler than |  | ||||||
| 	 * cycling on root_domains... Discussion on different/better |  | ||||||
| 	 * solutions is welcome! |  | ||||||
| 	 */ | 	 */ | ||||||
| 	for_each_possible_cpu(cpu) { | 	for_each_possible_cpu(cpu) { | ||||||
| 		rcu_read_lock_sched(); | 		rcu_read_lock_sched(); | ||||||
|  | 
 | ||||||
|  | 		if (dl_bw_visited(cpu, gen)) | ||||||
|  | 			goto next; | ||||||
|  | 
 | ||||||
| 		dl_b = dl_bw_of(cpu); | 		dl_b = dl_bw_of(cpu); | ||||||
|  | 		cpus = dl_bw_cpus(cpu); | ||||||
| 
 | 
 | ||||||
| 		raw_spin_lock_irqsave(&dl_b->lock, flags); | 		raw_spin_lock_irqsave(&dl_b->lock, flags); | ||||||
| 		if (new_bw < dl_b->total_bw) | 		if (new_bw * cpus < dl_b->total_bw) | ||||||
| 			ret = -EBUSY; | 			ret = -EBUSY; | ||||||
| 		raw_spin_unlock_irqrestore(&dl_b->lock, flags); | 		raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||||||
| 
 | 
 | ||||||
|  | next: | ||||||
| 		rcu_read_unlock_sched(); | 		rcu_read_unlock_sched(); | ||||||
| 
 | 
 | ||||||
| 		if (ret) | 		if (ret) | ||||||
|  | @ -2593,6 +2634,7 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) | ||||||
| void sched_dl_do_global(void) | void sched_dl_do_global(void) | ||||||
| { | { | ||||||
| 	u64 new_bw = -1; | 	u64 new_bw = -1; | ||||||
|  | 	u64 gen = ++dl_generation; | ||||||
| 	struct dl_bw *dl_b; | 	struct dl_bw *dl_b; | ||||||
| 	int cpu; | 	int cpu; | ||||||
| 	unsigned long flags; | 	unsigned long flags; | ||||||
|  | @ -2603,11 +2645,14 @@ void sched_dl_do_global(void) | ||||||
| 	if (global_rt_runtime() != RUNTIME_INF) | 	if (global_rt_runtime() != RUNTIME_INF) | ||||||
| 		new_bw = to_ratio(global_rt_period(), global_rt_runtime()); | 		new_bw = to_ratio(global_rt_period(), global_rt_runtime()); | ||||||
| 
 | 
 | ||||||
| 	/*
 |  | ||||||
| 	 * FIXME: As above... |  | ||||||
| 	 */ |  | ||||||
| 	for_each_possible_cpu(cpu) { | 	for_each_possible_cpu(cpu) { | ||||||
| 		rcu_read_lock_sched(); | 		rcu_read_lock_sched(); | ||||||
|  | 
 | ||||||
|  | 		if (dl_bw_visited(cpu, gen)) { | ||||||
|  | 			rcu_read_unlock_sched(); | ||||||
|  | 			continue; | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
| 		dl_b = dl_bw_of(cpu); | 		dl_b = dl_bw_of(cpu); | ||||||
| 
 | 
 | ||||||
| 		raw_spin_lock_irqsave(&dl_b->lock, flags); | 		raw_spin_lock_irqsave(&dl_b->lock, flags); | ||||||
|  |  | ||||||
|  | @ -906,6 +906,15 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||||||
| 	if (!schedstat_enabled()) | 	if (!schedstat_enabled()) | ||||||
| 		return; | 		return; | ||||||
| 
 | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * When the sched_schedstat changes from 0 to 1, some sched se | ||||||
|  | 	 * maybe already in the runqueue, the se->statistics.wait_start | ||||||
|  | 	 * will be 0.So it will let the delta wrong. We need to avoid this | ||||||
|  | 	 * scenario. | ||||||
|  | 	 */ | ||||||
|  | 	if (unlikely(!schedstat_val(se->statistics.wait_start))) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
| 	delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); | 	delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); | ||||||
| 
 | 
 | ||||||
| 	if (entity_is_task(se)) { | 	if (entity_is_task(se)) { | ||||||
|  | @ -1550,7 +1559,8 @@ struct task_numa_env { | ||||||
| static unsigned long cpu_load(struct rq *rq); | static unsigned long cpu_load(struct rq *rq); | ||||||
| static unsigned long cpu_runnable(struct rq *rq); | static unsigned long cpu_runnable(struct rq *rq); | ||||||
| static unsigned long cpu_util(int cpu); | static unsigned long cpu_util(int cpu); | ||||||
| static inline long adjust_numa_imbalance(int imbalance, int nr_running); | static inline long adjust_numa_imbalance(int imbalance, | ||||||
|  | 					int dst_running, int dst_weight); | ||||||
| 
 | 
 | ||||||
| static inline enum | static inline enum | ||||||
| numa_type numa_classify(unsigned int imbalance_pct, | numa_type numa_classify(unsigned int imbalance_pct, | ||||||
|  | @ -1930,7 +1940,8 @@ static void task_numa_find_cpu(struct task_numa_env *env, | ||||||
| 		src_running = env->src_stats.nr_running - 1; | 		src_running = env->src_stats.nr_running - 1; | ||||||
| 		dst_running = env->dst_stats.nr_running + 1; | 		dst_running = env->dst_stats.nr_running + 1; | ||||||
| 		imbalance = max(0, dst_running - src_running); | 		imbalance = max(0, dst_running - src_running); | ||||||
| 		imbalance = adjust_numa_imbalance(imbalance, dst_running); | 		imbalance = adjust_numa_imbalance(imbalance, dst_running, | ||||||
|  | 							env->dst_stats.weight); | ||||||
| 
 | 
 | ||||||
| 		/* Use idle CPU if there is no imbalance */ | 		/* Use idle CPU if there is no imbalance */ | ||||||
| 		if (!imbalance) { | 		if (!imbalance) { | ||||||
|  | @ -4779,25 +4790,37 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) | ||||||
| 		struct cfs_rq *qcfs_rq = cfs_rq_of(se); | 		struct cfs_rq *qcfs_rq = cfs_rq_of(se); | ||||||
| 		/* throttled entity or throttle-on-deactivate */ | 		/* throttled entity or throttle-on-deactivate */ | ||||||
| 		if (!se->on_rq) | 		if (!se->on_rq) | ||||||
| 			break; | 			goto done; | ||||||
| 
 | 
 | ||||||
| 		if (dequeue) { | 		dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); | ||||||
| 			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); |  | ||||||
| 		} else { |  | ||||||
| 			update_load_avg(qcfs_rq, se, 0); |  | ||||||
| 			se_update_runnable(se); |  | ||||||
| 		} |  | ||||||
| 
 | 
 | ||||||
| 		qcfs_rq->h_nr_running -= task_delta; | 		qcfs_rq->h_nr_running -= task_delta; | ||||||
| 		qcfs_rq->idle_h_nr_running -= idle_task_delta; | 		qcfs_rq->idle_h_nr_running -= idle_task_delta; | ||||||
| 
 | 
 | ||||||
| 		if (qcfs_rq->load.weight) | 		if (qcfs_rq->load.weight) { | ||||||
| 			dequeue = 0; | 			/* Avoid re-evaluating load for this entity: */ | ||||||
|  | 			se = parent_entity(se); | ||||||
|  | 			break; | ||||||
|  | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	if (!se) | 	for_each_sched_entity(se) { | ||||||
| 		sub_nr_running(rq, task_delta); | 		struct cfs_rq *qcfs_rq = cfs_rq_of(se); | ||||||
|  | 		/* throttled entity or throttle-on-deactivate */ | ||||||
|  | 		if (!se->on_rq) | ||||||
|  | 			goto done; | ||||||
| 
 | 
 | ||||||
|  | 		update_load_avg(qcfs_rq, se, 0); | ||||||
|  | 		se_update_runnable(se); | ||||||
|  | 
 | ||||||
|  | 		qcfs_rq->h_nr_running -= task_delta; | ||||||
|  | 		qcfs_rq->idle_h_nr_running -= idle_task_delta; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	/* At this point se is NULL and we are at root level*/ | ||||||
|  | 	sub_nr_running(rq, task_delta); | ||||||
|  | 
 | ||||||
|  | done: | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Note: distribution will already see us throttled via the | 	 * Note: distribution will already see us throttled via the | ||||||
| 	 * throttled-list.  rq->lock protects completion. | 	 * throttled-list.  rq->lock protects completion. | ||||||
|  | @ -5105,9 +5128,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) | ||||||
| 		return; | 		return; | ||||||
| 
 | 
 | ||||||
| 	distribute_cfs_runtime(cfs_b); | 	distribute_cfs_runtime(cfs_b); | ||||||
| 
 |  | ||||||
| 	raw_spin_lock_irqsave(&cfs_b->lock, flags); |  | ||||||
| 	raw_spin_unlock_irqrestore(&cfs_b->lock, flags); |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  | @ -5805,6 +5825,9 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) | ||||||
| 	if (sync && cpu_rq(this_cpu)->nr_running == 1) | 	if (sync && cpu_rq(this_cpu)->nr_running == 1) | ||||||
| 		return this_cpu; | 		return this_cpu; | ||||||
| 
 | 
 | ||||||
|  | 	if (available_idle_cpu(prev_cpu)) | ||||||
|  | 		return prev_cpu; | ||||||
|  | 
 | ||||||
| 	return nr_cpumask_bits; | 	return nr_cpumask_bits; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -6063,10 +6086,11 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int | ||||||
| 				break; | 				break; | ||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
| 		cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); |  | ||||||
| 
 | 
 | ||||||
| 		if (idle) | 		if (idle) | ||||||
| 			return core; | 			return core; | ||||||
|  | 
 | ||||||
|  | 		cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
|  | @ -6307,7 +6331,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /**
 | /**
 | ||||||
|  * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks |  * cpu_util - Estimates the amount of capacity of a CPU used by CFS tasks. | ||||||
|  * @cpu: the CPU to get the utilization of |  * @cpu: the CPU to get the utilization of | ||||||
|  * |  * | ||||||
|  * The unit of the return value must be the one of capacity so we can compare |  * The unit of the return value must be the one of capacity so we can compare | ||||||
|  | @ -6683,7 +6707,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * select_task_rq_fair: Select target runqueue for the waking task in domains |  * select_task_rq_fair: Select target runqueue for the waking task in domains | ||||||
|  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, |  * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE, | ||||||
|  * SD_BALANCE_FORK, or SD_BALANCE_EXEC. |  * SD_BALANCE_FORK, or SD_BALANCE_EXEC. | ||||||
|  * |  * | ||||||
|  * Balances load by selecting the idlest CPU in the idlest group, or under |  * Balances load by selecting the idlest CPU in the idlest group, or under | ||||||
|  | @ -6694,15 +6718,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) | ||||||
|  * preempt must be disabled. |  * preempt must be disabled. | ||||||
|  */ |  */ | ||||||
| static int | static int | ||||||
| select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) | select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) | ||||||
| { | { | ||||||
|  | 	int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); | ||||||
| 	struct sched_domain *tmp, *sd = NULL; | 	struct sched_domain *tmp, *sd = NULL; | ||||||
| 	int cpu = smp_processor_id(); | 	int cpu = smp_processor_id(); | ||||||
| 	int new_cpu = prev_cpu; | 	int new_cpu = prev_cpu; | ||||||
| 	int want_affine = 0; | 	int want_affine = 0; | ||||||
| 	int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); | 	/* SD_flags and WF_flags share the first nibble */ | ||||||
|  | 	int sd_flag = wake_flags & 0xF; | ||||||
| 
 | 
 | ||||||
| 	if (sd_flag & SD_BALANCE_WAKE) { | 	if (wake_flags & WF_TTWU) { | ||||||
| 		record_wakee(p); | 		record_wakee(p); | ||||||
| 
 | 
 | ||||||
| 		if (sched_energy_enabled()) { | 		if (sched_energy_enabled()) { | ||||||
|  | @ -6739,9 +6765,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | ||||||
| 	if (unlikely(sd)) { | 	if (unlikely(sd)) { | ||||||
| 		/* Slow path */ | 		/* Slow path */ | ||||||
| 		new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); | 		new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); | ||||||
| 	} else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ | 	} else if (wake_flags & WF_TTWU) { /* XXX always ? */ | ||||||
| 		/* Fast path */ | 		/* Fast path */ | ||||||
| 
 |  | ||||||
| 		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); | 		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); | ||||||
| 
 | 
 | ||||||
| 		if (want_affine) | 		if (want_affine) | ||||||
|  | @ -8757,6 +8782,16 @@ static bool update_pick_idlest(struct sched_group *idlest, | ||||||
| 	return true; | 	return true; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain. | ||||||
|  |  * This is an approximation as the number of running tasks may not be | ||||||
|  |  * related to the number of busy CPUs due to sched_setaffinity. | ||||||
|  |  */ | ||||||
|  | static inline bool allow_numa_imbalance(int dst_running, int dst_weight) | ||||||
|  | { | ||||||
|  | 	return (dst_running < (dst_weight >> 2)); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| /*
 | /*
 | ||||||
|  * find_idlest_group() finds and returns the least busy CPU group within the |  * find_idlest_group() finds and returns the least busy CPU group within the | ||||||
|  * domain. |  * domain. | ||||||
|  | @ -8775,9 +8810,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | ||||||
| 			.group_type = group_overloaded, | 			.group_type = group_overloaded, | ||||||
| 	}; | 	}; | ||||||
| 
 | 
 | ||||||
| 	imbalance = scale_load_down(NICE_0_LOAD) * |  | ||||||
| 				(sd->imbalance_pct-100) / 100; |  | ||||||
| 
 |  | ||||||
| 	do { | 	do { | ||||||
| 		int local_group; | 		int local_group; | ||||||
| 
 | 
 | ||||||
|  | @ -8831,6 +8863,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | ||||||
| 	switch (local_sgs.group_type) { | 	switch (local_sgs.group_type) { | ||||||
| 	case group_overloaded: | 	case group_overloaded: | ||||||
| 	case group_fully_busy: | 	case group_fully_busy: | ||||||
|  | 
 | ||||||
|  | 		/* Calculate allowed imbalance based on load */ | ||||||
|  | 		imbalance = scale_load_down(NICE_0_LOAD) * | ||||||
|  | 				(sd->imbalance_pct-100) / 100; | ||||||
|  | 
 | ||||||
| 		/*
 | 		/*
 | ||||||
| 		 * When comparing groups across NUMA domains, it's possible for | 		 * When comparing groups across NUMA domains, it's possible for | ||||||
| 		 * the local domain to be very lightly loaded relative to the | 		 * the local domain to be very lightly loaded relative to the | ||||||
|  | @ -8887,7 +8924,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | ||||||
| 			 * a real need of migration, periodic load balance will | 			 * a real need of migration, periodic load balance will | ||||||
| 			 * take care of it. | 			 * take care of it. | ||||||
| 			 */ | 			 */ | ||||||
| 			if (local_sgs.idle_cpus) | 			if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight)) | ||||||
| 				return NULL; | 				return NULL; | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
|  | @ -8989,16 +9026,19 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline long adjust_numa_imbalance(int imbalance, int nr_running) | #define NUMA_IMBALANCE_MIN 2 | ||||||
|  | 
 | ||||||
|  | static inline long adjust_numa_imbalance(int imbalance, | ||||||
|  | 				int dst_running, int dst_weight) | ||||||
| { | { | ||||||
| 	unsigned int imbalance_min; | 	if (!allow_numa_imbalance(dst_running, dst_weight)) | ||||||
|  | 		return imbalance; | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Allow a small imbalance based on a simple pair of communicating | 	 * Allow a small imbalance based on a simple pair of communicating | ||||||
| 	 * tasks that remain local when the source domain is almost idle. | 	 * tasks that remain local when the destination is lightly loaded. | ||||||
| 	 */ | 	 */ | ||||||
| 	imbalance_min = 2; | 	if (imbalance <= NUMA_IMBALANCE_MIN) | ||||||
| 	if (nr_running <= imbalance_min) |  | ||||||
| 		return 0; | 		return 0; | ||||||
| 
 | 
 | ||||||
| 	return imbalance; | 	return imbalance; | ||||||
|  | @ -9101,9 +9141,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
| 		/* Consider allowing a small imbalance between NUMA groups */ | 		/* Consider allowing a small imbalance between NUMA groups */ | ||||||
| 		if (env->sd->flags & SD_NUMA) | 		if (env->sd->flags & SD_NUMA) { | ||||||
| 			env->imbalance = adjust_numa_imbalance(env->imbalance, | 			env->imbalance = adjust_numa_imbalance(env->imbalance, | ||||||
| 						busiest->sum_nr_running); | 				busiest->sum_nr_running, busiest->group_weight); | ||||||
|  | 		} | ||||||
| 
 | 
 | ||||||
| 		return; | 		return; | ||||||
| 	} | 	} | ||||||
|  | @ -10068,6 +10109,10 @@ static inline int find_new_ilb(void) | ||||||
| 
 | 
 | ||||||
| 	for_each_cpu_and(ilb, nohz.idle_cpus_mask, | 	for_each_cpu_and(ilb, nohz.idle_cpus_mask, | ||||||
| 			      housekeeping_cpumask(HK_FLAG_MISC)) { | 			      housekeeping_cpumask(HK_FLAG_MISC)) { | ||||||
|  | 
 | ||||||
|  | 		if (ilb == smp_processor_id()) | ||||||
|  | 			continue; | ||||||
|  | 
 | ||||||
| 		if (idle_cpu(ilb)) | 		if (idle_cpu(ilb)) | ||||||
| 			return ilb; | 			return ilb; | ||||||
| 	} | 	} | ||||||
|  | @ -10505,7 +10550,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } | ||||||
| #endif /* CONFIG_NO_HZ_COMMON */ | #endif /* CONFIG_NO_HZ_COMMON */ | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * idle_balance is called by schedule() if this_cpu is about to become |  * newidle_balance is called by schedule() if this_cpu is about to become | ||||||
|  * idle. Attempts to pull tasks from other CPUs. |  * idle. Attempts to pull tasks from other CPUs. | ||||||
|  * |  * | ||||||
|  * Returns: |  * Returns: | ||||||
|  | @ -11179,8 +11224,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task | ||||||
| /*
 | /*
 | ||||||
|  * All the scheduling class methods: |  * All the scheduling class methods: | ||||||
|  */ |  */ | ||||||
| const struct sched_class fair_sched_class | DEFINE_SCHED_CLASS(fair) = { | ||||||
| 	__section("__fair_sched_class") = { | 
 | ||||||
| 	.enqueue_task		= enqueue_task_fair, | 	.enqueue_task		= enqueue_task_fair, | ||||||
| 	.dequeue_task		= dequeue_task_fair, | 	.dequeue_task		= dequeue_task_fair, | ||||||
| 	.yield_task		= yield_task_fair, | 	.yield_task		= yield_task_fair, | ||||||
|  |  | ||||||
|  | @ -364,6 +364,7 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns) | ||||||
| 	WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); | 	WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); | ||||||
| 	WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); | 	WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); | ||||||
| 	WARN_ON_ONCE(!duration_ns); | 	WARN_ON_ONCE(!duration_ns); | ||||||
|  | 	WARN_ON_ONCE(current->mm); | ||||||
| 
 | 
 | ||||||
| 	rcu_sleep_check(); | 	rcu_sleep_check(); | ||||||
| 	preempt_disable(); | 	preempt_disable(); | ||||||
|  | @ -401,7 +402,7 @@ void cpu_startup_entry(enum cpuhp_state state) | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_SMP | #ifdef CONFIG_SMP | ||||||
| static int | static int | ||||||
| select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) | select_task_rq_idle(struct task_struct *p, int cpu, int flags) | ||||||
| { | { | ||||||
| 	return task_cpu(p); /* IDLE tasks as never migrated */ | 	return task_cpu(p); /* IDLE tasks as never migrated */ | ||||||
| } | } | ||||||
|  | @ -483,8 +484,8 @@ static void update_curr_idle(struct rq *rq) | ||||||
| /*
 | /*
 | ||||||
|  * Simple, special scheduling class for the per-CPU idle tasks: |  * Simple, special scheduling class for the per-CPU idle tasks: | ||||||
|  */ |  */ | ||||||
| const struct sched_class idle_sched_class | DEFINE_SCHED_CLASS(idle) = { | ||||||
| 	__section("__idle_sched_class") = { | 
 | ||||||
| 	/* no enqueue/yield_task for idle tasks */ | 	/* no enqueue/yield_task for idle tasks */ | ||||||
| 
 | 
 | ||||||
| 	/* dequeue is not valid, we print a debug message there: */ | 	/* dequeue is not valid, we print a debug message there: */ | ||||||
|  |  | ||||||
|  | @ -6,6 +6,134 @@ | ||||||
|  */ |  */ | ||||||
| #include "sched.h" | #include "sched.h" | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * For documentation purposes, here are some membarrier ordering | ||||||
|  |  * scenarios to keep in mind: | ||||||
|  |  * | ||||||
|  |  * A) Userspace thread execution after IPI vs membarrier's memory | ||||||
|  |  *    barrier before sending the IPI | ||||||
|  |  * | ||||||
|  |  * Userspace variables: | ||||||
|  |  * | ||||||
|  |  * int x = 0, y = 0; | ||||||
|  |  * | ||||||
|  |  * The memory barrier at the start of membarrier() on CPU0 is necessary in | ||||||
|  |  * order to enforce the guarantee that any writes occurring on CPU0 before | ||||||
|  |  * the membarrier() is executed will be visible to any code executing on | ||||||
|  |  * CPU1 after the IPI-induced memory barrier: | ||||||
|  |  * | ||||||
|  |  *         CPU0                              CPU1 | ||||||
|  |  * | ||||||
|  |  *         x = 1 | ||||||
|  |  *         membarrier(): | ||||||
|  |  *           a: smp_mb() | ||||||
|  |  *           b: send IPI                       IPI-induced mb | ||||||
|  |  *           c: smp_mb() | ||||||
|  |  *         r2 = y | ||||||
|  |  *                                           y = 1 | ||||||
|  |  *                                           barrier() | ||||||
|  |  *                                           r1 = x | ||||||
|  |  * | ||||||
|  |  *                     BUG_ON(r1 == 0 && r2 == 0) | ||||||
|  |  * | ||||||
|  |  * The write to y and load from x by CPU1 are unordered by the hardware, | ||||||
|  |  * so it's possible to have "r1 = x" reordered before "y = 1" at any | ||||||
|  |  * point after (b).  If the memory barrier at (a) is omitted, then "x = 1" | ||||||
|  |  * can be reordered after (a) (although not after (c)), so we get r1 == 0 | ||||||
|  |  * and r2 == 0.  This violates the guarantee that membarrier() is | ||||||
|  |  * supposed by provide. | ||||||
|  |  * | ||||||
|  |  * The timing of the memory barrier at (a) has to ensure that it executes | ||||||
|  |  * before the IPI-induced memory barrier on CPU1. | ||||||
|  |  * | ||||||
|  |  * B) Userspace thread execution before IPI vs membarrier's memory | ||||||
|  |  *    barrier after completing the IPI | ||||||
|  |  * | ||||||
|  |  * Userspace variables: | ||||||
|  |  * | ||||||
|  |  * int x = 0, y = 0; | ||||||
|  |  * | ||||||
|  |  * The memory barrier at the end of membarrier() on CPU0 is necessary in | ||||||
|  |  * order to enforce the guarantee that any writes occurring on CPU1 before | ||||||
|  |  * the membarrier() is executed will be visible to any code executing on | ||||||
|  |  * CPU0 after the membarrier(): | ||||||
|  |  * | ||||||
|  |  *         CPU0                              CPU1 | ||||||
|  |  * | ||||||
|  |  *                                           x = 1 | ||||||
|  |  *                                           barrier() | ||||||
|  |  *                                           y = 1 | ||||||
|  |  *         r2 = y | ||||||
|  |  *         membarrier(): | ||||||
|  |  *           a: smp_mb() | ||||||
|  |  *           b: send IPI                       IPI-induced mb | ||||||
|  |  *           c: smp_mb() | ||||||
|  |  *         r1 = x | ||||||
|  |  *         BUG_ON(r1 == 0 && r2 == 1) | ||||||
|  |  * | ||||||
|  |  * The writes to x and y are unordered by the hardware, so it's possible to | ||||||
|  |  * have "r2 = 1" even though the write to x doesn't execute until (b).  If | ||||||
|  |  * the memory barrier at (c) is omitted then "r1 = x" can be reordered | ||||||
|  |  * before (b) (although not before (a)), so we get "r1 = 0".  This violates | ||||||
|  |  * the guarantee that membarrier() is supposed to provide. | ||||||
|  |  * | ||||||
|  |  * The timing of the memory barrier at (c) has to ensure that it executes | ||||||
|  |  * after the IPI-induced memory barrier on CPU1. | ||||||
|  |  * | ||||||
|  |  * C) Scheduling userspace thread -> kthread -> userspace thread vs membarrier | ||||||
|  |  * | ||||||
|  |  *           CPU0                            CPU1 | ||||||
|  |  * | ||||||
|  |  *           membarrier(): | ||||||
|  |  *           a: smp_mb() | ||||||
|  |  *                                           d: switch to kthread (includes mb) | ||||||
|  |  *           b: read rq->curr->mm == NULL | ||||||
|  |  *                                           e: switch to user (includes mb) | ||||||
|  |  *           c: smp_mb() | ||||||
|  |  * | ||||||
|  |  * Using the scenario from (A), we can show that (a) needs to be paired | ||||||
|  |  * with (e). Using the scenario from (B), we can show that (c) needs to | ||||||
|  |  * be paired with (d). | ||||||
|  |  * | ||||||
|  |  * D) exit_mm vs membarrier | ||||||
|  |  * | ||||||
|  |  * Two thread groups are created, A and B.  Thread group B is created by | ||||||
|  |  * issuing clone from group A with flag CLONE_VM set, but not CLONE_THREAD. | ||||||
|  |  * Let's assume we have a single thread within each thread group (Thread A | ||||||
|  |  * and Thread B).  Thread A runs on CPU0, Thread B runs on CPU1. | ||||||
|  |  * | ||||||
|  |  *           CPU0                            CPU1 | ||||||
|  |  * | ||||||
|  |  *           membarrier(): | ||||||
|  |  *             a: smp_mb() | ||||||
|  |  *                                           exit_mm(): | ||||||
|  |  *                                             d: smp_mb() | ||||||
|  |  *                                             e: current->mm = NULL | ||||||
|  |  *             b: read rq->curr->mm == NULL | ||||||
|  |  *             c: smp_mb() | ||||||
|  |  * | ||||||
|  |  * Using scenario (B), we can show that (c) needs to be paired with (d). | ||||||
|  |  * | ||||||
|  |  * E) kthread_{use,unuse}_mm vs membarrier | ||||||
|  |  * | ||||||
|  |  *           CPU0                            CPU1 | ||||||
|  |  * | ||||||
|  |  *           membarrier(): | ||||||
|  |  *           a: smp_mb() | ||||||
|  |  *                                           kthread_unuse_mm() | ||||||
|  |  *                                             d: smp_mb() | ||||||
|  |  *                                             e: current->mm = NULL | ||||||
|  |  *           b: read rq->curr->mm == NULL | ||||||
|  |  *                                           kthread_use_mm() | ||||||
|  |  *                                             f: current->mm = mm | ||||||
|  |  *                                             g: smp_mb() | ||||||
|  |  *           c: smp_mb() | ||||||
|  |  * | ||||||
|  |  * Using the scenario from (A), we can show that (a) needs to be paired | ||||||
|  |  * with (g). Using the scenario from (B), we can show that (c) needs to | ||||||
|  |  * be paired with (d). | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
| /*
 | /*
 | ||||||
|  * Bitmask made from a "or" of all commands within enum membarrier_cmd, |  * Bitmask made from a "or" of all commands within enum membarrier_cmd, | ||||||
|  * except MEMBARRIER_CMD_QUERY. |  * except MEMBARRIER_CMD_QUERY. | ||||||
|  | @ -101,6 +229,18 @@ void membarrier_exec_mmap(struct mm_struct *mm) | ||||||
| 	this_cpu_write(runqueues.membarrier_state, 0); | 	this_cpu_write(runqueues.membarrier_state, 0); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | void membarrier_update_current_mm(struct mm_struct *next_mm) | ||||||
|  | { | ||||||
|  | 	struct rq *rq = this_rq(); | ||||||
|  | 	int membarrier_state = 0; | ||||||
|  | 
 | ||||||
|  | 	if (next_mm) | ||||||
|  | 		membarrier_state = atomic_read(&next_mm->membarrier_state); | ||||||
|  | 	if (READ_ONCE(rq->membarrier_state) == membarrier_state) | ||||||
|  | 		return; | ||||||
|  | 	WRITE_ONCE(rq->membarrier_state, membarrier_state); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static int membarrier_global_expedited(void) | static int membarrier_global_expedited(void) | ||||||
| { | { | ||||||
| 	int cpu; | 	int cpu; | ||||||
|  | @ -139,12 +279,11 @@ static int membarrier_global_expedited(void) | ||||||
| 			continue; | 			continue; | ||||||
| 
 | 
 | ||||||
| 		/*
 | 		/*
 | ||||||
| 		 * Skip the CPU if it runs a kernel thread. The scheduler | 		 * Skip the CPU if it runs a kernel thread which is not using | ||||||
| 		 * leaves the prior task mm in place as an optimization when | 		 * a task mm. | ||||||
| 		 * scheduling a kthread. |  | ||||||
| 		 */ | 		 */ | ||||||
| 		p = rcu_dereference(cpu_rq(cpu)->curr); | 		p = rcu_dereference(cpu_rq(cpu)->curr); | ||||||
| 		if (p->flags & PF_KTHREAD) | 		if (!p->mm) | ||||||
| 			continue; | 			continue; | ||||||
| 
 | 
 | ||||||
| 		__cpumask_set_cpu(cpu, tmpmask); | 		__cpumask_set_cpu(cpu, tmpmask); | ||||||
|  |  | ||||||
|  | @ -89,8 +89,8 @@ void init_rt_rq(struct rt_rq *rt_rq) | ||||||
| 	__set_bit(MAX_RT_PRIO, array->bitmap); | 	__set_bit(MAX_RT_PRIO, array->bitmap); | ||||||
| 
 | 
 | ||||||
| #if defined CONFIG_SMP | #if defined CONFIG_SMP | ||||||
| 	rt_rq->highest_prio.curr = MAX_RT_PRIO; | 	rt_rq->highest_prio.curr = MAX_RT_PRIO-1; | ||||||
| 	rt_rq->highest_prio.next = MAX_RT_PRIO; | 	rt_rq->highest_prio.next = MAX_RT_PRIO-1; | ||||||
| 	rt_rq->rt_nr_migratory = 0; | 	rt_rq->rt_nr_migratory = 0; | ||||||
| 	rt_rq->overloaded = 0; | 	rt_rq->overloaded = 0; | ||||||
| 	plist_head_init(&rt_rq->pushable_tasks); | 	plist_head_init(&rt_rq->pushable_tasks); | ||||||
|  | @ -161,7 +161,7 @@ void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | ||||||
| { | { | ||||||
| 	struct rq *rq = cpu_rq(cpu); | 	struct rq *rq = cpu_rq(cpu); | ||||||
| 
 | 
 | ||||||
| 	rt_rq->highest_prio.curr = MAX_RT_PRIO; | 	rt_rq->highest_prio.curr = MAX_RT_PRIO-1; | ||||||
| 	rt_rq->rt_nr_boosted = 0; | 	rt_rq->rt_nr_boosted = 0; | ||||||
| 	rt_rq->rq = rq; | 	rt_rq->rq = rq; | ||||||
| 	rt_rq->tg = tg; | 	rt_rq->tg = tg; | ||||||
|  | @ -265,7 +265,7 @@ static void pull_rt_task(struct rq *this_rq); | ||||||
| static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) | static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) | ||||||
| { | { | ||||||
| 	/* Try to pull RT tasks here if we lower this rq's prio */ | 	/* Try to pull RT tasks here if we lower this rq's prio */ | ||||||
| 	return rq->rt.highest_prio.curr > prev->prio; | 	return rq->online && rq->rt.highest_prio.curr > prev->prio; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline int rt_overloaded(struct rq *rq) | static inline int rt_overloaded(struct rq *rq) | ||||||
|  | @ -393,8 +393,9 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) | ||||||
| 		p = plist_first_entry(&rq->rt.pushable_tasks, | 		p = plist_first_entry(&rq->rt.pushable_tasks, | ||||||
| 				      struct task_struct, pushable_tasks); | 				      struct task_struct, pushable_tasks); | ||||||
| 		rq->rt.highest_prio.next = p->prio; | 		rq->rt.highest_prio.next = p->prio; | ||||||
| 	} else | 	} else { | ||||||
| 		rq->rt.highest_prio.next = MAX_RT_PRIO; | 		rq->rt.highest_prio.next = MAX_RT_PRIO-1; | ||||||
|  | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #else | #else | ||||||
|  | @ -1147,8 +1148,9 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio) | ||||||
| 				sched_find_first_bit(array->bitmap); | 				sched_find_first_bit(array->bitmap); | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
| 	} else | 	} else { | ||||||
| 		rt_rq->highest_prio.curr = MAX_RT_PRIO; | 		rt_rq->highest_prio.curr = MAX_RT_PRIO-1; | ||||||
|  | 	} | ||||||
| 
 | 
 | ||||||
| 	dec_rt_prio_smp(rt_rq, prio, prev_prio); | 	dec_rt_prio_smp(rt_rq, prio, prev_prio); | ||||||
| } | } | ||||||
|  | @ -1428,14 +1430,14 @@ static void yield_task_rt(struct rq *rq) | ||||||
| static int find_lowest_rq(struct task_struct *task); | static int find_lowest_rq(struct task_struct *task); | ||||||
| 
 | 
 | ||||||
| static int | static int | ||||||
| select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) | select_task_rq_rt(struct task_struct *p, int cpu, int flags) | ||||||
| { | { | ||||||
| 	struct task_struct *curr; | 	struct task_struct *curr; | ||||||
| 	struct rq *rq; | 	struct rq *rq; | ||||||
| 	bool test; | 	bool test; | ||||||
| 
 | 
 | ||||||
| 	/* For anything but wake ups, just return the task_cpu */ | 	/* For anything but wake ups, just return the task_cpu */ | ||||||
| 	if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) | 	if (!(flags & (WF_TTWU | WF_FORK))) | ||||||
| 		goto out; | 		goto out; | ||||||
| 
 | 
 | ||||||
| 	rq = cpu_rq(cpu); | 	rq = cpu_rq(cpu); | ||||||
|  | @ -1658,7 +1660,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | ||||||
| static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | ||||||
| { | { | ||||||
| 	if (!task_running(rq, p) && | 	if (!task_running(rq, p) && | ||||||
| 	    cpumask_test_cpu(cpu, p->cpus_ptr)) | 	    cpumask_test_cpu(cpu, &p->cpus_mask)) | ||||||
| 		return 1; | 		return 1; | ||||||
| 
 | 
 | ||||||
| 	return 0; | 	return 0; | ||||||
|  | @ -1752,8 +1754,8 @@ static int find_lowest_rq(struct task_struct *task) | ||||||
| 				return this_cpu; | 				return this_cpu; | ||||||
| 			} | 			} | ||||||
| 
 | 
 | ||||||
| 			best_cpu = cpumask_first_and(lowest_mask, | 			best_cpu = cpumask_any_and_distribute(lowest_mask, | ||||||
| 						     sched_domain_span(sd)); | 							      sched_domain_span(sd)); | ||||||
| 			if (best_cpu < nr_cpu_ids) { | 			if (best_cpu < nr_cpu_ids) { | ||||||
| 				rcu_read_unlock(); | 				rcu_read_unlock(); | ||||||
| 				return best_cpu; | 				return best_cpu; | ||||||
|  | @ -1770,7 +1772,7 @@ static int find_lowest_rq(struct task_struct *task) | ||||||
| 	if (this_cpu != -1) | 	if (this_cpu != -1) | ||||||
| 		return this_cpu; | 		return this_cpu; | ||||||
| 
 | 
 | ||||||
| 	cpu = cpumask_any(lowest_mask); | 	cpu = cpumask_any_distribute(lowest_mask); | ||||||
| 	if (cpu < nr_cpu_ids) | 	if (cpu < nr_cpu_ids) | ||||||
| 		return cpu; | 		return cpu; | ||||||
| 
 | 
 | ||||||
|  | @ -1811,7 +1813,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | ||||||
| 			 * Also make sure that it wasn't scheduled on its rq. | 			 * Also make sure that it wasn't scheduled on its rq. | ||||||
| 			 */ | 			 */ | ||||||
| 			if (unlikely(task_rq(task) != rq || | 			if (unlikely(task_rq(task) != rq || | ||||||
| 				     !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) || | 				     !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || | ||||||
| 				     task_running(rq, task) || | 				     task_running(rq, task) || | ||||||
| 				     !rt_task(task) || | 				     !rt_task(task) || | ||||||
| 				     !task_on_rq_queued(task))) { | 				     !task_on_rq_queued(task))) { | ||||||
|  | @ -1859,7 +1861,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) | ||||||
|  * running task can migrate over to a CPU that is running a task |  * running task can migrate over to a CPU that is running a task | ||||||
|  * of lesser priority. |  * of lesser priority. | ||||||
|  */ |  */ | ||||||
| static int push_rt_task(struct rq *rq) | static int push_rt_task(struct rq *rq, bool pull) | ||||||
| { | { | ||||||
| 	struct task_struct *next_task; | 	struct task_struct *next_task; | ||||||
| 	struct rq *lowest_rq; | 	struct rq *lowest_rq; | ||||||
|  | @ -1873,6 +1875,34 @@ static int push_rt_task(struct rq *rq) | ||||||
| 		return 0; | 		return 0; | ||||||
| 
 | 
 | ||||||
| retry: | retry: | ||||||
|  | 	if (is_migration_disabled(next_task)) { | ||||||
|  | 		struct task_struct *push_task = NULL; | ||||||
|  | 		int cpu; | ||||||
|  | 
 | ||||||
|  | 		if (!pull || rq->push_busy) | ||||||
|  | 			return 0; | ||||||
|  | 
 | ||||||
|  | 		cpu = find_lowest_rq(rq->curr); | ||||||
|  | 		if (cpu == -1 || cpu == rq->cpu) | ||||||
|  | 			return 0; | ||||||
|  | 
 | ||||||
|  | 		/*
 | ||||||
|  | 		 * Given we found a CPU with lower priority than @next_task, | ||||||
|  | 		 * therefore it should be running. However we cannot migrate it | ||||||
|  | 		 * to this other CPU, instead attempt to push the current | ||||||
|  | 		 * running task on this CPU away. | ||||||
|  | 		 */ | ||||||
|  | 		push_task = get_push_task(rq); | ||||||
|  | 		if (push_task) { | ||||||
|  | 			raw_spin_unlock(&rq->lock); | ||||||
|  | 			stop_one_cpu_nowait(rq->cpu, push_cpu_stop, | ||||||
|  | 					    push_task, &rq->push_work); | ||||||
|  | 			raw_spin_lock(&rq->lock); | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
|  | 		return 0; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	if (WARN_ON(next_task == rq->curr)) | 	if (WARN_ON(next_task == rq->curr)) | ||||||
| 		return 0; | 		return 0; | ||||||
| 
 | 
 | ||||||
|  | @ -1927,12 +1957,10 @@ static int push_rt_task(struct rq *rq) | ||||||
| 	deactivate_task(rq, next_task, 0); | 	deactivate_task(rq, next_task, 0); | ||||||
| 	set_task_cpu(next_task, lowest_rq->cpu); | 	set_task_cpu(next_task, lowest_rq->cpu); | ||||||
| 	activate_task(lowest_rq, next_task, 0); | 	activate_task(lowest_rq, next_task, 0); | ||||||
|  | 	resched_curr(lowest_rq); | ||||||
| 	ret = 1; | 	ret = 1; | ||||||
| 
 | 
 | ||||||
| 	resched_curr(lowest_rq); |  | ||||||
| 
 |  | ||||||
| 	double_unlock_balance(rq, lowest_rq); | 	double_unlock_balance(rq, lowest_rq); | ||||||
| 
 |  | ||||||
| out: | out: | ||||||
| 	put_task_struct(next_task); | 	put_task_struct(next_task); | ||||||
| 
 | 
 | ||||||
|  | @ -1942,7 +1970,7 @@ static int push_rt_task(struct rq *rq) | ||||||
| static void push_rt_tasks(struct rq *rq) | static void push_rt_tasks(struct rq *rq) | ||||||
| { | { | ||||||
| 	/* push_rt_task will return true if it moved an RT */ | 	/* push_rt_task will return true if it moved an RT */ | ||||||
| 	while (push_rt_task(rq)) | 	while (push_rt_task(rq, false)) | ||||||
| 		; | 		; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -2095,7 +2123,8 @@ void rto_push_irq_work_func(struct irq_work *work) | ||||||
| 	 */ | 	 */ | ||||||
| 	if (has_pushable_tasks(rq)) { | 	if (has_pushable_tasks(rq)) { | ||||||
| 		raw_spin_lock(&rq->lock); | 		raw_spin_lock(&rq->lock); | ||||||
| 		push_rt_tasks(rq); | 		while (push_rt_task(rq, true)) | ||||||
|  | 			; | ||||||
| 		raw_spin_unlock(&rq->lock); | 		raw_spin_unlock(&rq->lock); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | @ -2120,7 +2149,7 @@ static void pull_rt_task(struct rq *this_rq) | ||||||
| { | { | ||||||
| 	int this_cpu = this_rq->cpu, cpu; | 	int this_cpu = this_rq->cpu, cpu; | ||||||
| 	bool resched = false; | 	bool resched = false; | ||||||
| 	struct task_struct *p; | 	struct task_struct *p, *push_task; | ||||||
| 	struct rq *src_rq; | 	struct rq *src_rq; | ||||||
| 	int rt_overload_count = rt_overloaded(this_rq); | 	int rt_overload_count = rt_overloaded(this_rq); | ||||||
| 
 | 
 | ||||||
|  | @ -2167,6 +2196,7 @@ static void pull_rt_task(struct rq *this_rq) | ||||||
| 		 * double_lock_balance, and another CPU could | 		 * double_lock_balance, and another CPU could | ||||||
| 		 * alter this_rq | 		 * alter this_rq | ||||||
| 		 */ | 		 */ | ||||||
|  | 		push_task = NULL; | ||||||
| 		double_lock_balance(this_rq, src_rq); | 		double_lock_balance(this_rq, src_rq); | ||||||
| 
 | 
 | ||||||
| 		/*
 | 		/*
 | ||||||
|  | @ -2194,11 +2224,14 @@ static void pull_rt_task(struct rq *this_rq) | ||||||
| 			if (p->prio < src_rq->curr->prio) | 			if (p->prio < src_rq->curr->prio) | ||||||
| 				goto skip; | 				goto skip; | ||||||
| 
 | 
 | ||||||
| 			resched = true; | 			if (is_migration_disabled(p)) { | ||||||
| 
 | 				push_task = get_push_task(src_rq); | ||||||
| 			deactivate_task(src_rq, p, 0); | 			} else { | ||||||
| 			set_task_cpu(p, this_cpu); | 				deactivate_task(src_rq, p, 0); | ||||||
| 			activate_task(this_rq, p, 0); | 				set_task_cpu(p, this_cpu); | ||||||
|  | 				activate_task(this_rq, p, 0); | ||||||
|  | 				resched = true; | ||||||
|  | 			} | ||||||
| 			/*
 | 			/*
 | ||||||
| 			 * We continue with the search, just in | 			 * We continue with the search, just in | ||||||
| 			 * case there's an even higher prio task | 			 * case there's an even higher prio task | ||||||
|  | @ -2208,6 +2241,13 @@ static void pull_rt_task(struct rq *this_rq) | ||||||
| 		} | 		} | ||||||
| skip: | skip: | ||||||
| 		double_unlock_balance(this_rq, src_rq); | 		double_unlock_balance(this_rq, src_rq); | ||||||
|  | 
 | ||||||
|  | 		if (push_task) { | ||||||
|  | 			raw_spin_unlock(&this_rq->lock); | ||||||
|  | 			stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, | ||||||
|  | 					    push_task, &src_rq->push_work); | ||||||
|  | 			raw_spin_lock(&this_rq->lock); | ||||||
|  | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	if (resched) | 	if (resched) | ||||||
|  | @ -2429,8 +2469,8 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) | ||||||
| 		return 0; | 		return 0; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| const struct sched_class rt_sched_class | DEFINE_SCHED_CLASS(rt) = { | ||||||
| 	__section("__rt_sched_class") = { | 
 | ||||||
| 	.enqueue_task		= enqueue_task_rt, | 	.enqueue_task		= enqueue_task_rt, | ||||||
| 	.dequeue_task		= dequeue_task_rt, | 	.dequeue_task		= dequeue_task_rt, | ||||||
| 	.yield_task		= yield_task_rt, | 	.yield_task		= yield_task_rt, | ||||||
|  | @ -2449,6 +2489,7 @@ const struct sched_class rt_sched_class | ||||||
| 	.rq_offline             = rq_offline_rt, | 	.rq_offline             = rq_offline_rt, | ||||||
| 	.task_woken		= task_woken_rt, | 	.task_woken		= task_woken_rt, | ||||||
| 	.switched_from		= switched_from_rt, | 	.switched_from		= switched_from_rt, | ||||||
|  | 	.find_lock_rq		= find_lock_lowest_rq, | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| 	.task_tick		= task_tick_rt, | 	.task_tick		= task_tick_rt, | ||||||
|  |  | ||||||
|  | @ -67,7 +67,6 @@ | ||||||
| #include <linux/tsacct_kern.h> | #include <linux/tsacct_kern.h> | ||||||
| 
 | 
 | ||||||
| #include <asm/tlb.h> | #include <asm/tlb.h> | ||||||
| #include <asm-generic/vmlinux.lds.h> |  | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_PARAVIRT | #ifdef CONFIG_PARAVIRT | ||||||
| # include <asm/paravirt.h> | # include <asm/paravirt.h> | ||||||
|  | @ -257,30 +256,6 @@ struct rt_bandwidth { | ||||||
| 
 | 
 | ||||||
| void __dl_clear_params(struct task_struct *p); | void __dl_clear_params(struct task_struct *p); | ||||||
| 
 | 
 | ||||||
| /*
 |  | ||||||
|  * To keep the bandwidth of -deadline tasks and groups under control |  | ||||||
|  * we need some place where: |  | ||||||
|  *  - store the maximum -deadline bandwidth of the system (the group); |  | ||||||
|  *  - cache the fraction of that bandwidth that is currently allocated. |  | ||||||
|  * |  | ||||||
|  * This is all done in the data structure below. It is similar to the |  | ||||||
|  * one used for RT-throttling (rt_bandwidth), with the main difference |  | ||||||
|  * that, since here we are only interested in admission control, we |  | ||||||
|  * do not decrease any runtime while the group "executes", neither we |  | ||||||
|  * need a timer to replenish it. |  | ||||||
|  * |  | ||||||
|  * With respect to SMP, the bandwidth is given on a per-CPU basis, |  | ||||||
|  * meaning that: |  | ||||||
|  *  - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; |  | ||||||
|  *  - dl_total_bw array contains, in the i-eth element, the currently |  | ||||||
|  *    allocated bandwidth on the i-eth CPU. |  | ||||||
|  * Moreover, groups consume bandwidth on each CPU, while tasks only |  | ||||||
|  * consume bandwidth on the CPU they're running on. |  | ||||||
|  * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw |  | ||||||
|  * that will be shown the next time the proc or cgroup controls will |  | ||||||
|  * be red. It on its turn can be changed by writing on its own |  | ||||||
|  * control. |  | ||||||
|  */ |  | ||||||
| struct dl_bandwidth { | struct dl_bandwidth { | ||||||
| 	raw_spinlock_t		dl_runtime_lock; | 	raw_spinlock_t		dl_runtime_lock; | ||||||
| 	u64			dl_runtime; | 	u64			dl_runtime; | ||||||
|  | @ -292,6 +267,24 @@ static inline int dl_bandwidth_enabled(void) | ||||||
| 	return sysctl_sched_rt_runtime >= 0; | 	return sysctl_sched_rt_runtime >= 0; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * To keep the bandwidth of -deadline tasks under control | ||||||
|  |  * we need some place where: | ||||||
|  |  *  - store the maximum -deadline bandwidth of each cpu; | ||||||
|  |  *  - cache the fraction of bandwidth that is currently allocated in | ||||||
|  |  *    each root domain; | ||||||
|  |  * | ||||||
|  |  * This is all done in the data structure below. It is similar to the | ||||||
|  |  * one used for RT-throttling (rt_bandwidth), with the main difference | ||||||
|  |  * that, since here we are only interested in admission control, we | ||||||
|  |  * do not decrease any runtime while the group "executes", neither we | ||||||
|  |  * need a timer to replenish it. | ||||||
|  |  * | ||||||
|  |  * With respect to SMP, bandwidth is given on a per root domain basis, | ||||||
|  |  * meaning that: | ||||||
|  |  *  - bw (< 100%) is the deadline bandwidth of each CPU; | ||||||
|  |  *  - total_bw is the currently allocated bandwidth in each root domain; | ||||||
|  |  */ | ||||||
| struct dl_bw { | struct dl_bw { | ||||||
| 	raw_spinlock_t		lock; | 	raw_spinlock_t		lock; | ||||||
| 	u64			bw; | 	u64			bw; | ||||||
|  | @ -801,6 +794,15 @@ struct root_domain { | ||||||
| 	struct dl_bw		dl_bw; | 	struct dl_bw		dl_bw; | ||||||
| 	struct cpudl		cpudl; | 	struct cpudl		cpudl; | ||||||
| 
 | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * Indicate whether a root_domain's dl_bw has been checked or | ||||||
|  | 	 * updated. It's monotonously increasing value. | ||||||
|  | 	 * | ||||||
|  | 	 * Also, some corner cases, like 'wrap around' is dangerous, but given | ||||||
|  | 	 * that u64 is 'big enough'. So that shouldn't be a concern. | ||||||
|  | 	 */ | ||||||
|  | 	u64 visit_gen; | ||||||
|  | 
 | ||||||
| #ifdef HAVE_RT_PUSH_IPI | #ifdef HAVE_RT_PUSH_IPI | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * For IPI pull requests, loop across the rto_mask. | 	 * For IPI pull requests, loop across the rto_mask. | ||||||
|  | @ -973,6 +975,7 @@ struct rq { | ||||||
| 	unsigned long		cpu_capacity_orig; | 	unsigned long		cpu_capacity_orig; | ||||||
| 
 | 
 | ||||||
| 	struct callback_head	*balance_callback; | 	struct callback_head	*balance_callback; | ||||||
|  | 	unsigned char		balance_flags; | ||||||
| 
 | 
 | ||||||
| 	unsigned char		nohz_idle_balance; | 	unsigned char		nohz_idle_balance; | ||||||
| 	unsigned char		idle_balance; | 	unsigned char		idle_balance; | ||||||
|  | @ -1003,6 +1006,10 @@ struct rq { | ||||||
| 
 | 
 | ||||||
| 	/* This is used to determine avg_idle's max value */ | 	/* This is used to determine avg_idle's max value */ | ||||||
| 	u64			max_idle_balance_cost; | 	u64			max_idle_balance_cost; | ||||||
|  | 
 | ||||||
|  | #ifdef CONFIG_HOTPLUG_CPU | ||||||
|  | 	struct rcuwait		hotplug_wait; | ||||||
|  | #endif | ||||||
| #endif /* CONFIG_SMP */ | #endif /* CONFIG_SMP */ | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_IRQ_TIME_ACCOUNTING | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||||||
|  | @ -1048,6 +1055,12 @@ struct rq { | ||||||
| 	/* Must be inspected within a rcu lock section */ | 	/* Must be inspected within a rcu lock section */ | ||||||
| 	struct cpuidle_state	*idle_state; | 	struct cpuidle_state	*idle_state; | ||||||
| #endif | #endif | ||||||
|  | 
 | ||||||
|  | #ifdef CONFIG_SMP | ||||||
|  | 	unsigned int		nr_pinned; | ||||||
|  | #endif | ||||||
|  | 	unsigned int		push_busy; | ||||||
|  | 	struct cpu_stop_work	push_work; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_FAIR_GROUP_SCHED | #ifdef CONFIG_FAIR_GROUP_SCHED | ||||||
|  | @ -1075,6 +1088,16 @@ static inline int cpu_of(struct rq *rq) | ||||||
| #endif | #endif | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | #define MDF_PUSH	0x01 | ||||||
|  | 
 | ||||||
|  | static inline bool is_migration_disabled(struct task_struct *p) | ||||||
|  | { | ||||||
|  | #ifdef CONFIG_SMP | ||||||
|  | 	return p->migration_disabled; | ||||||
|  | #else | ||||||
|  | 	return false; | ||||||
|  | #endif | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_SCHED_SMT | #ifdef CONFIG_SCHED_SMT | ||||||
| extern void __update_idle_core(struct rq *rq); | extern void __update_idle_core(struct rq *rq); | ||||||
|  | @ -1221,6 +1244,9 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) | ||||||
| 	rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); | 	rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); | ||||||
| 	rf->clock_update_flags = 0; | 	rf->clock_update_flags = 0; | ||||||
| #endif | #endif | ||||||
|  | #ifdef CONFIG_SMP | ||||||
|  | 	SCHED_WARN_ON(rq->balance_callback); | ||||||
|  | #endif | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) | static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) | ||||||
|  | @ -1382,6 +1408,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p) | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_SMP | #ifdef CONFIG_SMP | ||||||
| 
 | 
 | ||||||
|  | #define BALANCE_WORK	0x01 | ||||||
|  | #define BALANCE_PUSH	0x02 | ||||||
|  | 
 | ||||||
| static inline void | static inline void | ||||||
| queue_balance_callback(struct rq *rq, | queue_balance_callback(struct rq *rq, | ||||||
| 		       struct callback_head *head, | 		       struct callback_head *head, | ||||||
|  | @ -1389,12 +1418,13 @@ queue_balance_callback(struct rq *rq, | ||||||
| { | { | ||||||
| 	lockdep_assert_held(&rq->lock); | 	lockdep_assert_held(&rq->lock); | ||||||
| 
 | 
 | ||||||
| 	if (unlikely(head->next)) | 	if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH))) | ||||||
| 		return; | 		return; | ||||||
| 
 | 
 | ||||||
| 	head->func = (void (*)(struct callback_head *))func; | 	head->func = (void (*)(struct callback_head *))func; | ||||||
| 	head->next = rq->balance_callback; | 	head->next = rq->balance_callback; | ||||||
| 	rq->balance_callback = head; | 	rq->balance_callback = head; | ||||||
|  | 	rq->balance_flags |= BALANCE_WORK; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #define rcu_dereference_check_sched_domain(p) \ | #define rcu_dereference_check_sched_domain(p) \ | ||||||
|  | @ -1714,13 +1744,20 @@ static inline int task_on_rq_migrating(struct task_struct *p) | ||||||
| 	return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; | 	return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /*
 | /* Wake flags. The first three directly map to some SD flag value */ | ||||||
|  * wake flags | #define WF_EXEC     0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ | ||||||
|  */ | #define WF_FORK     0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ | ||||||
| #define WF_SYNC			0x01		/* Waker goes to sleep after wakeup */ | #define WF_TTWU     0x08 /* Wakeup;            maps to SD_BALANCE_WAKE */ | ||||||
| #define WF_FORK			0x02		/* Child wakeup after fork */ | 
 | ||||||
| #define WF_MIGRATED		0x04		/* Internal use, task got migrated */ | #define WF_SYNC     0x10 /* Waker goes to sleep after wakeup */ | ||||||
| #define WF_ON_CPU		0x08		/* Wakee is on_cpu */ | #define WF_MIGRATED 0x20 /* Internal use, task got migrated */ | ||||||
|  | #define WF_ON_CPU   0x40 /* Wakee is on_cpu */ | ||||||
|  | 
 | ||||||
|  | #ifdef CONFIG_SMP | ||||||
|  | static_assert(WF_EXEC == SD_BALANCE_EXEC); | ||||||
|  | static_assert(WF_FORK == SD_BALANCE_FORK); | ||||||
|  | static_assert(WF_TTWU == SD_BALANCE_WAKE); | ||||||
|  | #endif | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * To aid in avoiding the subversion of "niceness" due to uneven distribution |  * To aid in avoiding the subversion of "niceness" due to uneven distribution | ||||||
|  | @ -1796,16 +1833,19 @@ struct sched_class { | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_SMP | #ifdef CONFIG_SMP | ||||||
| 	int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); | 	int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); | ||||||
| 	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); | 	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); | ||||||
| 	void (*migrate_task_rq)(struct task_struct *p, int new_cpu); | 	void (*migrate_task_rq)(struct task_struct *p, int new_cpu); | ||||||
| 
 | 
 | ||||||
| 	void (*task_woken)(struct rq *this_rq, struct task_struct *task); | 	void (*task_woken)(struct rq *this_rq, struct task_struct *task); | ||||||
| 
 | 
 | ||||||
| 	void (*set_cpus_allowed)(struct task_struct *p, | 	void (*set_cpus_allowed)(struct task_struct *p, | ||||||
| 				 const struct cpumask *newmask); | 				 const struct cpumask *newmask, | ||||||
|  | 				 u32 flags); | ||||||
| 
 | 
 | ||||||
| 	void (*rq_online)(struct rq *rq); | 	void (*rq_online)(struct rq *rq); | ||||||
| 	void (*rq_offline)(struct rq *rq); | 	void (*rq_offline)(struct rq *rq); | ||||||
|  | 
 | ||||||
|  | 	struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| 	void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); | 	void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); | ||||||
|  | @ -1833,7 +1873,7 @@ struct sched_class { | ||||||
| #ifdef CONFIG_FAIR_GROUP_SCHED | #ifdef CONFIG_FAIR_GROUP_SCHED | ||||||
| 	void (*task_change_group)(struct task_struct *p, int type); | 	void (*task_change_group)(struct task_struct *p, int type); | ||||||
| #endif | #endif | ||||||
| } __aligned(STRUCT_ALIGNMENT); /* STRUCT_ALIGN(), vmlinux.lds.h */ | }; | ||||||
| 
 | 
 | ||||||
| static inline void put_prev_task(struct rq *rq, struct task_struct *prev) | static inline void put_prev_task(struct rq *rq, struct task_struct *prev) | ||||||
| { | { | ||||||
|  | @ -1847,6 +1887,20 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next) | ||||||
| 	next->sched_class->set_next_task(rq, next, false); | 	next->sched_class->set_next_task(rq, next, false); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Helper to define a sched_class instance; each one is placed in a separate | ||||||
|  |  * section which is ordered by the linker script: | ||||||
|  |  * | ||||||
|  |  *   include/asm-generic/vmlinux.lds.h | ||||||
|  |  * | ||||||
|  |  * Also enforce alignment on the instance, not the type, to guarantee layout. | ||||||
|  |  */ | ||||||
|  | #define DEFINE_SCHED_CLASS(name) \ | ||||||
|  | const struct sched_class name##_sched_class \ | ||||||
|  | 	__aligned(__alignof__(struct sched_class)) \ | ||||||
|  | 	__section("__" #name "_sched_class") | ||||||
|  | 
 | ||||||
| /* Defined in include/asm-generic/vmlinux.lds.h */ | /* Defined in include/asm-generic/vmlinux.lds.h */ | ||||||
| extern struct sched_class __begin_sched_classes[]; | extern struct sched_class __begin_sched_classes[]; | ||||||
| extern struct sched_class __end_sched_classes[]; | extern struct sched_class __end_sched_classes[]; | ||||||
|  | @ -1889,13 +1943,35 @@ static inline bool sched_fair_runnable(struct rq *rq) | ||||||
| extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); | extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); | ||||||
| extern struct task_struct *pick_next_task_idle(struct rq *rq); | extern struct task_struct *pick_next_task_idle(struct rq *rq); | ||||||
| 
 | 
 | ||||||
|  | #define SCA_CHECK		0x01 | ||||||
|  | #define SCA_MIGRATE_DISABLE	0x02 | ||||||
|  | #define SCA_MIGRATE_ENABLE	0x04 | ||||||
|  | 
 | ||||||
| #ifdef CONFIG_SMP | #ifdef CONFIG_SMP | ||||||
| 
 | 
 | ||||||
| extern void update_group_capacity(struct sched_domain *sd, int cpu); | extern void update_group_capacity(struct sched_domain *sd, int cpu); | ||||||
| 
 | 
 | ||||||
| extern void trigger_load_balance(struct rq *rq); | extern void trigger_load_balance(struct rq *rq); | ||||||
| 
 | 
 | ||||||
| extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); | extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags); | ||||||
|  | 
 | ||||||
|  | static inline struct task_struct *get_push_task(struct rq *rq) | ||||||
|  | { | ||||||
|  | 	struct task_struct *p = rq->curr; | ||||||
|  | 
 | ||||||
|  | 	lockdep_assert_held(&rq->lock); | ||||||
|  | 
 | ||||||
|  | 	if (rq->push_busy) | ||||||
|  | 		return NULL; | ||||||
|  | 
 | ||||||
|  | 	if (p->nr_cpus_allowed == 1) | ||||||
|  | 		return NULL; | ||||||
|  | 
 | ||||||
|  | 	rq->push_busy = true; | ||||||
|  | 	return get_task_struct(p); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | extern int push_cpu_stop(void *arg); | ||||||
| 
 | 
 | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -11,7 +11,7 @@ | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_SMP | #ifdef CONFIG_SMP | ||||||
| static int | static int | ||||||
| select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) | select_task_rq_stop(struct task_struct *p, int cpu, int flags) | ||||||
| { | { | ||||||
| 	return task_cpu(p); /* stop tasks as never migrate */ | 	return task_cpu(p); /* stop tasks as never migrate */ | ||||||
| } | } | ||||||
|  | @ -109,8 +109,7 @@ static void update_curr_stop(struct rq *rq) | ||||||
| /*
 | /*
 | ||||||
|  * Simple, special scheduling class for the per-CPU stop tasks: |  * Simple, special scheduling class for the per-CPU stop tasks: | ||||||
|  */ |  */ | ||||||
| const struct sched_class stop_sched_class | DEFINE_SCHED_CLASS(stop) = { | ||||||
| 	__section("__stop_sched_class") = { |  | ||||||
| 
 | 
 | ||||||
| 	.enqueue_task		= enqueue_task_stop, | 	.enqueue_task		= enqueue_task_stop, | ||||||
| 	.dequeue_task		= dequeue_task_stop, | 	.dequeue_task		= dequeue_task_stop, | ||||||
|  |  | ||||||
|  | @ -211,6 +211,15 @@ unsigned int sysctl_sched_energy_aware = 1; | ||||||
| DEFINE_MUTEX(sched_energy_mutex); | DEFINE_MUTEX(sched_energy_mutex); | ||||||
| bool sched_energy_update; | bool sched_energy_update; | ||||||
| 
 | 
 | ||||||
|  | void rebuild_sched_domains_energy(void) | ||||||
|  | { | ||||||
|  | 	mutex_lock(&sched_energy_mutex); | ||||||
|  | 	sched_energy_update = true; | ||||||
|  | 	rebuild_sched_domains(); | ||||||
|  | 	sched_energy_update = false; | ||||||
|  | 	mutex_unlock(&sched_energy_mutex); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| #ifdef CONFIG_PROC_SYSCTL | #ifdef CONFIG_PROC_SYSCTL | ||||||
| int sched_energy_aware_handler(struct ctl_table *table, int write, | int sched_energy_aware_handler(struct ctl_table *table, int write, | ||||||
| 		void *buffer, size_t *lenp, loff_t *ppos) | 		void *buffer, size_t *lenp, loff_t *ppos) | ||||||
|  | @ -223,13 +232,8 @@ int sched_energy_aware_handler(struct ctl_table *table, int write, | ||||||
| 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||||||
| 	if (!ret && write) { | 	if (!ret && write) { | ||||||
| 		state = static_branch_unlikely(&sched_energy_present); | 		state = static_branch_unlikely(&sched_energy_present); | ||||||
| 		if (state != sysctl_sched_energy_aware) { | 		if (state != sysctl_sched_energy_aware) | ||||||
| 			mutex_lock(&sched_energy_mutex); | 			rebuild_sched_domains_energy(); | ||||||
| 			sched_energy_update = 1; |  | ||||||
| 			rebuild_sched_domains(); |  | ||||||
| 			sched_energy_update = 0; |  | ||||||
| 			mutex_unlock(&sched_energy_mutex); |  | ||||||
| 		} |  | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	return ret; | 	return ret; | ||||||
|  | @ -324,6 +328,7 @@ static void sched_energy_set(bool has_eas) | ||||||
|  *    3. no SMT is detected. |  *    3. no SMT is detected. | ||||||
|  *    4. the EM complexity is low enough to keep scheduling overheads low; |  *    4. the EM complexity is low enough to keep scheduling overheads low; | ||||||
|  *    5. schedutil is driving the frequency of all CPUs of the rd; |  *    5. schedutil is driving the frequency of all CPUs of the rd; | ||||||
|  |  *    6. frequency invariance support is present; | ||||||
|  * |  * | ||||||
|  * The complexity of the Energy Model is defined as: |  * The complexity of the Energy Model is defined as: | ||||||
|  * |  * | ||||||
|  | @ -372,6 +377,14 @@ static bool build_perf_domains(const struct cpumask *cpu_map) | ||||||
| 		goto free; | 		goto free; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	if (!arch_scale_freq_invariant()) { | ||||||
|  | 		if (sched_debug()) { | ||||||
|  | 			pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported", | ||||||
|  | 				cpumask_pr_args(cpu_map)); | ||||||
|  | 		} | ||||||
|  | 		goto free; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	for_each_cpu(i, cpu_map) { | 	for_each_cpu(i, cpu_map) { | ||||||
| 		/* Skip already covered CPUs. */ | 		/* Skip already covered CPUs. */ | ||||||
| 		if (find_pd(pd, i)) | 		if (find_pd(pd, i)) | ||||||
|  | @ -516,6 +529,7 @@ static int init_rootdomain(struct root_domain *rd) | ||||||
| 	init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); | 	init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  | 	rd->visit_gen = 0; | ||||||
| 	init_dl_bw(&rd->dl_bw); | 	init_dl_bw(&rd->dl_bw); | ||||||
| 	if (cpudl_init(&rd->cpudl) != 0) | 	if (cpudl_init(&rd->cpudl) != 0) | ||||||
| 		goto free_rto_mask; | 		goto free_rto_mask; | ||||||
|  | @ -674,6 +688,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | ||||||
| { | { | ||||||
| 	struct rq *rq = cpu_rq(cpu); | 	struct rq *rq = cpu_rq(cpu); | ||||||
| 	struct sched_domain *tmp; | 	struct sched_domain *tmp; | ||||||
|  | 	int numa_distance = 0; | ||||||
| 
 | 
 | ||||||
| 	/* Remove the sched domains which do not contribute to scheduling. */ | 	/* Remove the sched domains which do not contribute to scheduling. */ | ||||||
| 	for (tmp = sd; tmp; ) { | 	for (tmp = sd; tmp; ) { | ||||||
|  | @ -705,6 +720,38 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | ||||||
| 			sd->child = NULL; | 			sd->child = NULL; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	for (tmp = sd; tmp; tmp = tmp->parent) | ||||||
|  | 		numa_distance += !!(tmp->flags & SD_NUMA); | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * FIXME: Diameter >=3 is misrepresented. | ||||||
|  | 	 * | ||||||
|  | 	 * Smallest diameter=3 topology is: | ||||||
|  | 	 * | ||||||
|  | 	 *   node   0   1   2   3 | ||||||
|  | 	 *     0:  10  20  30  40 | ||||||
|  | 	 *     1:  20  10  20  30 | ||||||
|  | 	 *     2:  30  20  10  20 | ||||||
|  | 	 *     3:  40  30  20  10 | ||||||
|  | 	 * | ||||||
|  | 	 *   0 --- 1 --- 2 --- 3 | ||||||
|  | 	 * | ||||||
|  | 	 * NUMA-3	0-3		N/A		N/A		0-3 | ||||||
|  | 	 *  groups:	{0-2},{1-3}					{1-3},{0-2} | ||||||
|  | 	 * | ||||||
|  | 	 * NUMA-2	0-2		0-3		0-3		1-3 | ||||||
|  | 	 *  groups:	{0-1},{1-3}	{0-2},{2-3}	{1-3},{0-1}	{2-3},{0-2} | ||||||
|  | 	 * | ||||||
|  | 	 * NUMA-1	0-1		0-2		1-3		2-3 | ||||||
|  | 	 *  groups:	{0},{1}		{1},{2},{0}	{2},{3},{1}	{3},{2} | ||||||
|  | 	 * | ||||||
|  | 	 * NUMA-0	0		1		2		3 | ||||||
|  | 	 * | ||||||
|  | 	 * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the | ||||||
|  | 	 * group span isn't a subset of the domain span. | ||||||
|  | 	 */ | ||||||
|  | 	WARN_ONCE(numa_distance > 2, "Shortest NUMA path spans too many nodes\n"); | ||||||
|  | 
 | ||||||
| 	sched_domain_debug(sd, cpu); | 	sched_domain_debug(sd, cpu); | ||||||
| 
 | 
 | ||||||
| 	rq_attach_root(rq, rd); | 	rq_attach_root(rq, rd); | ||||||
|  |  | ||||||
							
								
								
									
										52
									
								
								kernel/smp.c
									
									
									
									
									
								
							
							
						
						
									
										52
									
								
								kernel/smp.c
									
									
									
									
									
								
							|  | @ -27,7 +27,7 @@ | ||||||
| #include "smpboot.h" | #include "smpboot.h" | ||||||
| #include "sched/smp.h" | #include "sched/smp.h" | ||||||
| 
 | 
 | ||||||
| #define CSD_TYPE(_csd)	((_csd)->flags & CSD_FLAG_TYPE_MASK) | #define CSD_TYPE(_csd)	((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK) | ||||||
| 
 | 
 | ||||||
| struct call_function_data { | struct call_function_data { | ||||||
| 	call_single_data_t	__percpu *csd; | 	call_single_data_t	__percpu *csd; | ||||||
|  | @ -130,7 +130,7 @@ static __always_inline int csd_lock_wait_getcpu(call_single_data_t *csd) | ||||||
| 
 | 
 | ||||||
| 	csd_type = CSD_TYPE(csd); | 	csd_type = CSD_TYPE(csd); | ||||||
| 	if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC) | 	if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC) | ||||||
| 		return csd->dst; /* Other CSD_TYPE_ values might not have ->dst. */ | 		return csd->node.dst; /* Other CSD_TYPE_ values might not have ->dst. */ | ||||||
| 	return -1; | 	return -1; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -146,7 +146,7 @@ static __always_inline bool csd_lock_wait_toolong(call_single_data_t *csd, u64 t | ||||||
| 	bool firsttime; | 	bool firsttime; | ||||||
| 	u64 ts2, ts_delta; | 	u64 ts2, ts_delta; | ||||||
| 	call_single_data_t *cpu_cur_csd; | 	call_single_data_t *cpu_cur_csd; | ||||||
| 	unsigned int flags = READ_ONCE(csd->flags); | 	unsigned int flags = READ_ONCE(csd->node.u_flags); | ||||||
| 
 | 
 | ||||||
| 	if (!(flags & CSD_FLAG_LOCK)) { | 	if (!(flags & CSD_FLAG_LOCK)) { | ||||||
| 		if (!unlikely(*bug_id)) | 		if (!unlikely(*bug_id)) | ||||||
|  | @ -224,14 +224,14 @@ static void csd_lock_record(call_single_data_t *csd) | ||||||
| 
 | 
 | ||||||
| static __always_inline void csd_lock_wait(call_single_data_t *csd) | static __always_inline void csd_lock_wait(call_single_data_t *csd) | ||||||
| { | { | ||||||
| 	smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK)); | 	smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| static __always_inline void csd_lock(call_single_data_t *csd) | static __always_inline void csd_lock(call_single_data_t *csd) | ||||||
| { | { | ||||||
| 	csd_lock_wait(csd); | 	csd_lock_wait(csd); | ||||||
| 	csd->flags |= CSD_FLAG_LOCK; | 	csd->node.u_flags |= CSD_FLAG_LOCK; | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * prevent CPU from reordering the above assignment | 	 * prevent CPU from reordering the above assignment | ||||||
|  | @ -243,12 +243,12 @@ static __always_inline void csd_lock(call_single_data_t *csd) | ||||||
| 
 | 
 | ||||||
| static __always_inline void csd_unlock(call_single_data_t *csd) | static __always_inline void csd_unlock(call_single_data_t *csd) | ||||||
| { | { | ||||||
| 	WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); | 	WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK)); | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * ensure we're all done before releasing data: | 	 * ensure we're all done before releasing data: | ||||||
| 	 */ | 	 */ | ||||||
| 	smp_store_release(&csd->flags, 0); | 	smp_store_release(&csd->node.u_flags, 0); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); | static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); | ||||||
|  | @ -300,7 +300,7 @@ static int generic_exec_single(int cpu, call_single_data_t *csd) | ||||||
| 		return -ENXIO; | 		return -ENXIO; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	__smp_call_single_queue(cpu, &csd->llist); | 	__smp_call_single_queue(cpu, &csd->node.llist); | ||||||
| 
 | 
 | ||||||
| 	return 0; | 	return 0; | ||||||
| } | } | ||||||
|  | @ -353,7 +353,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) | ||||||
| 		 * We don't have to use the _safe() variant here | 		 * We don't have to use the _safe() variant here | ||||||
| 		 * because we are not invoking the IPI handlers yet. | 		 * because we are not invoking the IPI handlers yet. | ||||||
| 		 */ | 		 */ | ||||||
| 		llist_for_each_entry(csd, entry, llist) { | 		llist_for_each_entry(csd, entry, node.llist) { | ||||||
| 			switch (CSD_TYPE(csd)) { | 			switch (CSD_TYPE(csd)) { | ||||||
| 			case CSD_TYPE_ASYNC: | 			case CSD_TYPE_ASYNC: | ||||||
| 			case CSD_TYPE_SYNC: | 			case CSD_TYPE_SYNC: | ||||||
|  | @ -378,16 +378,16 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) | ||||||
| 	 * First; run all SYNC callbacks, people are waiting for us. | 	 * First; run all SYNC callbacks, people are waiting for us. | ||||||
| 	 */ | 	 */ | ||||||
| 	prev = NULL; | 	prev = NULL; | ||||||
| 	llist_for_each_entry_safe(csd, csd_next, entry, llist) { | 	llist_for_each_entry_safe(csd, csd_next, entry, node.llist) { | ||||||
| 		/* Do we wait until *after* callback? */ | 		/* Do we wait until *after* callback? */ | ||||||
| 		if (CSD_TYPE(csd) == CSD_TYPE_SYNC) { | 		if (CSD_TYPE(csd) == CSD_TYPE_SYNC) { | ||||||
| 			smp_call_func_t func = csd->func; | 			smp_call_func_t func = csd->func; | ||||||
| 			void *info = csd->info; | 			void *info = csd->info; | ||||||
| 
 | 
 | ||||||
| 			if (prev) { | 			if (prev) { | ||||||
| 				prev->next = &csd_next->llist; | 				prev->next = &csd_next->node.llist; | ||||||
| 			} else { | 			} else { | ||||||
| 				entry = &csd_next->llist; | 				entry = &csd_next->node.llist; | ||||||
| 			} | 			} | ||||||
| 
 | 
 | ||||||
| 			csd_lock_record(csd); | 			csd_lock_record(csd); | ||||||
|  | @ -395,7 +395,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) | ||||||
| 			csd_unlock(csd); | 			csd_unlock(csd); | ||||||
| 			csd_lock_record(NULL); | 			csd_lock_record(NULL); | ||||||
| 		} else { | 		} else { | ||||||
| 			prev = &csd->llist; | 			prev = &csd->node.llist; | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | @ -406,14 +406,14 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) | ||||||
| 	 * Second; run all !SYNC callbacks. | 	 * Second; run all !SYNC callbacks. | ||||||
| 	 */ | 	 */ | ||||||
| 	prev = NULL; | 	prev = NULL; | ||||||
| 	llist_for_each_entry_safe(csd, csd_next, entry, llist) { | 	llist_for_each_entry_safe(csd, csd_next, entry, node.llist) { | ||||||
| 		int type = CSD_TYPE(csd); | 		int type = CSD_TYPE(csd); | ||||||
| 
 | 
 | ||||||
| 		if (type != CSD_TYPE_TTWU) { | 		if (type != CSD_TYPE_TTWU) { | ||||||
| 			if (prev) { | 			if (prev) { | ||||||
| 				prev->next = &csd_next->llist; | 				prev->next = &csd_next->node.llist; | ||||||
| 			} else { | 			} else { | ||||||
| 				entry = &csd_next->llist; | 				entry = &csd_next->node.llist; | ||||||
| 			} | 			} | ||||||
| 
 | 
 | ||||||
| 			if (type == CSD_TYPE_ASYNC) { | 			if (type == CSD_TYPE_ASYNC) { | ||||||
|  | @ -429,7 +429,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) | ||||||
| 			} | 			} | ||||||
| 
 | 
 | ||||||
| 		} else { | 		} else { | ||||||
| 			prev = &csd->llist; | 			prev = &csd->node.llist; | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | @ -465,7 +465,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, | ||||||
| { | { | ||||||
| 	call_single_data_t *csd; | 	call_single_data_t *csd; | ||||||
| 	call_single_data_t csd_stack = { | 	call_single_data_t csd_stack = { | ||||||
| 		.flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, | 		.node = { .u_flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, }, | ||||||
| 	}; | 	}; | ||||||
| 	int this_cpu; | 	int this_cpu; | ||||||
| 	int err; | 	int err; | ||||||
|  | @ -502,8 +502,8 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, | ||||||
| 	csd->func = func; | 	csd->func = func; | ||||||
| 	csd->info = info; | 	csd->info = info; | ||||||
| #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG | #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG | ||||||
| 	csd->src = smp_processor_id(); | 	csd->node.src = smp_processor_id(); | ||||||
| 	csd->dst = cpu; | 	csd->node.dst = cpu; | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| 	err = generic_exec_single(cpu, csd); | 	err = generic_exec_single(cpu, csd); | ||||||
|  | @ -544,12 +544,12 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd) | ||||||
| 
 | 
 | ||||||
| 	preempt_disable(); | 	preempt_disable(); | ||||||
| 
 | 
 | ||||||
| 	if (csd->flags & CSD_FLAG_LOCK) { | 	if (csd->node.u_flags & CSD_FLAG_LOCK) { | ||||||
| 		err = -EBUSY; | 		err = -EBUSY; | ||||||
| 		goto out; | 		goto out; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	csd->flags = CSD_FLAG_LOCK; | 	csd->node.u_flags = CSD_FLAG_LOCK; | ||||||
| 	smp_wmb(); | 	smp_wmb(); | ||||||
| 
 | 
 | ||||||
| 	err = generic_exec_single(cpu, csd); | 	err = generic_exec_single(cpu, csd); | ||||||
|  | @ -667,14 +667,14 @@ static void smp_call_function_many_cond(const struct cpumask *mask, | ||||||
| 
 | 
 | ||||||
| 		csd_lock(csd); | 		csd_lock(csd); | ||||||
| 		if (wait) | 		if (wait) | ||||||
| 			csd->flags |= CSD_TYPE_SYNC; | 			csd->node.u_flags |= CSD_TYPE_SYNC; | ||||||
| 		csd->func = func; | 		csd->func = func; | ||||||
| 		csd->info = info; | 		csd->info = info; | ||||||
| #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG | #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG | ||||||
| 		csd->src = smp_processor_id(); | 		csd->node.src = smp_processor_id(); | ||||||
| 		csd->dst = cpu; | 		csd->node.dst = cpu; | ||||||
| #endif | #endif | ||||||
| 		if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) | 		if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) | ||||||
| 			__cpumask_set_cpu(cpu, cfd->cpumask_ipi); | 			__cpumask_set_cpu(cpu, cfd->cpumask_ipi); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -42,11 +42,27 @@ struct cpu_stopper { | ||||||
| 	struct list_head	works;		/* list of pending works */ | 	struct list_head	works;		/* list of pending works */ | ||||||
| 
 | 
 | ||||||
| 	struct cpu_stop_work	stop_work;	/* for stop_cpus */ | 	struct cpu_stop_work	stop_work;	/* for stop_cpus */ | ||||||
|  | 	unsigned long		caller; | ||||||
|  | 	cpu_stop_fn_t		fn; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); | static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); | ||||||
| static bool stop_machine_initialized = false; | static bool stop_machine_initialized = false; | ||||||
| 
 | 
 | ||||||
|  | void print_stop_info(const char *log_lvl, struct task_struct *task) | ||||||
|  | { | ||||||
|  | 	/*
 | ||||||
|  | 	 * If @task is a stopper task, it cannot migrate and task_cpu() is | ||||||
|  | 	 * stable. | ||||||
|  | 	 */ | ||||||
|  | 	struct cpu_stopper *stopper = per_cpu_ptr(&cpu_stopper, task_cpu(task)); | ||||||
|  | 
 | ||||||
|  | 	if (task != stopper->thread) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	printk("%sStopper: %pS <- %pS\n", log_lvl, stopper->fn, (void *)stopper->caller); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| /* static data for stop_cpus */ | /* static data for stop_cpus */ | ||||||
| static DEFINE_MUTEX(stop_cpus_mutex); | static DEFINE_MUTEX(stop_cpus_mutex); | ||||||
| static bool stop_cpus_in_progress; | static bool stop_cpus_in_progress; | ||||||
|  | @ -123,7 +139,7 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) | ||||||
| int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) | int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) | ||||||
| { | { | ||||||
| 	struct cpu_stop_done done; | 	struct cpu_stop_done done; | ||||||
| 	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; | 	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ }; | ||||||
| 
 | 
 | ||||||
| 	cpu_stop_init_done(&done, 1); | 	cpu_stop_init_done(&done, 1); | ||||||
| 	if (!cpu_stop_queue_work(cpu, &work)) | 	if (!cpu_stop_queue_work(cpu, &work)) | ||||||
|  | @ -331,7 +347,8 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * | ||||||
| 	work1 = work2 = (struct cpu_stop_work){ | 	work1 = work2 = (struct cpu_stop_work){ | ||||||
| 		.fn = multi_cpu_stop, | 		.fn = multi_cpu_stop, | ||||||
| 		.arg = &msdata, | 		.arg = &msdata, | ||||||
| 		.done = &done | 		.done = &done, | ||||||
|  | 		.caller = _RET_IP_, | ||||||
| 	}; | 	}; | ||||||
| 
 | 
 | ||||||
| 	cpu_stop_init_done(&done, 2); | 	cpu_stop_init_done(&done, 2); | ||||||
|  | @ -367,7 +384,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * | ||||||
| bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, | bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, | ||||||
| 			struct cpu_stop_work *work_buf) | 			struct cpu_stop_work *work_buf) | ||||||
| { | { | ||||||
| 	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; | 	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, }; | ||||||
| 	return cpu_stop_queue_work(cpu, work_buf); | 	return cpu_stop_queue_work(cpu, work_buf); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -487,6 +504,8 @@ static void cpu_stopper_thread(unsigned int cpu) | ||||||
| 		int ret; | 		int ret; | ||||||
| 
 | 
 | ||||||
| 		/* cpu stop callbacks must not sleep, make in_atomic() == T */ | 		/* cpu stop callbacks must not sleep, make in_atomic() == T */ | ||||||
|  | 		stopper->caller = work->caller; | ||||||
|  | 		stopper->fn = fn; | ||||||
| 		preempt_count_inc(); | 		preempt_count_inc(); | ||||||
| 		ret = fn(arg); | 		ret = fn(arg); | ||||||
| 		if (done) { | 		if (done) { | ||||||
|  | @ -495,6 +514,8 @@ static void cpu_stopper_thread(unsigned int cpu) | ||||||
| 			cpu_stop_signal_done(done); | 			cpu_stop_signal_done(done); | ||||||
| 		} | 		} | ||||||
| 		preempt_count_dec(); | 		preempt_count_dec(); | ||||||
|  | 		stopper->fn = NULL; | ||||||
|  | 		stopper->caller = 0; | ||||||
| 		WARN_ONCE(preempt_count(), | 		WARN_ONCE(preempt_count(), | ||||||
| 			  "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg); | 			  "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg); | ||||||
| 		goto repeat; | 		goto repeat; | ||||||
|  |  | ||||||
|  | @ -293,10 +293,8 @@ static void nohz_full_kick_func(struct irq_work *work) | ||||||
| 	/* Empty, the tick restart happens on tick_nohz_irq_exit() */ | 	/* Empty, the tick restart happens on tick_nohz_irq_exit() */ | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { | static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = | ||||||
| 	.func = nohz_full_kick_func, | 	IRQ_WORK_INIT_HARD(nohz_full_kick_func); | ||||||
| 	.flags = ATOMIC_INIT(IRQ_WORK_HARD_IRQ), |  | ||||||
| }; |  | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * Kick this CPU if it's full dynticks in order to force it to |  * Kick this CPU if it's full dynticks in order to force it to | ||||||
|  |  | ||||||
|  | @ -1096,7 +1096,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type) | ||||||
| 			return -EINVAL; | 			return -EINVAL; | ||||||
| 
 | 
 | ||||||
| 		work = this_cpu_ptr(&send_signal_work); | 		work = this_cpu_ptr(&send_signal_work); | ||||||
| 		if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) | 		if (irq_work_is_busy(&work->irq_work)) | ||||||
| 			return -EBUSY; | 			return -EBUSY; | ||||||
| 
 | 
 | ||||||
| 		/* Add the current task, which is the target of sending signal,
 | 		/* Add the current task, which is the target of sending signal,
 | ||||||
|  |  | ||||||
|  | @ -4908,6 +4908,10 @@ static void unbind_workers(int cpu) | ||||||
| 		pool->flags |= POOL_DISASSOCIATED; | 		pool->flags |= POOL_DISASSOCIATED; | ||||||
| 
 | 
 | ||||||
| 		raw_spin_unlock_irq(&pool->lock); | 		raw_spin_unlock_irq(&pool->lock); | ||||||
|  | 
 | ||||||
|  | 		for_each_pool_worker(worker, pool) | ||||||
|  | 			WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0); | ||||||
|  | 
 | ||||||
| 		mutex_unlock(&wq_pool_attach_mutex); | 		mutex_unlock(&wq_pool_attach_mutex); | ||||||
| 
 | 
 | ||||||
| 		/*
 | 		/*
 | ||||||
|  |  | ||||||
|  | @ -267,3 +267,21 @@ int cpumask_any_and_distribute(const struct cpumask *src1p, | ||||||
| 	return next; | 	return next; | ||||||
| } | } | ||||||
| EXPORT_SYMBOL(cpumask_any_and_distribute); | EXPORT_SYMBOL(cpumask_any_and_distribute); | ||||||
|  | 
 | ||||||
|  | int cpumask_any_distribute(const struct cpumask *srcp) | ||||||
|  | { | ||||||
|  | 	int next, prev; | ||||||
|  | 
 | ||||||
|  | 	/* NOTE: our first selection will skip 0. */ | ||||||
|  | 	prev = __this_cpu_read(distribute_cpu_mask_prev); | ||||||
|  | 
 | ||||||
|  | 	next = cpumask_next(prev, srcp); | ||||||
|  | 	if (next >= nr_cpu_ids) | ||||||
|  | 		next = cpumask_first(srcp); | ||||||
|  | 
 | ||||||
|  | 	if (next < nr_cpu_ids) | ||||||
|  | 		__this_cpu_write(distribute_cpu_mask_prev, next); | ||||||
|  | 
 | ||||||
|  | 	return next; | ||||||
|  | } | ||||||
|  | EXPORT_SYMBOL(cpumask_any_distribute); | ||||||
|  |  | ||||||
|  | @ -12,6 +12,7 @@ | ||||||
| #include <linux/atomic.h> | #include <linux/atomic.h> | ||||||
| #include <linux/kexec.h> | #include <linux/kexec.h> | ||||||
| #include <linux/utsname.h> | #include <linux/utsname.h> | ||||||
|  | #include <linux/stop_machine.h> | ||||||
| 
 | 
 | ||||||
| static char dump_stack_arch_desc_str[128]; | static char dump_stack_arch_desc_str[128]; | ||||||
| 
 | 
 | ||||||
|  | @ -57,6 +58,7 @@ void dump_stack_print_info(const char *log_lvl) | ||||||
| 		       log_lvl, dump_stack_arch_desc_str); | 		       log_lvl, dump_stack_arch_desc_str); | ||||||
| 
 | 
 | ||||||
| 	print_worker_info(log_lvl, current); | 	print_worker_info(log_lvl, current); | ||||||
|  | 	print_stop_info(log_lvl, current); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /**
 | /**
 | ||||||
|  |  | ||||||
|  | @ -26,6 +26,11 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2) | ||||||
| 	if (current->nr_cpus_allowed == 1) | 	if (current->nr_cpus_allowed == 1) | ||||||
| 		goto out; | 		goto out; | ||||||
| 
 | 
 | ||||||
|  | #ifdef CONFIG_SMP | ||||||
|  | 	if (current->migration_disabled) | ||||||
|  | 		goto out; | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * It is valid to assume CPU-locality during early bootup: | 	 * It is valid to assume CPU-locality during early bootup: | ||||||
| 	 */ | 	 */ | ||||||
|  |  | ||||||
|  | @ -11179,8 +11179,7 @@ static int __init net_dev_init(void) | ||||||
| 		INIT_LIST_HEAD(&sd->poll_list); | 		INIT_LIST_HEAD(&sd->poll_list); | ||||||
| 		sd->output_queue_tailp = &sd->output_queue; | 		sd->output_queue_tailp = &sd->output_queue; | ||||||
| #ifdef CONFIG_RPS | #ifdef CONFIG_RPS | ||||||
| 		sd->csd.func = rps_trigger_softirq; | 		INIT_CSD(&sd->csd, rps_trigger_softirq, sd); | ||||||
| 		sd->csd.info = sd; |  | ||||||
| 		sd->cpu = i; | 		sd->cpu = i; | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Linus Torvalds
						Linus Torvalds