mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	mm: memcg: move legacy memcg event code into memcontrol-v1.c
Cgroup v1's memory controller contains a pretty complicated event notifications mechanism which is not used on cgroup v2. Let's move the corresponding code into memcontrol-v1.c. Please, note, that mem_cgroup_event_ratelimit() remains in memcontrol.c, otherwise it would require exporting too many details on memcg stats outside of memcontrol.c. Link: https://lkml.kernel.org/r/20240625005906.106920-7-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev> Acked-by: Michal Hocko <mhocko@suse.com> Acked-by: Shakeel Butt <shakeel.butt@linux.dev> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Muchun Song <muchun.song@linux.dev> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
		
							parent
							
								
									b9eaacb1db
								
							
						
					
					
						commit
						66d60c428b
					
				
					 4 changed files with 709 additions and 694 deletions
				
			
		| 
						 | 
					@ -69,18 +69,6 @@ struct mem_cgroup_id {
 | 
				
			||||||
	refcount_t ref;
 | 
						refcount_t ref;
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					 | 
				
			||||||
 * Per memcg event counter is incremented at every pagein/pageout. With THP,
 | 
					 | 
				
			||||||
 * it will be incremented by the number of pages. This counter is used
 | 
					 | 
				
			||||||
 * to trigger some periodic events. This is straightforward and better
 | 
					 | 
				
			||||||
 * than using jiffies etc. to handle periodic memcg event.
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
enum mem_cgroup_events_target {
 | 
					 | 
				
			||||||
	MEM_CGROUP_TARGET_THRESH,
 | 
					 | 
				
			||||||
	MEM_CGROUP_TARGET_SOFTLIMIT,
 | 
					 | 
				
			||||||
	MEM_CGROUP_NTARGETS,
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
struct memcg_vmstats_percpu;
 | 
					struct memcg_vmstats_percpu;
 | 
				
			||||||
struct memcg_vmstats;
 | 
					struct memcg_vmstats;
 | 
				
			||||||
struct lruvec_stats_percpu;
 | 
					struct lruvec_stats_percpu;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -6,6 +6,10 @@
 | 
				
			||||||
#include <linux/pagewalk.h>
 | 
					#include <linux/pagewalk.h>
 | 
				
			||||||
#include <linux/backing-dev.h>
 | 
					#include <linux/backing-dev.h>
 | 
				
			||||||
#include <linux/swap_cgroup.h>
 | 
					#include <linux/swap_cgroup.h>
 | 
				
			||||||
 | 
					#include <linux/eventfd.h>
 | 
				
			||||||
 | 
					#include <linux/poll.h>
 | 
				
			||||||
 | 
					#include <linux/sort.h>
 | 
				
			||||||
 | 
					#include <linux/file.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include "internal.h"
 | 
					#include "internal.h"
 | 
				
			||||||
#include "swap.h"
 | 
					#include "swap.h"
 | 
				
			||||||
| 
						 | 
					@ -60,6 +64,54 @@ static struct move_charge_struct {
 | 
				
			||||||
	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 | 
						.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* for OOM */
 | 
				
			||||||
 | 
					struct mem_cgroup_eventfd_list {
 | 
				
			||||||
 | 
						struct list_head list;
 | 
				
			||||||
 | 
						struct eventfd_ctx *eventfd;
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * cgroup_event represents events which userspace want to receive.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					struct mem_cgroup_event {
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * memcg which the event belongs to.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						struct mem_cgroup *memcg;
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * eventfd to signal userspace about the event.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						struct eventfd_ctx *eventfd;
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * Each of these stored in a list by the cgroup.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						struct list_head list;
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * register_event() callback will be used to add new userspace
 | 
				
			||||||
 | 
						 * waiter for changes related to this event.  Use eventfd_signal()
 | 
				
			||||||
 | 
						 * on eventfd to send notification to userspace.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						int (*register_event)(struct mem_cgroup *memcg,
 | 
				
			||||||
 | 
								      struct eventfd_ctx *eventfd, const char *args);
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * unregister_event() callback will be called when userspace closes
 | 
				
			||||||
 | 
						 * the eventfd or on cgroup removing.  This callback must be set,
 | 
				
			||||||
 | 
						 * if you want provide notification functionality.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						void (*unregister_event)(struct mem_cgroup *memcg,
 | 
				
			||||||
 | 
									 struct eventfd_ctx *eventfd);
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * All fields below needed to unregister event when
 | 
				
			||||||
 | 
						 * userspace closes eventfd.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						poll_table pt;
 | 
				
			||||||
 | 
						wait_queue_head_t *wqh;
 | 
				
			||||||
 | 
						wait_queue_entry_t wait;
 | 
				
			||||||
 | 
						struct work_struct remove;
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					extern spinlock_t memcg_oom_lock;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
 | 
					static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
 | 
				
			||||||
					 struct mem_cgroup_tree_per_node *mctz,
 | 
										 struct mem_cgroup_tree_per_node *mctz,
 | 
				
			||||||
					 unsigned long new_usage_in_excess)
 | 
										 unsigned long new_usage_in_excess)
 | 
				
			||||||
| 
						 | 
					@ -1306,6 +1358,607 @@ void memcg1_move_task(void)
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct mem_cgroup_threshold_ary *t;
 | 
				
			||||||
 | 
						unsigned long usage;
 | 
				
			||||||
 | 
						int i;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						rcu_read_lock();
 | 
				
			||||||
 | 
						if (!swap)
 | 
				
			||||||
 | 
							t = rcu_dereference(memcg->thresholds.primary);
 | 
				
			||||||
 | 
						else
 | 
				
			||||||
 | 
							t = rcu_dereference(memcg->memsw_thresholds.primary);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (!t)
 | 
				
			||||||
 | 
							goto unlock;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						usage = mem_cgroup_usage(memcg, swap);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * current_threshold points to threshold just below or equal to usage.
 | 
				
			||||||
 | 
						 * If it's not true, a threshold was crossed after last
 | 
				
			||||||
 | 
						 * call of __mem_cgroup_threshold().
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						i = t->current_threshold;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * Iterate backward over array of thresholds starting from
 | 
				
			||||||
 | 
						 * current_threshold and check if a threshold is crossed.
 | 
				
			||||||
 | 
						 * If none of thresholds below usage is crossed, we read
 | 
				
			||||||
 | 
						 * only one element of the array here.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
 | 
				
			||||||
 | 
							eventfd_signal(t->entries[i].eventfd);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* i = current_threshold + 1 */
 | 
				
			||||||
 | 
						i++;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * Iterate forward over array of thresholds starting from
 | 
				
			||||||
 | 
						 * current_threshold+1 and check if a threshold is crossed.
 | 
				
			||||||
 | 
						 * If none of thresholds above usage is crossed, we read
 | 
				
			||||||
 | 
						 * only one element of the array here.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
 | 
				
			||||||
 | 
							eventfd_signal(t->entries[i].eventfd);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* Update current_threshold */
 | 
				
			||||||
 | 
						t->current_threshold = i - 1;
 | 
				
			||||||
 | 
					unlock:
 | 
				
			||||||
 | 
						rcu_read_unlock();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void mem_cgroup_threshold(struct mem_cgroup *memcg)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						while (memcg) {
 | 
				
			||||||
 | 
							__mem_cgroup_threshold(memcg, false);
 | 
				
			||||||
 | 
							if (do_memsw_account())
 | 
				
			||||||
 | 
								__mem_cgroup_threshold(memcg, true);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							memcg = parent_mem_cgroup(memcg);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * Check events in order.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					void memcg_check_events(struct mem_cgroup *memcg, int nid)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						if (IS_ENABLED(CONFIG_PREEMPT_RT))
 | 
				
			||||||
 | 
							return;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* threshold event is triggered in finer grain than soft limit */
 | 
				
			||||||
 | 
						if (unlikely(mem_cgroup_event_ratelimit(memcg,
 | 
				
			||||||
 | 
											MEM_CGROUP_TARGET_THRESH))) {
 | 
				
			||||||
 | 
							bool do_softlimit;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							do_softlimit = mem_cgroup_event_ratelimit(memcg,
 | 
				
			||||||
 | 
											MEM_CGROUP_TARGET_SOFTLIMIT);
 | 
				
			||||||
 | 
							mem_cgroup_threshold(memcg);
 | 
				
			||||||
 | 
							if (unlikely(do_softlimit))
 | 
				
			||||||
 | 
								memcg1_update_tree(memcg, nid);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static int compare_thresholds(const void *a, const void *b)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						const struct mem_cgroup_threshold *_a = a;
 | 
				
			||||||
 | 
						const struct mem_cgroup_threshold *_b = b;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (_a->threshold > _b->threshold)
 | 
				
			||||||
 | 
							return 1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (_a->threshold < _b->threshold)
 | 
				
			||||||
 | 
							return -1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return 0;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct mem_cgroup_eventfd_list *ev;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						spin_lock(&memcg_oom_lock);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						list_for_each_entry(ev, &memcg->oom_notify, list)
 | 
				
			||||||
 | 
							eventfd_signal(ev->eventfd);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						spin_unlock(&memcg_oom_lock);
 | 
				
			||||||
 | 
						return 0;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct mem_cgroup *iter;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						for_each_mem_cgroup_tree(iter, memcg)
 | 
				
			||||||
 | 
							mem_cgroup_oom_notify_cb(iter);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
 | 
				
			||||||
 | 
						struct eventfd_ctx *eventfd, const char *args, enum res_type type)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct mem_cgroup_thresholds *thresholds;
 | 
				
			||||||
 | 
						struct mem_cgroup_threshold_ary *new;
 | 
				
			||||||
 | 
						unsigned long threshold;
 | 
				
			||||||
 | 
						unsigned long usage;
 | 
				
			||||||
 | 
						int i, size, ret;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						ret = page_counter_memparse(args, "-1", &threshold);
 | 
				
			||||||
 | 
						if (ret)
 | 
				
			||||||
 | 
							return ret;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						mutex_lock(&memcg->thresholds_lock);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (type == _MEM) {
 | 
				
			||||||
 | 
							thresholds = &memcg->thresholds;
 | 
				
			||||||
 | 
							usage = mem_cgroup_usage(memcg, false);
 | 
				
			||||||
 | 
						} else if (type == _MEMSWAP) {
 | 
				
			||||||
 | 
							thresholds = &memcg->memsw_thresholds;
 | 
				
			||||||
 | 
							usage = mem_cgroup_usage(memcg, true);
 | 
				
			||||||
 | 
						} else
 | 
				
			||||||
 | 
							BUG();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* Check if a threshold crossed before adding a new one */
 | 
				
			||||||
 | 
						if (thresholds->primary)
 | 
				
			||||||
 | 
							__mem_cgroup_threshold(memcg, type == _MEMSWAP);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						size = thresholds->primary ? thresholds->primary->size + 1 : 1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* Allocate memory for new array of thresholds */
 | 
				
			||||||
 | 
						new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
 | 
				
			||||||
 | 
						if (!new) {
 | 
				
			||||||
 | 
							ret = -ENOMEM;
 | 
				
			||||||
 | 
							goto unlock;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						new->size = size;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* Copy thresholds (if any) to new array */
 | 
				
			||||||
 | 
						if (thresholds->primary)
 | 
				
			||||||
 | 
							memcpy(new->entries, thresholds->primary->entries,
 | 
				
			||||||
 | 
							       flex_array_size(new, entries, size - 1));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* Add new threshold */
 | 
				
			||||||
 | 
						new->entries[size - 1].eventfd = eventfd;
 | 
				
			||||||
 | 
						new->entries[size - 1].threshold = threshold;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* Sort thresholds. Registering of new threshold isn't time-critical */
 | 
				
			||||||
 | 
						sort(new->entries, size, sizeof(*new->entries),
 | 
				
			||||||
 | 
								compare_thresholds, NULL);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* Find current threshold */
 | 
				
			||||||
 | 
						new->current_threshold = -1;
 | 
				
			||||||
 | 
						for (i = 0; i < size; i++) {
 | 
				
			||||||
 | 
							if (new->entries[i].threshold <= usage) {
 | 
				
			||||||
 | 
								/*
 | 
				
			||||||
 | 
								 * new->current_threshold will not be used until
 | 
				
			||||||
 | 
								 * rcu_assign_pointer(), so it's safe to increment
 | 
				
			||||||
 | 
								 * it here.
 | 
				
			||||||
 | 
								 */
 | 
				
			||||||
 | 
								++new->current_threshold;
 | 
				
			||||||
 | 
							} else
 | 
				
			||||||
 | 
								break;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* Free old spare buffer and save old primary buffer as spare */
 | 
				
			||||||
 | 
						kfree(thresholds->spare);
 | 
				
			||||||
 | 
						thresholds->spare = thresholds->primary;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						rcu_assign_pointer(thresholds->primary, new);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* To be sure that nobody uses thresholds */
 | 
				
			||||||
 | 
						synchronize_rcu();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					unlock:
 | 
				
			||||||
 | 
						mutex_unlock(&memcg->thresholds_lock);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return ret;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
 | 
				
			||||||
 | 
						struct eventfd_ctx *eventfd, const char *args)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
 | 
				
			||||||
 | 
						struct eventfd_ctx *eventfd, const char *args)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
 | 
				
			||||||
 | 
						struct eventfd_ctx *eventfd, enum res_type type)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct mem_cgroup_thresholds *thresholds;
 | 
				
			||||||
 | 
						struct mem_cgroup_threshold_ary *new;
 | 
				
			||||||
 | 
						unsigned long usage;
 | 
				
			||||||
 | 
						int i, j, size, entries;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						mutex_lock(&memcg->thresholds_lock);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (type == _MEM) {
 | 
				
			||||||
 | 
							thresholds = &memcg->thresholds;
 | 
				
			||||||
 | 
							usage = mem_cgroup_usage(memcg, false);
 | 
				
			||||||
 | 
						} else if (type == _MEMSWAP) {
 | 
				
			||||||
 | 
							thresholds = &memcg->memsw_thresholds;
 | 
				
			||||||
 | 
							usage = mem_cgroup_usage(memcg, true);
 | 
				
			||||||
 | 
						} else
 | 
				
			||||||
 | 
							BUG();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (!thresholds->primary)
 | 
				
			||||||
 | 
							goto unlock;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* Check if a threshold crossed before removing */
 | 
				
			||||||
 | 
						__mem_cgroup_threshold(memcg, type == _MEMSWAP);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* Calculate new number of threshold */
 | 
				
			||||||
 | 
						size = entries = 0;
 | 
				
			||||||
 | 
						for (i = 0; i < thresholds->primary->size; i++) {
 | 
				
			||||||
 | 
							if (thresholds->primary->entries[i].eventfd != eventfd)
 | 
				
			||||||
 | 
								size++;
 | 
				
			||||||
 | 
							else
 | 
				
			||||||
 | 
								entries++;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						new = thresholds->spare;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* If no items related to eventfd have been cleared, nothing to do */
 | 
				
			||||||
 | 
						if (!entries)
 | 
				
			||||||
 | 
							goto unlock;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* Set thresholds array to NULL if we don't have thresholds */
 | 
				
			||||||
 | 
						if (!size) {
 | 
				
			||||||
 | 
							kfree(new);
 | 
				
			||||||
 | 
							new = NULL;
 | 
				
			||||||
 | 
							goto swap_buffers;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						new->size = size;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* Copy thresholds and find current threshold */
 | 
				
			||||||
 | 
						new->current_threshold = -1;
 | 
				
			||||||
 | 
						for (i = 0, j = 0; i < thresholds->primary->size; i++) {
 | 
				
			||||||
 | 
							if (thresholds->primary->entries[i].eventfd == eventfd)
 | 
				
			||||||
 | 
								continue;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							new->entries[j] = thresholds->primary->entries[i];
 | 
				
			||||||
 | 
							if (new->entries[j].threshold <= usage) {
 | 
				
			||||||
 | 
								/*
 | 
				
			||||||
 | 
								 * new->current_threshold will not be used
 | 
				
			||||||
 | 
								 * until rcu_assign_pointer(), so it's safe to increment
 | 
				
			||||||
 | 
								 * it here.
 | 
				
			||||||
 | 
								 */
 | 
				
			||||||
 | 
								++new->current_threshold;
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
							j++;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					swap_buffers:
 | 
				
			||||||
 | 
						/* Swap primary and spare array */
 | 
				
			||||||
 | 
						thresholds->spare = thresholds->primary;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						rcu_assign_pointer(thresholds->primary, new);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* To be sure that nobody uses thresholds */
 | 
				
			||||||
 | 
						synchronize_rcu();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* If all events are unregistered, free the spare array */
 | 
				
			||||||
 | 
						if (!new) {
 | 
				
			||||||
 | 
							kfree(thresholds->spare);
 | 
				
			||||||
 | 
							thresholds->spare = NULL;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					unlock:
 | 
				
			||||||
 | 
						mutex_unlock(&memcg->thresholds_lock);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
 | 
				
			||||||
 | 
						struct eventfd_ctx *eventfd)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
 | 
				
			||||||
 | 
						struct eventfd_ctx *eventfd)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
 | 
				
			||||||
 | 
						struct eventfd_ctx *eventfd, const char *args)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct mem_cgroup_eventfd_list *event;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						event = kmalloc(sizeof(*event),	GFP_KERNEL);
 | 
				
			||||||
 | 
						if (!event)
 | 
				
			||||||
 | 
							return -ENOMEM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						spin_lock(&memcg_oom_lock);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						event->eventfd = eventfd;
 | 
				
			||||||
 | 
						list_add(&event->list, &memcg->oom_notify);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* already in OOM ? */
 | 
				
			||||||
 | 
						if (memcg->under_oom)
 | 
				
			||||||
 | 
							eventfd_signal(eventfd);
 | 
				
			||||||
 | 
						spin_unlock(&memcg_oom_lock);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return 0;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
 | 
				
			||||||
 | 
						struct eventfd_ctx *eventfd)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct mem_cgroup_eventfd_list *ev, *tmp;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						spin_lock(&memcg_oom_lock);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
 | 
				
			||||||
 | 
							if (ev->eventfd == eventfd) {
 | 
				
			||||||
 | 
								list_del(&ev->list);
 | 
				
			||||||
 | 
								kfree(ev);
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						spin_unlock(&memcg_oom_lock);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * DO NOT USE IN NEW FILES.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * "cgroup.event_control" implementation.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * This is way over-engineered.  It tries to support fully configurable
 | 
				
			||||||
 | 
					 * events for each user.  Such level of flexibility is completely
 | 
				
			||||||
 | 
					 * unnecessary especially in the light of the planned unified hierarchy.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Please deprecate this and replace with something simpler if at all
 | 
				
			||||||
 | 
					 * possible.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * Unregister event and free resources.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Gets called from workqueue.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					static void memcg_event_remove(struct work_struct *work)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct mem_cgroup_event *event =
 | 
				
			||||||
 | 
							container_of(work, struct mem_cgroup_event, remove);
 | 
				
			||||||
 | 
						struct mem_cgroup *memcg = event->memcg;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						remove_wait_queue(event->wqh, &event->wait);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						event->unregister_event(memcg, event->eventfd);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* Notify userspace the event is going away. */
 | 
				
			||||||
 | 
						eventfd_signal(event->eventfd);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						eventfd_ctx_put(event->eventfd);
 | 
				
			||||||
 | 
						kfree(event);
 | 
				
			||||||
 | 
						css_put(&memcg->css);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * Gets called on EPOLLHUP on eventfd when user closes it.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Called with wqh->lock held and interrupts disabled.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
 | 
				
			||||||
 | 
								    int sync, void *key)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct mem_cgroup_event *event =
 | 
				
			||||||
 | 
							container_of(wait, struct mem_cgroup_event, wait);
 | 
				
			||||||
 | 
						struct mem_cgroup *memcg = event->memcg;
 | 
				
			||||||
 | 
						__poll_t flags = key_to_poll(key);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (flags & EPOLLHUP) {
 | 
				
			||||||
 | 
							/*
 | 
				
			||||||
 | 
							 * If the event has been detached at cgroup removal, we
 | 
				
			||||||
 | 
							 * can simply return knowing the other side will cleanup
 | 
				
			||||||
 | 
							 * for us.
 | 
				
			||||||
 | 
							 *
 | 
				
			||||||
 | 
							 * We can't race against event freeing since the other
 | 
				
			||||||
 | 
							 * side will require wqh->lock via remove_wait_queue(),
 | 
				
			||||||
 | 
							 * which we hold.
 | 
				
			||||||
 | 
							 */
 | 
				
			||||||
 | 
							spin_lock(&memcg->event_list_lock);
 | 
				
			||||||
 | 
							if (!list_empty(&event->list)) {
 | 
				
			||||||
 | 
								list_del_init(&event->list);
 | 
				
			||||||
 | 
								/*
 | 
				
			||||||
 | 
								 * We are in atomic context, but cgroup_event_remove()
 | 
				
			||||||
 | 
								 * may sleep, so we have to call it in workqueue.
 | 
				
			||||||
 | 
								 */
 | 
				
			||||||
 | 
								schedule_work(&event->remove);
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
							spin_unlock(&memcg->event_list_lock);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return 0;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void memcg_event_ptable_queue_proc(struct file *file,
 | 
				
			||||||
 | 
							wait_queue_head_t *wqh, poll_table *pt)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct mem_cgroup_event *event =
 | 
				
			||||||
 | 
							container_of(pt, struct mem_cgroup_event, pt);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						event->wqh = wqh;
 | 
				
			||||||
 | 
						add_wait_queue(wqh, &event->wait);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * DO NOT USE IN NEW FILES.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Parse input and register new cgroup event handler.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Input must be in format '<event_fd> <control_fd> <args>'.
 | 
				
			||||||
 | 
					 * Interpretation of args is defined by control file implementation.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					ssize_t memcg_write_event_control(struct kernfs_open_file *of,
 | 
				
			||||||
 | 
									  char *buf, size_t nbytes, loff_t off)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct cgroup_subsys_state *css = of_css(of);
 | 
				
			||||||
 | 
						struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 | 
				
			||||||
 | 
						struct mem_cgroup_event *event;
 | 
				
			||||||
 | 
						struct cgroup_subsys_state *cfile_css;
 | 
				
			||||||
 | 
						unsigned int efd, cfd;
 | 
				
			||||||
 | 
						struct fd efile;
 | 
				
			||||||
 | 
						struct fd cfile;
 | 
				
			||||||
 | 
						struct dentry *cdentry;
 | 
				
			||||||
 | 
						const char *name;
 | 
				
			||||||
 | 
						char *endp;
 | 
				
			||||||
 | 
						int ret;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (IS_ENABLED(CONFIG_PREEMPT_RT))
 | 
				
			||||||
 | 
							return -EOPNOTSUPP;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						buf = strstrip(buf);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						efd = simple_strtoul(buf, &endp, 10);
 | 
				
			||||||
 | 
						if (*endp != ' ')
 | 
				
			||||||
 | 
							return -EINVAL;
 | 
				
			||||||
 | 
						buf = endp + 1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						cfd = simple_strtoul(buf, &endp, 10);
 | 
				
			||||||
 | 
						if ((*endp != ' ') && (*endp != '\0'))
 | 
				
			||||||
 | 
							return -EINVAL;
 | 
				
			||||||
 | 
						buf = endp + 1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						event = kzalloc(sizeof(*event), GFP_KERNEL);
 | 
				
			||||||
 | 
						if (!event)
 | 
				
			||||||
 | 
							return -ENOMEM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						event->memcg = memcg;
 | 
				
			||||||
 | 
						INIT_LIST_HEAD(&event->list);
 | 
				
			||||||
 | 
						init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
 | 
				
			||||||
 | 
						init_waitqueue_func_entry(&event->wait, memcg_event_wake);
 | 
				
			||||||
 | 
						INIT_WORK(&event->remove, memcg_event_remove);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						efile = fdget(efd);
 | 
				
			||||||
 | 
						if (!efile.file) {
 | 
				
			||||||
 | 
							ret = -EBADF;
 | 
				
			||||||
 | 
							goto out_kfree;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						event->eventfd = eventfd_ctx_fileget(efile.file);
 | 
				
			||||||
 | 
						if (IS_ERR(event->eventfd)) {
 | 
				
			||||||
 | 
							ret = PTR_ERR(event->eventfd);
 | 
				
			||||||
 | 
							goto out_put_efile;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						cfile = fdget(cfd);
 | 
				
			||||||
 | 
						if (!cfile.file) {
 | 
				
			||||||
 | 
							ret = -EBADF;
 | 
				
			||||||
 | 
							goto out_put_eventfd;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* the process need read permission on control file */
 | 
				
			||||||
 | 
						/* AV: shouldn't we check that it's been opened for read instead? */
 | 
				
			||||||
 | 
						ret = file_permission(cfile.file, MAY_READ);
 | 
				
			||||||
 | 
						if (ret < 0)
 | 
				
			||||||
 | 
							goto out_put_cfile;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * The control file must be a regular cgroup1 file. As a regular cgroup
 | 
				
			||||||
 | 
						 * file can't be renamed, it's safe to access its name afterwards.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						cdentry = cfile.file->f_path.dentry;
 | 
				
			||||||
 | 
						if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
 | 
				
			||||||
 | 
							ret = -EINVAL;
 | 
				
			||||||
 | 
							goto out_put_cfile;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * Determine the event callbacks and set them in @event.  This used
 | 
				
			||||||
 | 
						 * to be done via struct cftype but cgroup core no longer knows
 | 
				
			||||||
 | 
						 * about these events.  The following is crude but the whole thing
 | 
				
			||||||
 | 
						 * is for compatibility anyway.
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
 | 
						 * DO NOT ADD NEW FILES.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						name = cdentry->d_name.name;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (!strcmp(name, "memory.usage_in_bytes")) {
 | 
				
			||||||
 | 
							event->register_event = mem_cgroup_usage_register_event;
 | 
				
			||||||
 | 
							event->unregister_event = mem_cgroup_usage_unregister_event;
 | 
				
			||||||
 | 
						} else if (!strcmp(name, "memory.oom_control")) {
 | 
				
			||||||
 | 
							event->register_event = mem_cgroup_oom_register_event;
 | 
				
			||||||
 | 
							event->unregister_event = mem_cgroup_oom_unregister_event;
 | 
				
			||||||
 | 
						} else if (!strcmp(name, "memory.pressure_level")) {
 | 
				
			||||||
 | 
							event->register_event = vmpressure_register_event;
 | 
				
			||||||
 | 
							event->unregister_event = vmpressure_unregister_event;
 | 
				
			||||||
 | 
						} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
 | 
				
			||||||
 | 
							event->register_event = memsw_cgroup_usage_register_event;
 | 
				
			||||||
 | 
							event->unregister_event = memsw_cgroup_usage_unregister_event;
 | 
				
			||||||
 | 
						} else {
 | 
				
			||||||
 | 
							ret = -EINVAL;
 | 
				
			||||||
 | 
							goto out_put_cfile;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * Verify @cfile should belong to @css.  Also, remaining events are
 | 
				
			||||||
 | 
						 * automatically removed on cgroup destruction but the removal is
 | 
				
			||||||
 | 
						 * asynchronous, so take an extra ref on @css.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
 | 
				
			||||||
 | 
										       &memory_cgrp_subsys);
 | 
				
			||||||
 | 
						ret = -EINVAL;
 | 
				
			||||||
 | 
						if (IS_ERR(cfile_css))
 | 
				
			||||||
 | 
							goto out_put_cfile;
 | 
				
			||||||
 | 
						if (cfile_css != css) {
 | 
				
			||||||
 | 
							css_put(cfile_css);
 | 
				
			||||||
 | 
							goto out_put_cfile;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						ret = event->register_event(memcg, event->eventfd, buf);
 | 
				
			||||||
 | 
						if (ret)
 | 
				
			||||||
 | 
							goto out_put_css;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						vfs_poll(efile.file, &event->pt);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						spin_lock_irq(&memcg->event_list_lock);
 | 
				
			||||||
 | 
						list_add(&event->list, &memcg->event_list);
 | 
				
			||||||
 | 
						spin_unlock_irq(&memcg->event_list_lock);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						fdput(cfile);
 | 
				
			||||||
 | 
						fdput(efile);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return nbytes;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					out_put_css:
 | 
				
			||||||
 | 
						css_put(css);
 | 
				
			||||||
 | 
					out_put_cfile:
 | 
				
			||||||
 | 
						fdput(cfile);
 | 
				
			||||||
 | 
					out_put_eventfd:
 | 
				
			||||||
 | 
						eventfd_ctx_put(event->eventfd);
 | 
				
			||||||
 | 
					out_put_efile:
 | 
				
			||||||
 | 
						fdput(efile);
 | 
				
			||||||
 | 
					out_kfree:
 | 
				
			||||||
 | 
						kfree(event);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return ret;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void memcg1_css_offline(struct mem_cgroup *memcg)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct mem_cgroup_event *event, *tmp;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * Unregister events and notify userspace.
 | 
				
			||||||
 | 
						 * Notify userspace about cgroup removing only after rmdir of cgroup
 | 
				
			||||||
 | 
						 * directory to avoid race between userspace and kernelspace.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						spin_lock_irq(&memcg->event_list_lock);
 | 
				
			||||||
 | 
						list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
 | 
				
			||||||
 | 
							list_del_init(&event->list);
 | 
				
			||||||
 | 
							schedule_work(&event->remove);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						spin_unlock_irq(&memcg->event_list_lock);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static int __init memcg1_init(void)
 | 
					static int __init memcg1_init(void)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	int node;
 | 
						int node;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -41,4 +41,55 @@ u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
 | 
				
			||||||
int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
 | 
					int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
 | 
				
			||||||
				 struct cftype *cft, u64 val);
 | 
									 struct cftype *cft, u64 val);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * Per memcg event counter is incremented at every pagein/pageout. With THP,
 | 
				
			||||||
 | 
					 * it will be incremented by the number of pages. This counter is used
 | 
				
			||||||
 | 
					 * to trigger some periodic events. This is straightforward and better
 | 
				
			||||||
 | 
					 * than using jiffies etc. to handle periodic memcg event.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					enum mem_cgroup_events_target {
 | 
				
			||||||
 | 
						MEM_CGROUP_TARGET_THRESH,
 | 
				
			||||||
 | 
						MEM_CGROUP_TARGET_SOFTLIMIT,
 | 
				
			||||||
 | 
						MEM_CGROUP_NTARGETS,
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* Whether legacy memory+swap accounting is active */
 | 
				
			||||||
 | 
					static bool do_memsw_account(void)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * Iteration constructs for visiting all cgroups (under a tree).  If
 | 
				
			||||||
 | 
					 * loops are exited prematurely (break), mem_cgroup_iter_break() must
 | 
				
			||||||
 | 
					 * be used for reference counting.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					#define for_each_mem_cgroup_tree(iter, root)		\
 | 
				
			||||||
 | 
						for (iter = mem_cgroup_iter(root, NULL, NULL);	\
 | 
				
			||||||
 | 
						     iter != NULL;				\
 | 
				
			||||||
 | 
						     iter = mem_cgroup_iter(root, iter, NULL))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define for_each_mem_cgroup(iter)			\
 | 
				
			||||||
 | 
						for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
 | 
				
			||||||
 | 
						     iter != NULL;				\
 | 
				
			||||||
 | 
						     iter = mem_cgroup_iter(NULL, iter, NULL))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void memcg1_css_offline(struct mem_cgroup *memcg);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* for encoding cft->private value on file */
 | 
				
			||||||
 | 
					enum res_type {
 | 
				
			||||||
 | 
						_MEM,
 | 
				
			||||||
 | 
						_MEMSWAP,
 | 
				
			||||||
 | 
						_KMEM,
 | 
				
			||||||
 | 
						_TCP,
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 | 
				
			||||||
 | 
									enum mem_cgroup_events_target target);
 | 
				
			||||||
 | 
					unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap);
 | 
				
			||||||
 | 
					void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 | 
				
			||||||
 | 
					ssize_t memcg_write_event_control(struct kernfs_open_file *of,
 | 
				
			||||||
 | 
									  char *buf, size_t nbytes, loff_t off);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif	/* __MM_MEMCONTROL_V1_H */
 | 
					#endif	/* __MM_MEMCONTROL_V1_H */
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										685
									
								
								mm/memcontrol.c
									
									
									
									
									
								
							
							
						
						
									
										685
									
								
								mm/memcontrol.c
									
									
									
									
									
								
							| 
						 | 
					@ -46,9 +46,6 @@
 | 
				
			||||||
#include <linux/slab.h>
 | 
					#include <linux/slab.h>
 | 
				
			||||||
#include <linux/swapops.h>
 | 
					#include <linux/swapops.h>
 | 
				
			||||||
#include <linux/spinlock.h>
 | 
					#include <linux/spinlock.h>
 | 
				
			||||||
#include <linux/eventfd.h>
 | 
					 | 
				
			||||||
#include <linux/poll.h>
 | 
					 | 
				
			||||||
#include <linux/sort.h>
 | 
					 | 
				
			||||||
#include <linux/fs.h>
 | 
					#include <linux/fs.h>
 | 
				
			||||||
#include <linux/seq_file.h>
 | 
					#include <linux/seq_file.h>
 | 
				
			||||||
#include <linux/vmpressure.h>
 | 
					#include <linux/vmpressure.h>
 | 
				
			||||||
| 
						 | 
					@ -58,7 +55,6 @@
 | 
				
			||||||
#include <linux/cpu.h>
 | 
					#include <linux/cpu.h>
 | 
				
			||||||
#include <linux/oom.h>
 | 
					#include <linux/oom.h>
 | 
				
			||||||
#include <linux/lockdep.h>
 | 
					#include <linux/lockdep.h>
 | 
				
			||||||
#include <linux/file.h>
 | 
					 | 
				
			||||||
#include <linux/resume_user_mode.h>
 | 
					#include <linux/resume_user_mode.h>
 | 
				
			||||||
#include <linux/psi.h>
 | 
					#include <linux/psi.h>
 | 
				
			||||||
#include <linux/seq_buf.h>
 | 
					#include <linux/seq_buf.h>
 | 
				
			||||||
| 
						 | 
					@ -96,91 +92,13 @@ static bool cgroup_memory_nobpf __ro_after_init;
 | 
				
			||||||
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
 | 
					static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* Whether legacy memory+swap accounting is active */
 | 
					 | 
				
			||||||
static bool do_memsw_account(void)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define THRESHOLDS_EVENTS_TARGET 128
 | 
					#define THRESHOLDS_EVENTS_TARGET 128
 | 
				
			||||||
#define SOFTLIMIT_EVENTS_TARGET 1024
 | 
					#define SOFTLIMIT_EVENTS_TARGET 1024
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* for OOM */
 | 
					 | 
				
			||||||
struct mem_cgroup_eventfd_list {
 | 
					 | 
				
			||||||
	struct list_head list;
 | 
					 | 
				
			||||||
	struct eventfd_ctx *eventfd;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/*
 | 
					 | 
				
			||||||
 * cgroup_event represents events which userspace want to receive.
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
struct mem_cgroup_event {
 | 
					 | 
				
			||||||
	/*
 | 
					 | 
				
			||||||
	 * memcg which the event belongs to.
 | 
					 | 
				
			||||||
	 */
 | 
					 | 
				
			||||||
	struct mem_cgroup *memcg;
 | 
					 | 
				
			||||||
	/*
 | 
					 | 
				
			||||||
	 * eventfd to signal userspace about the event.
 | 
					 | 
				
			||||||
	 */
 | 
					 | 
				
			||||||
	struct eventfd_ctx *eventfd;
 | 
					 | 
				
			||||||
	/*
 | 
					 | 
				
			||||||
	 * Each of these stored in a list by the cgroup.
 | 
					 | 
				
			||||||
	 */
 | 
					 | 
				
			||||||
	struct list_head list;
 | 
					 | 
				
			||||||
	/*
 | 
					 | 
				
			||||||
	 * register_event() callback will be used to add new userspace
 | 
					 | 
				
			||||||
	 * waiter for changes related to this event.  Use eventfd_signal()
 | 
					 | 
				
			||||||
	 * on eventfd to send notification to userspace.
 | 
					 | 
				
			||||||
	 */
 | 
					 | 
				
			||||||
	int (*register_event)(struct mem_cgroup *memcg,
 | 
					 | 
				
			||||||
			      struct eventfd_ctx *eventfd, const char *args);
 | 
					 | 
				
			||||||
	/*
 | 
					 | 
				
			||||||
	 * unregister_event() callback will be called when userspace closes
 | 
					 | 
				
			||||||
	 * the eventfd or on cgroup removing.  This callback must be set,
 | 
					 | 
				
			||||||
	 * if you want provide notification functionality.
 | 
					 | 
				
			||||||
	 */
 | 
					 | 
				
			||||||
	void (*unregister_event)(struct mem_cgroup *memcg,
 | 
					 | 
				
			||||||
				 struct eventfd_ctx *eventfd);
 | 
					 | 
				
			||||||
	/*
 | 
					 | 
				
			||||||
	 * All fields below needed to unregister event when
 | 
					 | 
				
			||||||
	 * userspace closes eventfd.
 | 
					 | 
				
			||||||
	 */
 | 
					 | 
				
			||||||
	poll_table pt;
 | 
					 | 
				
			||||||
	wait_queue_head_t *wqh;
 | 
					 | 
				
			||||||
	wait_queue_entry_t wait;
 | 
					 | 
				
			||||||
	struct work_struct remove;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 | 
					 | 
				
			||||||
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* for encoding cft->private value on file */
 | 
					 | 
				
			||||||
enum res_type {
 | 
					 | 
				
			||||||
	_MEM,
 | 
					 | 
				
			||||||
	_MEMSWAP,
 | 
					 | 
				
			||||||
	_KMEM,
 | 
					 | 
				
			||||||
	_TCP,
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
 | 
					#define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
 | 
				
			||||||
#define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
 | 
					#define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
 | 
				
			||||||
#define MEMFILE_ATTR(val)	((val) & 0xffff)
 | 
					#define MEMFILE_ATTR(val)	((val) & 0xffff)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					 | 
				
			||||||
 * Iteration constructs for visiting all cgroups (under a tree).  If
 | 
					 | 
				
			||||||
 * loops are exited prematurely (break), mem_cgroup_iter_break() must
 | 
					 | 
				
			||||||
 * be used for reference counting.
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
#define for_each_mem_cgroup_tree(iter, root)		\
 | 
					 | 
				
			||||||
	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
 | 
					 | 
				
			||||||
	     iter != NULL;				\
 | 
					 | 
				
			||||||
	     iter = mem_cgroup_iter(root, iter, NULL))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define for_each_mem_cgroup(iter)			\
 | 
					 | 
				
			||||||
	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
 | 
					 | 
				
			||||||
	     iter != NULL;				\
 | 
					 | 
				
			||||||
	     iter = mem_cgroup_iter(NULL, iter, NULL))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
static inline bool task_is_dying(void)
 | 
					static inline bool task_is_dying(void)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
 | 
						return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
 | 
				
			||||||
| 
						 | 
					@ -939,7 +857,7 @@ void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
 | 
				
			||||||
	__this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
 | 
						__this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 | 
					bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 | 
				
			||||||
				enum mem_cgroup_events_target target)
 | 
									enum mem_cgroup_events_target target)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	unsigned long val, next;
 | 
						unsigned long val, next;
 | 
				
			||||||
| 
						 | 
					@ -964,28 +882,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 | 
				
			||||||
	return false;
 | 
						return false;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					 | 
				
			||||||
 * Check events in order.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
void memcg_check_events(struct mem_cgroup *memcg, int nid)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	if (IS_ENABLED(CONFIG_PREEMPT_RT))
 | 
					 | 
				
			||||||
		return;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/* threshold event is triggered in finer grain than soft limit */
 | 
					 | 
				
			||||||
	if (unlikely(mem_cgroup_event_ratelimit(memcg,
 | 
					 | 
				
			||||||
						MEM_CGROUP_TARGET_THRESH))) {
 | 
					 | 
				
			||||||
		bool do_softlimit;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
		do_softlimit = mem_cgroup_event_ratelimit(memcg,
 | 
					 | 
				
			||||||
						MEM_CGROUP_TARGET_SOFTLIMIT);
 | 
					 | 
				
			||||||
		mem_cgroup_threshold(memcg);
 | 
					 | 
				
			||||||
		if (unlikely(do_softlimit))
 | 
					 | 
				
			||||||
			memcg1_update_tree(memcg, nid);
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 | 
					struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
| 
						 | 
					@ -1725,7 +1621,7 @@ static struct lockdep_map memcg_oom_lock_dep_map = {
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static DEFINE_SPINLOCK(memcg_oom_lock);
 | 
					DEFINE_SPINLOCK(memcg_oom_lock);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					/*
 | 
				
			||||||
 * Check OOM-Killer is already running under our hierarchy.
 | 
					 * Check OOM-Killer is already running under our hierarchy.
 | 
				
			||||||
| 
						 | 
					@ -3543,7 +3439,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
 | 
				
			||||||
	return -EINVAL;
 | 
						return -EINVAL;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 | 
					unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	unsigned long val;
 | 
						unsigned long val;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4044,331 +3940,6 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
 | 
				
			||||||
	return 0;
 | 
						return 0;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	struct mem_cgroup_threshold_ary *t;
 | 
					 | 
				
			||||||
	unsigned long usage;
 | 
					 | 
				
			||||||
	int i;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	rcu_read_lock();
 | 
					 | 
				
			||||||
	if (!swap)
 | 
					 | 
				
			||||||
		t = rcu_dereference(memcg->thresholds.primary);
 | 
					 | 
				
			||||||
	else
 | 
					 | 
				
			||||||
		t = rcu_dereference(memcg->memsw_thresholds.primary);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	if (!t)
 | 
					 | 
				
			||||||
		goto unlock;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	usage = mem_cgroup_usage(memcg, swap);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/*
 | 
					 | 
				
			||||||
	 * current_threshold points to threshold just below or equal to usage.
 | 
					 | 
				
			||||||
	 * If it's not true, a threshold was crossed after last
 | 
					 | 
				
			||||||
	 * call of __mem_cgroup_threshold().
 | 
					 | 
				
			||||||
	 */
 | 
					 | 
				
			||||||
	i = t->current_threshold;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/*
 | 
					 | 
				
			||||||
	 * Iterate backward over array of thresholds starting from
 | 
					 | 
				
			||||||
	 * current_threshold and check if a threshold is crossed.
 | 
					 | 
				
			||||||
	 * If none of thresholds below usage is crossed, we read
 | 
					 | 
				
			||||||
	 * only one element of the array here.
 | 
					 | 
				
			||||||
	 */
 | 
					 | 
				
			||||||
	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
 | 
					 | 
				
			||||||
		eventfd_signal(t->entries[i].eventfd);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/* i = current_threshold + 1 */
 | 
					 | 
				
			||||||
	i++;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/*
 | 
					 | 
				
			||||||
	 * Iterate forward over array of thresholds starting from
 | 
					 | 
				
			||||||
	 * current_threshold+1 and check if a threshold is crossed.
 | 
					 | 
				
			||||||
	 * If none of thresholds above usage is crossed, we read
 | 
					 | 
				
			||||||
	 * only one element of the array here.
 | 
					 | 
				
			||||||
	 */
 | 
					 | 
				
			||||||
	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
 | 
					 | 
				
			||||||
		eventfd_signal(t->entries[i].eventfd);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/* Update current_threshold */
 | 
					 | 
				
			||||||
	t->current_threshold = i - 1;
 | 
					 | 
				
			||||||
unlock:
 | 
					 | 
				
			||||||
	rcu_read_unlock();
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
static void mem_cgroup_threshold(struct mem_cgroup *memcg)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	while (memcg) {
 | 
					 | 
				
			||||||
		__mem_cgroup_threshold(memcg, false);
 | 
					 | 
				
			||||||
		if (do_memsw_account())
 | 
					 | 
				
			||||||
			__mem_cgroup_threshold(memcg, true);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
		memcg = parent_mem_cgroup(memcg);
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
static int compare_thresholds(const void *a, const void *b)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	const struct mem_cgroup_threshold *_a = a;
 | 
					 | 
				
			||||||
	const struct mem_cgroup_threshold *_b = b;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	if (_a->threshold > _b->threshold)
 | 
					 | 
				
			||||||
		return 1;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	if (_a->threshold < _b->threshold)
 | 
					 | 
				
			||||||
		return -1;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	return 0;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	struct mem_cgroup_eventfd_list *ev;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	spin_lock(&memcg_oom_lock);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	list_for_each_entry(ev, &memcg->oom_notify, list)
 | 
					 | 
				
			||||||
		eventfd_signal(ev->eventfd);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	spin_unlock(&memcg_oom_lock);
 | 
					 | 
				
			||||||
	return 0;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	struct mem_cgroup *iter;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	for_each_mem_cgroup_tree(iter, memcg)
 | 
					 | 
				
			||||||
		mem_cgroup_oom_notify_cb(iter);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
 | 
					 | 
				
			||||||
	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	struct mem_cgroup_thresholds *thresholds;
 | 
					 | 
				
			||||||
	struct mem_cgroup_threshold_ary *new;
 | 
					 | 
				
			||||||
	unsigned long threshold;
 | 
					 | 
				
			||||||
	unsigned long usage;
 | 
					 | 
				
			||||||
	int i, size, ret;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	ret = page_counter_memparse(args, "-1", &threshold);
 | 
					 | 
				
			||||||
	if (ret)
 | 
					 | 
				
			||||||
		return ret;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	mutex_lock(&memcg->thresholds_lock);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	if (type == _MEM) {
 | 
					 | 
				
			||||||
		thresholds = &memcg->thresholds;
 | 
					 | 
				
			||||||
		usage = mem_cgroup_usage(memcg, false);
 | 
					 | 
				
			||||||
	} else if (type == _MEMSWAP) {
 | 
					 | 
				
			||||||
		thresholds = &memcg->memsw_thresholds;
 | 
					 | 
				
			||||||
		usage = mem_cgroup_usage(memcg, true);
 | 
					 | 
				
			||||||
	} else
 | 
					 | 
				
			||||||
		BUG();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/* Check if a threshold crossed before adding a new one */
 | 
					 | 
				
			||||||
	if (thresholds->primary)
 | 
					 | 
				
			||||||
		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/* Allocate memory for new array of thresholds */
 | 
					 | 
				
			||||||
	new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
 | 
					 | 
				
			||||||
	if (!new) {
 | 
					 | 
				
			||||||
		ret = -ENOMEM;
 | 
					 | 
				
			||||||
		goto unlock;
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
	new->size = size;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/* Copy thresholds (if any) to new array */
 | 
					 | 
				
			||||||
	if (thresholds->primary)
 | 
					 | 
				
			||||||
		memcpy(new->entries, thresholds->primary->entries,
 | 
					 | 
				
			||||||
		       flex_array_size(new, entries, size - 1));
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/* Add new threshold */
 | 
					 | 
				
			||||||
	new->entries[size - 1].eventfd = eventfd;
 | 
					 | 
				
			||||||
	new->entries[size - 1].threshold = threshold;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/* Sort thresholds. Registering of new threshold isn't time-critical */
 | 
					 | 
				
			||||||
	sort(new->entries, size, sizeof(*new->entries),
 | 
					 | 
				
			||||||
			compare_thresholds, NULL);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/* Find current threshold */
 | 
					 | 
				
			||||||
	new->current_threshold = -1;
 | 
					 | 
				
			||||||
	for (i = 0; i < size; i++) {
 | 
					 | 
				
			||||||
		if (new->entries[i].threshold <= usage) {
 | 
					 | 
				
			||||||
			/*
 | 
					 | 
				
			||||||
			 * new->current_threshold will not be used until
 | 
					 | 
				
			||||||
			 * rcu_assign_pointer(), so it's safe to increment
 | 
					 | 
				
			||||||
			 * it here.
 | 
					 | 
				
			||||||
			 */
 | 
					 | 
				
			||||||
			++new->current_threshold;
 | 
					 | 
				
			||||||
		} else
 | 
					 | 
				
			||||||
			break;
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/* Free old spare buffer and save old primary buffer as spare */
 | 
					 | 
				
			||||||
	kfree(thresholds->spare);
 | 
					 | 
				
			||||||
	thresholds->spare = thresholds->primary;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	rcu_assign_pointer(thresholds->primary, new);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/* To be sure that nobody uses thresholds */
 | 
					 | 
				
			||||||
	synchronize_rcu();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
unlock:
 | 
					 | 
				
			||||||
	mutex_unlock(&memcg->thresholds_lock);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	return ret;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
 | 
					 | 
				
			||||||
	struct eventfd_ctx *eventfd, const char *args)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
 | 
					 | 
				
			||||||
	struct eventfd_ctx *eventfd, const char *args)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
 | 
					 | 
				
			||||||
	struct eventfd_ctx *eventfd, enum res_type type)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	struct mem_cgroup_thresholds *thresholds;
 | 
					 | 
				
			||||||
	struct mem_cgroup_threshold_ary *new;
 | 
					 | 
				
			||||||
	unsigned long usage;
 | 
					 | 
				
			||||||
	int i, j, size, entries;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	mutex_lock(&memcg->thresholds_lock);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	if (type == _MEM) {
 | 
					 | 
				
			||||||
		thresholds = &memcg->thresholds;
 | 
					 | 
				
			||||||
		usage = mem_cgroup_usage(memcg, false);
 | 
					 | 
				
			||||||
	} else if (type == _MEMSWAP) {
 | 
					 | 
				
			||||||
		thresholds = &memcg->memsw_thresholds;
 | 
					 | 
				
			||||||
		usage = mem_cgroup_usage(memcg, true);
 | 
					 | 
				
			||||||
	} else
 | 
					 | 
				
			||||||
		BUG();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	if (!thresholds->primary)
 | 
					 | 
				
			||||||
		goto unlock;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/* Check if a threshold crossed before removing */
 | 
					 | 
				
			||||||
	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/* Calculate new number of threshold */
 | 
					 | 
				
			||||||
	size = entries = 0;
 | 
					 | 
				
			||||||
	for (i = 0; i < thresholds->primary->size; i++) {
 | 
					 | 
				
			||||||
		if (thresholds->primary->entries[i].eventfd != eventfd)
 | 
					 | 
				
			||||||
			size++;
 | 
					 | 
				
			||||||
		else
 | 
					 | 
				
			||||||
			entries++;
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	new = thresholds->spare;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/* If no items related to eventfd have been cleared, nothing to do */
 | 
					 | 
				
			||||||
	if (!entries)
 | 
					 | 
				
			||||||
		goto unlock;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/* Set thresholds array to NULL if we don't have thresholds */
 | 
					 | 
				
			||||||
	if (!size) {
 | 
					 | 
				
			||||||
		kfree(new);
 | 
					 | 
				
			||||||
		new = NULL;
 | 
					 | 
				
			||||||
		goto swap_buffers;
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	new->size = size;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/* Copy thresholds and find current threshold */
 | 
					 | 
				
			||||||
	new->current_threshold = -1;
 | 
					 | 
				
			||||||
	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
 | 
					 | 
				
			||||||
		if (thresholds->primary->entries[i].eventfd == eventfd)
 | 
					 | 
				
			||||||
			continue;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
		new->entries[j] = thresholds->primary->entries[i];
 | 
					 | 
				
			||||||
		if (new->entries[j].threshold <= usage) {
 | 
					 | 
				
			||||||
			/*
 | 
					 | 
				
			||||||
			 * new->current_threshold will not be used
 | 
					 | 
				
			||||||
			 * until rcu_assign_pointer(), so it's safe to increment
 | 
					 | 
				
			||||||
			 * it here.
 | 
					 | 
				
			||||||
			 */
 | 
					 | 
				
			||||||
			++new->current_threshold;
 | 
					 | 
				
			||||||
		}
 | 
					 | 
				
			||||||
		j++;
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
swap_buffers:
 | 
					 | 
				
			||||||
	/* Swap primary and spare array */
 | 
					 | 
				
			||||||
	thresholds->spare = thresholds->primary;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	rcu_assign_pointer(thresholds->primary, new);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/* To be sure that nobody uses thresholds */
 | 
					 | 
				
			||||||
	synchronize_rcu();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/* If all events are unregistered, free the spare array */
 | 
					 | 
				
			||||||
	if (!new) {
 | 
					 | 
				
			||||||
		kfree(thresholds->spare);
 | 
					 | 
				
			||||||
		thresholds->spare = NULL;
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
unlock:
 | 
					 | 
				
			||||||
	mutex_unlock(&memcg->thresholds_lock);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
 | 
					 | 
				
			||||||
	struct eventfd_ctx *eventfd)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
 | 
					 | 
				
			||||||
	struct eventfd_ctx *eventfd)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
 | 
					 | 
				
			||||||
	struct eventfd_ctx *eventfd, const char *args)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	struct mem_cgroup_eventfd_list *event;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	event = kmalloc(sizeof(*event),	GFP_KERNEL);
 | 
					 | 
				
			||||||
	if (!event)
 | 
					 | 
				
			||||||
		return -ENOMEM;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	spin_lock(&memcg_oom_lock);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	event->eventfd = eventfd;
 | 
					 | 
				
			||||||
	list_add(&event->list, &memcg->oom_notify);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/* already in OOM ? */
 | 
					 | 
				
			||||||
	if (memcg->under_oom)
 | 
					 | 
				
			||||||
		eventfd_signal(eventfd);
 | 
					 | 
				
			||||||
	spin_unlock(&memcg_oom_lock);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	return 0;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
 | 
					 | 
				
			||||||
	struct eventfd_ctx *eventfd)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	struct mem_cgroup_eventfd_list *ev, *tmp;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	spin_lock(&memcg_oom_lock);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
 | 
					 | 
				
			||||||
		if (ev->eventfd == eventfd) {
 | 
					 | 
				
			||||||
			list_del(&ev->list);
 | 
					 | 
				
			||||||
			kfree(ev);
 | 
					 | 
				
			||||||
		}
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	spin_unlock(&memcg_oom_lock);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
 | 
					static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
 | 
						struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
 | 
				
			||||||
| 
						 | 
					@ -4609,243 +4180,6 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif	/* CONFIG_CGROUP_WRITEBACK */
 | 
					#endif	/* CONFIG_CGROUP_WRITEBACK */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					 | 
				
			||||||
 * DO NOT USE IN NEW FILES.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * "cgroup.event_control" implementation.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * This is way over-engineered.  It tries to support fully configurable
 | 
					 | 
				
			||||||
 * events for each user.  Such level of flexibility is completely
 | 
					 | 
				
			||||||
 * unnecessary especially in the light of the planned unified hierarchy.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * Please deprecate this and replace with something simpler if at all
 | 
					 | 
				
			||||||
 * possible.
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/*
 | 
					 | 
				
			||||||
 * Unregister event and free resources.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * Gets called from workqueue.
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
static void memcg_event_remove(struct work_struct *work)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	struct mem_cgroup_event *event =
 | 
					 | 
				
			||||||
		container_of(work, struct mem_cgroup_event, remove);
 | 
					 | 
				
			||||||
	struct mem_cgroup *memcg = event->memcg;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	remove_wait_queue(event->wqh, &event->wait);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	event->unregister_event(memcg, event->eventfd);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/* Notify userspace the event is going away. */
 | 
					 | 
				
			||||||
	eventfd_signal(event->eventfd);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	eventfd_ctx_put(event->eventfd);
 | 
					 | 
				
			||||||
	kfree(event);
 | 
					 | 
				
			||||||
	css_put(&memcg->css);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/*
 | 
					 | 
				
			||||||
 * Gets called on EPOLLHUP on eventfd when user closes it.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * Called with wqh->lock held and interrupts disabled.
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
 | 
					 | 
				
			||||||
			    int sync, void *key)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	struct mem_cgroup_event *event =
 | 
					 | 
				
			||||||
		container_of(wait, struct mem_cgroup_event, wait);
 | 
					 | 
				
			||||||
	struct mem_cgroup *memcg = event->memcg;
 | 
					 | 
				
			||||||
	__poll_t flags = key_to_poll(key);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	if (flags & EPOLLHUP) {
 | 
					 | 
				
			||||||
		/*
 | 
					 | 
				
			||||||
		 * If the event has been detached at cgroup removal, we
 | 
					 | 
				
			||||||
		 * can simply return knowing the other side will cleanup
 | 
					 | 
				
			||||||
		 * for us.
 | 
					 | 
				
			||||||
		 *
 | 
					 | 
				
			||||||
		 * We can't race against event freeing since the other
 | 
					 | 
				
			||||||
		 * side will require wqh->lock via remove_wait_queue(),
 | 
					 | 
				
			||||||
		 * which we hold.
 | 
					 | 
				
			||||||
		 */
 | 
					 | 
				
			||||||
		spin_lock(&memcg->event_list_lock);
 | 
					 | 
				
			||||||
		if (!list_empty(&event->list)) {
 | 
					 | 
				
			||||||
			list_del_init(&event->list);
 | 
					 | 
				
			||||||
			/*
 | 
					 | 
				
			||||||
			 * We are in atomic context, but cgroup_event_remove()
 | 
					 | 
				
			||||||
			 * may sleep, so we have to call it in workqueue.
 | 
					 | 
				
			||||||
			 */
 | 
					 | 
				
			||||||
			schedule_work(&event->remove);
 | 
					 | 
				
			||||||
		}
 | 
					 | 
				
			||||||
		spin_unlock(&memcg->event_list_lock);
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	return 0;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
static void memcg_event_ptable_queue_proc(struct file *file,
 | 
					 | 
				
			||||||
		wait_queue_head_t *wqh, poll_table *pt)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	struct mem_cgroup_event *event =
 | 
					 | 
				
			||||||
		container_of(pt, struct mem_cgroup_event, pt);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	event->wqh = wqh;
 | 
					 | 
				
			||||||
	add_wait_queue(wqh, &event->wait);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/*
 | 
					 | 
				
			||||||
 * DO NOT USE IN NEW FILES.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * Parse input and register new cgroup event handler.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * Input must be in format '<event_fd> <control_fd> <args>'.
 | 
					 | 
				
			||||||
 * Interpretation of args is defined by control file implementation.
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
 | 
					 | 
				
			||||||
					 char *buf, size_t nbytes, loff_t off)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	struct cgroup_subsys_state *css = of_css(of);
 | 
					 | 
				
			||||||
	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 | 
					 | 
				
			||||||
	struct mem_cgroup_event *event;
 | 
					 | 
				
			||||||
	struct cgroup_subsys_state *cfile_css;
 | 
					 | 
				
			||||||
	unsigned int efd, cfd;
 | 
					 | 
				
			||||||
	struct fd efile;
 | 
					 | 
				
			||||||
	struct fd cfile;
 | 
					 | 
				
			||||||
	struct dentry *cdentry;
 | 
					 | 
				
			||||||
	const char *name;
 | 
					 | 
				
			||||||
	char *endp;
 | 
					 | 
				
			||||||
	int ret;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	if (IS_ENABLED(CONFIG_PREEMPT_RT))
 | 
					 | 
				
			||||||
		return -EOPNOTSUPP;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	buf = strstrip(buf);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	efd = simple_strtoul(buf, &endp, 10);
 | 
					 | 
				
			||||||
	if (*endp != ' ')
 | 
					 | 
				
			||||||
		return -EINVAL;
 | 
					 | 
				
			||||||
	buf = endp + 1;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	cfd = simple_strtoul(buf, &endp, 10);
 | 
					 | 
				
			||||||
	if ((*endp != ' ') && (*endp != '\0'))
 | 
					 | 
				
			||||||
		return -EINVAL;
 | 
					 | 
				
			||||||
	buf = endp + 1;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	event = kzalloc(sizeof(*event), GFP_KERNEL);
 | 
					 | 
				
			||||||
	if (!event)
 | 
					 | 
				
			||||||
		return -ENOMEM;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	event->memcg = memcg;
 | 
					 | 
				
			||||||
	INIT_LIST_HEAD(&event->list);
 | 
					 | 
				
			||||||
	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
 | 
					 | 
				
			||||||
	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
 | 
					 | 
				
			||||||
	INIT_WORK(&event->remove, memcg_event_remove);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	efile = fdget(efd);
 | 
					 | 
				
			||||||
	if (!efile.file) {
 | 
					 | 
				
			||||||
		ret = -EBADF;
 | 
					 | 
				
			||||||
		goto out_kfree;
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	event->eventfd = eventfd_ctx_fileget(efile.file);
 | 
					 | 
				
			||||||
	if (IS_ERR(event->eventfd)) {
 | 
					 | 
				
			||||||
		ret = PTR_ERR(event->eventfd);
 | 
					 | 
				
			||||||
		goto out_put_efile;
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	cfile = fdget(cfd);
 | 
					 | 
				
			||||||
	if (!cfile.file) {
 | 
					 | 
				
			||||||
		ret = -EBADF;
 | 
					 | 
				
			||||||
		goto out_put_eventfd;
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/* the process need read permission on control file */
 | 
					 | 
				
			||||||
	/* AV: shouldn't we check that it's been opened for read instead? */
 | 
					 | 
				
			||||||
	ret = file_permission(cfile.file, MAY_READ);
 | 
					 | 
				
			||||||
	if (ret < 0)
 | 
					 | 
				
			||||||
		goto out_put_cfile;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/*
 | 
					 | 
				
			||||||
	 * The control file must be a regular cgroup1 file. As a regular cgroup
 | 
					 | 
				
			||||||
	 * file can't be renamed, it's safe to access its name afterwards.
 | 
					 | 
				
			||||||
	 */
 | 
					 | 
				
			||||||
	cdentry = cfile.file->f_path.dentry;
 | 
					 | 
				
			||||||
	if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
 | 
					 | 
				
			||||||
		ret = -EINVAL;
 | 
					 | 
				
			||||||
		goto out_put_cfile;
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/*
 | 
					 | 
				
			||||||
	 * Determine the event callbacks and set them in @event.  This used
 | 
					 | 
				
			||||||
	 * to be done via struct cftype but cgroup core no longer knows
 | 
					 | 
				
			||||||
	 * about these events.  The following is crude but the whole thing
 | 
					 | 
				
			||||||
	 * is for compatibility anyway.
 | 
					 | 
				
			||||||
	 *
 | 
					 | 
				
			||||||
	 * DO NOT ADD NEW FILES.
 | 
					 | 
				
			||||||
	 */
 | 
					 | 
				
			||||||
	name = cdentry->d_name.name;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	if (!strcmp(name, "memory.usage_in_bytes")) {
 | 
					 | 
				
			||||||
		event->register_event = mem_cgroup_usage_register_event;
 | 
					 | 
				
			||||||
		event->unregister_event = mem_cgroup_usage_unregister_event;
 | 
					 | 
				
			||||||
	} else if (!strcmp(name, "memory.oom_control")) {
 | 
					 | 
				
			||||||
		event->register_event = mem_cgroup_oom_register_event;
 | 
					 | 
				
			||||||
		event->unregister_event = mem_cgroup_oom_unregister_event;
 | 
					 | 
				
			||||||
	} else if (!strcmp(name, "memory.pressure_level")) {
 | 
					 | 
				
			||||||
		event->register_event = vmpressure_register_event;
 | 
					 | 
				
			||||||
		event->unregister_event = vmpressure_unregister_event;
 | 
					 | 
				
			||||||
	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
 | 
					 | 
				
			||||||
		event->register_event = memsw_cgroup_usage_register_event;
 | 
					 | 
				
			||||||
		event->unregister_event = memsw_cgroup_usage_unregister_event;
 | 
					 | 
				
			||||||
	} else {
 | 
					 | 
				
			||||||
		ret = -EINVAL;
 | 
					 | 
				
			||||||
		goto out_put_cfile;
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/*
 | 
					 | 
				
			||||||
	 * Verify @cfile should belong to @css.  Also, remaining events are
 | 
					 | 
				
			||||||
	 * automatically removed on cgroup destruction but the removal is
 | 
					 | 
				
			||||||
	 * asynchronous, so take an extra ref on @css.
 | 
					 | 
				
			||||||
	 */
 | 
					 | 
				
			||||||
	cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
 | 
					 | 
				
			||||||
					       &memory_cgrp_subsys);
 | 
					 | 
				
			||||||
	ret = -EINVAL;
 | 
					 | 
				
			||||||
	if (IS_ERR(cfile_css))
 | 
					 | 
				
			||||||
		goto out_put_cfile;
 | 
					 | 
				
			||||||
	if (cfile_css != css) {
 | 
					 | 
				
			||||||
		css_put(cfile_css);
 | 
					 | 
				
			||||||
		goto out_put_cfile;
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	ret = event->register_event(memcg, event->eventfd, buf);
 | 
					 | 
				
			||||||
	if (ret)
 | 
					 | 
				
			||||||
		goto out_put_css;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	vfs_poll(efile.file, &event->pt);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	spin_lock_irq(&memcg->event_list_lock);
 | 
					 | 
				
			||||||
	list_add(&event->list, &memcg->event_list);
 | 
					 | 
				
			||||||
	spin_unlock_irq(&memcg->event_list_lock);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	fdput(cfile);
 | 
					 | 
				
			||||||
	fdput(efile);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	return nbytes;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
out_put_css:
 | 
					 | 
				
			||||||
	css_put(css);
 | 
					 | 
				
			||||||
out_put_cfile:
 | 
					 | 
				
			||||||
	fdput(cfile);
 | 
					 | 
				
			||||||
out_put_eventfd:
 | 
					 | 
				
			||||||
	eventfd_ctx_put(event->eventfd);
 | 
					 | 
				
			||||||
out_put_efile:
 | 
					 | 
				
			||||||
	fdput(efile);
 | 
					 | 
				
			||||||
out_kfree:
 | 
					 | 
				
			||||||
	kfree(event);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	return ret;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG)
 | 
					#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG)
 | 
				
			||||||
static int mem_cgroup_slab_show(struct seq_file *m, void *p)
 | 
					static int mem_cgroup_slab_show(struct seq_file *m, void *p)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
| 
						 | 
					@ -5312,19 +4646,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
 | 
				
			||||||
static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 | 
					static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 | 
						struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 | 
				
			||||||
	struct mem_cgroup_event *event, *tmp;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						memcg1_css_offline(memcg);
 | 
				
			||||||
	 * Unregister events and notify userspace.
 | 
					 | 
				
			||||||
	 * Notify userspace about cgroup removing only after rmdir of cgroup
 | 
					 | 
				
			||||||
	 * directory to avoid race between userspace and kernelspace.
 | 
					 | 
				
			||||||
	 */
 | 
					 | 
				
			||||||
	spin_lock_irq(&memcg->event_list_lock);
 | 
					 | 
				
			||||||
	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
 | 
					 | 
				
			||||||
		list_del_init(&event->list);
 | 
					 | 
				
			||||||
		schedule_work(&event->remove);
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
	spin_unlock_irq(&memcg->event_list_lock);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	page_counter_set_min(&memcg->memory, 0);
 | 
						page_counter_set_min(&memcg->memory, 0);
 | 
				
			||||||
	page_counter_set_low(&memcg->memory, 0);
 | 
						page_counter_set_low(&memcg->memory, 0);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue