per-zone and reclaim enhancements for memory controller: modifies vmscan.c for isolate globa/cgroup lru activity

When using memory controller, there are 2 levels of memory reclaim. 1. zone memory reclaim because of system/zone memory shortage. 2. memory cgroup memory reclaim because of hitting limit. These two can be distinguished by sc->mem_cgroup parameter. (scan_global_lru() macro) This patch tries to make memory cgroup reclaim routine avoid affecting system/zone memory reclaim. This patch inserts if (scan_global_lru()) and hook to memory_cgroup reclaim support functions. This patch can be a help for isolating system lru activity and group lru activity and shows what additional functions are necessary. * mem_cgroup_calc_mapped_ratio() ... calculate mapped ratio for cgroup. * mem_cgroup_reclaim_imbalance() ... calculate active/inactive balance in cgroup. * mem_cgroup_calc_reclaim_active() ... calculate the number of active pages to be scanned in this priority in mem_cgroup. * mem_cgroup_calc_reclaim_inactive() ... calculate the number of inactive pages to be scanned in this priority in mem_cgroup. * mem_cgroup_all_unreclaimable() .. checks cgroup's page is all unreclaimable or not. * mem_cgroup_get_reclaim_priority() ... * mem_cgroup_note_reclaim_priority() ... record reclaim priority (temporal) * mem_cgroup_remember_reclaim_priority() .... record reclaim priority as zone->prev_priority. This value is used for calc reclaim_mapped. [akpm@linux-foundation.org: fix unused var warning] Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: David Rientjes <rientjes@google.com> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Kirill Korotaev <dev@sw.ru> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Paul Menage <menage@google.com> Cc: Pavel Emelianov <xemul@openvz.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> Cc: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-07 00:14:37 -08:00 · 2008-02-07 00:14:37 -08:00 · 1cfb419b39
parent cc38108e1b
commit 1cfb419b39
1 changed files with 201 additions and 131 deletions
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@ -856,6 +856,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 		__mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
 		__mod_zone_page_state(zone, NR_INACTIVE,
 						-(nr_taken - nr_active));
+		if (scan_global_lru(sc))
 			zone->pages_scanned += nr_scan;
 		spin_unlock_irq(&zone->lru_lock);

@ -888,8 +889,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 		if (current_is_kswapd()) {
 			__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
 			__count_vm_events(KSWAPD_STEAL, nr_freed);
-		} else
+		} else if (scan_global_lru(sc))
 			__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
+
 		__count_zone_vm_events(PGSTEAL, zone, nr_freed);

 		if (nr_taken == 0)
@ -943,49 +945,31 @@ static inline int zone_is_near_oom(struct zone *zone)
 }

 /*
- * This moves pages from the active list to the inactive list.
- *
- * We move them the other way if the page is referenced by one or more
- * processes, from rmap.
- *
- * If the pages are mostly unmapped, the processing is fast and it is
- * appropriate to hold zone->lru_lock across the whole operation.  But if
- * the pages are mapped, the processing is slow (page_referenced()) so we
- * should drop zone->lru_lock around each page.  It's impossible to balance
- * this, so instead we remove the pages from the LRU while processing them.
- * It is safe to rely on PG_active against the non-LRU pages in here because
- * nobody will play with that bit on a non-LRU page.
- *
- * The downside is that we have to touch page->_count against each page.
- * But we had to alter page->flags anyway.
+ * Determine we should try to reclaim mapped pages.
+ * This is called only when sc->mem_cgroup is NULL.
 */
-static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
-				struct scan_control *sc, int priority)
+static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
+				int priority)
 {
-	unsigned long pgmoved;
-	int pgdeactivate = 0;
-	unsigned long pgscanned;
-	LIST_HEAD(l_hold);	/* The pages which were snipped off */
-	LIST_HEAD(l_inactive);	/* Pages to go onto the inactive_list */
-	LIST_HEAD(l_active);	/* Pages to go onto the active_list */
-	struct page *page;
-	struct pagevec pvec;
-	int reclaim_mapped = 0;
-
-	if (sc->may_swap) {
 	long mapped_ratio;
 	long distress;
 	long swap_tendency;
 	long imbalance;
+	int reclaim_mapped = 0;
+	int prev_priority;

-		if (zone_is_near_oom(zone))
-			goto force_reclaim_mapped;
-
+	if (scan_global_lru(sc) && zone_is_near_oom(zone))
+		return 1;
 	/*
 	 * `distress' is a measure of how much trouble we're having
 	 * reclaiming pages.  0 -> no problems.  100 -> great trouble.
 	 */
-		distress = 100 >> min(zone->prev_priority, priority);
+	if (scan_global_lru(sc))
+		prev_priority = zone->prev_priority;
+	else
+		prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup);
+
+	distress = 100 >> min(prev_priority, priority);

 	/*
 	 * The point of this algorithm is to decide when to start
@ -993,9 +977,12 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	 * how much memory
 	 * is mapped.
 	 */
+	if (scan_global_lru(sc))
 		mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
 				global_page_state(NR_ANON_PAGES)) * 100) /
 					vm_total_pages;
+	else
+		mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup);

 	/*
 	 * Now decide how much we really want to unmap some pages.  The
@ -1023,8 +1010,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	 * Avoid div by zero with nr_inactive+1, and max resulting
 	 * value is vm_total_pages.
 	 */
+	if (scan_global_lru(sc)) {
 		imbalance  = zone_page_state(zone, NR_ACTIVE);
 		imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
+	} else
+		imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup);

 	/*
 	 * Reduce the effect of imbalance if swappiness is low,
@ -1056,16 +1046,58 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	 * memory onto the inactive list.
 	 */
 	if (swap_tendency >= 100)
-force_reclaim_mapped:
 		reclaim_mapped = 1;
-	}
+
+	return reclaim_mapped;
+}
+
+/*
+ * This moves pages from the active list to the inactive list.
+ *
+ * We move them the other way if the page is referenced by one or more
+ * processes, from rmap.
+ *
+ * If the pages are mostly unmapped, the processing is fast and it is
+ * appropriate to hold zone->lru_lock across the whole operation.  But if
+ * the pages are mapped, the processing is slow (page_referenced()) so we
+ * should drop zone->lru_lock around each page.  It's impossible to balance
+ * this, so instead we remove the pages from the LRU while processing them.
+ * It is safe to rely on PG_active against the non-LRU pages in here because
+ * nobody will play with that bit on a non-LRU page.
+ *
+ * The downside is that we have to touch page->_count against each page.
+ * But we had to alter page->flags anyway.
+ */
+
+
+static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
+				struct scan_control *sc, int priority)
+{
+	unsigned long pgmoved;
+	int pgdeactivate = 0;
+	unsigned long pgscanned;
+	LIST_HEAD(l_hold);	/* The pages which were snipped off */
+	LIST_HEAD(l_inactive);	/* Pages to go onto the inactive_list */
+	LIST_HEAD(l_active);	/* Pages to go onto the active_list */
+	struct page *page;
+	struct pagevec pvec;
+	int reclaim_mapped = 0;
+
+	if (sc->may_swap)
+		reclaim_mapped = calc_reclaim_mapped(sc, zone, priority);

 	lru_add_drain();
 	spin_lock_irq(&zone->lru_lock);
 	pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
 					ISOLATE_ACTIVE, zone,
 					sc->mem_cgroup, 1);
+	/*
+	 * zone->pages_scanned is used for detect zone's oom
+	 * mem_cgroup remembers nr_scan by itself.
+	 */
+	if (scan_global_lru(sc))
 		zone->pages_scanned += pgscanned;
+
 	__mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
 	spin_unlock_irq(&zone->lru_lock);

@ -1155,18 +1187,14 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
 	unsigned long nr_to_scan;
 	unsigned long nr_reclaimed = 0;

+	if (scan_global_lru(sc)) {
 		/*
-	 * Add one to `nr_to_scan' just to make sure that the kernel will
-	 * slowly sift through the active list.
+		 * Add one to nr_to_scan just to make sure that the kernel
+		 * will slowly sift through the active list.
 		 */
 		zone->nr_scan_active +=
 			(zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
 		nr_active = zone->nr_scan_active;
-	if (nr_active >= sc->swap_cluster_max)
-		zone->nr_scan_active = 0;
-	else
-		nr_active = 0;
-
 		zone->nr_scan_inactive +=
 			(zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
 		nr_inactive = zone->nr_scan_inactive;
@ -1175,6 +1203,24 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
 		else
 			nr_inactive = 0;

+		if (nr_active >= sc->swap_cluster_max)
+			zone->nr_scan_active = 0;
+		else
+			nr_active = 0;
+	} else {
+		/*
+		 * This reclaim occurs not because zone memory shortage but
+		 * because memory controller hits its limit.
+		 * Then, don't modify zone reclaim related data.
+		 */
+		nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup,
+					zone, priority);
+
+		nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup,
+					zone, priority);
+	}
+
+
 	while (nr_active || nr_inactive) {
 		if (nr_active) {
 			nr_to_scan = min(nr_active,
@ -1218,25 +1264,39 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
 	unsigned long nr_reclaimed = 0;
 	int i;

+
 	sc->all_unreclaimable = 1;
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *zone = zones[i];

 		if (!populated_zone(zone))
 			continue;
-
+		/*
+		 * Take care memory controller reclaiming has small influence
+		 * to global LRU.
+		 */
+		if (scan_global_lru(sc)) {
 			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 				continue;
-
 			note_zone_scanning_priority(zone, priority);

-		if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY)
+			if (zone_is_all_unreclaimable(zone) &&
+						priority != DEF_PRIORITY)
 				continue;	/* Let kswapd poll it */
-
 			sc->all_unreclaimable = 0;
+		} else {
+			/*
+			 * Ignore cpuset limitation here. We just want to reduce
+			 * # of used pages by us regardless of memory shortage.
+			 */
+			sc->all_unreclaimable = 0;
+			mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
+							priority);
+		}

 		nr_reclaimed += shrink_zone(priority, zone, sc);
 	}
+
 	return nr_reclaimed;
 }
 
@ -1264,8 +1324,12 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
 	unsigned long lru_pages = 0;
 	int i;

+	if (scan_global_lru(sc))
 		count_vm_event(ALLOCSTALL);
-
+	/*
+	 * mem_cgroup will not do shrink_slab.
+	 */
+	if (scan_global_lru(sc)) {
 		for (i = 0; zones[i] != NULL; i++) {
 			struct zone *zone = zones[i];

@ -1275,6 +1339,7 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
 			lru_pages += zone_page_state(zone, NR_ACTIVE)
 					+ zone_page_state(zone, NR_INACTIVE);
 		}
+	}

 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
 		sc->nr_scanned = 0;
@ -1330,6 +1395,8 @@ out:
 	 */
 	if (priority < 0)
 		priority = 0;
+
+	if (scan_global_lru(sc)) {
 		for (i = 0; zones[i] != NULL; i++) {
 			struct zone *zone = zones[i];

@ -1338,6 +1405,9 @@ out:

 			zone->prev_priority = priority;
 		}
+	} else
+		mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
+
 	return ret;
 }