vmscan.c

0001:   /*
0002:    *  linux/mm/vmscan.c
0003:    *
0004:    *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
0005:    *
0006:    *  Swap reorganised 29.12.95, Stephen Tweedie.
0007:    *  kswapd added: 7.1.96  sct
0008:    *  Removed kswapd_ctl limits, and swap out as many pages as needed
0009:    *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
0010:    *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
0011:    *  Multiqueue VM started 5.8.00, Rik van Riel.
0012:    */
0013:   
0014:   #include <linux/mm.h>
0015:   #include <linux/module.h>
0016:   #include <linux/gfp.h>
0017:   #include <linux/kernel_stat.h>
0018:   #include <linux/swap.h>
0019:   #include <linux/pagemap.h>
0020:   #include <linux/init.h>
0021:   #include <linux/highmem.h>
0022:   #include <linux/vmpressure.h>
0023:   #include <linux/vmstat.h>
0024:   #include <linux/file.h>
0025:   #include <linux/writeback.h>
0026:   #include <linux/blkdev.h>
0027:   #include <linux/buffer_head.h>  /* for try_to_release_page(),
0028:                                           buffer_heads_over_limit */
0029:   #include <linux/mm_inline.h>
0030:   #include <linux/backing-dev.h>
0031:   #include <linux/rmap.h>
0032:   #include <linux/topology.h>
0033:   #include <linux/cpu.h>
0034:   #include <linux/cpuset.h>
0035:   #include <linux/compaction.h>
0036:   #include <linux/notifier.h>
0037:   #include <linux/rwsem.h>
0038:   #include <linux/delay.h>
0039:   #include <linux/kthread.h>
0040:   #include <linux/freezer.h>
0041:   #include <linux/memcontrol.h>
0042:   #include <linux/delayacct.h>
0043:   #include <linux/sysctl.h>
0044:   #include <linux/oom.h>
0045:   #include <linux/prefetch.h>
0046:   
0047:   #include <asm/tlbflush.h>
0048:   #include <asm/div64.h>
0049:   
0050:   #include <linux/swapops.h>
0051:   #include <linux/balloon_compaction.h>
0052:   
0053:   #include "internal.h"
0054:   
0055:   #define CREATE_TRACE_POINTS
0056:   #include <trace/events/vmscan.h>
0057:   
0058:   struct scan_control {
0059:           /* Incremented by the number of inactive pages that were scanned */
0060:           unsigned long nr_scanned;
0061:   
0062:           /* Number of pages freed so far during a call to shrink_zones() */
0063:           unsigned long nr_reclaimed;
0064:   
0065:           /* How many pages shrink_list() should reclaim */
0066:           unsigned long nr_to_reclaim;
0067:   
0068:           unsigned long hibernation_mode;
0069:   
0070:           /* This context's GFP mask */
0071:           gfp_t gfp_mask;
0072:   
0073:           int may_writepage;
0074:   
0075:           /* Can mapped pages be reclaimed? */
0076:           int may_unmap;
0077:   
0078:           /* Can pages be swapped as part of reclaim? */
0079:           int may_swap;
0080:   
0081:           int order;
0082:   
0083:           /* Scan (total_size >> priority) pages at once */
0084:           int priority;
0085:   
0086:           /*
0087:            * The memory cgroup that hit its limit and as a result is the
0088:            * primary target of this reclaim invocation.
0089:            */
0090:           struct mem_cgroup *target_mem_cgroup;
0091:   
0092:           /*
0093:            * Nodemask of nodes allowed by the caller. If NULL, all nodes
0094:            * are scanned.
0095:            */
0096:           nodemask_t      *nodemask;
0097:   };
0098:   
0099:   #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
0100:   
0101:   #ifdef ARCH_HAS_PREFETCH
0102:   #define prefetch_prev_lru_page(_page, _base, _field)                    \
0103:           do {                                                            \
0104:                   if ((_page)->lru.prev != _base) {                       \
0105:                           struct page *prev;                              \
0106:                                                                           \
0107:                           prev = lru_to_page(&(_page->lru));              \
0108:                           prefetch(&prev->_field);                        \
0109:                   }                                                       \
0110:           } while (0)
0111:   #else
0112:   #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
0113:   #endif
0114:   
0115:   #ifdef ARCH_HAS_PREFETCHW
0116:   #define prefetchw_prev_lru_page(_page, _base, _field)                   \
0117:           do {                                                            \
0118:                   if ((_page)->lru.prev != _base) {                       \
0119:                           struct page *prev;                              \
0120:                                                                           \
0121:                           prev = lru_to_page(&(_page->lru));              \
0122:                           prefetchw(&prev->_field);                       \
0123:                   }                                                       \
0124:           } while (0)
0125:   #else
0126:   #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
0127:   #endif
0128:   
0129:   /*
0130:    * From 0 .. 100.  Higher means more swappy.
0131:    */
0132:   int vm_swappiness = 60;
0133:   unsigned long vm_total_pages;   /* The total number of pages which the VM controls */
0134:   
0135:   static LIST_HEAD(shrinker_list);
0136:   static DECLARE_RWSEM(shrinker_rwsem);
0137:   
0138:   #ifdef CONFIG_MEMCG
0139:   static bool global_reclaim(struct scan_control *sc)
0140:   {
0141:           return !sc->target_mem_cgroup;
0142:   }
0143:   #else
0144:   static bool global_reclaim(struct scan_control *sc)
0145:   {
0146:           return true;
0147:   }
0148:   #endif
0149:   
0150:   static unsigned long zone_reclaimable_pages(struct zone *zone)
0151:   {
0152:           int nr;
0153:   
0154:           nr = zone_page_state(zone, NR_ACTIVE_FILE) +
0155:                zone_page_state(zone, NR_INACTIVE_FILE);
0156:   
0157:           if (get_nr_swap_pages() > 0)
0158:                   nr += zone_page_state(zone, NR_ACTIVE_ANON) +
0159:                         zone_page_state(zone, NR_INACTIVE_ANON);
0160:   
0161:           return nr;
0162:   }
0163:   
0164:   bool zone_reclaimable(struct zone *zone)
0165:   {
0166:           return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
0167:   }
0168:   
0169:   static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
0170:   {
0171:           if (!mem_cgroup_disabled())
0172:                   return mem_cgroup_get_lru_size(lruvec, lru);
0173:   
0174:           return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
0175:   }
0176:   
0177:   /*
0178:    * Add a shrinker callback to be called from the vm.
0179:    */
0180:   int register_shrinker(struct shrinker *shrinker)
0181:   {
0182:           size_t size = sizeof(*shrinker->nr_deferred);
0183:   
0184:           /*
0185:            * If we only have one possible node in the system anyway, save
0186:            * ourselves the trouble and disable NUMA aware behavior. This way we
0187:            * will save memory and some small loop time later.
0188:            */
0189:           if (nr_node_ids == 1)
0190:                   shrinker->flags &= ~SHRINKER_NUMA_AWARE;
0191:   
0192:           if (shrinker->flags & SHRINKER_NUMA_AWARE)
0193:                   size *= nr_node_ids;
0194:   
0195:           shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
0196:           if (!shrinker->nr_deferred)
0197:                   return -ENOMEM;
0198:   
0199:           down_write(&shrinker_rwsem);
0200:           list_add_tail(&shrinker->list, &shrinker_list);
0201:           up_write(&shrinker_rwsem);
0202:           return 0;
0203:   }
0204:   EXPORT_SYMBOL(register_shrinker);
0205:   
0206:   /*
0207:    * Remove one
0208:    */
0209:   void unregister_shrinker(struct shrinker *shrinker)
0210:   {
0211:           down_write(&shrinker_rwsem);
0212:           list_del(&shrinker->list);
0213:           up_write(&shrinker_rwsem);
0214:           kfree(shrinker->nr_deferred);
0215:   }
0216:   EXPORT_SYMBOL(unregister_shrinker);
0217:   
0218:   #define SHRINK_BATCH 128
0219:   
0220:   static unsigned long
0221:   shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
0222:                    unsigned long nr_pages_scanned, unsigned long lru_pages)
0223:   {
0224:           unsigned long freed = 0;
0225:           unsigned long long delta;
0226:           long total_scan;
0227:           long max_pass;
0228:           long nr;
0229:           long new_nr;
0230:           int nid = shrinkctl->nid;
0231:           long batch_size = shrinker->batch ? shrinker->batch
0232:                                             : SHRINK_BATCH;
0233:   
0234:           max_pass = shrinker->count_objects(shrinker, shrinkctl);
0235:           if (max_pass == 0)
0236:                   return 0;
0237:   
0238:           /*
0239:            * copy the current shrinker scan count into a local variable
0240:            * and zero it so that other concurrent shrinker invocations
0241:            * don't also do this scanning work.
0242:            */
0243:           nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
0244:   
0245:           total_scan = nr;
0246:           delta = (4 * nr_pages_scanned) / shrinker->seeks;
0247:           delta *= max_pass;
0248:           do_div(delta, lru_pages + 1);
0249:           total_scan += delta;
0250:           if (total_scan < 0) {
0251:                   printk(KERN_ERR
0252:                   "shrink_slab: %pF negative objects to delete nr=%ld\n",
0253:                          shrinker->scan_objects, total_scan);
0254:                   total_scan = max_pass;
0255:           }
0256:   
0257:           /*
0258:            * We need to avoid excessive windup on filesystem shrinkers
0259:            * due to large numbers of GFP_NOFS allocations causing the
0260:            * shrinkers to return -1 all the time. This results in a large
0261:            * nr being built up so when a shrink that can do some work
0262:            * comes along it empties the entire cache due to nr >>>
0263:            * max_pass.  This is bad for sustaining a working set in
0264:            * memory.
0265:            *
0266:            * Hence only allow the shrinker to scan the entire cache when
0267:            * a large delta change is calculated directly.
0268:            */
0269:           if (delta < max_pass / 4)
0270:                   total_scan = min(total_scan, max_pass / 2);
0271:   
0272:           /*
0273:            * Avoid risking looping forever due to too large nr value:
0274:            * never try to free more than twice the estimate number of
0275:            * freeable entries.
0276:            */
0277:           if (total_scan > max_pass * 2)
0278:                   total_scan = max_pass * 2;
0279:   
0280:           trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
0281:                                   nr_pages_scanned, lru_pages,
0282:                                   max_pass, delta, total_scan);
0283:   
0284:           while (total_scan >= batch_size) {
0285:                   unsigned long ret;
0286:   
0287:                   shrinkctl->nr_to_scan = batch_size;
0288:                   ret = shrinker->scan_objects(shrinker, shrinkctl);
0289:                   if (ret == SHRINK_STOP)
0290:                           break;
0291:                   freed += ret;
0292:   
0293:                   count_vm_events(SLABS_SCANNED, batch_size);
0294:                   total_scan -= batch_size;
0295:   
0296:                   cond_resched();
0297:           }
0298:   
0299:           /*
0300:            * move the unused scan count back into the shrinker in a
0301:            * manner that handles concurrent updates. If we exhausted the
0302:            * scan, there is no need to do an update.
0303:            */
0304:           if (total_scan > 0)
0305:                   new_nr = atomic_long_add_return(total_scan,
0306:                                                   &shrinker->nr_deferred[nid]);
0307:           else
0308:                   new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
0309:   
0310:           trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
0311:           return freed;
0312:   }
0313:   
0314:   /*
0315:    * Call the shrink functions to age shrinkable caches
0316:    *
0317:    * Here we assume it costs one seek to replace a lru page and that it also
0318:    * takes a seek to recreate a cache object.  With this in mind we age equal
0319:    * percentages of the lru and ageable caches.  This should balance the seeks
0320:    * generated by these structures.
0321:    *
0322:    * If the vm encountered mapped pages on the LRU it increase the pressure on
0323:    * slab to avoid swapping.
0324:    *
0325:    * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
0326:    *
0327:    * `lru_pages' represents the number of on-LRU pages in all the zones which
0328:    * are eligible for the caller's allocation attempt.  It is used for balancing
0329:    * slab reclaim versus page reclaim.
0330:    *
0331:    * Returns the number of slab objects which we shrunk.
0332:    */
0333:   unsigned long shrink_slab(struct shrink_control *shrinkctl,
0334:                             unsigned long nr_pages_scanned,
0335:                             unsigned long lru_pages)
0336:   {
0337:           struct shrinker *shrinker;
0338:           unsigned long freed = 0;
0339:   
0340:           if (nr_pages_scanned == 0)
0341:                   nr_pages_scanned = SWAP_CLUSTER_MAX;
0342:   
0343:           if (!down_read_trylock(&shrinker_rwsem)) {
0344:                   /*
0345:                    * If we would return 0, our callers would understand that we
0346:                    * have nothing else to shrink and give up trying. By returning
0347:                    * 1 we keep it going and assume we'll be able to shrink next
0348:                    * time.
0349:                    */
0350:                   freed = 1;
0351:                   goto out;
0352:           }
0353:   
0354:           list_for_each_entry(shrinker, &shrinker_list, list) {
0355:                   for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
0356:                           if (!node_online(shrinkctl->nid))
0357:                                   continue;
0358:   
0359:                           if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
0360:                               (shrinkctl->nid != 0))
0361:                                   break;
0362:   
0363:                           freed += shrink_slab_node(shrinkctl, shrinker,
0364:                                    nr_pages_scanned, lru_pages);
0365:   
0366:                   }
0367:           }
0368:           up_read(&shrinker_rwsem);
0369:   out:
0370:           cond_resched();
0371:           return freed;
0372:   }
0373:   
0374:   static inline int is_page_cache_freeable(struct page *page)
0375:   {
0376:           /*
0377:            * A freeable page cache page is referenced only by the caller
0378:            * that isolated the page, the page cache radix tree and
0379:            * optional buffer heads at page->private.
0380:            */
0381:           return page_count(page) - page_has_private(page) == 2;
0382:   }
0383:   
0384:   static int may_write_to_queue(struct backing_dev_info *bdi,
0385:                                 struct scan_control *sc)
0386:   {
0387:           if (current->flags & PF_SWAPWRITE)
0388:                   return 1;
0389:           if (!bdi_write_congested(bdi))
0390:                   return 1;
0391:           if (bdi == current->backing_dev_info)
0392:                   return 1;
0393:           return 0;
0394:   }
0395:   
0396:   /*
0397:    * We detected a synchronous write error writing a page out.  Probably
0398:    * -ENOSPC.  We need to propagate that into the address_space for a subsequent
0399:    * fsync(), msync() or close().
0400:    *
0401:    * The tricky part is that after writepage we cannot touch the mapping: nothing
0402:    * prevents it from being freed up.  But we have a ref on the page and once
0403:    * that page is locked, the mapping is pinned.
0404:    *
0405:    * We're allowed to run sleeping lock_page() here because we know the caller has
0406:    * __GFP_FS.
0407:    */
0408:   static void handle_write_error(struct address_space *mapping,
0409:                                   struct page *page, int error)
0410:   {
0411:           lock_page(page);
0412:           if (page_mapping(page) == mapping)
0413:                   mapping_set_error(mapping, error);
0414:           unlock_page(page);
0415:   }
0416:   
0417:   /* possible outcome of pageout() */
0418:   typedef enum {
0419:           /* failed to write page out, page is locked */
0420:           PAGE_KEEP,
0421:           /* move page to the active list, page is locked */
0422:           PAGE_ACTIVATE,
0423:           /* page has been sent to the disk successfully, page is unlocked */
0424:           PAGE_SUCCESS,
0425:           /* page is clean and locked */
0426:           PAGE_CLEAN,
0427:   } pageout_t;
0428:   
0429:   /*
0430:    * pageout is called by shrink_page_list() for each dirty page.
0431:    * Calls ->writepage().
0432:    */
0433:   static pageout_t pageout(struct page *page, struct address_space *mapping,
0434:                            struct scan_control *sc)
0435:   {
0436:           /*
0437:            * If the page is dirty, only perform writeback if that write
0438:            * will be non-blocking.  To prevent this allocation from being
0439:            * stalled by pagecache activity.  But note that there may be
0440:            * stalls if we need to run get_block().  We could test
0441:            * PagePrivate for that.
0442:            *
0443:            * If this process is currently in __generic_file_aio_write() against
0444:            * this page's queue, we can perform writeback even if that
0445:            * will block.
0446:            *
0447:            * If the page is swapcache, write it back even if that would
0448:            * block, for some throttling. This happens by accident, because
0449:            * swap_backing_dev_info is bust: it doesn't reflect the
0450:            * congestion state of the swapdevs.  Easy to fix, if needed.
0451:            */
0452:           if (!is_page_cache_freeable(page))
0453:                   return PAGE_KEEP;
0454:           if (!mapping) {
0455:                   /*
0456:                    * Some data journaling orphaned pages can have
0457:                    * page->mapping == NULL while being dirty with clean buffers.
0458:                    */
0459:                   if (page_has_private(page)) {
0460:                           if (try_to_free_buffers(page)) {
0461:                                   ClearPageDirty(page);
0462:                                   printk("%s: orphaned page\n", __func__);
0463:                                   return PAGE_CLEAN;
0464:                           }
0465:                   }
0466:                   return PAGE_KEEP;
0467:           }
0468:           if (mapping->a_ops->writepage == NULL)
0469:                   return PAGE_ACTIVATE;
0470:           if (!may_write_to_queue(mapping->backing_dev_info, sc))
0471:                   return PAGE_KEEP;
0472:   
0473:           if (clear_page_dirty_for_io(page)) {
0474:                   int res;
0475:                   struct writeback_control wbc = {
0476:                           .sync_mode = WB_SYNC_NONE,
0477:                           .nr_to_write = SWAP_CLUSTER_MAX,
0478:                           .range_start = 0,
0479:                           .range_end = LLONG_MAX,
0480:                           .for_reclaim = 1,
0481:                   };
0482:   
0483:                   SetPageReclaim(page);
0484:                   res = mapping->a_ops->writepage(page, &wbc);
0485:                   if (res < 0)
0486:                           handle_write_error(mapping, page, res);
0487:                   if (res == AOP_WRITEPAGE_ACTIVATE) {
0488:                           ClearPageReclaim(page);
0489:                           return PAGE_ACTIVATE;
0490:                   }
0491:   
0492:                   if (!PageWriteback(page)) {
0493:                           /* synchronous write or broken a_ops? */
0494:                           ClearPageReclaim(page);
0495:                   }
0496:                   trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
0497:                   inc_zone_page_state(page, NR_VMSCAN_WRITE);
0498:                   return PAGE_SUCCESS;
0499:           }
0500:   
0501:           return PAGE_CLEAN;
0502:   }
0503:   
0504:   /*
0505:    * Same as remove_mapping, but if the page is removed from the mapping, it
0506:    * gets returned with a refcount of 0.
0507:    */
0508:   static int __remove_mapping(struct address_space *mapping, struct page *page)
0509:   {
0510:           BUG_ON(!PageLocked(page));
0511:           BUG_ON(mapping != page_mapping(page));
0512:   
0513:           spin_lock_irq(&mapping->tree_lock);
0514:           /*
0515:            * The non racy check for a busy page.
0516:            *
0517:            * Must be careful with the order of the tests. When someone has
0518:            * a ref to the page, it may be possible that they dirty it then
0519:            * drop the reference. So if PageDirty is tested before page_count
0520:            * here, then the following race may occur:
0521:            *
0522:            * get_user_pages(&page);
0523:            * [user mapping goes away]
0524:            * write_to(page);
0525:            *                              !PageDirty(page)    [good]
0526:            * SetPageDirty(page);
0527:            * put_page(page);
0528:            *                              !page_count(page)   [good, discard it]
0529:            *
0530:            * [oops, our write_to data is lost]
0531:            *
0532:            * Reversing the order of the tests ensures such a situation cannot
0533:            * escape unnoticed. The smp_rmb is needed to ensure the page->flags
0534:            * load is not satisfied before that of page->_count.
0535:            *
0536:            * Note that if SetPageDirty is always performed via set_page_dirty,
0537:            * and thus under tree_lock, then this ordering is not required.
0538:            */
0539:           if (!page_freeze_refs(page, 2))
0540:                   goto cannot_free;
0541:           /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
0542:           if (unlikely(PageDirty(page))) {
0543:                   page_unfreeze_refs(page, 2);
0544:                   goto cannot_free;
0545:           }
0546:   
0547:           if (PageSwapCache(page)) {
0548:                   swp_entry_t swap = { .val = page_private(page) };
0549:                   __delete_from_swap_cache(page);
0550:                   spin_unlock_irq(&mapping->tree_lock);
0551:                   swapcache_free(swap, page);
0552:           } else {
0553:                   void (*freepage)(struct page *);
0554:   
0555:                   freepage = mapping->a_ops->freepage;
0556:   
0557:                   __delete_from_page_cache(page);
0558:                   spin_unlock_irq(&mapping->tree_lock);
0559:                   mem_cgroup_uncharge_cache_page(page);
0560:   
0561:                   if (freepage != NULL)
0562:                           freepage(page);
0563:           }
0564:   
0565:           return 1;
0566:   
0567:   cannot_free:
0568:           spin_unlock_irq(&mapping->tree_lock);
0569:           return 0;
0570:   }
0571:   
0572:   /*
0573:    * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
0574:    * someone else has a ref on the page, abort and return 0.  If it was
0575:    * successfully detached, return 1.  Assumes the caller has a single ref on
0576:    * this page.
0577:    */
0578:   int remove_mapping(struct address_space *mapping, struct page *page)
0579:   {
0580:           if (__remove_mapping(mapping, page)) {
0581:                   /*
0582:                    * Unfreezing the refcount with 1 rather than 2 effectively
0583:                    * drops the pagecache ref for us without requiring another
0584:                    * atomic operation.
0585:                    */
0586:                   page_unfreeze_refs(page, 1);
0587:                   return 1;
0588:           }
0589:           return 0;
0590:   }
0591:   
0592:   /**
0593:    * putback_lru_page - put previously isolated page onto appropriate LRU list
0594:    * @page: page to be put back to appropriate lru list
0595:    *
0596:    * Add previously isolated @page to appropriate LRU list.
0597:    * Page may still be unevictable for other reasons.
0598:    *
0599:    * lru_lock must not be held, interrupts must be enabled.
0600:    */
0601:   void putback_lru_page(struct page *page)
0602:   {
0603:           bool is_unevictable;
0604:           int was_unevictable = PageUnevictable(page);
0605:   
0606:           VM_BUG_ON(PageLRU(page));
0607:   
0608:   redo:
0609:           ClearPageUnevictable(page);
0610:   
0611:           if (page_evictable(page)) {
0612:                   /*
0613:                    * For evictable pages, we can use the cache.
0614:                    * In event of a race, worst case is we end up with an
0615:                    * unevictable page on [in]active list.
0616:                    * We know how to handle that.
0617:                    */
0618:                   is_unevictable = false;
0619:                   lru_cache_add(page);
0620:           } else {
0621:                   /*
0622:                    * Put unevictable pages directly on zone's unevictable
0623:                    * list.
0624:                    */
0625:                   is_unevictable = true;
0626:                   add_page_to_unevictable_list(page);
0627:                   /*
0628:                    * When racing with an mlock or AS_UNEVICTABLE clearing
0629:                    * (page is unlocked) make sure that if the other thread
0630:                    * does not observe our setting of PG_lru and fails
0631:                    * isolation/check_move_unevictable_pages,
0632:                    * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
0633:                    * the page back to the evictable list.
0634:                    *
0635:                    * The other side is TestClearPageMlocked() or shmem_lock().
0636:                    */
0637:                   smp_mb();
0638:           }
0639:   
0640:           /*
0641:            * page's status can change while we move it among lru. If an evictable
0642:            * page is on unevictable list, it never be freed. To avoid that,
0643:            * check after we added it to the list, again.
0644:            */
0645:           if (is_unevictable && page_evictable(page)) {
0646:                   if (!isolate_lru_page(page)) {
0647:                           put_page(page);
0648:                           goto redo;
0649:                   }
0650:                   /* This means someone else dropped this page from LRU
0651:                    * So, it will be freed or putback to LRU again. There is
0652:                    * nothing to do here.
0653:                    */
0654:           }
0655:   
0656:           if (was_unevictable && !is_unevictable)
0657:                   count_vm_event(UNEVICTABLE_PGRESCUED);
0658:           else if (!was_unevictable && is_unevictable)
0659:                   count_vm_event(UNEVICTABLE_PGCULLED);
0660:   
0661:           put_page(page);         /* drop ref from isolate */
0662:   }
0663:   
0664:   enum page_references {
0665:           PAGEREF_RECLAIM,
0666:           PAGEREF_RECLAIM_CLEAN,
0667:           PAGEREF_KEEP,
0668:           PAGEREF_ACTIVATE,
0669:   };
0670:   
0671:   static enum page_references page_check_references(struct page *page,
0672:                                                     struct scan_control *sc)
0673:   {
0674:           int referenced_ptes, referenced_page;
0675:           unsigned long vm_flags;
0676:   
0677:           referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
0678:                                             &vm_flags);
0679:           referenced_page = TestClearPageReferenced(page);
0680:   
0681:           /*
0682:            * Mlock lost the isolation race with us.  Let try_to_unmap()
0683:            * move the page to the unevictable list.
0684:            */
0685:           if (vm_flags & VM_LOCKED)
0686:                   return PAGEREF_RECLAIM;
0687:   
0688:           if (referenced_ptes) {
0689:                   if (PageSwapBacked(page))
0690:                           return PAGEREF_ACTIVATE;
0691:                   /*
0692:                    * All mapped pages start out with page table
0693:                    * references from the instantiating fault, so we need
0694:                    * to look twice if a mapped file page is used more
0695:                    * than once.
0696:                    *
0697:                    * Mark it and spare it for another trip around the
0698:                    * inactive list.  Another page table reference will
0699:                    * lead to its activation.
0700:                    *
0701:                    * Note: the mark is set for activated pages as well
0702:                    * so that recently deactivated but used pages are
0703:                    * quickly recovered.
0704:                    */
0705:                   SetPageReferenced(page);
0706:   
0707:                   if (referenced_page || referenced_ptes > 1)
0708:                           return PAGEREF_ACTIVATE;
0709:   
0710:                   /*
0711:                    * Activate file-backed executable pages after first usage.
0712:                    */
0713:                   if (vm_flags & VM_EXEC)
0714:                           return PAGEREF_ACTIVATE;
0715:   
0716:                   return PAGEREF_KEEP;
0717:           }
0718:   
0719:           /* Reclaim if clean, defer dirty pages to writeback */
0720:           if (referenced_page && !PageSwapBacked(page))
0721:                   return PAGEREF_RECLAIM_CLEAN;
0722:   
0723:           return PAGEREF_RECLAIM;
0724:   }
0725:   
0726:   /* Check if a page is dirty or under writeback */
0727:   static void page_check_dirty_writeback(struct page *page,
0728:                                          bool *dirty, bool *writeback)
0729:   {
0730:           struct address_space *mapping;
0731:   
0732:           /*
0733:            * Anonymous pages are not handled by flushers and must be written
0734:            * from reclaim context. Do not stall reclaim based on them
0735:            */
0736:           if (!page_is_file_cache(page)) {
0737:                   *dirty = false;
0738:                   *writeback = false;
0739:                   return;
0740:           }
0741:   
0742:           /* By default assume that the page flags are accurate */
0743:           *dirty = PageDirty(page);
0744:           *writeback = PageWriteback(page);
0745:   
0746:           /* Verify dirty/writeback state if the filesystem supports it */
0747:           if (!page_has_private(page))
0748:                   return;
0749:   
0750:           mapping = page_mapping(page);
0751:           if (mapping && mapping->a_ops->is_dirty_writeback)
0752:                   mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
0753:   }
0754:   
0755:   /*
0756:    * shrink_page_list() returns the number of reclaimed pages
0757:    */
0758:   static unsigned long shrink_page_list(struct list_head *page_list,
0759:                                         struct zone *zone,
0760:                                         struct scan_control *sc,
0761:                                         enum ttu_flags ttu_flags,
0762:                                         unsigned long *ret_nr_dirty,
0763:                                         unsigned long *ret_nr_unqueued_dirty,
0764:                                         unsigned long *ret_nr_congested,
0765:                                         unsigned long *ret_nr_writeback,
0766:                                         unsigned long *ret_nr_immediate,
0767:                                         bool force_reclaim)
0768:   {
0769:           LIST_HEAD(ret_pages);
0770:           LIST_HEAD(free_pages);
0771:           int pgactivate = 0;
0772:           unsigned long nr_unqueued_dirty = 0;
0773:           unsigned long nr_dirty = 0;
0774:           unsigned long nr_congested = 0;
0775:           unsigned long nr_reclaimed = 0;
0776:           unsigned long nr_writeback = 0;
0777:           unsigned long nr_immediate = 0;
0778:   
0779:           cond_resched();
0780:   
0781:           mem_cgroup_uncharge_start();
0782:           while (!list_empty(page_list)) {
0783:                   struct address_space *mapping;
0784:                   struct page *page;
0785:                   int may_enter_fs;
0786:                   enum page_references references = PAGEREF_RECLAIM_CLEAN;
0787:                   bool dirty, writeback;
0788:   
0789:                   cond_resched();
0790:   
0791:                   page = lru_to_page(page_list);
0792:                   list_del(&page->lru);
0793:   
0794:                   if (!trylock_page(page))
0795:                           goto keep;
0796:   
0797:                   VM_BUG_ON(PageActive(page));
0798:                   VM_BUG_ON(page_zone(page) != zone);
0799:   
0800:                   sc->nr_scanned++;
0801:   
0802:                   if (unlikely(!page_evictable(page)))
0803:                           goto cull_mlocked;
0804:   
0805:                   if (!sc->may_unmap && page_mapped(page))
0806:                           goto keep_locked;
0807:   
0808:                   /* Double the slab pressure for mapped and swapcache pages */
0809:                   if (page_mapped(page) || PageSwapCache(page))
0810:                           sc->nr_scanned++;
0811:   
0812:                   may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
0813:                           (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
0814:   
0815:                   /*
0816:                    * The number of dirty pages determines if a zone is marked
0817:                    * reclaim_congested which affects wait_iff_congested. kswapd
0818:                    * will stall and start writing pages if the tail of the LRU
0819:                    * is all dirty unqueued pages.
0820:                    */
0821:                   page_check_dirty_writeback(page, &dirty, &writeback);
0822:                   if (dirty || writeback)
0823:                           nr_dirty++;
0824:   
0825:                   if (dirty && !writeback)
0826:                           nr_unqueued_dirty++;
0827:   
0828:                   /*
0829:                    * Treat this page as congested if the underlying BDI is or if
0830:                    * pages are cycling through the LRU so quickly that the
0831:                    * pages marked for immediate reclaim are making it to the
0832:                    * end of the LRU a second time.
0833:                    */
0834:                   mapping = page_mapping(page);
0835:                   if ((mapping && bdi_write_congested(mapping->backing_dev_info)) ||
0836:                       (writeback && PageReclaim(page)))
0837:                           nr_congested++;
0838:   
0839:                   /*
0840:                    * If a page at the tail of the LRU is under writeback, there
0841:                    * are three cases to consider.
0842:                    *
0843:                    * 1) If reclaim is encountering an excessive number of pages
0844:                    *    under writeback and this page is both under writeback and
0845:                    *    PageReclaim then it indicates that pages are being queued
0846:                    *    for IO but are being recycled through the LRU before the
0847:                    *    IO can complete. Waiting on the page itself risks an
0848:                    *    indefinite stall if it is impossible to writeback the
0849:                    *    page due to IO error or disconnected storage so instead
0850:                    *    note that the LRU is being scanned too quickly and the
0851:                    *    caller can stall after page list has been processed.
0852:                    *
0853:                    * 2) Global reclaim encounters a page, memcg encounters a
0854:                    *    page that is not marked for immediate reclaim or
0855:                    *    the caller does not have __GFP_IO. In this case mark
0856:                    *    the page for immediate reclaim and continue scanning.
0857:                    *
0858:                    *    __GFP_IO is checked  because a loop driver thread might
0859:                    *    enter reclaim, and deadlock if it waits on a page for
0860:                    *    which it is needed to do the write (loop masks off
0861:                    *    __GFP_IO|__GFP_FS for this reason); but more thought
0862:                    *    would probably show more reasons.
0863:                    *
0864:                    *    Don't require __GFP_FS, since we're not going into the
0865:                    *    FS, just waiting on its writeback completion. Worryingly,
0866:                    *    ext4 gfs2 and xfs allocate pages with
0867:                    *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
0868:                    *    may_enter_fs here is liable to OOM on them.
0869:                    *
0870:                    * 3) memcg encounters a page that is not already marked
0871:                    *    PageReclaim. memcg does not have any dirty pages
0872:                    *    throttling so we could easily OOM just because too many
0873:                    *    pages are in writeback and there is nothing else to
0874:                    *    reclaim. Wait for the writeback to complete.
0875:                    */
0876:                   if (PageWriteback(page)) {
0877:                           /* Case 1 above */
0878:                           if (current_is_kswapd() &&
0879:                               PageReclaim(page) &&
0880:                               zone_is_reclaim_writeback(zone)) {
0881:                                   nr_immediate++;
0882:                                   goto keep_locked;
0883:   
0884:                           /* Case 2 above */
0885:                           } else if (global_reclaim(sc) ||
0886:                               !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
0887:                                   /*
0888:                                    * This is slightly racy - end_page_writeback()
0889:                                    * might have just cleared PageReclaim, then
0890:                                    * setting PageReclaim here end up interpreted
0891:                                    * as PageReadahead - but that does not matter
0892:                                    * enough to care.  What we do want is for this
0893:                                    * page to have PageReclaim set next time memcg
0894:                                    * reclaim reaches the tests above, so it will
0895:                                    * then wait_on_page_writeback() to avoid OOM;
0896:                                    * and it's also appropriate in global reclaim.
0897:                                    */
0898:                                   SetPageReclaim(page);
0899:                                   nr_writeback++;
0900:   
0901:                                   goto keep_locked;
0902:   
0903:                           /* Case 3 above */
0904:                           } else {
0905:                                   wait_on_page_writeback(page);
0906:                           }
0907:                   }
0908:   
0909:                   if (!force_reclaim)
0910:                           references = page_check_references(page, sc);
0911:   
0912:                   switch (references) {
0913:                   case PAGEREF_ACTIVATE:
0914:                           goto activate_locked;
0915:                   case PAGEREF_KEEP:
0916:                           goto keep_locked;
0917:                   case PAGEREF_RECLAIM:
0918:                   case PAGEREF_RECLAIM_CLEAN:
0919:                           ; /* try to reclaim the page below */
0920:                   }
0921:   
0922:                   /*
0923:                    * Anonymous process memory has backing store?
0924:                    * Try to allocate it some swap space here.
0925:                    */
0926:                   if (PageAnon(page) && !PageSwapCache(page)) {
0927:                           if (!(sc->gfp_mask & __GFP_IO))
0928:                                   goto keep_locked;
0929:                           if (!add_to_swap(page, page_list))
0930:                                   goto activate_locked;
0931:                           may_enter_fs = 1;
0932:   
0933:                           /* Adding to swap updated mapping */
0934:                           mapping = page_mapping(page);
0935:                   }
0936:   
0937:                   /*
0938:                    * The page is mapped into the page tables of one or more
0939:                    * processes. Try to unmap it here.
0940:                    */
0941:                   if (page_mapped(page) && mapping) {
0942:                           switch (try_to_unmap(page, ttu_flags)) {
0943:                           case SWAP_FAIL:
0944:                                   goto activate_locked;
0945:                           case SWAP_AGAIN:
0946:                                   goto keep_locked;
0947:                           case SWAP_MLOCK:
0948:                                   goto cull_mlocked;
0949:                           case SWAP_SUCCESS:
0950:                                   ; /* try to free the page below */
0951:                           }
0952:                   }
0953:   
0954:                   if (PageDirty(page)) {
0955:                           /*
0956:                            * Only kswapd can writeback filesystem pages to
0957:                            * avoid risk of stack overflow but only writeback
0958:                            * if many dirty pages have been encountered.
0959:                            */
0960:                           if (page_is_file_cache(page) &&
0961:                                           (!current_is_kswapd() ||
0962:                                            !zone_is_reclaim_dirty(zone))) {
0963:                                   /*
0964:                                    * Immediately reclaim when written back.
0965:                                    * Similar in principal to deactivate_page()
0966:                                    * except we already have the page isolated
0967:                                    * and know it's dirty
0968:                                    */
0969:                                   inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
0970:                                   SetPageReclaim(page);
0971:   
0972:                                   goto keep_locked;
0973:                           }
0974:   
0975:                           if (references == PAGEREF_RECLAIM_CLEAN)
0976:                                   goto keep_locked;
0977:                           if (!may_enter_fs)
0978:                                   goto keep_locked;
0979:                           if (!sc->may_writepage)
0980:                                   goto keep_locked;
0981:   
0982:                           /* Page is dirty, try to write it out here */
0983:                           switch (pageout(page, mapping, sc)) {
0984:                           case PAGE_KEEP:
0985:                                   goto keep_locked;
0986:                           case PAGE_ACTIVATE:
0987:                                   goto activate_locked;
0988:                           case PAGE_SUCCESS:
0989:                                   if (PageWriteback(page))
0990:                                           goto keep;
0991:                                   if (PageDirty(page))
0992:                                           goto keep;
0993:   
0994:                                   /*
0995:                                    * A synchronous write - probably a ramdisk.  Go
0996:                                    * ahead and try to reclaim the page.
0997:                                    */
0998:                                   if (!trylock_page(page))
0999:                                           goto keep;
1000:                                   if (PageDirty(page) || PageWriteback(page))
1001:                                           goto keep_locked;
1002:                                   mapping = page_mapping(page);
1003:                           case PAGE_CLEAN:
1004:                                   ; /* try to free the page below */
1005:                           }
1006:                   }
1007:   
1008:                   /*
1009:                    * If the page has buffers, try to free the buffer mappings
1010:                    * associated with this page. If we succeed we try to free
1011:                    * the page as well.
1012:                    *
1013:                    * We do this even if the page is PageDirty().
1014:                    * try_to_release_page() does not perform I/O, but it is
1015:                    * possible for a page to have PageDirty set, but it is actually
1016:                    * clean (all its buffers are clean).  This happens if the
1017:                    * buffers were written out directly, with submit_bh(). ext3
1018:                    * will do this, as well as the blockdev mapping.
1019:                    * try_to_release_page() will discover that cleanness and will
1020:                    * drop the buffers and mark the page clean - it can be freed.
1021:                    *
1022:                    * Rarely, pages can have buffers and no ->mapping.  These are
1023:                    * the pages which were not successfully invalidated in
1024:                    * truncate_complete_page().  We try to drop those buffers here
1025:                    * and if that worked, and the page is no longer mapped into
1026:                    * process address space (page_count == 1) it can be freed.
1027:                    * Otherwise, leave the page on the LRU so it is swappable.
1028:                    */
1029:                   if (page_has_private(page)) {
1030:                           if (!try_to_release_page(page, sc->gfp_mask))
1031:                                   goto activate_locked;
1032:                           if (!mapping && page_count(page) == 1) {
1033:                                   unlock_page(page);
1034:                                   if (put_page_testzero(page))
1035:                                           goto free_it;
1036:                                   else {
1037:                                           /*
1038:                                            * rare race with speculative reference.
1039:                                            * the speculative reference will free
1040:                                            * this page shortly, so we may
1041:                                            * increment nr_reclaimed here (and
1042:                                            * leave it off the LRU).
1043:                                            */
1044:                                           nr_reclaimed++;
1045:                                           continue;
1046:                                   }
1047:                           }
1048:                   }
1049:   
1050:                   if (!mapping || !__remove_mapping(mapping, page))
1051:                           goto keep_locked;
1052:   
1053:                   /*
1054:                    * At this point, we have no other references and there is
1055:                    * no way to pick any more up (removed from LRU, removed
1056:                    * from pagecache). Can use non-atomic bitops now (and
1057:                    * we obviously don't have to worry about waking up a process
1058:                    * waiting on the page lock, because there are no references.
1059:                    */
1060:                   __clear_page_locked(page);
1061:   free_it:
1062:                   nr_reclaimed++;
1063:   
1064:                   /*
1065:                    * Is there need to periodically free_page_list? It would
1066:                    * appear not as the counts should be low
1067:                    */
1068:                   list_add(&page->lru, &free_pages);
1069:                   continue;
1070:   
1071:   cull_mlocked:
1072:                   if (PageSwapCache(page))
1073:                           try_to_free_swap(page);
1074:                   unlock_page(page);
1075:                   putback_lru_page(page);
1076:                   continue;
1077:   
1078:   activate_locked:
1079:                   /* Not a candidate for swapping, so reclaim swap space. */
1080:                   if (PageSwapCache(page) && vm_swap_full())
1081:                           try_to_free_swap(page);
1082:                   VM_BUG_ON(PageActive(page));
1083:                   SetPageActive(page);
1084:                   pgactivate++;
1085:   keep_locked:
1086:                   unlock_page(page);
1087:   keep:
1088:                   list_add(&page->lru, &ret_pages);
1089:                   VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
1090:           }
1091:   
1092:           free_hot_cold_page_list(&free_pages, 1);
1093:   
1094:           list_splice(&ret_pages, page_list);
1095:           count_vm_events(PGACTIVATE, pgactivate);
1096:           mem_cgroup_uncharge_end();
1097:           *ret_nr_dirty += nr_dirty;
1098:           *ret_nr_congested += nr_congested;
1099:           *ret_nr_unqueued_dirty += nr_unqueued_dirty;
1100:           *ret_nr_writeback += nr_writeback;
1101:           *ret_nr_immediate += nr_immediate;
1102:           return nr_reclaimed;
1103:   }
1104:   
1105:   unsigned long reclaim_clean_pages_from_list(struct zone *zone,
1106:                                               struct list_head *page_list)
1107:   {
1108:           struct scan_control sc = {
1109:                   .gfp_mask = GFP_KERNEL,
1110:                   .priority = DEF_PRIORITY,
1111:                   .may_unmap = 1,
1112:           };
1113:           unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;
1114:           struct page *page, *next;
1115:           LIST_HEAD(clean_pages);
1116:   
1117:           list_for_each_entry_safe(page, next, page_list, lru) {
1118:                   if (page_is_file_cache(page) && !PageDirty(page) &&
1119:                       !isolated_balloon_page(page)) {
1120:                           ClearPageActive(page);
1121:                           list_move(&page->lru, &clean_pages);
1122:                   }
1123:           }
1124:   
1125:           ret = shrink_page_list(&clean_pages, zone, &sc,
1126:                           TTU_UNMAP|TTU_IGNORE_ACCESS,
1127:                           &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
1128:           list_splice(&clean_pages, page_list);
1129:           __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
1130:           return ret;
1131:   }
1132:   
1133:   /*
1134:    * Attempt to remove the specified page from its LRU.  Only take this page
1135:    * if it is of the appropriate PageActive status.  Pages which are being
1136:    * freed elsewhere are also ignored.
1137:    *
1138:    * page:        page to consider
1139:    * mode:        one of the LRU isolation modes defined above
1140:    *
1141:    * returns 0 on success, -ve errno on failure.
1142:    */
1143:   int __isolate_lru_page(struct page *page, isolate_mode_t mode)
1144:   {
1145:           int ret = -EINVAL;
1146:   
1147:           /* Only take pages on the LRU. */
1148:           if (!PageLRU(page))
1149:                   return ret;
1150:   
1151:           /* Compaction should not handle unevictable pages but CMA can do so */
1152:           if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
1153:                   return ret;
1154:   
1155:           ret = -EBUSY;
1156:   
1157:           /*
1158:            * To minimise LRU disruption, the caller can indicate that it only
1159:            * wants to isolate pages it will be able to operate on without
1160:            * blocking - clean pages for the most part.
1161:            *
1162:            * ISOLATE_CLEAN means that only clean pages should be isolated. This
1163:            * is used by reclaim when it is cannot write to backing storage
1164:            *
1165:            * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
1166:            * that it is possible to migrate without blocking
1167:            */
1168:           if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
1169:                   /* All the caller can do on PageWriteback is block */
1170:                   if (PageWriteback(page))
1171:                           return ret;
1172:   
1173:                   if (PageDirty(page)) {
1174:                           struct address_space *mapping;
1175:   
1176:                           /* ISOLATE_CLEAN means only clean pages */
1177:                           if (mode & ISOLATE_CLEAN)
1178:                                   return ret;
1179:   
1180:                           /*
1181:                            * Only pages without mappings or that have a
1182:                            * ->migratepage callback are possible to migrate
1183:                            * without blocking
1184:                            */
1185:                           mapping = page_mapping(page);
1186:                           if (mapping && !mapping->a_ops->migratepage)
1187:                                   return ret;
1188:                   }
1189:           }
1190:   
1191:           if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1192:                   return ret;
1193:   
1194:           if (likely(get_page_unless_zero(page))) {
1195:                   /*
1196:                    * Be careful not to clear PageLRU until after we're
1197:                    * sure the page is not being freed elsewhere -- the
1198:                    * page release code relies on it.
1199:                    */
1200:                   ClearPageLRU(page);
1201:                   ret = 0;
1202:           }
1203:   
1204:           return ret;
1205:   }
1206:   
1207:   /*
1208:    * zone->lru_lock is heavily contended.  Some of the functions that
1209:    * shrink the lists perform better by taking out a batch of pages
1210:    * and working on them outside the LRU lock.
1211:    *
1212:    * For pagecache intensive workloads, this function is the hottest
1213:    * spot in the kernel (apart from copy_*_user functions).
1214:    *
1215:    * Appropriate locks must be held before calling this function.
1216:    *
1217:    * @nr_to_scan: The number of pages to look through on the list.
1218:    * @lruvec:     The LRU vector to pull pages from.
1219:    * @dst:        The temp list to put pages on to.
1220:    * @nr_scanned: The number of pages that were scanned.
1221:    * @sc:         The scan_control struct for this reclaim session
1222:    * @mode:       One of the LRU isolation modes
1223:    * @lru:        LRU list id for isolating
1224:    *
1225:    * returns how many pages were moved onto *@dst.
1226:    */
1227:   static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1228:                   struct lruvec *lruvec, struct list_head *dst,
1229:                   unsigned long *nr_scanned, struct scan_control *sc,
1230:                   isolate_mode_t mode, enum lru_list lru)
1231:   {
1232:           struct list_head *src = &lruvec->lists[lru];
1233:           unsigned long nr_taken = 0;
1234:           unsigned long scan;
1235:   
1236:           for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
1237:                   struct page *page;
1238:                   int nr_pages;
1239:   
1240:                   page = lru_to_page(src);
1241:                   prefetchw_prev_lru_page(page, src, flags);
1242:   
1243:                   VM_BUG_ON(!PageLRU(page));
1244:   
1245:                   switch (__isolate_lru_page(page, mode)) {
1246:                   case 0:
1247:                           nr_pages = hpage_nr_pages(page);
1248:                           mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
1249:                           list_move(&page->lru, dst);
1250:                           nr_taken += nr_pages;
1251:                           break;
1252:   
1253:                   case -EBUSY:
1254:                           /* else it is being freed elsewhere */
1255:                           list_move(&page->lru, src);
1256:                           continue;
1257:   
1258:                   default:
1259:                           BUG();
1260:                   }
1261:           }
1262:   
1263:           *nr_scanned = scan;
1264:           trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
1265:                                       nr_taken, mode, is_file_lru(lru));
1266:           return nr_taken;
1267:   }
1268:   
1269:   /**
1270:    * isolate_lru_page - tries to isolate a page from its LRU list
1271:    * @page: page to isolate from its LRU list
1272:    *
1273:    * Isolates a @page from an LRU list, clears PageLRU and adjusts the
1274:    * vmstat statistic corresponding to whatever LRU list the page was on.
1275:    *
1276:    * Returns 0 if the page was removed from an LRU list.
1277:    * Returns -EBUSY if the page was not on an LRU list.
1278:    *
1279:    * The returned page will have PageLRU() cleared.  If it was found on
1280:    * the active list, it will have PageActive set.  If it was found on
1281:    * the unevictable list, it will have the PageUnevictable bit set. That flag
1282:    * may need to be cleared by the caller before letting the page go.
1283:    *
1284:    * The vmstat statistic corresponding to the list on which the page was
1285:    * found will be decremented.
1286:    *
1287:    * Restrictions:
1288:    * (1) Must be called with an elevated refcount on the page. This is a
1289:    *     fundamentnal difference from isolate_lru_pages (which is called
1290:    *     without a stable reference).
1291:    * (2) the lru_lock must not be held.
1292:    * (3) interrupts must be enabled.
1293:    */
1294:   int isolate_lru_page(struct page *page)
1295:   {
1296:           int ret = -EBUSY;
1297:   
1298:           VM_BUG_ON(!page_count(page));
1299:   
1300:           if (PageLRU(page)) {
1301:                   struct zone *zone = page_zone(page);
1302:                   struct lruvec *lruvec;
1303:   
1304:                   spin_lock_irq(&zone->lru_lock);
1305:                   lruvec = mem_cgroup_page_lruvec(page, zone);
1306:                   if (PageLRU(page)) {
1307:                           int lru = page_lru(page);
1308:                           get_page(page);
1309:                           ClearPageLRU(page);
1310:                           del_page_from_lru_list(page, lruvec, lru);
1311:                           ret = 0;
1312:                   }
1313:                   spin_unlock_irq(&zone->lru_lock);
1314:           }
1315:           return ret;
1316:   }
1317:   
1318:   /*
1319:    * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
1320:    * then get resheduled. When there are massive number of tasks doing page
1321:    * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
1322:    * the LRU list will go small and be scanned faster than necessary, leading to
1323:    * unnecessary swapping, thrashing and OOM.
1324:    */
1325:   static int too_many_isolated(struct zone *zone, int file,
1326:                   struct scan_control *sc)
1327:   {
1328:           unsigned long inactive, isolated;
1329:   
1330:           if (current_is_kswapd())
1331:                   return 0;
1332:   
1333:           if (!global_reclaim(sc))
1334:                   return 0;
1335:   
1336:           if (file) {
1337:                   inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1338:                   isolated = zone_page_state(zone, NR_ISOLATED_FILE);
1339:           } else {
1340:                   inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1341:                   isolated = zone_page_state(zone, NR_ISOLATED_ANON);
1342:           }
1343:   
1344:           /*
1345:            * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
1346:            * won't get blocked by normal direct-reclaimers, forming a circular
1347:            * deadlock.
1348:            */
1349:           if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
1350:                   inactive >>= 3;
1351:   
1352:           return isolated > inactive;
1353:   }
1354:   
1355:   static noinline_for_stack void
1356:   putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
1357:   {
1358:           struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1359:           struct zone *zone = lruvec_zone(lruvec);
1360:           LIST_HEAD(pages_to_free);
1361:   
1362:           /*
1363:            * Put back any unfreeable pages.
1364:            */
1365:           while (!list_empty(page_list)) {
1366:                   struct page *page = lru_to_page(page_list);
1367:                   int lru;
1368:   
1369:                   VM_BUG_ON(PageLRU(page));
1370:                   list_del(&page->lru);
1371:                   if (unlikely(!page_evictable(page))) {
1372:                           spin_unlock_irq(&zone->lru_lock);
1373:                           putback_lru_page(page);
1374:                           spin_lock_irq(&zone->lru_lock);
1375:                           continue;
1376:                   }
1377:   
1378:                   lruvec = mem_cgroup_page_lruvec(page, zone);
1379:   
1380:                   SetPageLRU(page);
1381:                   lru = page_lru(page);
1382:                   add_page_to_lru_list(page, lruvec, lru);
1383:   
1384:                   if (is_active_lru(lru)) {
1385:                           int file = is_file_lru(lru);
1386:                           int numpages = hpage_nr_pages(page);
1387:                           reclaim_stat->recent_rotated[file] += numpages;
1388:                   }
1389:                   if (put_page_testzero(page)) {
1390:                           __ClearPageLRU(page);
1391:                           __ClearPageActive(page);
1392:                           del_page_from_lru_list(page, lruvec, lru);
1393:   
1394:                           if (unlikely(PageCompound(page))) {
1395:                                   spin_unlock_irq(&zone->lru_lock);
1396:                                   (*get_compound_page_dtor(page))(page);
1397:                                   spin_lock_irq(&zone->lru_lock);
1398:                           } else
1399:                                   list_add(&page->lru, &pages_to_free);
1400:                   }
1401:           }
1402:   
1403:           /*
1404:            * To save our caller's stack, now use input list for pages to free.
1405:            */
1406:           list_splice(&pages_to_free, page_list);
1407:   }
1408:   
1409:   /*
1410:    * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
1411:    * of reclaimed pages
1412:    */
1413:   static noinline_for_stack unsigned long
1414:   shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1415:                        struct scan_control *sc, enum lru_list lru)
1416:   {
1417:           LIST_HEAD(page_list);
1418:           unsigned long nr_scanned;
1419:           unsigned long nr_reclaimed = 0;
1420:           unsigned long nr_taken;
1421:           unsigned long nr_dirty = 0;
1422:           unsigned long nr_congested = 0;
1423:           unsigned long nr_unqueued_dirty = 0;
1424:           unsigned long nr_writeback = 0;
1425:           unsigned long nr_immediate = 0;
1426:           isolate_mode_t isolate_mode = 0;
1427:           int file = is_file_lru(lru);
1428:           struct zone *zone = lruvec_zone(lruvec);
1429:           struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1430:   
1431:           while (unlikely(too_many_isolated(zone, file, sc))) {
1432:                   congestion_wait(BLK_RW_ASYNC, HZ/10);
1433:   
1434:                   /* We are about to die and free our memory. Return now. */
1435:                   if (fatal_signal_pending(current))
1436:                           return SWAP_CLUSTER_MAX;
1437:           }
1438:   
1439:           lru_add_drain();
1440:   
1441:           if (!sc->may_unmap)
1442:                   isolate_mode |= ISOLATE_UNMAPPED;
1443:           if (!sc->may_writepage)
1444:                   isolate_mode |= ISOLATE_CLEAN;
1445:   
1446:           spin_lock_irq(&zone->lru_lock);
1447:   
1448:           nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
1449:                                        &nr_scanned, sc, isolate_mode, lru);
1450:   
1451:           __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
1452:           __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1453:   
1454:           if (global_reclaim(sc)) {
1455:                   zone->pages_scanned += nr_scanned;
1456:                   if (current_is_kswapd())
1457:                           __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
1458:                   else
1459:                           __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
1460:           }
1461:           spin_unlock_irq(&zone->lru_lock);
1462:   
1463:           if (nr_taken == 0)
1464:                   return 0;
1465:   
1466:           nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
1467:                                   &nr_dirty, &nr_unqueued_dirty, &nr_congested,
1468:                                   &nr_writeback, &nr_immediate,
1469:                                   false);
1470:   
1471:           spin_lock_irq(&zone->lru_lock);
1472:   
1473:           reclaim_stat->recent_scanned[file] += nr_taken;
1474:   
1475:           if (global_reclaim(sc)) {
1476:                   if (current_is_kswapd())
1477:                           __count_zone_vm_events(PGSTEAL_KSWAPD, zone,
1478:                                                  nr_reclaimed);
1479:                   else
1480:                           __count_zone_vm_events(PGSTEAL_DIRECT, zone,
1481:                                                  nr_reclaimed);
1482:           }
1483:   
1484:           putback_inactive_pages(lruvec, &page_list);
1485:   
1486:           __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1487:   
1488:           spin_unlock_irq(&zone->lru_lock);
1489:   
1490:           free_hot_cold_page_list(&page_list, 1);
1491:   
1492:           /*
1493:            * If reclaim is isolating dirty pages under writeback, it implies
1494:            * that the long-lived page allocation rate is exceeding the page
1495:            * laundering rate. Either the global limits are not being effective
1496:            * at throttling processes due to the page distribution throughout
1497:            * zones or there is heavy usage of a slow backing device. The
1498:            * only option is to throttle from reclaim context which is not ideal
1499:            * as there is no guarantee the dirtying process is throttled in the
1500:            * same way balance_dirty_pages() manages.
1501:            *
1502:            * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
1503:            * of pages under pages flagged for immediate reclaim and stall if any
1504:            * are encountered in the nr_immediate check below.
1505:            */
1506:           if (nr_writeback && nr_writeback == nr_taken)
1507:                   zone_set_flag(zone, ZONE_WRITEBACK);
1508:   
1509:           /*
1510:            * memcg will stall in page writeback so only consider forcibly
1511:            * stalling for global reclaim
1512:            */
1513:           if (global_reclaim(sc)) {
1514:                   /*
1515:                    * Tag a zone as congested if all the dirty pages scanned were
1516:                    * backed by a congested BDI and wait_iff_congested will stall.
1517:                    */
1518:                   if (nr_dirty && nr_dirty == nr_congested)
1519:                           zone_set_flag(zone, ZONE_CONGESTED);
1520:   
1521:                   /*
1522:                    * If dirty pages are scanned that are not queued for IO, it
1523:                    * implies that flushers are not keeping up. In this case, flag
1524:                    * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
1525:                    * pages from reclaim context. It will forcibly stall in the
1526:                    * next check.
1527:                    */
1528:                   if (nr_unqueued_dirty == nr_taken)
1529:                           zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
1530:   
1531:                   /*
1532:                    * In addition, if kswapd scans pages marked marked for
1533:                    * immediate reclaim and under writeback (nr_immediate), it
1534:                    * implies that pages are cycling through the LRU faster than
1535:                    * they are written so also forcibly stall.
1536:                    */
1537:                   if (nr_unqueued_dirty == nr_taken || nr_immediate)
1538:                           congestion_wait(BLK_RW_ASYNC, HZ/10);
1539:           }
1540:   
1541:           /*
1542:            * Stall direct reclaim for IO completions if underlying BDIs or zone
1543:            * is congested. Allow kswapd to continue until it starts encountering
1544:            * unqueued dirty pages or cycling through the LRU too quickly.
1545:            */
1546:           if (!sc->hibernation_mode && !current_is_kswapd())
1547:                   wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1548:   
1549:           trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1550:                   zone_idx(zone),
1551:                   nr_scanned, nr_reclaimed,
1552:                   sc->priority,
1553:                   trace_shrink_flags(file));
1554:           return nr_reclaimed;
1555:   }
1556:   
1557:   /*
1558:    * This moves pages from the active list to the inactive list.
1559:    *
1560:    * We move them the other way if the page is referenced by one or more
1561:    * processes, from rmap.
1562:    *
1563:    * If the pages are mostly unmapped, the processing is fast and it is
1564:    * appropriate to hold zone->lru_lock across the whole operation.  But if
1565:    * the pages are mapped, the processing is slow (page_referenced()) so we
1566:    * should drop zone->lru_lock around each page.  It's impossible to balance
1567:    * this, so instead we remove the pages from the LRU while processing them.
1568:    * It is safe to rely on PG_active against the non-LRU pages in here because
1569:    * nobody will play with that bit on a non-LRU page.
1570:    *
1571:    * The downside is that we have to touch page->_count against each page.
1572:    * But we had to alter page->flags anyway.
1573:    */
1574:   
1575:   static void move_active_pages_to_lru(struct lruvec *lruvec,
1576:                                        struct list_head *list,
1577:                                        struct list_head *pages_to_free,
1578:                                        enum lru_list lru)
1579:   {
1580:           struct zone *zone = lruvec_zone(lruvec);
1581:           unsigned long pgmoved = 0;
1582:           struct page *page;
1583:           int nr_pages;
1584:   
1585:           while (!list_empty(list)) {
1586:                   page = lru_to_page(list);
1587:                   lruvec = mem_cgroup_page_lruvec(page, zone);
1588:   
1589:                   VM_BUG_ON(PageLRU(page));
1590:                   SetPageLRU(page);
1591:   
1592:                   nr_pages = hpage_nr_pages(page);
1593:                   mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
1594:                   list_move(&page->lru, &lruvec->lists[lru]);
1595:                   pgmoved += nr_pages;
1596:   
1597:                   if (put_page_testzero(page)) {
1598:                           __ClearPageLRU(page);
1599:                           __ClearPageActive(page);
1600:                           del_page_from_lru_list(page, lruvec, lru);
1601:   
1602:                           if (unlikely(PageCompound(page))) {
1603:                                   spin_unlock_irq(&zone->lru_lock);
1604:                                   (*get_compound_page_dtor(page))(page);
1605:                                   spin_lock_irq(&zone->lru_lock);
1606:                           } else
1607:                                   list_add(&page->lru, pages_to_free);
1608:                   }
1609:           }
1610:           __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1611:           if (!is_active_lru(lru))
1612:                   __count_vm_events(PGDEACTIVATE, pgmoved);
1613:   }
1614:   
1615:   static void shrink_active_list(unsigned long nr_to_scan,
1616:                                  struct lruvec *lruvec,
1617:                                  struct scan_control *sc,
1618:                                  enum lru_list lru)
1619:   {
1620:           unsigned long nr_taken;
1621:           unsigned long nr_scanned;
1622:           unsigned long vm_flags;
1623:           LIST_HEAD(l_hold);      /* The pages which were snipped off */
1624:           LIST_HEAD(l_active);
1625:           LIST_HEAD(l_inactive);
1626:           struct page *page;
1627:           struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1628:           unsigned long nr_rotated = 0;
1629:           isolate_mode_t isolate_mode = 0;
1630:           int file = is_file_lru(lru);
1631:           struct zone *zone = lruvec_zone(lruvec);
1632:   
1633:           lru_add_drain();
1634:   
1635:           if (!sc->may_unmap)
1636:                   isolate_mode |= ISOLATE_UNMAPPED;
1637:           if (!sc->may_writepage)
1638:                   isolate_mode |= ISOLATE_CLEAN;
1639:   
1640:           spin_lock_irq(&zone->lru_lock);
1641:   
1642:           nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
1643:                                        &nr_scanned, sc, isolate_mode, lru);
1644:           if (global_reclaim(sc))
1645:                   zone->pages_scanned += nr_scanned;
1646:   
1647:           reclaim_stat->recent_scanned[file] += nr_taken;
1648:   
1649:           __count_zone_vm_events(PGREFILL, zone, nr_scanned);
1650:           __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
1651:           __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1652:           spin_unlock_irq(&zone->lru_lock);
1653:   
1654:           while (!list_empty(&l_hold)) {
1655:                   cond_resched();
1656:                   page = lru_to_page(&l_hold);
1657:                   list_del(&page->lru);
1658:   
1659:                   if (unlikely(!page_evictable(page))) {
1660:                           putback_lru_page(page);
1661:                           continue;
1662:                   }
1663:   
1664:                   if (unlikely(buffer_heads_over_limit)) {
1665:                           if (page_has_private(page) && trylock_page(page)) {
1666:                                   if (page_has_private(page))
1667:                                           try_to_release_page(page, 0);
1668:                                   unlock_page(page);
1669:                           }
1670:                   }
1671:   
1672:                   if (page_referenced(page, 0, sc->target_mem_cgroup,
1673:                                       &vm_flags)) {
1674:                           nr_rotated += hpage_nr_pages(page);
1675:                           /*
1676:                            * Identify referenced, file-backed active pages and
1677:                            * give them one more trip around the active list. So
1678:                            * that executable code get better chances to stay in
1679:                            * memory under moderate memory pressure.  Anon pages
1680:                            * are not likely to be evicted by use-once streaming
1681:                            * IO, plus JVM can create lots of anon VM_EXEC pages,
1682:                            * so we ignore them here.
1683:                            */
1684:                           if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
1685:                                   list_add(&page->lru, &l_active);
1686:                                   continue;
1687:                           }
1688:                   }
1689:   
1690:                   ClearPageActive(page);  /* we are de-activating */
1691:                   list_add(&page->lru, &l_inactive);
1692:           }
1693:   
1694:           /*
1695:            * Move pages back to the lru list.
1696:            */
1697:           spin_lock_irq(&zone->lru_lock);
1698:           /*
1699:            * Count referenced pages from currently used mappings as rotated,
1700:            * even though only some of them are actually re-activated.  This
1701:            * helps balance scan pressure between file and anonymous pages in
1702:            * get_scan_ratio.
1703:            */
1704:           reclaim_stat->recent_rotated[file] += nr_rotated;
1705:   
1706:           move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
1707:           move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
1708:           __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1709:           spin_unlock_irq(&zone->lru_lock);
1710:   
1711:           free_hot_cold_page_list(&l_hold, 1);
1712:   }
1713:   
1714:   #ifdef CONFIG_SWAP
1715:   static int inactive_anon_is_low_global(struct zone *zone)
1716:   {
1717:           unsigned long active, inactive;
1718:   
1719:           active = zone_page_state(zone, NR_ACTIVE_ANON);
1720:           inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1721:   
1722:           if (inactive * zone->inactive_ratio < active)
1723:                   return 1;
1724:   
1725:           return 0;
1726:   }
1727:   
1728:   /**
1729:    * inactive_anon_is_low - check if anonymous pages need to be deactivated
1730:    * @lruvec: LRU vector to check
1731:    *
1732:    * Returns true if the zone does not have enough inactive anon pages,
1733:    * meaning some active anon pages need to be deactivated.
1734:    */
1735:   static int inactive_anon_is_low(struct lruvec *lruvec)
1736:   {
1737:           /*
1738:            * If we don't have swap space, anonymous page deactivation
1739:            * is pointless.
1740:            */
1741:           if (!total_swap_pages)
1742:                   return 0;
1743:   
1744:           if (!mem_cgroup_disabled())
1745:                   return mem_cgroup_inactive_anon_is_low(lruvec);
1746:   
1747:           return inactive_anon_is_low_global(lruvec_zone(lruvec));
1748:   }
1749:   #else
1750:   static inline int inactive_anon_is_low(struct lruvec *lruvec)
1751:   {
1752:           return 0;
1753:   }
1754:   #endif
1755:   
1756:   /**
1757:    * inactive_file_is_low - check if file pages need to be deactivated
1758:    * @lruvec: LRU vector to check
1759:    *
1760:    * When the system is doing streaming IO, memory pressure here
1761:    * ensures that active file pages get deactivated, until more
1762:    * than half of the file pages are on the inactive list.
1763:    *
1764:    * Once we get to that situation, protect the system's working
1765:    * set from being evicted by disabling active file page aging.
1766:    *
1767:    * This uses a different ratio than the anonymous pages, because
1768:    * the page cache uses a use-once replacement algorithm.
1769:    */
1770:   static int inactive_file_is_low(struct lruvec *lruvec)
1771:   {
1772:           unsigned long inactive;
1773:           unsigned long active;
1774:   
1775:           inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
1776:           active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
1777:   
1778:           return active > inactive;
1779:   }
1780:   
1781:   static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
1782:   {
1783:           if (is_file_lru(lru))
1784:                   return inactive_file_is_low(lruvec);
1785:           else
1786:                   return inactive_anon_is_low(lruvec);
1787:   }
1788:   
1789:   static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1790:                                    struct lruvec *lruvec, struct scan_control *sc)
1791:   {
1792:           if (is_active_lru(lru)) {
1793:                   if (inactive_list_is_low(lruvec, lru))
1794:                           shrink_active_list(nr_to_scan, lruvec, sc, lru);
1795:                   return 0;
1796:           }
1797:   
1798:           return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
1799:   }
1800:   
1801:   static int vmscan_swappiness(struct scan_control *sc)
1802:   {
1803:           if (global_reclaim(sc))
1804:                   return vm_swappiness;
1805:           return mem_cgroup_swappiness(sc->target_mem_cgroup);
1806:   }
1807:   
1808:   enum scan_balance {
1809:           SCAN_EQUAL,
1810:           SCAN_FRACT,
1811:           SCAN_ANON,
1812:           SCAN_FILE,
1813:   };
1814:   
1815:   /*
1816:    * Determine how aggressively the anon and file LRU lists should be
1817:    * scanned.  The relative value of each set of LRU lists is determined
1818:    * by looking at the fraction of the pages scanned we did rotate back
1819:    * onto the active list instead of evict.
1820:    *
1821:    * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
1822:    * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
1823:    */
1824:   static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1825:                              unsigned long *nr)
1826:   {
1827:           struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1828:           u64 fraction[2];
1829:           u64 denominator = 0;    /* gcc */
1830:           struct zone *zone = lruvec_zone(lruvec);
1831:           unsigned long anon_prio, file_prio;
1832:           enum scan_balance scan_balance;
1833:           unsigned long anon, file, free;
1834:           bool force_scan = false;
1835:           unsigned long ap, fp;
1836:           enum lru_list lru;
1837:   
1838:           /*
1839:            * If the zone or memcg is small, nr[l] can be 0.  This
1840:            * results in no scanning on this priority and a potential
1841:            * priority drop.  Global direct reclaim can go to the next
1842:            * zone and tends to have no problems. Global kswapd is for
1843:            * zone balancing and it needs to scan a minimum amount. When
1844:            * reclaiming for a memcg, a priority drop can cause high
1845:            * latencies, so it's better to scan a minimum amount there as
1846:            * well.
1847:            */
1848:           if (current_is_kswapd() && !zone_reclaimable(zone))
1849:                   force_scan = true;
1850:           if (!global_reclaim(sc))
1851:                   force_scan = true;
1852:   
1853:           /* If we have no swap space, do not bother scanning anon pages. */
1854:           if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
1855:                   scan_balance = SCAN_FILE;
1856:                   goto out;
1857:           }
1858:   
1859:           /*
1860:            * Global reclaim will swap to prevent OOM even with no
1861:            * swappiness, but memcg users want to use this knob to
1862:            * disable swapping for individual groups completely when
1863:            * using the memory controller's swap limit feature would be
1864:            * too expensive.
1865:            */
1866:           if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {
1867:                   scan_balance = SCAN_FILE;
1868:                   goto out;
1869:           }
1870:   
1871:           /*
1872:            * Do not apply any pressure balancing cleverness when the
1873:            * system is close to OOM, scan both anon and file equally
1874:            * (unless the swappiness setting disagrees with swapping).
1875:            */
1876:           if (!sc->priority && vmscan_swappiness(sc)) {
1877:                   scan_balance = SCAN_EQUAL;
1878:                   goto out;
1879:           }
1880:   
1881:           anon  = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
1882:                   get_lru_size(lruvec, LRU_INACTIVE_ANON);
1883:           file  = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
1884:                   get_lru_size(lruvec, LRU_INACTIVE_FILE);
1885:   
1886:           /*
1887:            * If it's foreseeable that reclaiming the file cache won't be
1888:            * enough to get the zone back into a desirable shape, we have
1889:            * to swap.  Better start now and leave the - probably heavily
1890:            * thrashing - remaining file pages alone.
1891:            */
1892:           if (global_reclaim(sc)) {
1893:                   free = zone_page_state(zone, NR_FREE_PAGES);
1894:                   if (unlikely(file + free <= high_wmark_pages(zone))) {
1895:                           scan_balance = SCAN_ANON;
1896:                           goto out;
1897:                   }
1898:           }
1899:   
1900:           /*
1901:            * There is enough inactive page cache, do not reclaim
1902:            * anything from the anonymous working set right now.
1903:            */
1904:           if (!inactive_file_is_low(lruvec)) {
1905:                   scan_balance = SCAN_FILE;
1906:                   goto out;
1907:           }
1908:   
1909:           scan_balance = SCAN_FRACT;
1910:   
1911:           /*
1912:            * With swappiness at 100, anonymous and file have the same priority.
1913:            * This scanning priority is essentially the inverse of IO cost.
1914:            */
1915:           anon_prio = vmscan_swappiness(sc);
1916:           file_prio = 200 - anon_prio;
1917:   
1918:           /*
1919:            * OK, so we have swap space and a fair amount of page cache
1920:            * pages.  We use the recently rotated / recently scanned
1921:            * ratios to determine how valuable each cache is.
1922:            *
1923:            * Because workloads change over time (and to avoid overflow)
1924:            * we keep these statistics as a floating average, which ends
1925:            * up weighing recent references more than old ones.
1926:            *
1927:            * anon in [0], file in [1]
1928:            */
1929:           spin_lock_irq(&zone->lru_lock);
1930:           if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1931:                   reclaim_stat->recent_scanned[0] /= 2;
1932:                   reclaim_stat->recent_rotated[0] /= 2;
1933:           }
1934:   
1935:           if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
1936:                   reclaim_stat->recent_scanned[1] /= 2;
1937:                   reclaim_stat->recent_rotated[1] /= 2;
1938:           }
1939:   
1940:           /*
1941:            * The amount of pressure on anon vs file pages is inversely
1942:            * proportional to the fraction of recently scanned pages on
1943:            * each list that were recently referenced and in active use.
1944:            */
1945:           ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
1946:           ap /= reclaim_stat->recent_rotated[0] + 1;
1947:   
1948:           fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
1949:           fp /= reclaim_stat->recent_rotated[1] + 1;
1950:           spin_unlock_irq(&zone->lru_lock);
1951:   
1952:           fraction[0] = ap;
1953:           fraction[1] = fp;
1954:           denominator = ap + fp + 1;
1955:   out:
1956:           for_each_evictable_lru(lru) {
1957:                   int file = is_file_lru(lru);
1958:                   unsigned long size;
1959:                   unsigned long scan;
1960:   
1961:                   size = get_lru_size(lruvec, lru);
1962:                   scan = size >> sc->priority;
1963:   
1964:                   if (!scan && force_scan)
1965:                           scan = min(size, SWAP_CLUSTER_MAX);
1966:   
1967:                   switch (scan_balance) {
1968:                   case SCAN_EQUAL:
1969:                           /* Scan lists relative to size */
1970:                           break;
1971:                   case SCAN_FRACT:
1972:                           /*
1973:                            * Scan types proportional to swappiness and
1974:                            * their relative recent reclaim efficiency.
1975:                            */
1976:                           scan = div64_u64(scan * fraction[file], denominator);
1977:                           break;
1978:                   case SCAN_FILE:
1979:                   case SCAN_ANON:
1980:                           /* Scan one type exclusively */
1981:                           if ((scan_balance == SCAN_FILE) != file)
1982:                                   scan = 0;
1983:                           break;
1984:                   default:
1985:                           /* Look ma, no brain */
1986:                           BUG();
1987:                   }
1988:                   nr[lru] = scan;
1989:           }
1990:   }
1991:   
1992:   /*
1993:    * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
1994:    */
1995:   static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1996:   {
1997:           unsigned long nr[NR_LRU_LISTS];
1998:           unsigned long targets[NR_LRU_LISTS];
1999:           unsigned long nr_to_scan;
2000:           enum lru_list lru;
2001:           unsigned long nr_reclaimed = 0;
2002:           unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2003:           struct blk_plug plug;
2004:           bool scan_adjusted = false;
2005:   
2006:           get_scan_count(lruvec, sc, nr);
2007:   
2008:           /* Record the original scan target for proportional adjustments later */
2009:           memcpy(targets, nr, sizeof(nr));
2010:   
2011:           blk_start_plug(&plug);
2012:           while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2013:                                           nr[LRU_INACTIVE_FILE]) {
2014:                   unsigned long nr_anon, nr_file, percentage;
2015:                   unsigned long nr_scanned;
2016:   
2017:                   for_each_evictable_lru(lru) {
2018:                           if (nr[lru]) {
2019:                                   nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
2020:                                   nr[lru] -= nr_to_scan;
2021:   
2022:                                   nr_reclaimed += shrink_list(lru, nr_to_scan,
2023:                                                               lruvec, sc);
2024:                           }
2025:                   }
2026:   
2027:                   if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
2028:                           continue;
2029:   
2030:                   /*
2031:                    * For global direct reclaim, reclaim only the number of pages
2032:                    * requested. Less care is taken to scan proportionally as it
2033:                    * is more important to minimise direct reclaim stall latency
2034:                    * than it is to properly age the LRU lists.
2035:                    */
2036:                   if (global_reclaim(sc) && !current_is_kswapd())
2037:                           break;
2038:   
2039:                   /*
2040:                    * For kswapd and memcg, reclaim at least the number of pages
2041:                    * requested. Ensure that the anon and file LRUs shrink
2042:                    * proportionally what was requested by get_scan_count(). We
2043:                    * stop reclaiming one LRU and reduce the amount scanning
2044:                    * proportional to the original scan target.
2045:                    */
2046:                   nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
2047:                   nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
2048:   
2049:                   if (nr_file > nr_anon) {
2050:                           unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
2051:                                                   targets[LRU_ACTIVE_ANON] + 1;
2052:                           lru = LRU_BASE;
2053:                           percentage = nr_anon * 100 / scan_target;
2054:                   } else {
2055:                           unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
2056:                                                   targets[LRU_ACTIVE_FILE] + 1;
2057:                           lru = LRU_FILE;
2058:                           percentage = nr_file * 100 / scan_target;
2059:                   }
2060:   
2061:                   /* Stop scanning the smaller of the LRU */
2062:                   nr[lru] = 0;
2063:                   nr[lru + LRU_ACTIVE] = 0;
2064:   
2065:                   /*
2066:                    * Recalculate the other LRU scan count based on its original
2067:                    * scan target and the percentage scanning already complete
2068:                    */
2069:                   lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
2070:                   nr_scanned = targets[lru] - nr[lru];
2071:                   nr[lru] = targets[lru] * (100 - percentage) / 100;
2072:                   nr[lru] -= min(nr[lru], nr_scanned);
2073:   
2074:                   lru += LRU_ACTIVE;
2075:                   nr_scanned = targets[lru] - nr[lru];
2076:                   nr[lru] = targets[lru] * (100 - percentage) / 100;
2077:                   nr[lru] -= min(nr[lru], nr_scanned);
2078:   
2079:                   scan_adjusted = true;
2080:           }
2081:           blk_finish_plug(&plug);
2082:           sc->nr_reclaimed += nr_reclaimed;
2083:   
2084:           /*
2085:            * Even if we did not try to evict anon pages at all, we want to
2086:            * rebalance the anon lru active/inactive ratio.
2087:            */
2088:           if (inactive_anon_is_low(lruvec))
2089:                   shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2090:                                      sc, LRU_ACTIVE_ANON);
2091:   
2092:           throttle_vm_writeout(sc->gfp_mask);
2093:   }
2094:   
2095:   /* Use reclaim/compaction for costly allocs or under memory pressure */
2096:   static bool in_reclaim_compaction(struct scan_control *sc)
2097:   {
2098:           if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
2099:                           (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
2100:                            sc->priority < DEF_PRIORITY - 2))
2101:                   return true;
2102:   
2103:           return false;
2104:   }
2105:   
2106:   /*
2107:    * Reclaim/compaction is used for high-order allocation requests. It reclaims
2108:    * order-0 pages before compacting the zone. should_continue_reclaim() returns
2109:    * true if more pages should be reclaimed such that when the page allocator
2110:    * calls try_to_compact_zone() that it will have enough free pages to succeed.
2111:    * It will give up earlier than that if there is difficulty reclaiming pages.
2112:    */
2113:   static inline bool should_continue_reclaim(struct zone *zone,
2114:                                           unsigned long nr_reclaimed,
2115:                                           unsigned long nr_scanned,
2116:                                           struct scan_control *sc)
2117:   {
2118:           unsigned long pages_for_compaction;
2119:           unsigned long inactive_lru_pages;
2120:   
2121:           /* If not in reclaim/compaction mode, stop */
2122:           if (!in_reclaim_compaction(sc))
2123:                   return false;
2124:   
2125:           /* Consider stopping depending on scan and reclaim activity */
2126:           if (sc->gfp_mask & __GFP_REPEAT) {
2127:                   /*
2128:                    * For __GFP_REPEAT allocations, stop reclaiming if the
2129:                    * full LRU list has been scanned and we are still failing
2130:                    * to reclaim pages. This full LRU scan is potentially
2131:                    * expensive but a __GFP_REPEAT caller really wants to succeed
2132:                    */
2133:                   if (!nr_reclaimed && !nr_scanned)
2134:                           return false;
2135:           } else {
2136:                   /*
2137:                    * For non-__GFP_REPEAT allocations which can presumably
2138:                    * fail without consequence, stop if we failed to reclaim
2139:                    * any pages from the last SWAP_CLUSTER_MAX number of
2140:                    * pages that were scanned. This will return to the
2141:                    * caller faster at the risk reclaim/compaction and
2142:                    * the resulting allocation attempt fails
2143:                    */
2144:                   if (!nr_reclaimed)
2145:                           return false;
2146:           }
2147:   
2148:           /*
2149:            * If we have not reclaimed enough pages for compaction and the
2150:            * inactive lists are large enough, continue reclaiming
2151:            */
2152:           pages_for_compaction = (2UL << sc->order);
2153:           inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
2154:           if (get_nr_swap_pages() > 0)
2155:                   inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
2156:           if (sc->nr_reclaimed < pages_for_compaction &&
2157:                           inactive_lru_pages > pages_for_compaction)
2158:                   return true;
2159:   
2160:           /* If compaction would go ahead or the allocation would succeed, stop */
2161:           switch (compaction_suitable(zone, sc->order)) {
2162:           case COMPACT_PARTIAL:
2163:           case COMPACT_CONTINUE:
2164:                   return false;
2165:           default:
2166:                   return true;
2167:           }
2168:   }
2169:   
2170:   static void shrink_zone(struct zone *zone, struct scan_control *sc)
2171:   {
2172:           unsigned long nr_reclaimed, nr_scanned;
2173:   
2174:           do {
2175:                   struct mem_cgroup *root = sc->target_mem_cgroup;
2176:                   struct mem_cgroup_reclaim_cookie reclaim = {
2177:                           .zone = zone,
2178:                           .priority = sc->priority,
2179:                   };
2180:                   struct mem_cgroup *memcg;
2181:   
2182:                   nr_reclaimed = sc->nr_reclaimed;
2183:                   nr_scanned = sc->nr_scanned;
2184:   
2185:                   memcg = mem_cgroup_iter(root, NULL, &reclaim);
2186:                   do {
2187:                           struct lruvec *lruvec;
2188:   
2189:                           lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2190:   
2191:                           shrink_lruvec(lruvec, sc);
2192:   
2193:                           /*
2194:                            * Direct reclaim and kswapd have to scan all memory
2195:                            * cgroups to fulfill the overall scan target for the
2196:                            * zone.
2197:                            *
2198:                            * Limit reclaim, on the other hand, only cares about
2199:                            * nr_to_reclaim pages to be reclaimed and it will
2200:                            * retry with decreasing priority if one round over the
2201:                            * whole hierarchy is not sufficient.
2202:                            */
2203:                           if (!global_reclaim(sc) &&
2204:                                           sc->nr_reclaimed >= sc->nr_to_reclaim) {
2205:                                   mem_cgroup_iter_break(root, memcg);
2206:                                   break;
2207:                           }
2208:                           memcg = mem_cgroup_iter(root, memcg, &reclaim);
2209:                   } while (memcg);
2210:   
2211:                   vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
2212:                              sc->nr_scanned - nr_scanned,
2213:                              sc->nr_reclaimed - nr_reclaimed);
2214:   
2215:           } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
2216:                                            sc->nr_scanned - nr_scanned, sc));
2217:   }
2218:   
2219:   /* Returns true if compaction should go ahead for a high-order request */
2220:   static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2221:   {
2222:           unsigned long balance_gap, watermark;
2223:           bool watermark_ok;
2224:   
2225:           /* Do not consider compaction for orders reclaim is meant to satisfy */
2226:           if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
2227:                   return false;
2228:   
2229:           /*
2230:            * Compaction takes time to run and there are potentially other
2231:            * callers using the pages just freed. Continue reclaiming until
2232:            * there is a buffer of free pages available to give compaction
2233:            * a reasonable chance of completing and allocating the page
2234:            */
2235:           balance_gap = min(low_wmark_pages(zone),
2236:                   (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2237:                           KSWAPD_ZONE_BALANCE_GAP_RATIO);
2238:           watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
2239:           watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
2240:   
2241:           /*
2242:            * If compaction is deferred, reclaim up to a point where
2243:            * compaction will have a chance of success when re-enabled
2244:            */
2245:           if (compaction_deferred(zone, sc->order))
2246:                   return watermark_ok;
2247:   
2248:           /* If compaction is not ready to start, keep reclaiming */
2249:           if (!compaction_suitable(zone, sc->order))
2250:                   return false;
2251:   
2252:           return watermark_ok;
2253:   }
2254:   
2255:   /*
2256:    * This is the direct reclaim path, for page-allocating processes.  We only
2257:    * try to reclaim pages from zones which will satisfy the caller's allocation
2258:    * request.
2259:    *
2260:    * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
2261:    * Because:
2262:    * a) The caller may be trying to free *extra* pages to satisfy a higher-order
2263:    *    allocation or
2264:    * b) The target zone may be at high_wmark_pages(zone) but the lower zones
2265:    *    must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
2266:    *    zone defense algorithm.
2267:    *
2268:    * If a zone is deemed to be full of pinned pages then just give it a light
2269:    * scan then give up on it.
2270:    *
2271:    * This function returns true if a zone is being reclaimed for a costly
2272:    * high-order allocation and compaction is ready to begin. This indicates to
2273:    * the caller that it should consider retrying the allocation instead of
2274:    * further reclaim.
2275:    */
2276:   static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2277:   {
2278:           struct zoneref *z;
2279:           struct zone *zone;
2280:           unsigned long nr_soft_reclaimed;
2281:           unsigned long nr_soft_scanned;
2282:           bool aborted_reclaim = false;
2283:   
2284:           /*
2285:            * If the number of buffer_heads in the machine exceeds the maximum
2286:            * allowed level, force direct reclaim to scan the highmem zone as
2287:            * highmem pages could be pinning lowmem pages storing buffer_heads
2288:            */
2289:           if (buffer_heads_over_limit)
2290:                   sc->gfp_mask |= __GFP_HIGHMEM;
2291:   
2292:           for_each_zone_zonelist_nodemask(zone, z, zonelist,
2293:                                           gfp_zone(sc->gfp_mask), sc->nodemask) {
2294:                   if (!populated_zone(zone))
2295:                           continue;
2296:                   /*
2297:                    * Take care memory controller reclaiming has small influence
2298:                    * to global LRU.
2299:                    */
2300:                   if (global_reclaim(sc)) {
2301:                           if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2302:                                   continue;
2303:                           if (sc->priority != DEF_PRIORITY &&
2304:                               !zone_reclaimable(zone))
2305:                                   continue;       /* Let kswapd poll it */
2306:                           if (IS_ENABLED(CONFIG_COMPACTION)) {
2307:                                   /*
2308:                                    * If we already have plenty of memory free for
2309:                                    * compaction in this zone, don't free any more.
2310:                                    * Even though compaction is invoked for any
2311:                                    * non-zero order, only frequent costly order
2312:                                    * reclamation is disruptive enough to become a
2313:                                    * noticeable problem, like transparent huge
2314:                                    * page allocations.
2315:                                    */
2316:                                   if (compaction_ready(zone, sc)) {
2317:                                           aborted_reclaim = true;
2318:                                           continue;
2319:                                   }
2320:                           }
2321:                           /*
2322:                            * This steals pages from memory cgroups over softlimit
2323:                            * and returns the number of reclaimed pages and
2324:                            * scanned pages. This works for global memory pressure
2325:                            * and balancing, not for a memcg's limit.
2326:                            */
2327:                           nr_soft_scanned = 0;
2328:                           nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2329:                                                   sc->order, sc->gfp_mask,
2330:                                                   &nr_soft_scanned);
2331:                           sc->nr_reclaimed += nr_soft_reclaimed;
2332:                           sc->nr_scanned += nr_soft_scanned;
2333:                           /* need some check for avoid more shrink_zone() */
2334:                   }
2335:   
2336:                   shrink_zone(zone, sc);
2337:           }
2338:   
2339:           return aborted_reclaim;
2340:   }
2341:   
2342:   /* All zones in zonelist are unreclaimable? */
2343:   static bool all_unreclaimable(struct zonelist *zonelist,
2344:                   struct scan_control *sc)
2345:   {
2346:           struct zoneref *z;
2347:           struct zone *zone;
2348:   
2349:           for_each_zone_zonelist_nodemask(zone, z, zonelist,
2350:                           gfp_zone(sc->gfp_mask), sc->nodemask) {
2351:                   if (!populated_zone(zone))
2352:                           continue;
2353:                   if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2354:                           continue;
2355:                   if (zone_reclaimable(zone))
2356:                           return false;
2357:           }
2358:   
2359:           return true;
2360:   }
2361:   
2362:   /*
2363:    * This is the main entry point to direct page reclaim.
2364:    *
2365:    * If a full scan of the inactive list fails to free enough memory then we
2366:    * are "out of memory" and something needs to be killed.
2367:    *
2368:    * If the caller is !__GFP_FS then the probability of a failure is reasonably
2369:    * high - the zone may be full of dirty or under-writeback pages, which this
2370:    * caller can't do much about.  We kick the writeback threads and take explicit
2371:    * naps in the hope that some of these pages can be written.  But if the
2372:    * allocating task holds filesystem locks which prevent writeout this might not
2373:    * work, and the allocation attempt will fail.
2374:    *
2375:    * returns:     0, if no pages reclaimed
2376:    *              else, the number of pages reclaimed
2377:    */
2378:   static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2379:                                           struct scan_control *sc,
2380:                                           struct shrink_control *shrink)
2381:   {
2382:           unsigned long total_scanned = 0;
2383:           struct reclaim_state *reclaim_state = current->reclaim_state;
2384:           struct zoneref *z;
2385:           struct zone *zone;
2386:           unsigned long writeback_threshold;
2387:           bool aborted_reclaim;
2388:   
2389:           delayacct_freepages_start();
2390:   
2391:           if (global_reclaim(sc))
2392:                   count_vm_event(ALLOCSTALL);
2393:   
2394:           do {
2395:                   vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
2396:                                   sc->priority);
2397:                   sc->nr_scanned = 0;
2398:                   aborted_reclaim = shrink_zones(zonelist, sc);
2399:   
2400:                   /*
2401:                    * Don't shrink slabs when reclaiming memory from over limit
2402:                    * cgroups but do shrink slab at least once when aborting
2403:                    * reclaim for compaction to avoid unevenly scanning file/anon
2404:                    * LRU pages over slab pages.
2405:                    */
2406:                   if (global_reclaim(sc)) {
2407:                           unsigned long lru_pages = 0;
2408:   
2409:                           nodes_clear(shrink->nodes_to_scan);
2410:                           for_each_zone_zonelist(zone, z, zonelist,
2411:                                           gfp_zone(sc->gfp_mask)) {
2412:                                   if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2413:                                           continue;
2414:   
2415:                                   lru_pages += zone_reclaimable_pages(zone);
2416:                                   node_set(zone_to_nid(zone),
2417:                                            shrink->nodes_to_scan);
2418:                           }
2419:   
2420:                           shrink_slab(shrink, sc->nr_scanned, lru_pages);
2421:                           if (reclaim_state) {
2422:                                   sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2423:                                   reclaim_state->reclaimed_slab = 0;
2424:                           }
2425:                   }
2426:                   total_scanned += sc->nr_scanned;
2427:                   if (sc->nr_reclaimed >= sc->nr_to_reclaim)
2428:                           goto out;
2429:   
2430:                   /*
2431:                    * If we're getting trouble reclaiming, start doing
2432:                    * writepage even in laptop mode.
2433:                    */
2434:                   if (sc->priority < DEF_PRIORITY - 2)
2435:                           sc->may_writepage = 1;
2436:   
2437:                   /*
2438:                    * Try to write back as many pages as we just scanned.  This
2439:                    * tends to cause slow streaming writers to write data to the
2440:                    * disk smoothly, at the dirtying rate, which is nice.   But
2441:                    * that's undesirable in laptop mode, where we *want* lumpy
2442:                    * writeout.  So in laptop mode, write out the whole world.
2443:                    */
2444:                   writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
2445:                   if (total_scanned > writeback_threshold) {
2446:                           wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
2447:                                                   WB_REASON_TRY_TO_FREE_PAGES);
2448:                           sc->may_writepage = 1;
2449:                   }
2450:           } while (--sc->priority >= 0 && !aborted_reclaim);
2451:   
2452:   out:
2453:           delayacct_freepages_end();
2454:   
2455:           if (sc->nr_reclaimed)
2456:                   return sc->nr_reclaimed;
2457:   
2458:           /*
2459:            * As hibernation is going on, kswapd is freezed so that it can't mark
2460:            * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
2461:            * check.
2462:            */
2463:           if (oom_killer_disabled)
2464:                   return 0;
2465:   
2466:           /* Aborted reclaim to try compaction? don't OOM, then */
2467:           if (aborted_reclaim)
2468:                   return 1;
2469:   
2470:           /* top priority shrink_zones still had more to do? don't OOM, then */
2471:           if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
2472:                   return 1;
2473:   
2474:           return 0;
2475:   }
2476:   
2477:   static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2478:   {
2479:           struct zone *zone;
2480:           unsigned long pfmemalloc_reserve = 0;
2481:           unsigned long free_pages = 0;
2482:           int i;
2483:           bool wmark_ok;
2484:   
2485:           for (i = 0; i <= ZONE_NORMAL; i++) {
2486:                   zone = &pgdat->node_zones[i];
2487:                   if (!populated_zone(zone))
2488:                           continue;
2489:   
2490:                   pfmemalloc_reserve += min_wmark_pages(zone);
2491:                   free_pages += zone_page_state(zone, NR_FREE_PAGES);
2492:           }
2493:   
2494:           /* If there are no reserves (unexpected config) then do not throttle */
2495:           if (!pfmemalloc_reserve)
2496:                   return true;
2497:   
2498:           wmark_ok = free_pages > pfmemalloc_reserve / 2;
2499:   
2500:           /* kswapd must be awake if processes are being throttled */
2501:           if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
2502:                   pgdat->classzone_idx = min(pgdat->classzone_idx,
2503:                                                   (enum zone_type)ZONE_NORMAL);
2504:                   wake_up_interruptible(&pgdat->kswapd_wait);
2505:           }
2506:   
2507:           return wmark_ok;
2508:   }
2509:   
2510:   /*
2511:    * Throttle direct reclaimers if backing storage is backed by the network
2512:    * and the PFMEMALLOC reserve for the preferred node is getting dangerously
2513:    * depleted. kswapd will continue to make progress and wake the processes
2514:    * when the low watermark is reached.
2515:    *
2516:    * Returns true if a fatal signal was delivered during throttling. If this
2517:    * happens, the page allocator should not consider triggering the OOM killer.
2518:    */
2519:   static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2520:                                           nodemask_t *nodemask)
2521:   {
2522:           struct zoneref *z;
2523:           struct zone *zone;
2524:           pg_data_t *pgdat = NULL;
2525:   
2526:           /*
2527:            * Kernel threads should not be throttled as they may be indirectly
2528:            * responsible for cleaning pages necessary for reclaim to make forward
2529:            * progress. kjournald for example may enter direct reclaim while
2530:            * committing a transaction where throttling it could forcing other
2531:            * processes to block on log_wait_commit().
2532:            */
2533:           if (current->flags & PF_KTHREAD)
2534:                   goto out;
2535:   
2536:           /*
2537:            * If a fatal signal is pending, this process should not throttle.
2538:            * It should return quickly so it can exit and free its memory
2539:            */
2540:           if (fatal_signal_pending(current))
2541:                   goto out;
2542:   
2543:           /*
2544:            * Check if the pfmemalloc reserves are ok by finding the first node
2545:            * with a usable ZONE_NORMAL or lower zone. The expectation is that
2546:            * GFP_KERNEL will be required for allocating network buffers when
2547:            * swapping over the network so ZONE_HIGHMEM is unusable.
2548:            *
2549:            * Throttling is based on the first usable node and throttled processes
2550:            * wait on a queue until kswapd makes progress and wakes them. There
2551:            * is an affinity then between processes waking up and where reclaim
2552:            * progress has been made assuming the process wakes on the same node.
2553:            * More importantly, processes running on remote nodes will not compete
2554:            * for remote pfmemalloc reserves and processes on different nodes
2555:            * should make reasonable progress.
2556:            */
2557:           for_each_zone_zonelist_nodemask(zone, z, zonelist,
2558:                                           gfp_mask, nodemask) {
2559:                   if (zone_idx(zone) > ZONE_NORMAL)
2560:                           continue;
2561:   
2562:                   /* Throttle based on the first usable node */
2563:                   pgdat = zone->zone_pgdat;
2564:                   if (pfmemalloc_watermark_ok(pgdat))
2565:                           goto out;
2566:                   break;
2567:           }
2568:   
2569:           /* If no zone was usable by the allocation flags then do not throttle */
2570:           if (!pgdat)
2571:                   goto out;
2572:   
2573:           /* Account for the throttling */
2574:           count_vm_event(PGSCAN_DIRECT_THROTTLE);
2575:   
2576:           /*
2577:            * If the caller cannot enter the filesystem, it's possible that it
2578:            * is due to the caller holding an FS lock or performing a journal
2579:            * transaction in the case of a filesystem like ext[3|4]. In this case,
2580:            * it is not safe to block on pfmemalloc_wait as kswapd could be
2581:            * blocked waiting on the same lock. Instead, throttle for up to a
2582:            * second before continuing.
2583:            */
2584:           if (!(gfp_mask & __GFP_FS)) {
2585:                   wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
2586:                           pfmemalloc_watermark_ok(pgdat), HZ);
2587:   
2588:                   goto check_pending;
2589:           }
2590:   
2591:           /* Throttle until kswapd wakes the process */
2592:           wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
2593:                   pfmemalloc_watermark_ok(pgdat));
2594:   
2595:   check_pending:
2596:           if (fatal_signal_pending(current))
2597:                   return true;
2598:   
2599:   out:
2600:           return false;
2601:   }
2602:   
2603:   unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2604:                                   gfp_t gfp_mask, nodemask_t *nodemask)
2605:   {
2606:           unsigned long nr_reclaimed;
2607:           struct scan_control sc = {
2608:                   .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
2609:                   .may_writepage = !laptop_mode,
2610:                   .nr_to_reclaim = SWAP_CLUSTER_MAX,
2611:                   .may_unmap = 1,
2612:                   .may_swap = 1,
2613:                   .order = order,
2614:                   .priority = DEF_PRIORITY,
2615:                   .target_mem_cgroup = NULL,
2616:                   .nodemask = nodemask,
2617:           };
2618:           struct shrink_control shrink = {
2619:                   .gfp_mask = sc.gfp_mask,
2620:           };
2621:   
2622:           /*
2623:            * Do not enter reclaim if fatal signal was delivered while throttled.
2624:            * 1 is returned so that the page allocator does not OOM kill at this
2625:            * point.
2626:            */
2627:           if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
2628:                   return 1;
2629:   
2630:           trace_mm_vmscan_direct_reclaim_begin(order,
2631:                                   sc.may_writepage,
2632:                                   gfp_mask);
2633:   
2634:           nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2635:   
2636:           trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
2637:   
2638:           return nr_reclaimed;
2639:   }
2640:   
2641:   #ifdef CONFIG_MEMCG
2642:   
2643:   unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2644:                                                   gfp_t gfp_mask, bool noswap,
2645:                                                   struct zone *zone,
2646:                                                   unsigned long *nr_scanned)
2647:   {
2648:           struct scan_control sc = {
2649:                   .nr_scanned = 0,
2650:                   .nr_to_reclaim = SWAP_CLUSTER_MAX,
2651:                   .may_writepage = !laptop_mode,
2652:                   .may_unmap = 1,
2653:                   .may_swap = !noswap,
2654:                   .order = 0,
2655:                   .priority = 0,
2656:                   .target_mem_cgroup = memcg,
2657:           };
2658:           struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2659:   
2660:           sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2661:                           (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
2662:   
2663:           trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
2664:                                                         sc.may_writepage,
2665:                                                         sc.gfp_mask);
2666:   
2667:           /*
2668:            * NOTE: Although we can get the priority field, using it
2669:            * here is not a good idea, since it limits the pages we can scan.
2670:            * if we don't reclaim here, the shrink_zone from balance_pgdat
2671:            * will pick up pages from other mem cgroup's as well. We hack
2672:            * the priority and make it zero.
2673:            */
2674:           shrink_lruvec(lruvec, &sc);
2675:   
2676:           trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2677:   
2678:           *nr_scanned = sc.nr_scanned;
2679:           return sc.nr_reclaimed;
2680:   }
2681:   
2682:   unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2683:                                              gfp_t gfp_mask,
2684:                                              bool noswap)
2685:   {
2686:           struct zonelist *zonelist;
2687:           unsigned long nr_reclaimed;
2688:           int nid;
2689:           struct scan_control sc = {
2690:                   .may_writepage = !laptop_mode,
2691:                   .may_unmap = 1,
2692:                   .may_swap = !noswap,
2693:                   .nr_to_reclaim = SWAP_CLUSTER_MAX,
2694:                   .order = 0,
2695:                   .priority = DEF_PRIORITY,
2696:                   .target_mem_cgroup = memcg,
2697:                   .nodemask = NULL, /* we don't care the placement */
2698:                   .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2699:                                   (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
2700:           };
2701:           struct shrink_control shrink = {
2702:                   .gfp_mask = sc.gfp_mask,
2703:           };
2704:   
2705:           /*
2706:            * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
2707:            * take care of from where we get pages. So the node where we start the
2708:            * scan does not need to be the current node.
2709:            */
2710:           nid = mem_cgroup_select_victim_node(memcg);
2711:   
2712:           zonelist = NODE_DATA(nid)->node_zonelists;
2713:   
2714:           trace_mm_vmscan_memcg_reclaim_begin(0,
2715:                                               sc.may_writepage,
2716:                                               sc.gfp_mask);
2717:   
2718:           nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2719:   
2720:           trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
2721:   
2722:           return nr_reclaimed;
2723:   }
2724:   #endif
2725:   
2726:   static void age_active_anon(struct zone *zone, struct scan_control *sc)
2727:   {
2728:           struct mem_cgroup *memcg;
2729:   
2730:           if (!total_swap_pages)
2731:                   return;
2732:   
2733:           memcg = mem_cgroup_iter(NULL, NULL, NULL);
2734:           do {
2735:                   struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2736:   
2737:                   if (inactive_anon_is_low(lruvec))
2738:                           shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2739:                                              sc, LRU_ACTIVE_ANON);
2740:   
2741:                   memcg = mem_cgroup_iter(NULL, memcg, NULL);
2742:           } while (memcg);
2743:   }
2744:   
2745:   static bool zone_balanced(struct zone *zone, int order,
2746:                             unsigned long balance_gap, int classzone_idx)
2747:   {
2748:           if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
2749:                                       balance_gap, classzone_idx, 0))
2750:                   return false;
2751:   
2752:           if (IS_ENABLED(CONFIG_COMPACTION) && order &&
2753:               !compaction_suitable(zone, order))
2754:                   return false;
2755:   
2756:           return true;
2757:   }
2758:   
2759:   /*
2760:    * pgdat_balanced() is used when checking if a node is balanced.
2761:    *
2762:    * For order-0, all zones must be balanced!
2763:    *
2764:    * For high-order allocations only zones that meet watermarks and are in a
2765:    * zone allowed by the callers classzone_idx are added to balanced_pages. The
2766:    * total of balanced pages must be at least 25% of the zones allowed by
2767:    * classzone_idx for the node to be considered balanced. Forcing all zones to
2768:    * be balanced for high orders can cause excessive reclaim when there are
2769:    * imbalanced zones.
2770:    * The choice of 25% is due to
2771:    *   o a 16M DMA zone that is balanced will not balance a zone on any
2772:    *     reasonable sized machine
2773:    *   o On all other machines, the top zone must be at least a reasonable
2774:    *     percentage of the middle zones. For example, on 32-bit x86, highmem
2775:    *     would need to be at least 256M for it to be balance a whole node.
2776:    *     Similarly, on x86-64 the Normal zone would need to be at least 1G
2777:    *     to balance a node on its own. These seemed like reasonable ratios.
2778:    */
2779:   static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
2780:   {
2781:           unsigned long managed_pages = 0;
2782:           unsigned long balanced_pages = 0;
2783:           int i;
2784:   
2785:           /* Check the watermark levels */
2786:           for (i = 0; i <= classzone_idx; i++) {
2787:                   struct zone *zone = pgdat->node_zones + i;
2788:   
2789:                   if (!populated_zone(zone))
2790:                           continue;
2791:   
2792:                   managed_pages += zone->managed_pages;
2793:   
2794:                   /*
2795:                    * A special case here:
2796:                    *
2797:                    * balance_pgdat() skips over all_unreclaimable after
2798:                    * DEF_PRIORITY. Effectively, it considers them balanced so
2799:                    * they must be considered balanced here as well!
2800:                    */
2801:                   if (!zone_reclaimable(zone)) {
2802:                           balanced_pages += zone->managed_pages;
2803:                           continue;
2804:                   }
2805:   
2806:                   if (zone_balanced(zone, order, 0, i))
2807:                           balanced_pages += zone->managed_pages;
2808:                   else if (!order)
2809:                           return false;
2810:           }
2811:   
2812:           if (order)
2813:                   return balanced_pages >= (managed_pages >> 2);
2814:           else
2815:                   return true;
2816:   }
2817:   
2818:   /*
2819:    * Prepare kswapd for sleeping. This verifies that there are no processes
2820:    * waiting in throttle_direct_reclaim() and that watermarks have been met.
2821:    *
2822:    * Returns true if kswapd is ready to sleep
2823:    */
2824:   static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2825:                                           int classzone_idx)
2826:   {
2827:           /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2828:           if (remaining)
2829:                   return false;
2830:   
2831:           /*
2832:            * There is a potential race between when kswapd checks its watermarks
2833:            * and a process gets throttled. There is also a potential race if
2834:            * processes get throttled, kswapd wakes, a large process exits therby
2835:            * balancing the zones that causes kswapd to miss a wakeup. If kswapd
2836:            * is going to sleep, no process should be sleeping on pfmemalloc_wait
2837:            * so wake them now if necessary. If necessary, processes will wake
2838:            * kswapd and get throttled again
2839:            */
2840:           if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
2841:                   wake_up(&pgdat->pfmemalloc_wait);
2842:                   return false;
2843:           }
2844:   
2845:           return pgdat_balanced(pgdat, order, classzone_idx);
2846:   }
2847:   
2848:   /*
2849:    * kswapd shrinks the zone by the number of pages required to reach
2850:    * the high watermark.
2851:    *
2852:    * Returns true if kswapd scanned at least the requested number of pages to
2853:    * reclaim or if the lack of progress was due to pages under writeback.
2854:    * This is used to determine if the scanning priority needs to be raised.
2855:    */
2856:   static bool kswapd_shrink_zone(struct zone *zone,
2857:                                  int classzone_idx,
2858:                                  struct scan_control *sc,
2859:                                  unsigned long lru_pages,
2860:                                  unsigned long *nr_attempted)
2861:   {
2862:           int testorder = sc->order;
2863:           unsigned long balance_gap;
2864:           struct reclaim_state *reclaim_state = current->reclaim_state;
2865:           struct shrink_control shrink = {
2866:                   .gfp_mask = sc->gfp_mask,
2867:           };
2868:           bool lowmem_pressure;
2869:   
2870:           /* Reclaim above the high watermark. */
2871:           sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
2872:   
2873:           /*
2874:            * Kswapd reclaims only single pages with compaction enabled. Trying
2875:            * too hard to reclaim until contiguous free pages have become
2876:            * available can hurt performance by evicting too much useful data
2877:            * from memory. Do not reclaim more than needed for compaction.
2878:            */
2879:           if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
2880:                           compaction_suitable(zone, sc->order) !=
2881:                                   COMPACT_SKIPPED)
2882:                   testorder = 0;
2883:   
2884:           /*
2885:            * We put equal pressure on every zone, unless one zone has way too
2886:            * many pages free already. The "too many pages" is defined as the
2887:            * high wmark plus a "gap" where the gap is either the low
2888:            * watermark or 1% of the zone, whichever is smaller.
2889:            */
2890:           balance_gap = min(low_wmark_pages(zone),
2891:                   (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2892:                   KSWAPD_ZONE_BALANCE_GAP_RATIO);
2893:   
2894:           /*
2895:            * If there is no low memory pressure or the zone is balanced then no
2896:            * reclaim is necessary
2897:            */
2898:           lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
2899:           if (!lowmem_pressure && zone_balanced(zone, testorder,
2900:                                                   balance_gap, classzone_idx))
2901:                   return true;
2902:   
2903:           shrink_zone(zone, sc);
2904:           nodes_clear(shrink.nodes_to_scan);
2905:           node_set(zone_to_nid(zone), shrink.nodes_to_scan);
2906:   
2907:           reclaim_state->reclaimed_slab = 0;
2908:           shrink_slab(&shrink, sc->nr_scanned, lru_pages);
2909:           sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2910:   
2911:           /* Account for the number of pages attempted to reclaim */
2912:           *nr_attempted += sc->nr_to_reclaim;
2913:   
2914:           zone_clear_flag(zone, ZONE_WRITEBACK);
2915:   
2916:           /*
2917:            * If a zone reaches its high watermark, consider it to be no longer
2918:            * congested. It's possible there are dirty pages backed by congested
2919:            * BDIs but as pressure is relieved, speculatively avoid congestion
2920:            * waits.
2921:            */
2922:           if (zone_reclaimable(zone) &&
2923:               zone_balanced(zone, testorder, 0, classzone_idx)) {
2924:                   zone_clear_flag(zone, ZONE_CONGESTED);
2925:                   zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
2926:           }
2927:   
2928:           return sc->nr_scanned >= sc->nr_to_reclaim;
2929:   }
2930:   
2931:   /*
2932:    * For kswapd, balance_pgdat() will work across all this node's zones until
2933:    * they are all at high_wmark_pages(zone).
2934:    *
2935:    * Returns the final order kswapd was reclaiming at
2936:    *
2937:    * There is special handling here for zones which are full of pinned pages.
2938:    * This can happen if the pages are all mlocked, or if they are all used by
2939:    * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb.
2940:    * What we do is to detect the case where all pages in the zone have been
2941:    * scanned twice and there has been zero successful reclaim.  Mark the zone as
2942:    * dead and from now on, only perform a short scan.  Basically we're polling
2943:    * the zone for when the problem goes away.
2944:    *
2945:    * kswapd scans the zones in the highmem->normal->dma direction.  It skips
2946:    * zones which have free_pages > high_wmark_pages(zone), but once a zone is
2947:    * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
2948:    * lower zones regardless of the number of free pages in the lower zones. This
2949:    * interoperates with the page allocator fallback scheme to ensure that aging
2950:    * of pages is balanced across the zones.
2951:    */
2952:   static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2953:                                                           int *classzone_idx)
2954:   {
2955:           int i;
2956:           int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
2957:           unsigned long nr_soft_reclaimed;
2958:           unsigned long nr_soft_scanned;
2959:           struct scan_control sc = {
2960:                   .gfp_mask = GFP_KERNEL,
2961:                   .priority = DEF_PRIORITY,
2962:                   .may_unmap = 1,
2963:                   .may_swap = 1,
2964:                   .may_writepage = !laptop_mode,
2965:                   .order = order,
2966:                   .target_mem_cgroup = NULL,
2967:           };
2968:           count_vm_event(PAGEOUTRUN);
2969:   
2970:           do {
2971:                   unsigned long lru_pages = 0;
2972:                   unsigned long nr_attempted = 0;
2973:                   bool raise_priority = true;
2974:                   bool pgdat_needs_compaction = (order > 0);
2975:   
2976:                   sc.nr_reclaimed = 0;
2977:   
2978:                   /*
2979:                    * Scan in the highmem->dma direction for the highest
2980:                    * zone which needs scanning
2981:                    */
2982:                   for (i = pgdat->nr_zones - 1; i >= 0; i--) {
2983:                           struct zone *zone = pgdat->node_zones + i;
2984:   
2985:                           if (!populated_zone(zone))
2986:                                   continue;
2987:   
2988:                           if (sc.priority != DEF_PRIORITY &&
2989:                               !zone_reclaimable(zone))
2990:                                   continue;
2991:   
2992:                           /*
2993:                            * Do some background aging of the anon list, to give
2994:                            * pages a chance to be referenced before reclaiming.
2995:                            */
2996:                           age_active_anon(zone, &sc);
2997:   
2998:                           /*
2999:                            * If the number of buffer_heads in the machine
3000:                            * exceeds the maximum allowed level and this node
3001:                            * has a highmem zone, force kswapd to reclaim from
3002:                            * it to relieve lowmem pressure.
3003:                            */
3004:                           if (buffer_heads_over_limit && is_highmem_idx(i)) {
3005:                                   end_zone = i;
3006:                                   break;
3007:                           }
3008:   
3009:                           if (!zone_balanced(zone, order, 0, 0)) {
3010:                                   end_zone = i;
3011:                                   break;
3012:                           } else {
3013:                                   /*
3014:                                    * If balanced, clear the dirty and congested
3015:                                    * flags
3016:                                    */
3017:                                   zone_clear_flag(zone, ZONE_CONGESTED);
3018:                                   zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
3019:                           }
3020:                   }
3021:   
3022:                   if (i < 0)
3023:                           goto out;
3024:   
3025:                   for (i = 0; i <= end_zone; i++) {
3026:                           struct zone *zone = pgdat->node_zones + i;
3027:   
3028:                           if (!populated_zone(zone))
3029:                                   continue;
3030:   
3031:                           lru_pages += zone_reclaimable_pages(zone);
3032:   
3033:                           /*
3034:                            * If any zone is currently balanced then kswapd will
3035:                            * not call compaction as it is expected that the
3036:                            * necessary pages are already available.
3037:                            */
3038:                           if (pgdat_needs_compaction &&
3039:                                           zone_watermark_ok(zone, order,
3040:                                                   low_wmark_pages(zone),
3041:                                                   *classzone_idx, 0))
3042:                                   pgdat_needs_compaction = false;
3043:                   }
3044:   
3045:                   /*
3046:                    * If we're getting trouble reclaiming, start doing writepage
3047:                    * even in laptop mode.
3048:                    */
3049:                   if (sc.priority < DEF_PRIORITY - 2)
3050:                           sc.may_writepage = 1;
3051:   
3052:                   /*
3053:                    * Now scan the zone in the dma->highmem direction, stopping
3054:                    * at the last zone which needs scanning.
3055:                    *
3056:                    * We do this because the page allocator works in the opposite
3057:                    * direction.  This prevents the page allocator from allocating
3058:                    * pages behind kswapd's direction of progress, which would
3059:                    * cause too much scanning of the lower zones.
3060:                    */
3061:                   for (i = 0; i <= end_zone; i++) {
3062:                           struct zone *zone = pgdat->node_zones + i;
3063:   
3064:                           if (!populated_zone(zone))
3065:                                   continue;
3066:   
3067:                           if (sc.priority != DEF_PRIORITY &&
3068:                               !zone_reclaimable(zone))
3069:                                   continue;
3070:   
3071:                           sc.nr_scanned = 0;
3072:   
3073:                           nr_soft_scanned = 0;
3074:                           /*
3075:                            * Call soft limit reclaim before calling shrink_zone.
3076:                            */
3077:                           nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
3078:                                                           order, sc.gfp_mask,
3079:                                                           &nr_soft_scanned);
3080:                           sc.nr_reclaimed += nr_soft_reclaimed;
3081:   
3082:                           /*
3083:                            * There should be no need to raise the scanning
3084:                            * priority if enough pages are already being scanned
3085:                            * that that high watermark would be met at 100%
3086:                            * efficiency.
3087:                            */
3088:                           if (kswapd_shrink_zone(zone, end_zone, &sc,
3089:                                           lru_pages, &nr_attempted))
3090:                                   raise_priority = false;
3091:                   }
3092:   
3093:                   /*
3094:                    * If the low watermark is met there is no need for processes
3095:                    * to be throttled on pfmemalloc_wait as they should not be
3096:                    * able to safely make forward progress. Wake them
3097:                    */
3098:                   if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
3099:                                   pfmemalloc_watermark_ok(pgdat))
3100:                           wake_up(&pgdat->pfmemalloc_wait);
3101:   
3102:                   /*
3103:                    * Fragmentation may mean that the system cannot be rebalanced
3104:                    * for high-order allocations in all zones. If twice the
3105:                    * allocation size has been reclaimed and the zones are still
3106:                    * not balanced then recheck the watermarks at order-0 to
3107:                    * prevent kswapd reclaiming excessively. Assume that a
3108:                    * process requested a high-order can direct reclaim/compact.
3109:                    */
3110:                   if (order && sc.nr_reclaimed >= 2UL << order)
3111:                           order = sc.order = 0;
3112:   
3113:                   /* Check if kswapd should be suspending */
3114:                   if (try_to_freeze() || kthread_should_stop())
3115:                           break;
3116:   
3117:                   /*
3118:                    * Compact if necessary and kswapd is reclaiming at least the
3119:                    * high watermark number of pages as requsted
3120:                    */
3121:                   if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
3122:                           compact_pgdat(pgdat, order);
3123:   
3124:                   /*
3125:                    * Raise priority if scanning rate is too low or there was no
3126:                    * progress in reclaiming pages
3127:                    */
3128:                   if (raise_priority || !sc.nr_reclaimed)
3129:                           sc.priority--;
3130:           } while (sc.priority >= 1 &&
3131:                    !pgdat_balanced(pgdat, order, *classzone_idx));
3132:   
3133:   out:
3134:           /*
3135:            * Return the order we were reclaiming at so prepare_kswapd_sleep()
3136:            * makes a decision on the order we were last reclaiming at. However,
3137:            * if another caller entered the allocator slow path while kswapd
3138:            * was awake, order will remain at the higher level
3139:            */
3140:           *classzone_idx = end_zone;
3141:           return order;
3142:   }
3143:   
3144:   static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3145:   {
3146:           long remaining = 0;
3147:           DEFINE_WAIT(wait);
3148:   
3149:           if (freezing(current) || kthread_should_stop())
3150:                   return;
3151:   
3152:           prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3153:   
3154:           /* Try to sleep for a short interval */
3155:           if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
3156:                   remaining = schedule_timeout(HZ/10);
3157:                   finish_wait(&pgdat->kswapd_wait, &wait);
3158:                   prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3159:           }
3160:   
3161:           /*
3162:            * After a short sleep, check if it was a premature sleep. If not, then
3163:            * go fully to sleep until explicitly woken up.
3164:            */
3165:           if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
3166:                   trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
3167:   
3168:                   /*
3169:                    * vmstat counters are not perfectly accurate and the estimated
3170:                    * value for counters such as NR_FREE_PAGES can deviate from the
3171:                    * true value by nr_online_cpus * threshold. To avoid the zone
3172:                    * watermarks being breached while under pressure, we reduce the
3173:                    * per-cpu vmstat threshold while kswapd is awake and restore
3174:                    * them before going back to sleep.
3175:                    */
3176:                   set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
3177:   
3178:                   /*
3179:                    * Compaction records what page blocks it recently failed to
3180:                    * isolate pages from and skips them in the future scanning.
3181:                    * When kswapd is going to sleep, it is reasonable to assume
3182:                    * that pages and compaction may succeed so reset the cache.
3183:                    */
3184:                   reset_isolation_suitable(pgdat);
3185:   
3186:                   if (!kthread_should_stop())
3187:                           schedule();
3188:   
3189:                   set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
3190:           } else {
3191:                   if (remaining)
3192:                           count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
3193:                   else
3194:                           count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
3195:           }
3196:           finish_wait(&pgdat->kswapd_wait, &wait);
3197:   }
3198:   
3199:   /*
3200:    * The background pageout daemon, started as a kernel thread
3201:    * from the init process.
3202:    *
3203:    * This basically trickles out pages so that we have _some_
3204:    * free memory available even if there is no other activity
3205:    * that frees anything up. This is needed for things like routing
3206:    * etc, where we otherwise might have all activity going on in
3207:    * asynchronous contexts that cannot page things out.
3208:    *
3209:    * If there are applications that are active memory-allocators
3210:    * (most normal use), this basically shouldn't matter.
3211:    */
3212:   static int kswapd(void *p)
3213:   {
3214:           unsigned long order, new_order;
3215:           unsigned balanced_order;
3216:           int classzone_idx, new_classzone_idx;
3217:           int balanced_classzone_idx;
3218:           pg_data_t *pgdat = (pg_data_t*)p;
3219:           struct task_struct *tsk = current;
3220:   
3221:           struct reclaim_state reclaim_state = {
3222:                   .reclaimed_slab = 0,
3223:           };
3224:           const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
3225:   
3226:           lockdep_set_current_reclaim_state(GFP_KERNEL);
3227:   
3228:           if (!cpumask_empty(cpumask))
3229:                   set_cpus_allowed_ptr(tsk, cpumask);
3230:           current->reclaim_state = &reclaim_state;
3231:   
3232:           /*
3233:            * Tell the memory management that we're a "memory allocator",
3234:            * and that if we need more memory we should get access to it
3235:            * regardless (see "__alloc_pages()"). "kswapd" should
3236:            * never get caught in the normal page freeing logic.
3237:            *
3238:            * (Kswapd normally doesn't need memory anyway, but sometimes
3239:            * you need a small amount of memory in order to be able to
3240:            * page out something else, and this flag essentially protects
3241:            * us from recursively trying to free more memory as we're
3242:            * trying to free the first piece of memory in the first place).
3243:            */
3244:           tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
3245:           set_freezable();
3246:   
3247:           order = new_order = 0;
3248:           balanced_order = 0;
3249:           classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
3250:           balanced_classzone_idx = classzone_idx;
3251:           for ( ; ; ) {
3252:                   bool ret;
3253:   
3254:                   /*
3255:                    * If the last balance_pgdat was unsuccessful it's unlikely a
3256:                    * new request of a similar or harder type will succeed soon
3257:                    * so consider going to sleep on the basis we reclaimed at
3258:                    */
3259:                   if (balanced_classzone_idx >= new_classzone_idx &&
3260:                                           balanced_order == new_order) {
3261:                           new_order = pgdat->kswapd_max_order;
3262:                           new_classzone_idx = pgdat->classzone_idx;
3263:                           pgdat->kswapd_max_order =  0;
3264:                           pgdat->classzone_idx = pgdat->nr_zones - 1;
3265:                   }
3266:   
3267:                   if (order < new_order || classzone_idx > new_classzone_idx) {
3268:                           /*
3269:                            * Don't sleep if someone wants a larger 'order'
3270:                            * allocation or has tigher zone constraints
3271:                            */
3272:                           order = new_order;
3273:                           classzone_idx = new_classzone_idx;
3274:                   } else {
3275:                           kswapd_try_to_sleep(pgdat, balanced_order,
3276:                                                   balanced_classzone_idx);
3277:                           order = pgdat->kswapd_max_order;
3278:                           classzone_idx = pgdat->classzone_idx;
3279:                           new_order = order;
3280:                           new_classzone_idx = classzone_idx;
3281:                           pgdat->kswapd_max_order = 0;
3282:                           pgdat->classzone_idx = pgdat->nr_zones - 1;
3283:                   }
3284:   
3285:                   ret = try_to_freeze();
3286:                   if (kthread_should_stop())
3287:                           break;
3288:   
3289:                   /*
3290:                    * We can speed up thawing tasks if we don't call balance_pgdat
3291:                    * after returning from the refrigerator
3292:                    */
3293:                   if (!ret) {
3294:                           trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
3295:                           balanced_classzone_idx = classzone_idx;
3296:                           balanced_order = balance_pgdat(pgdat, order,
3297:                                                   &balanced_classzone_idx);
3298:                   }
3299:           }
3300:   
3301:           tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
3302:           current->reclaim_state = NULL;
3303:           lockdep_clear_current_reclaim_state();
3304:   
3305:           return 0;
3306:   }
3307:   
3308:   /*
3309:    * A zone is low on free memory, so wake its kswapd task to service it.
3310:    */
3311:   void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
3312:   {
3313:           pg_data_t *pgdat;
3314:   
3315:           if (!populated_zone(zone))
3316:                   return;
3317:   
3318:           if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
3319:                   return;
3320:           pgdat = zone->zone_pgdat;
3321:           if (pgdat->kswapd_max_order < order) {
3322:                   pgdat->kswapd_max_order = order;
3323:                   pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
3324:           }
3325:           if (!waitqueue_active(&pgdat->kswapd_wait))
3326:                   return;
3327:           if (zone_balanced(zone, order, 0, 0))
3328:                   return;
3329:   
3330:           trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
3331:           wake_up_interruptible(&pgdat->kswapd_wait);
3332:   }
3333:   
3334:   #ifdef CONFIG_HIBERNATION
3335:   /*
3336:    * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
3337:    * freed pages.
3338:    *
3339:    * Rather than trying to age LRUs the aim is to preserve the overall
3340:    * LRU order by reclaiming preferentially
3341:    * inactive > active > active referenced > active mapped
3342:    */
3343:   unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3344:   {
3345:           struct reclaim_state reclaim_state;
3346:           struct scan_control sc = {
3347:                   .gfp_mask = GFP_HIGHUSER_MOVABLE,
3348:                   .may_swap = 1,
3349:                   .may_unmap = 1,
3350:                   .may_writepage = 1,
3351:                   .nr_to_reclaim = nr_to_reclaim,
3352:                   .hibernation_mode = 1,
3353:                   .order = 0,
3354:                   .priority = DEF_PRIORITY,
3355:           };
3356:           struct shrink_control shrink = {
3357:                   .gfp_mask = sc.gfp_mask,
3358:           };
3359:           struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
3360:           struct task_struct *p = current;
3361:           unsigned long nr_reclaimed;
3362:   
3363:           p->flags |= PF_MEMALLOC;
3364:           lockdep_set_current_reclaim_state(sc.gfp_mask);
3365:           reclaim_state.reclaimed_slab = 0;
3366:           p->reclaim_state = &reclaim_state;
3367:   
3368:           nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
3369:   
3370:           p->reclaim_state = NULL;
3371:           lockdep_clear_current_reclaim_state();
3372:           p->flags &= ~PF_MEMALLOC;
3373:   
3374:           return nr_reclaimed;
3375:   }
3376:   #endif /* CONFIG_HIBERNATION */
3377:   
3378:   /* It's optimal to keep kswapds on the same CPUs as their memory, but
3379:      not required for correctness.  So if the last cpu in a node goes
3380:      away, we get changed to run anywhere: as the first one comes back,
3381:      restore their cpu bindings. */
3382:   static int cpu_callback(struct notifier_block *nfb, unsigned long action,
3383:                           void *hcpu)
3384:   {
3385:           int nid;
3386:   
3387:           if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
3388:                   for_each_node_state(nid, N_MEMORY) {
3389:                           pg_data_t *pgdat = NODE_DATA(nid);
3390:                           const struct cpumask *mask;
3391:   
3392:                           mask = cpumask_of_node(pgdat->node_id);
3393:   
3394:                           if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
3395:                                   /* One of our CPUs online: restore mask */
3396:                                   set_cpus_allowed_ptr(pgdat->kswapd, mask);
3397:                   }
3398:           }
3399:           return NOTIFY_OK;
3400:   }
3401:   
3402:   /*
3403:    * This kswapd start function will be called by init and node-hot-add.
3404:    * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
3405:    */
3406:   int kswapd_run(int nid)
3407:   {
3408:           pg_data_t *pgdat = NODE_DATA(nid);
3409:           int ret = 0;
3410:   
3411:           if (pgdat->kswapd)
3412:                   return 0;
3413:   
3414:           pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
3415:           if (IS_ERR(pgdat->kswapd)) {
3416:                   /* failure at boot is fatal */
3417:                   BUG_ON(system_state == SYSTEM_BOOTING);
3418:                   pr_err("Failed to start kswapd on node %d\n", nid);
3419:                   ret = PTR_ERR(pgdat->kswapd);
3420:                   pgdat->kswapd = NULL;
3421:           }
3422:           return ret;
3423:   }
3424:   
3425:   /*
3426:    * Called by memory hotplug when all memory in a node is offlined.  Caller must
3427:    * hold lock_memory_hotplug().
3428:    */
3429:   void kswapd_stop(int nid)
3430:   {
3431:           struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
3432:   
3433:           if (kswapd) {
3434:                   kthread_stop(kswapd);
3435:                   NODE_DATA(nid)->kswapd = NULL;
3436:           }
3437:   }
3438:   
3439:   static int __init kswapd_init(void)
3440:   {
3441:           int nid;
3442:   
3443:           swap_setup();
3444:           for_each_node_state(nid, N_MEMORY)
3445:                   kswapd_run(nid);
3446:           hotcpu_notifier(cpu_callback, 0);
3447:           return 0;
3448:   }
3449:   
3450:   module_init(kswapd_init)
3451:   
3452:   #ifdef CONFIG_NUMA
3453:   /*
3454:    * Zone reclaim mode
3455:    *
3456:    * If non-zero call zone_reclaim when the number of free pages falls below
3457:    * the watermarks.
3458:    */
3459:   int zone_reclaim_mode __read_mostly;
3460:   
3461:   #define RECLAIM_OFF 0
3462:   #define RECLAIM_ZONE (1<<0)     /* Run shrink_inactive_list on the zone */
3463:   #define RECLAIM_WRITE (1<<1)    /* Writeout pages during reclaim */
3464:   #define RECLAIM_SWAP (1<<2)     /* Swap pages out during reclaim */
3465:   
3466:   /*
3467:    * Priority for ZONE_RECLAIM. This determines the fraction of pages
3468:    * of a node considered for each zone_reclaim. 4 scans 1/16th of
3469:    * a zone.
3470:    */
3471:   #define ZONE_RECLAIM_PRIORITY 4
3472:   
3473:   /*
3474:    * Percentage of pages in a zone that must be unmapped for zone_reclaim to
3475:    * occur.
3476:    */
3477:   int sysctl_min_unmapped_ratio = 1;
3478:   
3479:   /*
3480:    * If the number of slab pages in a zone grows beyond this percentage then
3481:    * slab reclaim needs to occur.
3482:    */
3483:   int sysctl_min_slab_ratio = 5;
3484:   
3485:   static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
3486:   {
3487:           unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
3488:           unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
3489:                   zone_page_state(zone, NR_ACTIVE_FILE);
3490:   
3491:           /*
3492:            * It's possible for there to be more file mapped pages than
3493:            * accounted for by the pages on the file LRU lists because
3494:            * tmpfs pages accounted for as ANON can also be FILE_MAPPED
3495:            */
3496:           return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
3497:   }
3498:   
3499:   /* Work out how many page cache pages we can reclaim in this reclaim_mode */
3500:   static long zone_pagecache_reclaimable(struct zone *zone)
3501:   {
3502:           long nr_pagecache_reclaimable;
3503:           long delta = 0;
3504:   
3505:           /*
3506:            * If RECLAIM_SWAP is set, then all file pages are considered
3507:            * potentially reclaimable. Otherwise, we have to worry about
3508:            * pages like swapcache and zone_unmapped_file_pages() provides
3509:            * a better estimate
3510:            */
3511:           if (zone_reclaim_mode & RECLAIM_SWAP)
3512:                   nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
3513:           else
3514:                   nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
3515:   
3516:           /* If we can't clean pages, remove dirty pages from consideration */
3517:           if (!(zone_reclaim_mode & RECLAIM_WRITE))
3518:                   delta += zone_page_state(zone, NR_FILE_DIRTY);
3519:   
3520:           /* Watch for any possible underflows due to delta */
3521:           if (unlikely(delta > nr_pagecache_reclaimable))
3522:                   delta = nr_pagecache_reclaimable;
3523:   
3524:           return nr_pagecache_reclaimable - delta;
3525:   }
3526:   
3527:   /*
3528:    * Try to free up some pages from this zone through reclaim.
3529:    */
3530:   static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3531:   {
3532:           /* Minimum pages needed in order to stay on node */
3533:           const unsigned long nr_pages = 1 << order;
3534:           struct task_struct *p = current;
3535:           struct reclaim_state reclaim_state;
3536:           struct scan_control sc = {
3537:                   .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
3538:                   .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
3539:                   .may_swap = 1,
3540:                   .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
3541:                   .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
3542:                   .order = order,
3543:                   .priority = ZONE_RECLAIM_PRIORITY,
3544:           };
3545:           struct shrink_control shrink = {
3546:                   .gfp_mask = sc.gfp_mask,
3547:           };
3548:           unsigned long nr_slab_pages0, nr_slab_pages1;
3549:   
3550:           cond_resched();
3551:           /*
3552:            * We need to be able to allocate from the reserves for RECLAIM_SWAP
3553:            * and we also need to be able to write out pages for RECLAIM_WRITE
3554:            * and RECLAIM_SWAP.
3555:            */
3556:           p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
3557:           lockdep_set_current_reclaim_state(gfp_mask);
3558:           reclaim_state.reclaimed_slab = 0;
3559:           p->reclaim_state = &reclaim_state;
3560:   
3561:           if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
3562:                   /*
3563:                    * Free memory by calling shrink zone with increasing
3564:                    * priorities until we have enough memory freed.
3565:                    */
3566:                   do {
3567:                           shrink_zone(zone, &sc);
3568:                   } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
3569:           }
3570:   
3571:           nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3572:           if (nr_slab_pages0 > zone->min_slab_pages) {
3573:                   /*
3574:                    * shrink_slab() does not currently allow us to determine how
3575:                    * many pages were freed in this zone. So we take the current
3576:                    * number of slab pages and shake the slab until it is reduced
3577:                    * by the same nr_pages that we used for reclaiming unmapped
3578:                    * pages.
3579:                    */
3580:                   nodes_clear(shrink.nodes_to_scan);
3581:                   node_set(zone_to_nid(zone), shrink.nodes_to_scan);
3582:                   for (;;) {
3583:                           unsigned long lru_pages = zone_reclaimable_pages(zone);
3584:   
3585:                           /* No reclaimable slab or very low memory pressure */
3586:                           if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
3587:                                   break;
3588:   
3589:                           /* Freed enough memory */
3590:                           nr_slab_pages1 = zone_page_state(zone,
3591:                                                           NR_SLAB_RECLAIMABLE);
3592:                           if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
3593:                                   break;
3594:                   }
3595:   
3596:                   /*
3597:                    * Update nr_reclaimed by the number of slab pages we
3598:                    * reclaimed from this zone.
3599:                    */
3600:                   nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3601:                   if (nr_slab_pages1 < nr_slab_pages0)
3602:                           sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
3603:           }
3604:   
3605:           p->reclaim_state = NULL;
3606:           current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
3607:           lockdep_clear_current_reclaim_state();
3608:           return sc.nr_reclaimed >= nr_pages;
3609:   }
3610:   
3611:   int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3612:   {
3613:           int node_id;
3614:           int ret;
3615:   
3616:           /*
3617:            * Zone reclaim reclaims unmapped file backed pages and
3618:            * slab pages if we are over the defined limits.
3619:            *
3620:            * A small portion of unmapped file backed pages is needed for
3621:            * file I/O otherwise pages read by file I/O will be immediately
3622:            * thrown out if the zone is overallocated. So we do not reclaim
3623:            * if less than a specified percentage of the zone is used by
3624:            * unmapped file backed pages.
3625:            */
3626:           if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
3627:               zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
3628:                   return ZONE_RECLAIM_FULL;
3629:   
3630:           if (!zone_reclaimable(zone))
3631:                   return ZONE_RECLAIM_FULL;
3632:   
3633:           /*
3634:            * Do not scan if the allocation should not be delayed.
3635:            */
3636:           if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
3637:                   return ZONE_RECLAIM_NOSCAN;
3638:   
3639:           /*
3640:            * Only run zone reclaim on the local zone or on zones that do not
3641:            * have associated processors. This will favor the local processor
3642:            * over remote processors and spread off node memory allocations
3643:            * as wide as possible.
3644:            */
3645:           node_id = zone_to_nid(zone);
3646:           if (node_state(node_id, N_CPU) && node_id != numa_node_id())
3647:                   return ZONE_RECLAIM_NOSCAN;
3648:   
3649:           if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
3650:                   return ZONE_RECLAIM_NOSCAN;
3651:   
3652:           ret = __zone_reclaim(zone, gfp_mask, order);
3653:           zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
3654:   
3655:           if (!ret)
3656:                   count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
3657:   
3658:           return ret;
3659:   }
3660:   #endif
3661:   
3662:   /*
3663:    * page_evictable - test whether a page is evictable
3664:    * @page: the page to test
3665:    *
3666:    * Test whether page is evictable--i.e., should be placed on active/inactive
3667:    * lists vs unevictable list.
3668:    *
3669:    * Reasons page might not be evictable:
3670:    * (1) page's mapping marked unevictable
3671:    * (2) page is part of an mlocked VMA
3672:    *
3673:    */
3674:   int page_evictable(struct page *page)
3675:   {
3676:           return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
3677:   }
3678:   
3679:   #ifdef CONFIG_SHMEM
3680:   /**
3681:    * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list
3682:    * @pages:      array of pages to check
3683:    * @nr_pages:   number of pages to check
3684:    *
3685:    * Checks pages for evictability and moves them to the appropriate lru list.
3686:    *
3687:    * This function is only used for SysV IPC SHM_UNLOCK.
3688:    */
3689:   void check_move_unevictable_pages(struct page **pages, int nr_pages)
3690:   {
3691:           struct lruvec *lruvec;
3692:           struct zone *zone = NULL;
3693:           int pgscanned = 0;
3694:           int pgrescued = 0;
3695:           int i;
3696:   
3697:           for (i = 0; i < nr_pages; i++) {
3698:                   struct page *page = pages[i];
3699:                   struct zone *pagezone;
3700:   
3701:                   pgscanned++;
3702:                   pagezone = page_zone(page);
3703:                   if (pagezone != zone) {
3704:                           if (zone)
3705:                                   spin_unlock_irq(&zone->lru_lock);
3706:                           zone = pagezone;
3707:                           spin_lock_irq(&zone->lru_lock);
3708:                   }
3709:                   lruvec = mem_cgroup_page_lruvec(page, zone);
3710:   
3711:                   if (!PageLRU(page) || !PageUnevictable(page))
3712:                           continue;
3713:   
3714:                   if (page_evictable(page)) {
3715:                           enum lru_list lru = page_lru_base_type(page);
3716:   
3717:                           VM_BUG_ON(PageActive(page));
3718:                           ClearPageUnevictable(page);
3719:                           del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
3720:                           add_page_to_lru_list(page, lruvec, lru);
3721:                           pgrescued++;
3722:                   }
3723:           }
3724:   
3725:           if (zone) {
3726:                   __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
3727:                   __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
3728:                   spin_unlock_irq(&zone->lru_lock);
3729:           }
3730:   }
3731:   #endif /* CONFIG_SHMEM */
3732:   
3733:   static void warn_scan_unevictable_pages(void)
3734:   {
3735:           printk_once(KERN_WARNING
3736:                       "%s: The scan_unevictable_pages sysctl/node-interface has been "
3737:                       "disabled for lack of a legitimate use case.  If you have "
3738:                       "one, please send an email to linux-mm@kvack.org.\n",
3739:                       current->comm);
3740:   }
3741:   
3742:   /*
3743:    * scan_unevictable_pages [vm] sysctl handler.  On demand re-scan of
3744:    * all nodes' unevictable lists for evictable pages
3745:    */
3746:   unsigned long scan_unevictable_pages;
3747:   
3748:   int scan_unevictable_handler(struct ctl_table *table, int write,
3749:                              void __user *buffer,
3750:                              size_t *length, loff_t *ppos)
3751:   {
3752:           warn_scan_unevictable_pages();
3753:           proc_doulongvec_minmax(table, write, buffer, length, ppos);
3754:           scan_unevictable_pages = 0;
3755:           return 0;
3756:   }
3757:   
3758:   #ifdef CONFIG_NUMA
3759:   /*
3760:    * per node 'scan_unevictable_pages' attribute.  On demand re-scan of
3761:    * a specified node's per zone unevictable lists for evictable pages.
3762:    */
3763:   
3764:   static ssize_t read_scan_unevictable_node(struct device *dev,
3765:                                             struct device_attribute *attr,
3766:                                             char *buf)
3767:   {
3768:           warn_scan_unevictable_pages();
3769:           return sprintf(buf, "0\n");     /* always zero; should fit... */
3770:   }
3771:   
3772:   static ssize_t write_scan_unevictable_node(struct device *dev,
3773:                                              struct device_attribute *attr,
3774:                                           const char *buf, size_t count)
3775:   {
3776:           warn_scan_unevictable_pages();
3777:           return 1;
3778:   }
3779:   
3780:   
3781:   static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
3782:                           read_scan_unevictable_node,
3783:                           write_scan_unevictable_node);
3784:   
3785:   int scan_unevictable_register_node(struct node *node)
3786:   {
3787:           return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);
3788:   }
3789:   
3790:   void scan_unevictable_unregister_node(struct node *node)
3791:   {
3792:           device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages);
3793:   }
3794:   #endif