0001: /* 0002: * linux/mm/vmscan.c 0003: * 0004: * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 0005: * 0006: * Swap reorganised 29.12.95, Stephen Tweedie. 0007: * kswapd added: 7.1.96 sct 0008: * Removed kswapd_ctl limits, and swap out as many pages as needed 0009: * to bring the system back to freepages.high: 2.4.97, Rik van Riel. 0010: * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). 0011: * Multiqueue VM started 5.8.00, Rik van Riel. 0012: */ 0013: 0014: #include <linux/mm.h> 0015: #include <linux/module.h> 0016: #include <linux/gfp.h> 0017: #include <linux/kernel_stat.h> 0018: #include <linux/swap.h> 0019: #include <linux/pagemap.h> 0020: #include <linux/init.h> 0021: #include <linux/highmem.h> 0022: #include <linux/vmpressure.h> 0023: #include <linux/vmstat.h> 0024: #include <linux/file.h> 0025: #include <linux/writeback.h> 0026: #include <linux/blkdev.h> 0027: #include <linux/buffer_head.h> /* for try_to_release_page(), 0028: buffer_heads_over_limit */ 0029: #include <linux/mm_inline.h> 0030: #include <linux/backing-dev.h> 0031: #include <linux/rmap.h> 0032: #include <linux/topology.h> 0033: #include <linux/cpu.h> 0034: #include <linux/cpuset.h> 0035: #include <linux/compaction.h> 0036: #include <linux/notifier.h> 0037: #include <linux/rwsem.h> 0038: #include <linux/delay.h> 0039: #include <linux/kthread.h> 0040: #include <linux/freezer.h> 0041: #include <linux/memcontrol.h> 0042: #include <linux/delayacct.h> 0043: #include <linux/sysctl.h> 0044: #include <linux/oom.h> 0045: #include <linux/prefetch.h> 0046: 0047: #include <asm/tlbflush.h> 0048: #include <asm/div64.h> 0049: 0050: #include <linux/swapops.h> 0051: #include <linux/balloon_compaction.h> 0052: 0053: #include "internal.h" 0054: 0055: #define CREATE_TRACE_POINTS 0056: #include <trace/events/vmscan.h> 0057: 0058: struct scan_control { 0059: /* Incremented by the number of inactive pages that were scanned */ 0060: unsigned long nr_scanned; 0061: 0062: /* Number of pages freed so far during a call to shrink_zones() */ 0063: unsigned long nr_reclaimed; 0064: 0065: /* How many pages shrink_list() should reclaim */ 0066: unsigned long nr_to_reclaim; 0067: 0068: unsigned long hibernation_mode; 0069: 0070: /* This context's GFP mask */ 0071: gfp_t gfp_mask; 0072: 0073: int may_writepage; 0074: 0075: /* Can mapped pages be reclaimed? */ 0076: int may_unmap; 0077: 0078: /* Can pages be swapped as part of reclaim? */ 0079: int may_swap; 0080: 0081: int order; 0082: 0083: /* Scan (total_size >> priority) pages at once */ 0084: int priority; 0085: 0086: /* 0087: * The memory cgroup that hit its limit and as a result is the 0088: * primary target of this reclaim invocation. 0089: */ 0090: struct mem_cgroup *target_mem_cgroup; 0091: 0092: /* 0093: * Nodemask of nodes allowed by the caller. If NULL, all nodes 0094: * are scanned. 0095: */ 0096: nodemask_t *nodemask; 0097: }; 0098: 0099: #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 0100: 0101: #ifdef ARCH_HAS_PREFETCH 0102: #define prefetch_prev_lru_page(_page, _base, _field) \ 0103: do { \ 0104: if ((_page)->lru.prev != _base) { \ 0105: struct page *prev; \ 0106: \ 0107: prev = lru_to_page(&(_page->lru)); \ 0108: prefetch(&prev->_field); \ 0109: } \ 0110: } while (0) 0111: #else 0112: #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) 0113: #endif 0114: 0115: #ifdef ARCH_HAS_PREFETCHW 0116: #define prefetchw_prev_lru_page(_page, _base, _field) \ 0117: do { \ 0118: if ((_page)->lru.prev != _base) { \ 0119: struct page *prev; \ 0120: \ 0121: prev = lru_to_page(&(_page->lru)); \ 0122: prefetchw(&prev->_field); \ 0123: } \ 0124: } while (0) 0125: #else 0126: #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) 0127: #endif 0128: 0129: /* 0130: * From 0 .. 100. Higher means more swappy. 0131: */ 0132: int vm_swappiness = 60; 0133: unsigned long vm_total_pages; /* The total number of pages which the VM controls */ 0134: 0135: static LIST_HEAD(shrinker_list); 0136: static DECLARE_RWSEM(shrinker_rwsem); 0137: 0138: #ifdef CONFIG_MEMCG 0139: static bool global_reclaim(struct scan_control *sc) 0140: { 0141: return !sc->target_mem_cgroup; 0142: } 0143: #else 0144: static bool global_reclaim(struct scan_control *sc) 0145: { 0146: return true; 0147: } 0148: #endif 0149: 0150: static unsigned long zone_reclaimable_pages(struct zone *zone) 0151: { 0152: int nr; 0153: 0154: nr = zone_page_state(zone, NR_ACTIVE_FILE) + 0155: zone_page_state(zone, NR_INACTIVE_FILE); 0156: 0157: if (get_nr_swap_pages() > 0) 0158: nr += zone_page_state(zone, NR_ACTIVE_ANON) + 0159: zone_page_state(zone, NR_INACTIVE_ANON); 0160: 0161: return nr; 0162: } 0163: 0164: bool zone_reclaimable(struct zone *zone) 0165: { 0166: return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; 0167: } 0168: 0169: static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) 0170: { 0171: if (!mem_cgroup_disabled()) 0172: return mem_cgroup_get_lru_size(lruvec, lru); 0173: 0174: return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru); 0175: } 0176: 0177: /* 0178: * Add a shrinker callback to be called from the vm. 0179: */ 0180: int register_shrinker(struct shrinker *shrinker) 0181: { 0182: size_t size = sizeof(*shrinker->nr_deferred); 0183: 0184: /* 0185: * If we only have one possible node in the system anyway, save 0186: * ourselves the trouble and disable NUMA aware behavior. This way we 0187: * will save memory and some small loop time later. 0188: */ 0189: if (nr_node_ids == 1) 0190: shrinker->flags &= ~SHRINKER_NUMA_AWARE; 0191: 0192: if (shrinker->flags & SHRINKER_NUMA_AWARE) 0193: size *= nr_node_ids; 0194: 0195: shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); 0196: if (!shrinker->nr_deferred) 0197: return -ENOMEM; 0198: 0199: down_write(&shrinker_rwsem); 0200: list_add_tail(&shrinker->list, &shrinker_list); 0201: up_write(&shrinker_rwsem); 0202: return 0; 0203: } 0204: EXPORT_SYMBOL(register_shrinker); 0205: 0206: /* 0207: * Remove one 0208: */ 0209: void unregister_shrinker(struct shrinker *shrinker) 0210: { 0211: down_write(&shrinker_rwsem); 0212: list_del(&shrinker->list); 0213: up_write(&shrinker_rwsem); 0214: kfree(shrinker->nr_deferred); 0215: } 0216: EXPORT_SYMBOL(unregister_shrinker); 0217: 0218: #define SHRINK_BATCH 128 0219: 0220: static unsigned long 0221: shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, 0222: unsigned long nr_pages_scanned, unsigned long lru_pages) 0223: { 0224: unsigned long freed = 0; 0225: unsigned long long delta; 0226: long total_scan; 0227: long max_pass; 0228: long nr; 0229: long new_nr; 0230: int nid = shrinkctl->nid; 0231: long batch_size = shrinker->batch ? shrinker->batch 0232: : SHRINK_BATCH; 0233: 0234: max_pass = shrinker->count_objects(shrinker, shrinkctl); 0235: if (max_pass == 0) 0236: return 0; 0237: 0238: /* 0239: * copy the current shrinker scan count into a local variable 0240: * and zero it so that other concurrent shrinker invocations 0241: * don't also do this scanning work. 0242: */ 0243: nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); 0244: 0245: total_scan = nr; 0246: delta = (4 * nr_pages_scanned) / shrinker->seeks; 0247: delta *= max_pass; 0248: do_div(delta, lru_pages + 1); 0249: total_scan += delta; 0250: if (total_scan < 0) { 0251: printk(KERN_ERR 0252: "shrink_slab: %pF negative objects to delete nr=%ld\n", 0253: shrinker->scan_objects, total_scan); 0254: total_scan = max_pass; 0255: } 0256: 0257: /* 0258: * We need to avoid excessive windup on filesystem shrinkers 0259: * due to large numbers of GFP_NOFS allocations causing the 0260: * shrinkers to return -1 all the time. This results in a large 0261: * nr being built up so when a shrink that can do some work 0262: * comes along it empties the entire cache due to nr >>> 0263: * max_pass. This is bad for sustaining a working set in 0264: * memory. 0265: * 0266: * Hence only allow the shrinker to scan the entire cache when 0267: * a large delta change is calculated directly. 0268: */ 0269: if (delta < max_pass / 4) 0270: total_scan = min(total_scan, max_pass / 2); 0271: 0272: /* 0273: * Avoid risking looping forever due to too large nr value: 0274: * never try to free more than twice the estimate number of 0275: * freeable entries. 0276: */ 0277: if (total_scan > max_pass * 2) 0278: total_scan = max_pass * 2; 0279: 0280: trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, 0281: nr_pages_scanned, lru_pages, 0282: max_pass, delta, total_scan); 0283: 0284: while (total_scan >= batch_size) { 0285: unsigned long ret; 0286: 0287: shrinkctl->nr_to_scan = batch_size; 0288: ret = shrinker->scan_objects(shrinker, shrinkctl); 0289: if (ret == SHRINK_STOP) 0290: break; 0291: freed += ret; 0292: 0293: count_vm_events(SLABS_SCANNED, batch_size); 0294: total_scan -= batch_size; 0295: 0296: cond_resched(); 0297: } 0298: 0299: /* 0300: * move the unused scan count back into the shrinker in a 0301: * manner that handles concurrent updates. If we exhausted the 0302: * scan, there is no need to do an update. 0303: */ 0304: if (total_scan > 0) 0305: new_nr = atomic_long_add_return(total_scan, 0306: &shrinker->nr_deferred[nid]); 0307: else 0308: new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); 0309: 0310: trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); 0311: return freed; 0312: } 0313: 0314: /* 0315: * Call the shrink functions to age shrinkable caches 0316: * 0317: * Here we assume it costs one seek to replace a lru page and that it also 0318: * takes a seek to recreate a cache object. With this in mind we age equal 0319: * percentages of the lru and ageable caches. This should balance the seeks 0320: * generated by these structures. 0321: * 0322: * If the vm encountered mapped pages on the LRU it increase the pressure on 0323: * slab to avoid swapping. 0324: * 0325: * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. 0326: * 0327: * `lru_pages' represents the number of on-LRU pages in all the zones which 0328: * are eligible for the caller's allocation attempt. It is used for balancing 0329: * slab reclaim versus page reclaim. 0330: * 0331: * Returns the number of slab objects which we shrunk. 0332: */ 0333: unsigned long shrink_slab(struct shrink_control *shrinkctl, 0334: unsigned long nr_pages_scanned, 0335: unsigned long lru_pages) 0336: { 0337: struct shrinker *shrinker; 0338: unsigned long freed = 0; 0339: 0340: if (nr_pages_scanned == 0) 0341: nr_pages_scanned = SWAP_CLUSTER_MAX; 0342: 0343: if (!down_read_trylock(&shrinker_rwsem)) { 0344: /* 0345: * If we would return 0, our callers would understand that we 0346: * have nothing else to shrink and give up trying. By returning 0347: * 1 we keep it going and assume we'll be able to shrink next 0348: * time. 0349: */ 0350: freed = 1; 0351: goto out; 0352: } 0353: 0354: list_for_each_entry(shrinker, &shrinker_list, list) { 0355: for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { 0356: if (!node_online(shrinkctl->nid)) 0357: continue; 0358: 0359: if (!(shrinker->flags & SHRINKER_NUMA_AWARE) && 0360: (shrinkctl->nid != 0)) 0361: break; 0362: 0363: freed += shrink_slab_node(shrinkctl, shrinker, 0364: nr_pages_scanned, lru_pages); 0365: 0366: } 0367: } 0368: up_read(&shrinker_rwsem); 0369: out: 0370: cond_resched(); 0371: return freed; 0372: } 0373: 0374: static inline int is_page_cache_freeable(struct page *page) 0375: { 0376: /* 0377: * A freeable page cache page is referenced only by the caller 0378: * that isolated the page, the page cache radix tree and 0379: * optional buffer heads at page->private. 0380: */ 0381: return page_count(page) - page_has_private(page) == 2; 0382: } 0383: 0384: static int may_write_to_queue(struct backing_dev_info *bdi, 0385: struct scan_control *sc) 0386: { 0387: if (current->flags & PF_SWAPWRITE) 0388: return 1; 0389: if (!bdi_write_congested(bdi)) 0390: return 1; 0391: if (bdi == current->backing_dev_info) 0392: return 1; 0393: return 0; 0394: } 0395: 0396: /* 0397: * We detected a synchronous write error writing a page out. Probably 0398: * -ENOSPC. We need to propagate that into the address_space for a subsequent 0399: * fsync(), msync() or close(). 0400: * 0401: * The tricky part is that after writepage we cannot touch the mapping: nothing 0402: * prevents it from being freed up. But we have a ref on the page and once 0403: * that page is locked, the mapping is pinned. 0404: * 0405: * We're allowed to run sleeping lock_page() here because we know the caller has 0406: * __GFP_FS. 0407: */ 0408: static void handle_write_error(struct address_space *mapping, 0409: struct page *page, int error) 0410: { 0411: lock_page(page); 0412: if (page_mapping(page) == mapping) 0413: mapping_set_error(mapping, error); 0414: unlock_page(page); 0415: } 0416: 0417: /* possible outcome of pageout() */ 0418: typedef enum { 0419: /* failed to write page out, page is locked */ 0420: PAGE_KEEP, 0421: /* move page to the active list, page is locked */ 0422: PAGE_ACTIVATE, 0423: /* page has been sent to the disk successfully, page is unlocked */ 0424: PAGE_SUCCESS, 0425: /* page is clean and locked */ 0426: PAGE_CLEAN, 0427: } pageout_t; 0428: 0429: /* 0430: * pageout is called by shrink_page_list() for each dirty page. 0431: * Calls ->writepage(). 0432: */ 0433: static pageout_t pageout(struct page *page, struct address_space *mapping, 0434: struct scan_control *sc) 0435: { 0436: /* 0437: * If the page is dirty, only perform writeback if that write 0438: * will be non-blocking. To prevent this allocation from being 0439: * stalled by pagecache activity. But note that there may be 0440: * stalls if we need to run get_block(). We could test 0441: * PagePrivate for that. 0442: * 0443: * If this process is currently in __generic_file_aio_write() against 0444: * this page's queue, we can perform writeback even if that 0445: * will block. 0446: * 0447: * If the page is swapcache, write it back even if that would 0448: * block, for some throttling. This happens by accident, because 0449: * swap_backing_dev_info is bust: it doesn't reflect the 0450: * congestion state of the swapdevs. Easy to fix, if needed. 0451: */ 0452: if (!is_page_cache_freeable(page)) 0453: return PAGE_KEEP; 0454: if (!mapping) { 0455: /* 0456: * Some data journaling orphaned pages can have 0457: * page->mapping == NULL while being dirty with clean buffers. 0458: */ 0459: if (page_has_private(page)) { 0460: if (try_to_free_buffers(page)) { 0461: ClearPageDirty(page); 0462: printk("%s: orphaned page\n", __func__); 0463: return PAGE_CLEAN; 0464: } 0465: } 0466: return PAGE_KEEP; 0467: } 0468: if (mapping->a_ops->writepage == NULL) 0469: return PAGE_ACTIVATE; 0470: if (!may_write_to_queue(mapping->backing_dev_info, sc)) 0471: return PAGE_KEEP; 0472: 0473: if (clear_page_dirty_for_io(page)) { 0474: int res; 0475: struct writeback_control wbc = { 0476: .sync_mode = WB_SYNC_NONE, 0477: .nr_to_write = SWAP_CLUSTER_MAX, 0478: .range_start = 0, 0479: .range_end = LLONG_MAX, 0480: .for_reclaim = 1, 0481: }; 0482: 0483: SetPageReclaim(page); 0484: res = mapping->a_ops->writepage(page, &wbc); 0485: if (res < 0) 0486: handle_write_error(mapping, page, res); 0487: if (res == AOP_WRITEPAGE_ACTIVATE) { 0488: ClearPageReclaim(page); 0489: return PAGE_ACTIVATE; 0490: } 0491: 0492: if (!PageWriteback(page)) { 0493: /* synchronous write or broken a_ops? */ 0494: ClearPageReclaim(page); 0495: } 0496: trace_mm_vmscan_writepage(page, trace_reclaim_flags(page)); 0497: inc_zone_page_state(page, NR_VMSCAN_WRITE); 0498: return PAGE_SUCCESS; 0499: } 0500: 0501: return PAGE_CLEAN; 0502: } 0503: 0504: /* 0505: * Same as remove_mapping, but if the page is removed from the mapping, it 0506: * gets returned with a refcount of 0. 0507: */ 0508: static int __remove_mapping(struct address_space *mapping, struct page *page) 0509: { 0510: BUG_ON(!PageLocked(page)); 0511: BUG_ON(mapping != page_mapping(page)); 0512: 0513: spin_lock_irq(&mapping->tree_lock); 0514: /* 0515: * The non racy check for a busy page. 0516: * 0517: * Must be careful with the order of the tests. When someone has 0518: * a ref to the page, it may be possible that they dirty it then 0519: * drop the reference. So if PageDirty is tested before page_count 0520: * here, then the following race may occur: 0521: * 0522: * get_user_pages(&page); 0523: * [user mapping goes away] 0524: * write_to(page); 0525: * !PageDirty(page) [good] 0526: * SetPageDirty(page); 0527: * put_page(page); 0528: * !page_count(page) [good, discard it] 0529: * 0530: * [oops, our write_to data is lost] 0531: * 0532: * Reversing the order of the tests ensures such a situation cannot 0533: * escape unnoticed. The smp_rmb is needed to ensure the page->flags 0534: * load is not satisfied before that of page->_count. 0535: * 0536: * Note that if SetPageDirty is always performed via set_page_dirty, 0537: * and thus under tree_lock, then this ordering is not required. 0538: */ 0539: if (!page_freeze_refs(page, 2)) 0540: goto cannot_free; 0541: /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ 0542: if (unlikely(PageDirty(page))) { 0543: page_unfreeze_refs(page, 2); 0544: goto cannot_free; 0545: } 0546: 0547: if (PageSwapCache(page)) { 0548: swp_entry_t swap = { .val = page_private(page) }; 0549: __delete_from_swap_cache(page); 0550: spin_unlock_irq(&mapping->tree_lock); 0551: swapcache_free(swap, page); 0552: } else { 0553: void (*freepage)(struct page *); 0554: 0555: freepage = mapping->a_ops->freepage; 0556: 0557: __delete_from_page_cache(page); 0558: spin_unlock_irq(&mapping->tree_lock); 0559: mem_cgroup_uncharge_cache_page(page); 0560: 0561: if (freepage != NULL) 0562: freepage(page); 0563: } 0564: 0565: return 1; 0566: 0567: cannot_free: 0568: spin_unlock_irq(&mapping->tree_lock); 0569: return 0; 0570: } 0571: 0572: /* 0573: * Attempt to detach a locked page from its ->mapping. If it is dirty or if 0574: * someone else has a ref on the page, abort and return 0. If it was 0575: * successfully detached, return 1. Assumes the caller has a single ref on 0576: * this page. 0577: */ 0578: int remove_mapping(struct address_space *mapping, struct page *page) 0579: { 0580: if (__remove_mapping(mapping, page)) { 0581: /* 0582: * Unfreezing the refcount with 1 rather than 2 effectively 0583: * drops the pagecache ref for us without requiring another 0584: * atomic operation. 0585: */ 0586: page_unfreeze_refs(page, 1); 0587: return 1; 0588: } 0589: return 0; 0590: } 0591: 0592: /** 0593: * putback_lru_page - put previously isolated page onto appropriate LRU list 0594: * @page: page to be put back to appropriate lru list 0595: * 0596: * Add previously isolated @page to appropriate LRU list. 0597: * Page may still be unevictable for other reasons. 0598: * 0599: * lru_lock must not be held, interrupts must be enabled. 0600: */ 0601: void putback_lru_page(struct page *page) 0602: { 0603: bool is_unevictable; 0604: int was_unevictable = PageUnevictable(page); 0605: 0606: VM_BUG_ON(PageLRU(page)); 0607: 0608: redo: 0609: ClearPageUnevictable(page); 0610: 0611: if (page_evictable(page)) { 0612: /* 0613: * For evictable pages, we can use the cache. 0614: * In event of a race, worst case is we end up with an 0615: * unevictable page on [in]active list. 0616: * We know how to handle that. 0617: */ 0618: is_unevictable = false; 0619: lru_cache_add(page); 0620: } else { 0621: /* 0622: * Put unevictable pages directly on zone's unevictable 0623: * list. 0624: */ 0625: is_unevictable = true; 0626: add_page_to_unevictable_list(page); 0627: /* 0628: * When racing with an mlock or AS_UNEVICTABLE clearing 0629: * (page is unlocked) make sure that if the other thread 0630: * does not observe our setting of PG_lru and fails 0631: * isolation/check_move_unevictable_pages, 0632: * we see PG_mlocked/AS_UNEVICTABLE cleared below and move 0633: * the page back to the evictable list. 0634: * 0635: * The other side is TestClearPageMlocked() or shmem_lock(). 0636: */ 0637: smp_mb(); 0638: } 0639: 0640: /* 0641: * page's status can change while we move it among lru. If an evictable 0642: * page is on unevictable list, it never be freed. To avoid that, 0643: * check after we added it to the list, again. 0644: */ 0645: if (is_unevictable && page_evictable(page)) { 0646: if (!isolate_lru_page(page)) { 0647: put_page(page); 0648: goto redo; 0649: } 0650: /* This means someone else dropped this page from LRU 0651: * So, it will be freed or putback to LRU again. There is 0652: * nothing to do here. 0653: */ 0654: } 0655: 0656: if (was_unevictable && !is_unevictable) 0657: count_vm_event(UNEVICTABLE_PGRESCUED); 0658: else if (!was_unevictable && is_unevictable) 0659: count_vm_event(UNEVICTABLE_PGCULLED); 0660: 0661: put_page(page); /* drop ref from isolate */ 0662: } 0663: 0664: enum page_references { 0665: PAGEREF_RECLAIM, 0666: PAGEREF_RECLAIM_CLEAN, 0667: PAGEREF_KEEP, 0668: PAGEREF_ACTIVATE, 0669: }; 0670: 0671: static enum page_references page_check_references(struct page *page, 0672: struct scan_control *sc) 0673: { 0674: int referenced_ptes, referenced_page; 0675: unsigned long vm_flags; 0676: 0677: referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, 0678: &vm_flags); 0679: referenced_page = TestClearPageReferenced(page); 0680: 0681: /* 0682: * Mlock lost the isolation race with us. Let try_to_unmap() 0683: * move the page to the unevictable list. 0684: */ 0685: if (vm_flags & VM_LOCKED) 0686: return PAGEREF_RECLAIM; 0687: 0688: if (referenced_ptes) { 0689: if (PageSwapBacked(page)) 0690: return PAGEREF_ACTIVATE; 0691: /* 0692: * All mapped pages start out with page table 0693: * references from the instantiating fault, so we need 0694: * to look twice if a mapped file page is used more 0695: * than once. 0696: * 0697: * Mark it and spare it for another trip around the 0698: * inactive list. Another page table reference will 0699: * lead to its activation. 0700: * 0701: * Note: the mark is set for activated pages as well 0702: * so that recently deactivated but used pages are 0703: * quickly recovered. 0704: */ 0705: SetPageReferenced(page); 0706: 0707: if (referenced_page || referenced_ptes > 1) 0708: return PAGEREF_ACTIVATE; 0709: 0710: /* 0711: * Activate file-backed executable pages after first usage. 0712: */ 0713: if (vm_flags & VM_EXEC) 0714: return PAGEREF_ACTIVATE; 0715: 0716: return PAGEREF_KEEP; 0717: } 0718: 0719: /* Reclaim if clean, defer dirty pages to writeback */ 0720: if (referenced_page && !PageSwapBacked(page)) 0721: return PAGEREF_RECLAIM_CLEAN; 0722: 0723: return PAGEREF_RECLAIM; 0724: } 0725: 0726: /* Check if a page is dirty or under writeback */ 0727: static void page_check_dirty_writeback(struct page *page, 0728: bool *dirty, bool *writeback) 0729: { 0730: struct address_space *mapping; 0731: 0732: /* 0733: * Anonymous pages are not handled by flushers and must be written 0734: * from reclaim context. Do not stall reclaim based on them 0735: */ 0736: if (!page_is_file_cache(page)) { 0737: *dirty = false; 0738: *writeback = false; 0739: return; 0740: } 0741: 0742: /* By default assume that the page flags are accurate */ 0743: *dirty = PageDirty(page); 0744: *writeback = PageWriteback(page); 0745: 0746: /* Verify dirty/writeback state if the filesystem supports it */ 0747: if (!page_has_private(page)) 0748: return; 0749: 0750: mapping = page_mapping(page); 0751: if (mapping && mapping->a_ops->is_dirty_writeback) 0752: mapping->a_ops->is_dirty_writeback(page, dirty, writeback); 0753: } 0754: 0755: /* 0756: * shrink_page_list() returns the number of reclaimed pages 0757: */ 0758: static unsigned long shrink_page_list(struct list_head *page_list, 0759: struct zone *zone, 0760: struct scan_control *sc, 0761: enum ttu_flags ttu_flags, 0762: unsigned long *ret_nr_dirty, 0763: unsigned long *ret_nr_unqueued_dirty, 0764: unsigned long *ret_nr_congested, 0765: unsigned long *ret_nr_writeback, 0766: unsigned long *ret_nr_immediate, 0767: bool force_reclaim) 0768: { 0769: LIST_HEAD(ret_pages); 0770: LIST_HEAD(free_pages); 0771: int pgactivate = 0; 0772: unsigned long nr_unqueued_dirty = 0; 0773: unsigned long nr_dirty = 0; 0774: unsigned long nr_congested = 0; 0775: unsigned long nr_reclaimed = 0; 0776: unsigned long nr_writeback = 0; 0777: unsigned long nr_immediate = 0; 0778: 0779: cond_resched(); 0780: 0781: mem_cgroup_uncharge_start(); 0782: while (!list_empty(page_list)) { 0783: struct address_space *mapping; 0784: struct page *page; 0785: int may_enter_fs; 0786: enum page_references references = PAGEREF_RECLAIM_CLEAN; 0787: bool dirty, writeback; 0788: 0789: cond_resched(); 0790: 0791: page = lru_to_page(page_list); 0792: list_del(&page->lru); 0793: 0794: if (!trylock_page(page)) 0795: goto keep; 0796: 0797: VM_BUG_ON(PageActive(page)); 0798: VM_BUG_ON(page_zone(page) != zone); 0799: 0800: sc->nr_scanned++; 0801: 0802: if (unlikely(!page_evictable(page))) 0803: goto cull_mlocked; 0804: 0805: if (!sc->may_unmap && page_mapped(page)) 0806: goto keep_locked; 0807: 0808: /* Double the slab pressure for mapped and swapcache pages */ 0809: if (page_mapped(page) || PageSwapCache(page)) 0810: sc->nr_scanned++; 0811: 0812: may_enter_fs = (sc->gfp_mask & __GFP_FS) || 0813: (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 0814: 0815: /* 0816: * The number of dirty pages determines if a zone is marked 0817: * reclaim_congested which affects wait_iff_congested. kswapd 0818: * will stall and start writing pages if the tail of the LRU 0819: * is all dirty unqueued pages. 0820: */ 0821: page_check_dirty_writeback(page, &dirty, &writeback); 0822: if (dirty || writeback) 0823: nr_dirty++; 0824: 0825: if (dirty && !writeback) 0826: nr_unqueued_dirty++; 0827: 0828: /* 0829: * Treat this page as congested if the underlying BDI is or if 0830: * pages are cycling through the LRU so quickly that the 0831: * pages marked for immediate reclaim are making it to the 0832: * end of the LRU a second time. 0833: */ 0834: mapping = page_mapping(page); 0835: if ((mapping && bdi_write_congested(mapping->backing_dev_info)) || 0836: (writeback && PageReclaim(page))) 0837: nr_congested++; 0838: 0839: /* 0840: * If a page at the tail of the LRU is under writeback, there 0841: * are three cases to consider. 0842: * 0843: * 1) If reclaim is encountering an excessive number of pages 0844: * under writeback and this page is both under writeback and 0845: * PageReclaim then it indicates that pages are being queued 0846: * for IO but are being recycled through the LRU before the 0847: * IO can complete. Waiting on the page itself risks an 0848: * indefinite stall if it is impossible to writeback the 0849: * page due to IO error or disconnected storage so instead 0850: * note that the LRU is being scanned too quickly and the 0851: * caller can stall after page list has been processed. 0852: * 0853: * 2) Global reclaim encounters a page, memcg encounters a 0854: * page that is not marked for immediate reclaim or 0855: * the caller does not have __GFP_IO. In this case mark 0856: * the page for immediate reclaim and continue scanning. 0857: * 0858: * __GFP_IO is checked because a loop driver thread might 0859: * enter reclaim, and deadlock if it waits on a page for 0860: * which it is needed to do the write (loop masks off 0861: * __GFP_IO|__GFP_FS for this reason); but more thought 0862: * would probably show more reasons. 0863: * 0864: * Don't require __GFP_FS, since we're not going into the 0865: * FS, just waiting on its writeback completion. Worryingly, 0866: * ext4 gfs2 and xfs allocate pages with 0867: * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing 0868: * may_enter_fs here is liable to OOM on them. 0869: * 0870: * 3) memcg encounters a page that is not already marked 0871: * PageReclaim. memcg does not have any dirty pages 0872: * throttling so we could easily OOM just because too many 0873: * pages are in writeback and there is nothing else to 0874: * reclaim. Wait for the writeback to complete. 0875: */ 0876: if (PageWriteback(page)) { 0877: /* Case 1 above */ 0878: if (current_is_kswapd() && 0879: PageReclaim(page) && 0880: zone_is_reclaim_writeback(zone)) { 0881: nr_immediate++; 0882: goto keep_locked; 0883: 0884: /* Case 2 above */ 0885: } else if (global_reclaim(sc) || 0886: !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { 0887: /* 0888: * This is slightly racy - end_page_writeback() 0889: * might have just cleared PageReclaim, then 0890: * setting PageReclaim here end up interpreted 0891: * as PageReadahead - but that does not matter 0892: * enough to care. What we do want is for this 0893: * page to have PageReclaim set next time memcg 0894: * reclaim reaches the tests above, so it will 0895: * then wait_on_page_writeback() to avoid OOM; 0896: * and it's also appropriate in global reclaim. 0897: */ 0898: SetPageReclaim(page); 0899: nr_writeback++; 0900: 0901: goto keep_locked; 0902: 0903: /* Case 3 above */ 0904: } else { 0905: wait_on_page_writeback(page); 0906: } 0907: } 0908: 0909: if (!force_reclaim) 0910: references = page_check_references(page, sc); 0911: 0912: switch (references) { 0913: case PAGEREF_ACTIVATE: 0914: goto activate_locked; 0915: case PAGEREF_KEEP: 0916: goto keep_locked; 0917: case PAGEREF_RECLAIM: 0918: case PAGEREF_RECLAIM_CLEAN: 0919: ; /* try to reclaim the page below */ 0920: } 0921: 0922: /* 0923: * Anonymous process memory has backing store? 0924: * Try to allocate it some swap space here. 0925: */ 0926: if (PageAnon(page) && !PageSwapCache(page)) { 0927: if (!(sc->gfp_mask & __GFP_IO)) 0928: goto keep_locked; 0929: if (!add_to_swap(page, page_list)) 0930: goto activate_locked; 0931: may_enter_fs = 1; 0932: 0933: /* Adding to swap updated mapping */ 0934: mapping = page_mapping(page); 0935: } 0936: 0937: /* 0938: * The page is mapped into the page tables of one or more 0939: * processes. Try to unmap it here. 0940: */ 0941: if (page_mapped(page) && mapping) { 0942: switch (try_to_unmap(page, ttu_flags)) { 0943: case SWAP_FAIL: 0944: goto activate_locked; 0945: case SWAP_AGAIN: 0946: goto keep_locked; 0947: case SWAP_MLOCK: 0948: goto cull_mlocked; 0949: case SWAP_SUCCESS: 0950: ; /* try to free the page below */ 0951: } 0952: } 0953: 0954: if (PageDirty(page)) { 0955: /* 0956: * Only kswapd can writeback filesystem pages to 0957: * avoid risk of stack overflow but only writeback 0958: * if many dirty pages have been encountered. 0959: */ 0960: if (page_is_file_cache(page) && 0961: (!current_is_kswapd() || 0962: !zone_is_reclaim_dirty(zone))) { 0963: /* 0964: * Immediately reclaim when written back. 0965: * Similar in principal to deactivate_page() 0966: * except we already have the page isolated 0967: * and know it's dirty 0968: */ 0969: inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); 0970: SetPageReclaim(page); 0971: 0972: goto keep_locked; 0973: } 0974: 0975: if (references == PAGEREF_RECLAIM_CLEAN) 0976: goto keep_locked; 0977: if (!may_enter_fs) 0978: goto keep_locked; 0979: if (!sc->may_writepage) 0980: goto keep_locked; 0981: 0982: /* Page is dirty, try to write it out here */ 0983: switch (pageout(page, mapping, sc)) { 0984: case PAGE_KEEP: 0985: goto keep_locked; 0986: case PAGE_ACTIVATE: 0987: goto activate_locked; 0988: case PAGE_SUCCESS: 0989: if (PageWriteback(page)) 0990: goto keep; 0991: if (PageDirty(page)) 0992: goto keep; 0993: 0994: /* 0995: * A synchronous write - probably a ramdisk. Go 0996: * ahead and try to reclaim the page. 0997: */ 0998: if (!trylock_page(page)) 0999: goto keep; 1000: if (PageDirty(page) || PageWriteback(page)) 1001: goto keep_locked; 1002: mapping = page_mapping(page); 1003: case PAGE_CLEAN: 1004: ; /* try to free the page below */ 1005: } 1006: } 1007: 1008: /* 1009: * If the page has buffers, try to free the buffer mappings 1010: * associated with this page. If we succeed we try to free 1011: * the page as well. 1012: * 1013: * We do this even if the page is PageDirty(). 1014: * try_to_release_page() does not perform I/O, but it is 1015: * possible for a page to have PageDirty set, but it is actually 1016: * clean (all its buffers are clean). This happens if the 1017: * buffers were written out directly, with submit_bh(). ext3 1018: * will do this, as well as the blockdev mapping. 1019: * try_to_release_page() will discover that cleanness and will 1020: * drop the buffers and mark the page clean - it can be freed. 1021: * 1022: * Rarely, pages can have buffers and no ->mapping. These are 1023: * the pages which were not successfully invalidated in 1024: * truncate_complete_page(). We try to drop those buffers here 1025: * and if that worked, and the page is no longer mapped into 1026: * process address space (page_count == 1) it can be freed. 1027: * Otherwise, leave the page on the LRU so it is swappable. 1028: */ 1029: if (page_has_private(page)) { 1030: if (!try_to_release_page(page, sc->gfp_mask)) 1031: goto activate_locked; 1032: if (!mapping && page_count(page) == 1) { 1033: unlock_page(page); 1034: if (put_page_testzero(page)) 1035: goto free_it; 1036: else { 1037: /* 1038: * rare race with speculative reference. 1039: * the speculative reference will free 1040: * this page shortly, so we may 1041: * increment nr_reclaimed here (and 1042: * leave it off the LRU). 1043: */ 1044: nr_reclaimed++; 1045: continue; 1046: } 1047: } 1048: } 1049: 1050: if (!mapping || !__remove_mapping(mapping, page)) 1051: goto keep_locked; 1052: 1053: /* 1054: * At this point, we have no other references and there is 1055: * no way to pick any more up (removed from LRU, removed 1056: * from pagecache). Can use non-atomic bitops now (and 1057: * we obviously don't have to worry about waking up a process 1058: * waiting on the page lock, because there are no references. 1059: */ 1060: __clear_page_locked(page); 1061: free_it: 1062: nr_reclaimed++; 1063: 1064: /* 1065: * Is there need to periodically free_page_list? It would 1066: * appear not as the counts should be low 1067: */ 1068: list_add(&page->lru, &free_pages); 1069: continue; 1070: 1071: cull_mlocked: 1072: if (PageSwapCache(page)) 1073: try_to_free_swap(page); 1074: unlock_page(page); 1075: putback_lru_page(page); 1076: continue; 1077: 1078: activate_locked: 1079: /* Not a candidate for swapping, so reclaim swap space. */ 1080: if (PageSwapCache(page) && vm_swap_full()) 1081: try_to_free_swap(page); 1082: VM_BUG_ON(PageActive(page)); 1083: SetPageActive(page); 1084: pgactivate++; 1085: keep_locked: 1086: unlock_page(page); 1087: keep: 1088: list_add(&page->lru, &ret_pages); 1089: VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 1090: } 1091: 1092: free_hot_cold_page_list(&free_pages, 1); 1093: 1094: list_splice(&ret_pages, page_list); 1095: count_vm_events(PGACTIVATE, pgactivate); 1096: mem_cgroup_uncharge_end(); 1097: *ret_nr_dirty += nr_dirty; 1098: *ret_nr_congested += nr_congested; 1099: *ret_nr_unqueued_dirty += nr_unqueued_dirty; 1100: *ret_nr_writeback += nr_writeback; 1101: *ret_nr_immediate += nr_immediate; 1102: return nr_reclaimed; 1103: } 1104: 1105: unsigned long reclaim_clean_pages_from_list(struct zone *zone, 1106: struct list_head *page_list) 1107: { 1108: struct scan_control sc = { 1109: .gfp_mask = GFP_KERNEL, 1110: .priority = DEF_PRIORITY, 1111: .may_unmap = 1, 1112: }; 1113: unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5; 1114: struct page *page, *next; 1115: LIST_HEAD(clean_pages); 1116: 1117: list_for_each_entry_safe(page, next, page_list, lru) { 1118: if (page_is_file_cache(page) && !PageDirty(page) && 1119: !isolated_balloon_page(page)) { 1120: ClearPageActive(page); 1121: list_move(&page->lru, &clean_pages); 1122: } 1123: } 1124: 1125: ret = shrink_page_list(&clean_pages, zone, &sc, 1126: TTU_UNMAP|TTU_IGNORE_ACCESS, 1127: &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); 1128: list_splice(&clean_pages, page_list); 1129: __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); 1130: return ret; 1131: } 1132: 1133: /* 1134: * Attempt to remove the specified page from its LRU. Only take this page 1135: * if it is of the appropriate PageActive status. Pages which are being 1136: * freed elsewhere are also ignored. 1137: * 1138: * page: page to consider 1139: * mode: one of the LRU isolation modes defined above 1140: * 1141: * returns 0 on success, -ve errno on failure. 1142: */ 1143: int __isolate_lru_page(struct page *page, isolate_mode_t mode) 1144: { 1145: int ret = -EINVAL; 1146: 1147: /* Only take pages on the LRU. */ 1148: if (!PageLRU(page)) 1149: return ret; 1150: 1151: /* Compaction should not handle unevictable pages but CMA can do so */ 1152: if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE)) 1153: return ret; 1154: 1155: ret = -EBUSY; 1156: 1157: /* 1158: * To minimise LRU disruption, the caller can indicate that it only 1159: * wants to isolate pages it will be able to operate on without 1160: * blocking - clean pages for the most part. 1161: * 1162: * ISOLATE_CLEAN means that only clean pages should be isolated. This 1163: * is used by reclaim when it is cannot write to backing storage 1164: * 1165: * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages 1166: * that it is possible to migrate without blocking 1167: */ 1168: if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) { 1169: /* All the caller can do on PageWriteback is block */ 1170: if (PageWriteback(page)) 1171: return ret; 1172: 1173: if (PageDirty(page)) { 1174: struct address_space *mapping; 1175: 1176: /* ISOLATE_CLEAN means only clean pages */ 1177: if (mode & ISOLATE_CLEAN) 1178: return ret; 1179: 1180: /* 1181: * Only pages without mappings or that have a 1182: * ->migratepage callback are possible to migrate 1183: * without blocking 1184: */ 1185: mapping = page_mapping(page); 1186: if (mapping && !mapping->a_ops->migratepage) 1187: return ret; 1188: } 1189: } 1190: 1191: if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) 1192: return ret; 1193: 1194: if (likely(get_page_unless_zero(page))) { 1195: /* 1196: * Be careful not to clear PageLRU until after we're 1197: * sure the page is not being freed elsewhere -- the 1198: * page release code relies on it. 1199: */ 1200: ClearPageLRU(page); 1201: ret = 0; 1202: } 1203: 1204: return ret; 1205: } 1206: 1207: /* 1208: * zone->lru_lock is heavily contended. Some of the functions that 1209: * shrink the lists perform better by taking out a batch of pages 1210: * and working on them outside the LRU lock. 1211: * 1212: * For pagecache intensive workloads, this function is the hottest 1213: * spot in the kernel (apart from copy_*_user functions). 1214: * 1215: * Appropriate locks must be held before calling this function. 1216: * 1217: * @nr_to_scan: The number of pages to look through on the list. 1218: * @lruvec: The LRU vector to pull pages from. 1219: * @dst: The temp list to put pages on to. 1220: * @nr_scanned: The number of pages that were scanned. 1221: * @sc: The scan_control struct for this reclaim session 1222: * @mode: One of the LRU isolation modes 1223: * @lru: LRU list id for isolating 1224: * 1225: * returns how many pages were moved onto *@dst. 1226: */ 1227: static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1228: struct lruvec *lruvec, struct list_head *dst, 1229: unsigned long *nr_scanned, struct scan_control *sc, 1230: isolate_mode_t mode, enum lru_list lru) 1231: { 1232: struct list_head *src = &lruvec->lists[lru]; 1233: unsigned long nr_taken = 0; 1234: unsigned long scan; 1235: 1236: for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 1237: struct page *page; 1238: int nr_pages; 1239: 1240: page = lru_to_page(src); 1241: prefetchw_prev_lru_page(page, src, flags); 1242: 1243: VM_BUG_ON(!PageLRU(page)); 1244: 1245: switch (__isolate_lru_page(page, mode)) { 1246: case 0: 1247: nr_pages = hpage_nr_pages(page); 1248: mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); 1249: list_move(&page->lru, dst); 1250: nr_taken += nr_pages; 1251: break; 1252: 1253: case -EBUSY: 1254: /* else it is being freed elsewhere */ 1255: list_move(&page->lru, src); 1256: continue; 1257: 1258: default: 1259: BUG(); 1260: } 1261: } 1262: 1263: *nr_scanned = scan; 1264: trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan, 1265: nr_taken, mode, is_file_lru(lru)); 1266: return nr_taken; 1267: } 1268: 1269: /** 1270: * isolate_lru_page - tries to isolate a page from its LRU list 1271: * @page: page to isolate from its LRU list 1272: * 1273: * Isolates a @page from an LRU list, clears PageLRU and adjusts the 1274: * vmstat statistic corresponding to whatever LRU list the page was on. 1275: * 1276: * Returns 0 if the page was removed from an LRU list. 1277: * Returns -EBUSY if the page was not on an LRU list. 1278: * 1279: * The returned page will have PageLRU() cleared. If it was found on 1280: * the active list, it will have PageActive set. If it was found on 1281: * the unevictable list, it will have the PageUnevictable bit set. That flag 1282: * may need to be cleared by the caller before letting the page go. 1283: * 1284: * The vmstat statistic corresponding to the list on which the page was 1285: * found will be decremented. 1286: * 1287: * Restrictions: 1288: * (1) Must be called with an elevated refcount on the page. This is a 1289: * fundamentnal difference from isolate_lru_pages (which is called 1290: * without a stable reference). 1291: * (2) the lru_lock must not be held. 1292: * (3) interrupts must be enabled. 1293: */ 1294: int isolate_lru_page(struct page *page) 1295: { 1296: int ret = -EBUSY; 1297: 1298: VM_BUG_ON(!page_count(page)); 1299: 1300: if (PageLRU(page)) { 1301: struct zone *zone = page_zone(page); 1302: struct lruvec *lruvec; 1303: 1304: spin_lock_irq(&zone->lru_lock); 1305: lruvec = mem_cgroup_page_lruvec(page, zone); 1306: if (PageLRU(page)) { 1307: int lru = page_lru(page); 1308: get_page(page); 1309: ClearPageLRU(page); 1310: del_page_from_lru_list(page, lruvec, lru); 1311: ret = 0; 1312: } 1313: spin_unlock_irq(&zone->lru_lock); 1314: } 1315: return ret; 1316: } 1317: 1318: /* 1319: * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and 1320: * then get resheduled. When there are massive number of tasks doing page 1321: * allocation, such sleeping direct reclaimers may keep piling up on each CPU, 1322: * the LRU list will go small and be scanned faster than necessary, leading to 1323: * unnecessary swapping, thrashing and OOM. 1324: */ 1325: static int too_many_isolated(struct zone *zone, int file, 1326: struct scan_control *sc) 1327: { 1328: unsigned long inactive, isolated; 1329: 1330: if (current_is_kswapd()) 1331: return 0; 1332: 1333: if (!global_reclaim(sc)) 1334: return 0; 1335: 1336: if (file) { 1337: inactive = zone_page_state(zone, NR_INACTIVE_FILE); 1338: isolated = zone_page_state(zone, NR_ISOLATED_FILE); 1339: } else { 1340: inactive = zone_page_state(zone, NR_INACTIVE_ANON); 1341: isolated = zone_page_state(zone, NR_ISOLATED_ANON); 1342: } 1343: 1344: /* 1345: * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they 1346: * won't get blocked by normal direct-reclaimers, forming a circular 1347: * deadlock. 1348: */ 1349: if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) 1350: inactive >>= 3; 1351: 1352: return isolated > inactive; 1353: } 1354: 1355: static noinline_for_stack void 1356: putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) 1357: { 1358: struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 1359: struct zone *zone = lruvec_zone(lruvec); 1360: LIST_HEAD(pages_to_free); 1361: 1362: /* 1363: * Put back any unfreeable pages. 1364: */ 1365: while (!list_empty(page_list)) { 1366: struct page *page = lru_to_page(page_list); 1367: int lru; 1368: 1369: VM_BUG_ON(PageLRU(page)); 1370: list_del(&page->lru); 1371: if (unlikely(!page_evictable(page))) { 1372: spin_unlock_irq(&zone->lru_lock); 1373: putback_lru_page(page); 1374: spin_lock_irq(&zone->lru_lock); 1375: continue; 1376: } 1377: 1378: lruvec = mem_cgroup_page_lruvec(page, zone); 1379: 1380: SetPageLRU(page); 1381: lru = page_lru(page); 1382: add_page_to_lru_list(page, lruvec, lru); 1383: 1384: if (is_active_lru(lru)) { 1385: int file = is_file_lru(lru); 1386: int numpages = hpage_nr_pages(page); 1387: reclaim_stat->recent_rotated[file] += numpages; 1388: } 1389: if (put_page_testzero(page)) { 1390: __ClearPageLRU(page); 1391: __ClearPageActive(page); 1392: del_page_from_lru_list(page, lruvec, lru); 1393: 1394: if (unlikely(PageCompound(page))) { 1395: spin_unlock_irq(&zone->lru_lock); 1396: (*get_compound_page_dtor(page))(page); 1397: spin_lock_irq(&zone->lru_lock); 1398: } else 1399: list_add(&page->lru, &pages_to_free); 1400: } 1401: } 1402: 1403: /* 1404: * To save our caller's stack, now use input list for pages to free. 1405: */ 1406: list_splice(&pages_to_free, page_list); 1407: } 1408: 1409: /* 1410: * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1411: * of reclaimed pages 1412: */ 1413: static noinline_for_stack unsigned long 1414: shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, 1415: struct scan_control *sc, enum lru_list lru) 1416: { 1417: LIST_HEAD(page_list); 1418: unsigned long nr_scanned; 1419: unsigned long nr_reclaimed = 0; 1420: unsigned long nr_taken; 1421: unsigned long nr_dirty = 0; 1422: unsigned long nr_congested = 0; 1423: unsigned long nr_unqueued_dirty = 0; 1424: unsigned long nr_writeback = 0; 1425: unsigned long nr_immediate = 0; 1426: isolate_mode_t isolate_mode = 0; 1427: int file = is_file_lru(lru); 1428: struct zone *zone = lruvec_zone(lruvec); 1429: struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 1430: 1431: while (unlikely(too_many_isolated(zone, file, sc))) { 1432: congestion_wait(BLK_RW_ASYNC, HZ/10); 1433: 1434: /* We are about to die and free our memory. Return now. */ 1435: if (fatal_signal_pending(current)) 1436: return SWAP_CLUSTER_MAX; 1437: } 1438: 1439: lru_add_drain(); 1440: 1441: if (!sc->may_unmap) 1442: isolate_mode |= ISOLATE_UNMAPPED; 1443: if (!sc->may_writepage) 1444: isolate_mode |= ISOLATE_CLEAN; 1445: 1446: spin_lock_irq(&zone->lru_lock); 1447: 1448: nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, 1449: &nr_scanned, sc, isolate_mode, lru); 1450: 1451: __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); 1452: __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); 1453: 1454: if (global_reclaim(sc)) { 1455: zone->pages_scanned += nr_scanned; 1456: if (current_is_kswapd()) 1457: __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); 1458: else 1459: __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); 1460: } 1461: spin_unlock_irq(&zone->lru_lock); 1462: 1463: if (nr_taken == 0) 1464: return 0; 1465: 1466: nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, 1467: &nr_dirty, &nr_unqueued_dirty, &nr_congested, 1468: &nr_writeback, &nr_immediate, 1469: false); 1470: 1471: spin_lock_irq(&zone->lru_lock); 1472: 1473: reclaim_stat->recent_scanned[file] += nr_taken; 1474: 1475: if (global_reclaim(sc)) { 1476: if (current_is_kswapd()) 1477: __count_zone_vm_events(PGSTEAL_KSWAPD, zone, 1478: nr_reclaimed); 1479: else 1480: __count_zone_vm_events(PGSTEAL_DIRECT, zone, 1481: nr_reclaimed); 1482: } 1483: 1484: putback_inactive_pages(lruvec, &page_list); 1485: 1486: __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); 1487: 1488: spin_unlock_irq(&zone->lru_lock); 1489: 1490: free_hot_cold_page_list(&page_list, 1); 1491: 1492: /* 1493: * If reclaim is isolating dirty pages under writeback, it implies 1494: * that the long-lived page allocation rate is exceeding the page 1495: * laundering rate. Either the global limits are not being effective 1496: * at throttling processes due to the page distribution throughout 1497: * zones or there is heavy usage of a slow backing device. The 1498: * only option is to throttle from reclaim context which is not ideal 1499: * as there is no guarantee the dirtying process is throttled in the 1500: * same way balance_dirty_pages() manages. 1501: * 1502: * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number 1503: * of pages under pages flagged for immediate reclaim and stall if any 1504: * are encountered in the nr_immediate check below. 1505: */ 1506: if (nr_writeback && nr_writeback == nr_taken) 1507: zone_set_flag(zone, ZONE_WRITEBACK); 1508: 1509: /* 1510: * memcg will stall in page writeback so only consider forcibly 1511: * stalling for global reclaim 1512: */ 1513: if (global_reclaim(sc)) { 1514: /* 1515: * Tag a zone as congested if all the dirty pages scanned were 1516: * backed by a congested BDI and wait_iff_congested will stall. 1517: */ 1518: if (nr_dirty && nr_dirty == nr_congested) 1519: zone_set_flag(zone, ZONE_CONGESTED); 1520: 1521: /* 1522: * If dirty pages are scanned that are not queued for IO, it 1523: * implies that flushers are not keeping up. In this case, flag 1524: * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing 1525: * pages from reclaim context. It will forcibly stall in the 1526: * next check. 1527: */ 1528: if (nr_unqueued_dirty == nr_taken) 1529: zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); 1530: 1531: /* 1532: * In addition, if kswapd scans pages marked marked for 1533: * immediate reclaim and under writeback (nr_immediate), it 1534: * implies that pages are cycling through the LRU faster than 1535: * they are written so also forcibly stall. 1536: */ 1537: if (nr_unqueued_dirty == nr_taken || nr_immediate) 1538: congestion_wait(BLK_RW_ASYNC, HZ/10); 1539: } 1540: 1541: /* 1542: * Stall direct reclaim for IO completions if underlying BDIs or zone 1543: * is congested. Allow kswapd to continue until it starts encountering 1544: * unqueued dirty pages or cycling through the LRU too quickly. 1545: */ 1546: if (!sc->hibernation_mode && !current_is_kswapd()) 1547: wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); 1548: 1549: trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, 1550: zone_idx(zone), 1551: nr_scanned, nr_reclaimed, 1552: sc->priority, 1553: trace_shrink_flags(file)); 1554: return nr_reclaimed; 1555: } 1556: 1557: /* 1558: * This moves pages from the active list to the inactive list. 1559: * 1560: * We move them the other way if the page is referenced by one or more 1561: * processes, from rmap. 1562: * 1563: * If the pages are mostly unmapped, the processing is fast and it is 1564: * appropriate to hold zone->lru_lock across the whole operation. But if 1565: * the pages are mapped, the processing is slow (page_referenced()) so we 1566: * should drop zone->lru_lock around each page. It's impossible to balance 1567: * this, so instead we remove the pages from the LRU while processing them. 1568: * It is safe to rely on PG_active against the non-LRU pages in here because 1569: * nobody will play with that bit on a non-LRU page. 1570: * 1571: * The downside is that we have to touch page->_count against each page. 1572: * But we had to alter page->flags anyway. 1573: */ 1574: 1575: static void move_active_pages_to_lru(struct lruvec *lruvec, 1576: struct list_head *list, 1577: struct list_head *pages_to_free, 1578: enum lru_list lru) 1579: { 1580: struct zone *zone = lruvec_zone(lruvec); 1581: unsigned long pgmoved = 0; 1582: struct page *page; 1583: int nr_pages; 1584: 1585: while (!list_empty(list)) { 1586: page = lru_to_page(list); 1587: lruvec = mem_cgroup_page_lruvec(page, zone); 1588: 1589: VM_BUG_ON(PageLRU(page)); 1590: SetPageLRU(page); 1591: 1592: nr_pages = hpage_nr_pages(page); 1593: mem_cgroup_update_lru_size(lruvec, lru, nr_pages); 1594: list_move(&page->lru, &lruvec->lists[lru]); 1595: pgmoved += nr_pages; 1596: 1597: if (put_page_testzero(page)) { 1598: __ClearPageLRU(page); 1599: __ClearPageActive(page); 1600: del_page_from_lru_list(page, lruvec, lru); 1601: 1602: if (unlikely(PageCompound(page))) { 1603: spin_unlock_irq(&zone->lru_lock); 1604: (*get_compound_page_dtor(page))(page); 1605: spin_lock_irq(&zone->lru_lock); 1606: } else 1607: list_add(&page->lru, pages_to_free); 1608: } 1609: } 1610: __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); 1611: if (!is_active_lru(lru)) 1612: __count_vm_events(PGDEACTIVATE, pgmoved); 1613: } 1614: 1615: static void shrink_active_list(unsigned long nr_to_scan, 1616: struct lruvec *lruvec, 1617: struct scan_control *sc, 1618: enum lru_list lru) 1619: { 1620: unsigned long nr_taken; 1621: unsigned long nr_scanned; 1622: unsigned long vm_flags; 1623: LIST_HEAD(l_hold); /* The pages which were snipped off */ 1624: LIST_HEAD(l_active); 1625: LIST_HEAD(l_inactive); 1626: struct page *page; 1627: struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 1628: unsigned long nr_rotated = 0; 1629: isolate_mode_t isolate_mode = 0; 1630: int file = is_file_lru(lru); 1631: struct zone *zone = lruvec_zone(lruvec); 1632: 1633: lru_add_drain(); 1634: 1635: if (!sc->may_unmap) 1636: isolate_mode |= ISOLATE_UNMAPPED; 1637: if (!sc->may_writepage) 1638: isolate_mode |= ISOLATE_CLEAN; 1639: 1640: spin_lock_irq(&zone->lru_lock); 1641: 1642: nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, 1643: &nr_scanned, sc, isolate_mode, lru); 1644: if (global_reclaim(sc)) 1645: zone->pages_scanned += nr_scanned; 1646: 1647: reclaim_stat->recent_scanned[file] += nr_taken; 1648: 1649: __count_zone_vm_events(PGREFILL, zone, nr_scanned); 1650: __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); 1651: __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); 1652: spin_unlock_irq(&zone->lru_lock); 1653: 1654: while (!list_empty(&l_hold)) { 1655: cond_resched(); 1656: page = lru_to_page(&l_hold); 1657: list_del(&page->lru); 1658: 1659: if (unlikely(!page_evictable(page))) { 1660: putback_lru_page(page); 1661: continue; 1662: } 1663: 1664: if (unlikely(buffer_heads_over_limit)) { 1665: if (page_has_private(page) && trylock_page(page)) { 1666: if (page_has_private(page)) 1667: try_to_release_page(page, 0); 1668: unlock_page(page); 1669: } 1670: } 1671: 1672: if (page_referenced(page, 0, sc->target_mem_cgroup, 1673: &vm_flags)) { 1674: nr_rotated += hpage_nr_pages(page); 1675: /* 1676: * Identify referenced, file-backed active pages and 1677: * give them one more trip around the active list. So 1678: * that executable code get better chances to stay in 1679: * memory under moderate memory pressure. Anon pages 1680: * are not likely to be evicted by use-once streaming 1681: * IO, plus JVM can create lots of anon VM_EXEC pages, 1682: * so we ignore them here. 1683: */ 1684: if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) { 1685: list_add(&page->lru, &l_active); 1686: continue; 1687: } 1688: } 1689: 1690: ClearPageActive(page); /* we are de-activating */ 1691: list_add(&page->lru, &l_inactive); 1692: } 1693: 1694: /* 1695: * Move pages back to the lru list. 1696: */ 1697: spin_lock_irq(&zone->lru_lock); 1698: /* 1699: * Count referenced pages from currently used mappings as rotated, 1700: * even though only some of them are actually re-activated. This 1701: * helps balance scan pressure between file and anonymous pages in 1702: * get_scan_ratio. 1703: */ 1704: reclaim_stat->recent_rotated[file] += nr_rotated; 1705: 1706: move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); 1707: move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); 1708: __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); 1709: spin_unlock_irq(&zone->lru_lock); 1710: 1711: free_hot_cold_page_list(&l_hold, 1); 1712: } 1713: 1714: #ifdef CONFIG_SWAP 1715: static int inactive_anon_is_low_global(struct zone *zone) 1716: { 1717: unsigned long active, inactive; 1718: 1719: active = zone_page_state(zone, NR_ACTIVE_ANON); 1720: inactive = zone_page_state(zone, NR_INACTIVE_ANON); 1721: 1722: if (inactive * zone->inactive_ratio < active) 1723: return 1; 1724: 1725: return 0; 1726: } 1727: 1728: /** 1729: * inactive_anon_is_low - check if anonymous pages need to be deactivated 1730: * @lruvec: LRU vector to check 1731: * 1732: * Returns true if the zone does not have enough inactive anon pages, 1733: * meaning some active anon pages need to be deactivated. 1734: */ 1735: static int inactive_anon_is_low(struct lruvec *lruvec) 1736: { 1737: /* 1738: * If we don't have swap space, anonymous page deactivation 1739: * is pointless. 1740: */ 1741: if (!total_swap_pages) 1742: return 0; 1743: 1744: if (!mem_cgroup_disabled()) 1745: return mem_cgroup_inactive_anon_is_low(lruvec); 1746: 1747: return inactive_anon_is_low_global(lruvec_zone(lruvec)); 1748: } 1749: #else 1750: static inline int inactive_anon_is_low(struct lruvec *lruvec) 1751: { 1752: return 0; 1753: } 1754: #endif 1755: 1756: /** 1757: * inactive_file_is_low - check if file pages need to be deactivated 1758: * @lruvec: LRU vector to check 1759: * 1760: * When the system is doing streaming IO, memory pressure here 1761: * ensures that active file pages get deactivated, until more 1762: * than half of the file pages are on the inactive list. 1763: * 1764: * Once we get to that situation, protect the system's working 1765: * set from being evicted by disabling active file page aging. 1766: * 1767: * This uses a different ratio than the anonymous pages, because 1768: * the page cache uses a use-once replacement algorithm. 1769: */ 1770: static int inactive_file_is_low(struct lruvec *lruvec) 1771: { 1772: unsigned long inactive; 1773: unsigned long active; 1774: 1775: inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); 1776: active = get_lru_size(lruvec, LRU_ACTIVE_FILE); 1777: 1778: return active > inactive; 1779: } 1780: 1781: static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) 1782: { 1783: if (is_file_lru(lru)) 1784: return inactive_file_is_low(lruvec); 1785: else 1786: return inactive_anon_is_low(lruvec); 1787: } 1788: 1789: static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1790: struct lruvec *lruvec, struct scan_control *sc) 1791: { 1792: if (is_active_lru(lru)) { 1793: if (inactive_list_is_low(lruvec, lru)) 1794: shrink_active_list(nr_to_scan, lruvec, sc, lru); 1795: return 0; 1796: } 1797: 1798: return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); 1799: } 1800: 1801: static int vmscan_swappiness(struct scan_control *sc) 1802: { 1803: if (global_reclaim(sc)) 1804: return vm_swappiness; 1805: return mem_cgroup_swappiness(sc->target_mem_cgroup); 1806: } 1807: 1808: enum scan_balance { 1809: SCAN_EQUAL, 1810: SCAN_FRACT, 1811: SCAN_ANON, 1812: SCAN_FILE, 1813: }; 1814: 1815: /* 1816: * Determine how aggressively the anon and file LRU lists should be 1817: * scanned. The relative value of each set of LRU lists is determined 1818: * by looking at the fraction of the pages scanned we did rotate back 1819: * onto the active list instead of evict. 1820: * 1821: * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan 1822: * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan 1823: */ 1824: static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, 1825: unsigned long *nr) 1826: { 1827: struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 1828: u64 fraction[2]; 1829: u64 denominator = 0; /* gcc */ 1830: struct zone *zone = lruvec_zone(lruvec); 1831: unsigned long anon_prio, file_prio; 1832: enum scan_balance scan_balance; 1833: unsigned long anon, file, free; 1834: bool force_scan = false; 1835: unsigned long ap, fp; 1836: enum lru_list lru; 1837: 1838: /* 1839: * If the zone or memcg is small, nr[l] can be 0. This 1840: * results in no scanning on this priority and a potential 1841: * priority drop. Global direct reclaim can go to the next 1842: * zone and tends to have no problems. Global kswapd is for 1843: * zone balancing and it needs to scan a minimum amount. When 1844: * reclaiming for a memcg, a priority drop can cause high 1845: * latencies, so it's better to scan a minimum amount there as 1846: * well. 1847: */ 1848: if (current_is_kswapd() && !zone_reclaimable(zone)) 1849: force_scan = true; 1850: if (!global_reclaim(sc)) 1851: force_scan = true; 1852: 1853: /* If we have no swap space, do not bother scanning anon pages. */ 1854: if (!sc->may_swap || (get_nr_swap_pages() <= 0)) { 1855: scan_balance = SCAN_FILE; 1856: goto out; 1857: } 1858: 1859: /* 1860: * Global reclaim will swap to prevent OOM even with no 1861: * swappiness, but memcg users want to use this knob to 1862: * disable swapping for individual groups completely when 1863: * using the memory controller's swap limit feature would be 1864: * too expensive. 1865: */ 1866: if (!global_reclaim(sc) && !vmscan_swappiness(sc)) { 1867: scan_balance = SCAN_FILE; 1868: goto out; 1869: } 1870: 1871: /* 1872: * Do not apply any pressure balancing cleverness when the 1873: * system is close to OOM, scan both anon and file equally 1874: * (unless the swappiness setting disagrees with swapping). 1875: */ 1876: if (!sc->priority && vmscan_swappiness(sc)) { 1877: scan_balance = SCAN_EQUAL; 1878: goto out; 1879: } 1880: 1881: anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + 1882: get_lru_size(lruvec, LRU_INACTIVE_ANON); 1883: file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + 1884: get_lru_size(lruvec, LRU_INACTIVE_FILE); 1885: 1886: /* 1887: * If it's foreseeable that reclaiming the file cache won't be 1888: * enough to get the zone back into a desirable shape, we have 1889: * to swap. Better start now and leave the - probably heavily 1890: * thrashing - remaining file pages alone. 1891: */ 1892: if (global_reclaim(sc)) { 1893: free = zone_page_state(zone, NR_FREE_PAGES); 1894: if (unlikely(file + free <= high_wmark_pages(zone))) { 1895: scan_balance = SCAN_ANON; 1896: goto out; 1897: } 1898: } 1899: 1900: /* 1901: * There is enough inactive page cache, do not reclaim 1902: * anything from the anonymous working set right now. 1903: */ 1904: if (!inactive_file_is_low(lruvec)) { 1905: scan_balance = SCAN_FILE; 1906: goto out; 1907: } 1908: 1909: scan_balance = SCAN_FRACT; 1910: 1911: /* 1912: * With swappiness at 100, anonymous and file have the same priority. 1913: * This scanning priority is essentially the inverse of IO cost. 1914: */ 1915: anon_prio = vmscan_swappiness(sc); 1916: file_prio = 200 - anon_prio; 1917: 1918: /* 1919: * OK, so we have swap space and a fair amount of page cache 1920: * pages. We use the recently rotated / recently scanned 1921: * ratios to determine how valuable each cache is. 1922: * 1923: * Because workloads change over time (and to avoid overflow) 1924: * we keep these statistics as a floating average, which ends 1925: * up weighing recent references more than old ones. 1926: * 1927: * anon in [0], file in [1] 1928: */ 1929: spin_lock_irq(&zone->lru_lock); 1930: if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { 1931: reclaim_stat->recent_scanned[0] /= 2; 1932: reclaim_stat->recent_rotated[0] /= 2; 1933: } 1934: 1935: if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { 1936: reclaim_stat->recent_scanned[1] /= 2; 1937: reclaim_stat->recent_rotated[1] /= 2; 1938: } 1939: 1940: /* 1941: * The amount of pressure on anon vs file pages is inversely 1942: * proportional to the fraction of recently scanned pages on 1943: * each list that were recently referenced and in active use. 1944: */ 1945: ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1); 1946: ap /= reclaim_stat->recent_rotated[0] + 1; 1947: 1948: fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); 1949: fp /= reclaim_stat->recent_rotated[1] + 1; 1950: spin_unlock_irq(&zone->lru_lock); 1951: 1952: fraction[0] = ap; 1953: fraction[1] = fp; 1954: denominator = ap + fp + 1; 1955: out: 1956: for_each_evictable_lru(lru) { 1957: int file = is_file_lru(lru); 1958: unsigned long size; 1959: unsigned long scan; 1960: 1961: size = get_lru_size(lruvec, lru); 1962: scan = size >> sc->priority; 1963: 1964: if (!scan && force_scan) 1965: scan = min(size, SWAP_CLUSTER_MAX); 1966: 1967: switch (scan_balance) { 1968: case SCAN_EQUAL: 1969: /* Scan lists relative to size */ 1970: break; 1971: case SCAN_FRACT: 1972: /* 1973: * Scan types proportional to swappiness and 1974: * their relative recent reclaim efficiency. 1975: */ 1976: scan = div64_u64(scan * fraction[file], denominator); 1977: break; 1978: case SCAN_FILE: 1979: case SCAN_ANON: 1980: /* Scan one type exclusively */ 1981: if ((scan_balance == SCAN_FILE) != file) 1982: scan = 0; 1983: break; 1984: default: 1985: /* Look ma, no brain */ 1986: BUG(); 1987: } 1988: nr[lru] = scan; 1989: } 1990: } 1991: 1992: /* 1993: * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1994: */ 1995: static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 1996: { 1997: unsigned long nr[NR_LRU_LISTS]; 1998: unsigned long targets[NR_LRU_LISTS]; 1999: unsigned long nr_to_scan; 2000: enum lru_list lru; 2001: unsigned long nr_reclaimed = 0; 2002: unsigned long nr_to_reclaim = sc->nr_to_reclaim; 2003: struct blk_plug plug; 2004: bool scan_adjusted = false; 2005: 2006: get_scan_count(lruvec, sc, nr); 2007: 2008: /* Record the original scan target for proportional adjustments later */ 2009: memcpy(targets, nr, sizeof(nr)); 2010: 2011: blk_start_plug(&plug); 2012: while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 2013: nr[LRU_INACTIVE_FILE]) { 2014: unsigned long nr_anon, nr_file, percentage; 2015: unsigned long nr_scanned; 2016: 2017: for_each_evictable_lru(lru) { 2018: if (nr[lru]) { 2019: nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); 2020: nr[lru] -= nr_to_scan; 2021: 2022: nr_reclaimed += shrink_list(lru, nr_to_scan, 2023: lruvec, sc); 2024: } 2025: } 2026: 2027: if (nr_reclaimed < nr_to_reclaim || scan_adjusted) 2028: continue; 2029: 2030: /* 2031: * For global direct reclaim, reclaim only the number of pages 2032: * requested. Less care is taken to scan proportionally as it 2033: * is more important to minimise direct reclaim stall latency 2034: * than it is to properly age the LRU lists. 2035: */ 2036: if (global_reclaim(sc) && !current_is_kswapd()) 2037: break; 2038: 2039: /* 2040: * For kswapd and memcg, reclaim at least the number of pages 2041: * requested. Ensure that the anon and file LRUs shrink 2042: * proportionally what was requested by get_scan_count(). We 2043: * stop reclaiming one LRU and reduce the amount scanning 2044: * proportional to the original scan target. 2045: */ 2046: nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; 2047: nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; 2048: 2049: if (nr_file > nr_anon) { 2050: unsigned long scan_target = targets[LRU_INACTIVE_ANON] + 2051: targets[LRU_ACTIVE_ANON] + 1; 2052: lru = LRU_BASE; 2053: percentage = nr_anon * 100 / scan_target; 2054: } else { 2055: unsigned long scan_target = targets[LRU_INACTIVE_FILE] + 2056: targets[LRU_ACTIVE_FILE] + 1; 2057: lru = LRU_FILE; 2058: percentage = nr_file * 100 / scan_target; 2059: } 2060: 2061: /* Stop scanning the smaller of the LRU */ 2062: nr[lru] = 0; 2063: nr[lru + LRU_ACTIVE] = 0; 2064: 2065: /* 2066: * Recalculate the other LRU scan count based on its original 2067: * scan target and the percentage scanning already complete 2068: */ 2069: lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; 2070: nr_scanned = targets[lru] - nr[lru]; 2071: nr[lru] = targets[lru] * (100 - percentage) / 100; 2072: nr[lru] -= min(nr[lru], nr_scanned); 2073: 2074: lru += LRU_ACTIVE; 2075: nr_scanned = targets[lru] - nr[lru]; 2076: nr[lru] = targets[lru] * (100 - percentage) / 100; 2077: nr[lru] -= min(nr[lru], nr_scanned); 2078: 2079: scan_adjusted = true; 2080: } 2081: blk_finish_plug(&plug); 2082: sc->nr_reclaimed += nr_reclaimed; 2083: 2084: /* 2085: * Even if we did not try to evict anon pages at all, we want to 2086: * rebalance the anon lru active/inactive ratio. 2087: */ 2088: if (inactive_anon_is_low(lruvec)) 2089: shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 2090: sc, LRU_ACTIVE_ANON); 2091: 2092: throttle_vm_writeout(sc->gfp_mask); 2093: } 2094: 2095: /* Use reclaim/compaction for costly allocs or under memory pressure */ 2096: static bool in_reclaim_compaction(struct scan_control *sc) 2097: { 2098: if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && 2099: (sc->order > PAGE_ALLOC_COSTLY_ORDER || 2100: sc->priority < DEF_PRIORITY - 2)) 2101: return true; 2102: 2103: return false; 2104: } 2105: 2106: /* 2107: * Reclaim/compaction is used for high-order allocation requests. It reclaims 2108: * order-0 pages before compacting the zone. should_continue_reclaim() returns 2109: * true if more pages should be reclaimed such that when the page allocator 2110: * calls try_to_compact_zone() that it will have enough free pages to succeed. 2111: * It will give up earlier than that if there is difficulty reclaiming pages. 2112: */ 2113: static inline bool should_continue_reclaim(struct zone *zone, 2114: unsigned long nr_reclaimed, 2115: unsigned long nr_scanned, 2116: struct scan_control *sc) 2117: { 2118: unsigned long pages_for_compaction; 2119: unsigned long inactive_lru_pages; 2120: 2121: /* If not in reclaim/compaction mode, stop */ 2122: if (!in_reclaim_compaction(sc)) 2123: return false; 2124: 2125: /* Consider stopping depending on scan and reclaim activity */ 2126: if (sc->gfp_mask & __GFP_REPEAT) { 2127: /* 2128: * For __GFP_REPEAT allocations, stop reclaiming if the 2129: * full LRU list has been scanned and we are still failing 2130: * to reclaim pages. This full LRU scan is potentially 2131: * expensive but a __GFP_REPEAT caller really wants to succeed 2132: */ 2133: if (!nr_reclaimed && !nr_scanned) 2134: return false; 2135: } else { 2136: /* 2137: * For non-__GFP_REPEAT allocations which can presumably 2138: * fail without consequence, stop if we failed to reclaim 2139: * any pages from the last SWAP_CLUSTER_MAX number of 2140: * pages that were scanned. This will return to the 2141: * caller faster at the risk reclaim/compaction and 2142: * the resulting allocation attempt fails 2143: */ 2144: if (!nr_reclaimed) 2145: return false; 2146: } 2147: 2148: /* 2149: * If we have not reclaimed enough pages for compaction and the 2150: * inactive lists are large enough, continue reclaiming 2151: */ 2152: pages_for_compaction = (2UL << sc->order); 2153: inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); 2154: if (get_nr_swap_pages() > 0) 2155: inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); 2156: if (sc->nr_reclaimed < pages_for_compaction && 2157: inactive_lru_pages > pages_for_compaction) 2158: return true; 2159: 2160: /* If compaction would go ahead or the allocation would succeed, stop */ 2161: switch (compaction_suitable(zone, sc->order)) { 2162: case COMPACT_PARTIAL: 2163: case COMPACT_CONTINUE: 2164: return false; 2165: default: 2166: return true; 2167: } 2168: } 2169: 2170: static void shrink_zone(struct zone *zone, struct scan_control *sc) 2171: { 2172: unsigned long nr_reclaimed, nr_scanned; 2173: 2174: do { 2175: struct mem_cgroup *root = sc->target_mem_cgroup; 2176: struct mem_cgroup_reclaim_cookie reclaim = { 2177: .zone = zone, 2178: .priority = sc->priority, 2179: }; 2180: struct mem_cgroup *memcg; 2181: 2182: nr_reclaimed = sc->nr_reclaimed; 2183: nr_scanned = sc->nr_scanned; 2184: 2185: memcg = mem_cgroup_iter(root, NULL, &reclaim); 2186: do { 2187: struct lruvec *lruvec; 2188: 2189: lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2190: 2191: shrink_lruvec(lruvec, sc); 2192: 2193: /* 2194: * Direct reclaim and kswapd have to scan all memory 2195: * cgroups to fulfill the overall scan target for the 2196: * zone. 2197: * 2198: * Limit reclaim, on the other hand, only cares about 2199: * nr_to_reclaim pages to be reclaimed and it will 2200: * retry with decreasing priority if one round over the 2201: * whole hierarchy is not sufficient. 2202: */ 2203: if (!global_reclaim(sc) && 2204: sc->nr_reclaimed >= sc->nr_to_reclaim) { 2205: mem_cgroup_iter_break(root, memcg); 2206: break; 2207: } 2208: memcg = mem_cgroup_iter(root, memcg, &reclaim); 2209: } while (memcg); 2210: 2211: vmpressure(sc->gfp_mask, sc->target_mem_cgroup, 2212: sc->nr_scanned - nr_scanned, 2213: sc->nr_reclaimed - nr_reclaimed); 2214: 2215: } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, 2216: sc->nr_scanned - nr_scanned, sc)); 2217: } 2218: 2219: /* Returns true if compaction should go ahead for a high-order request */ 2220: static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) 2221: { 2222: unsigned long balance_gap, watermark; 2223: bool watermark_ok; 2224: 2225: /* Do not consider compaction for orders reclaim is meant to satisfy */ 2226: if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) 2227: return false; 2228: 2229: /* 2230: * Compaction takes time to run and there are potentially other 2231: * callers using the pages just freed. Continue reclaiming until 2232: * there is a buffer of free pages available to give compaction 2233: * a reasonable chance of completing and allocating the page 2234: */ 2235: balance_gap = min(low_wmark_pages(zone), 2236: (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2237: KSWAPD_ZONE_BALANCE_GAP_RATIO); 2238: watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); 2239: watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); 2240: 2241: /* 2242: * If compaction is deferred, reclaim up to a point where 2243: * compaction will have a chance of success when re-enabled 2244: */ 2245: if (compaction_deferred(zone, sc->order)) 2246: return watermark_ok; 2247: 2248: /* If compaction is not ready to start, keep reclaiming */ 2249: if (!compaction_suitable(zone, sc->order)) 2250: return false; 2251: 2252: return watermark_ok; 2253: } 2254: 2255: /* 2256: * This is the direct reclaim path, for page-allocating processes. We only 2257: * try to reclaim pages from zones which will satisfy the caller's allocation 2258: * request. 2259: * 2260: * We reclaim from a zone even if that zone is over high_wmark_pages(zone). 2261: * Because: 2262: * a) The caller may be trying to free *extra* pages to satisfy a higher-order 2263: * allocation or 2264: * b) The target zone may be at high_wmark_pages(zone) but the lower zones 2265: * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' 2266: * zone defense algorithm. 2267: * 2268: * If a zone is deemed to be full of pinned pages then just give it a light 2269: * scan then give up on it. 2270: * 2271: * This function returns true if a zone is being reclaimed for a costly 2272: * high-order allocation and compaction is ready to begin. This indicates to 2273: * the caller that it should consider retrying the allocation instead of 2274: * further reclaim. 2275: */ 2276: static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) 2277: { 2278: struct zoneref *z; 2279: struct zone *zone; 2280: unsigned long nr_soft_reclaimed; 2281: unsigned long nr_soft_scanned; 2282: bool aborted_reclaim = false; 2283: 2284: /* 2285: * If the number of buffer_heads in the machine exceeds the maximum 2286: * allowed level, force direct reclaim to scan the highmem zone as 2287: * highmem pages could be pinning lowmem pages storing buffer_heads 2288: */ 2289: if (buffer_heads_over_limit) 2290: sc->gfp_mask |= __GFP_HIGHMEM; 2291: 2292: for_each_zone_zonelist_nodemask(zone, z, zonelist, 2293: gfp_zone(sc->gfp_mask), sc->nodemask) { 2294: if (!populated_zone(zone)) 2295: continue; 2296: /* 2297: * Take care memory controller reclaiming has small influence 2298: * to global LRU. 2299: */ 2300: if (global_reclaim(sc)) { 2301: if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2302: continue; 2303: if (sc->priority != DEF_PRIORITY && 2304: !zone_reclaimable(zone)) 2305: continue; /* Let kswapd poll it */ 2306: if (IS_ENABLED(CONFIG_COMPACTION)) { 2307: /* 2308: * If we already have plenty of memory free for 2309: * compaction in this zone, don't free any more. 2310: * Even though compaction is invoked for any 2311: * non-zero order, only frequent costly order 2312: * reclamation is disruptive enough to become a 2313: * noticeable problem, like transparent huge 2314: * page allocations. 2315: */ 2316: if (compaction_ready(zone, sc)) { 2317: aborted_reclaim = true; 2318: continue; 2319: } 2320: } 2321: /* 2322: * This steals pages from memory cgroups over softlimit 2323: * and returns the number of reclaimed pages and 2324: * scanned pages. This works for global memory pressure 2325: * and balancing, not for a memcg's limit. 2326: */ 2327: nr_soft_scanned = 0; 2328: nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, 2329: sc->order, sc->gfp_mask, 2330: &nr_soft_scanned); 2331: sc->nr_reclaimed += nr_soft_reclaimed; 2332: sc->nr_scanned += nr_soft_scanned; 2333: /* need some check for avoid more shrink_zone() */ 2334: } 2335: 2336: shrink_zone(zone, sc); 2337: } 2338: 2339: return aborted_reclaim; 2340: } 2341: 2342: /* All zones in zonelist are unreclaimable? */ 2343: static bool all_unreclaimable(struct zonelist *zonelist, 2344: struct scan_control *sc) 2345: { 2346: struct zoneref *z; 2347: struct zone *zone; 2348: 2349: for_each_zone_zonelist_nodemask(zone, z, zonelist, 2350: gfp_zone(sc->gfp_mask), sc->nodemask) { 2351: if (!populated_zone(zone)) 2352: continue; 2353: if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2354: continue; 2355: if (zone_reclaimable(zone)) 2356: return false; 2357: } 2358: 2359: return true; 2360: } 2361: 2362: /* 2363: * This is the main entry point to direct page reclaim. 2364: * 2365: * If a full scan of the inactive list fails to free enough memory then we 2366: * are "out of memory" and something needs to be killed. 2367: * 2368: * If the caller is !__GFP_FS then the probability of a failure is reasonably 2369: * high - the zone may be full of dirty or under-writeback pages, which this 2370: * caller can't do much about. We kick the writeback threads and take explicit 2371: * naps in the hope that some of these pages can be written. But if the 2372: * allocating task holds filesystem locks which prevent writeout this might not 2373: * work, and the allocation attempt will fail. 2374: * 2375: * returns: 0, if no pages reclaimed 2376: * else, the number of pages reclaimed 2377: */ 2378: static unsigned long do_try_to_free_pages(struct zonelist *zonelist, 2379: struct scan_control *sc, 2380: struct shrink_control *shrink) 2381: { 2382: unsigned long total_scanned = 0; 2383: struct reclaim_state *reclaim_state = current->reclaim_state; 2384: struct zoneref *z; 2385: struct zone *zone; 2386: unsigned long writeback_threshold; 2387: bool aborted_reclaim; 2388: 2389: delayacct_freepages_start(); 2390: 2391: if (global_reclaim(sc)) 2392: count_vm_event(ALLOCSTALL); 2393: 2394: do { 2395: vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, 2396: sc->priority); 2397: sc->nr_scanned = 0; 2398: aborted_reclaim = shrink_zones(zonelist, sc); 2399: 2400: /* 2401: * Don't shrink slabs when reclaiming memory from over limit 2402: * cgroups but do shrink slab at least once when aborting 2403: * reclaim for compaction to avoid unevenly scanning file/anon 2404: * LRU pages over slab pages. 2405: */ 2406: if (global_reclaim(sc)) { 2407: unsigned long lru_pages = 0; 2408: 2409: nodes_clear(shrink->nodes_to_scan); 2410: for_each_zone_zonelist(zone, z, zonelist, 2411: gfp_zone(sc->gfp_mask)) { 2412: if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2413: continue; 2414: 2415: lru_pages += zone_reclaimable_pages(zone); 2416: node_set(zone_to_nid(zone), 2417: shrink->nodes_to_scan); 2418: } 2419: 2420: shrink_slab(shrink, sc->nr_scanned, lru_pages); 2421: if (reclaim_state) { 2422: sc->nr_reclaimed += reclaim_state->reclaimed_slab; 2423: reclaim_state->reclaimed_slab = 0; 2424: } 2425: } 2426: total_scanned += sc->nr_scanned; 2427: if (sc->nr_reclaimed >= sc->nr_to_reclaim) 2428: goto out; 2429: 2430: /* 2431: * If we're getting trouble reclaiming, start doing 2432: * writepage even in laptop mode. 2433: */ 2434: if (sc->priority < DEF_PRIORITY - 2) 2435: sc->may_writepage = 1; 2436: 2437: /* 2438: * Try to write back as many pages as we just scanned. This 2439: * tends to cause slow streaming writers to write data to the 2440: * disk smoothly, at the dirtying rate, which is nice. But 2441: * that's undesirable in laptop mode, where we *want* lumpy 2442: * writeout. So in laptop mode, write out the whole world. 2443: */ 2444: writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; 2445: if (total_scanned > writeback_threshold) { 2446: wakeup_flusher_threads(laptop_mode ? 0 : total_scanned, 2447: WB_REASON_TRY_TO_FREE_PAGES); 2448: sc->may_writepage = 1; 2449: } 2450: } while (--sc->priority >= 0 && !aborted_reclaim); 2451: 2452: out: 2453: delayacct_freepages_end(); 2454: 2455: if (sc->nr_reclaimed) 2456: return sc->nr_reclaimed; 2457: 2458: /* 2459: * As hibernation is going on, kswapd is freezed so that it can't mark 2460: * the zone into all_unreclaimable. Thus bypassing all_unreclaimable 2461: * check. 2462: */ 2463: if (oom_killer_disabled) 2464: return 0; 2465: 2466: /* Aborted reclaim to try compaction? don't OOM, then */ 2467: if (aborted_reclaim) 2468: return 1; 2469: 2470: /* top priority shrink_zones still had more to do? don't OOM, then */ 2471: if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc)) 2472: return 1; 2473: 2474: return 0; 2475: } 2476: 2477: static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) 2478: { 2479: struct zone *zone; 2480: unsigned long pfmemalloc_reserve = 0; 2481: unsigned long free_pages = 0; 2482: int i; 2483: bool wmark_ok; 2484: 2485: for (i = 0; i <= ZONE_NORMAL; i++) { 2486: zone = &pgdat->node_zones[i]; 2487: if (!populated_zone(zone)) 2488: continue; 2489: 2490: pfmemalloc_reserve += min_wmark_pages(zone); 2491: free_pages += zone_page_state(zone, NR_FREE_PAGES); 2492: } 2493: 2494: /* If there are no reserves (unexpected config) then do not throttle */ 2495: if (!pfmemalloc_reserve) 2496: return true; 2497: 2498: wmark_ok = free_pages > pfmemalloc_reserve / 2; 2499: 2500: /* kswapd must be awake if processes are being throttled */ 2501: if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { 2502: pgdat->classzone_idx = min(pgdat->classzone_idx, 2503: (enum zone_type)ZONE_NORMAL); 2504: wake_up_interruptible(&pgdat->kswapd_wait); 2505: } 2506: 2507: return wmark_ok; 2508: } 2509: 2510: /* 2511: * Throttle direct reclaimers if backing storage is backed by the network 2512: * and the PFMEMALLOC reserve for the preferred node is getting dangerously 2513: * depleted. kswapd will continue to make progress and wake the processes 2514: * when the low watermark is reached. 2515: * 2516: * Returns true if a fatal signal was delivered during throttling. If this 2517: * happens, the page allocator should not consider triggering the OOM killer. 2518: */ 2519: static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, 2520: nodemask_t *nodemask) 2521: { 2522: struct zoneref *z; 2523: struct zone *zone; 2524: pg_data_t *pgdat = NULL; 2525: 2526: /* 2527: * Kernel threads should not be throttled as they may be indirectly 2528: * responsible for cleaning pages necessary for reclaim to make forward 2529: * progress. kjournald for example may enter direct reclaim while 2530: * committing a transaction where throttling it could forcing other 2531: * processes to block on log_wait_commit(). 2532: */ 2533: if (current->flags & PF_KTHREAD) 2534: goto out; 2535: 2536: /* 2537: * If a fatal signal is pending, this process should not throttle. 2538: * It should return quickly so it can exit and free its memory 2539: */ 2540: if (fatal_signal_pending(current)) 2541: goto out; 2542: 2543: /* 2544: * Check if the pfmemalloc reserves are ok by finding the first node 2545: * with a usable ZONE_NORMAL or lower zone. The expectation is that 2546: * GFP_KERNEL will be required for allocating network buffers when 2547: * swapping over the network so ZONE_HIGHMEM is unusable. 2548: * 2549: * Throttling is based on the first usable node and throttled processes 2550: * wait on a queue until kswapd makes progress and wakes them. There 2551: * is an affinity then between processes waking up and where reclaim 2552: * progress has been made assuming the process wakes on the same node. 2553: * More importantly, processes running on remote nodes will not compete 2554: * for remote pfmemalloc reserves and processes on different nodes 2555: * should make reasonable progress. 2556: */ 2557: for_each_zone_zonelist_nodemask(zone, z, zonelist, 2558: gfp_mask, nodemask) { 2559: if (zone_idx(zone) > ZONE_NORMAL) 2560: continue; 2561: 2562: /* Throttle based on the first usable node */ 2563: pgdat = zone->zone_pgdat; 2564: if (pfmemalloc_watermark_ok(pgdat)) 2565: goto out; 2566: break; 2567: } 2568: 2569: /* If no zone was usable by the allocation flags then do not throttle */ 2570: if (!pgdat) 2571: goto out; 2572: 2573: /* Account for the throttling */ 2574: count_vm_event(PGSCAN_DIRECT_THROTTLE); 2575: 2576: /* 2577: * If the caller cannot enter the filesystem, it's possible that it 2578: * is due to the caller holding an FS lock or performing a journal 2579: * transaction in the case of a filesystem like ext[3|4]. In this case, 2580: * it is not safe to block on pfmemalloc_wait as kswapd could be 2581: * blocked waiting on the same lock. Instead, throttle for up to a 2582: * second before continuing. 2583: */ 2584: if (!(gfp_mask & __GFP_FS)) { 2585: wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, 2586: pfmemalloc_watermark_ok(pgdat), HZ); 2587: 2588: goto check_pending; 2589: } 2590: 2591: /* Throttle until kswapd wakes the process */ 2592: wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, 2593: pfmemalloc_watermark_ok(pgdat)); 2594: 2595: check_pending: 2596: if (fatal_signal_pending(current)) 2597: return true; 2598: 2599: out: 2600: return false; 2601: } 2602: 2603: unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 2604: gfp_t gfp_mask, nodemask_t *nodemask) 2605: { 2606: unsigned long nr_reclaimed; 2607: struct scan_control sc = { 2608: .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), 2609: .may_writepage = !laptop_mode, 2610: .nr_to_reclaim = SWAP_CLUSTER_MAX, 2611: .may_unmap = 1, 2612: .may_swap = 1, 2613: .order = order, 2614: .priority = DEF_PRIORITY, 2615: .target_mem_cgroup = NULL, 2616: .nodemask = nodemask, 2617: }; 2618: struct shrink_control shrink = { 2619: .gfp_mask = sc.gfp_mask, 2620: }; 2621: 2622: /* 2623: * Do not enter reclaim if fatal signal was delivered while throttled. 2624: * 1 is returned so that the page allocator does not OOM kill at this 2625: * point. 2626: */ 2627: if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask)) 2628: return 1; 2629: 2630: trace_mm_vmscan_direct_reclaim_begin(order, 2631: sc.may_writepage, 2632: gfp_mask); 2633: 2634: nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); 2635: 2636: trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); 2637: 2638: return nr_reclaimed; 2639: } 2640: 2641: #ifdef CONFIG_MEMCG 2642: 2643: unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, 2644: gfp_t gfp_mask, bool noswap, 2645: struct zone *zone, 2646: unsigned long *nr_scanned) 2647: { 2648: struct scan_control sc = { 2649: .nr_scanned = 0, 2650: .nr_to_reclaim = SWAP_CLUSTER_MAX, 2651: .may_writepage = !laptop_mode, 2652: .may_unmap = 1, 2653: .may_swap = !noswap, 2654: .order = 0, 2655: .priority = 0, 2656: .target_mem_cgroup = memcg, 2657: }; 2658: struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2659: 2660: sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2661: (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2662: 2663: trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, 2664: sc.may_writepage, 2665: sc.gfp_mask); 2666: 2667: /* 2668: * NOTE: Although we can get the priority field, using it 2669: * here is not a good idea, since it limits the pages we can scan. 2670: * if we don't reclaim here, the shrink_zone from balance_pgdat 2671: * will pick up pages from other mem cgroup's as well. We hack 2672: * the priority and make it zero. 2673: */ 2674: shrink_lruvec(lruvec, &sc); 2675: 2676: trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 2677: 2678: *nr_scanned = sc.nr_scanned; 2679: return sc.nr_reclaimed; 2680: } 2681: 2682: unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, 2683: gfp_t gfp_mask, 2684: bool noswap) 2685: { 2686: struct zonelist *zonelist; 2687: unsigned long nr_reclaimed; 2688: int nid; 2689: struct scan_control sc = { 2690: .may_writepage = !laptop_mode, 2691: .may_unmap = 1, 2692: .may_swap = !noswap, 2693: .nr_to_reclaim = SWAP_CLUSTER_MAX, 2694: .order = 0, 2695: .priority = DEF_PRIORITY, 2696: .target_mem_cgroup = memcg, 2697: .nodemask = NULL, /* we don't care the placement */ 2698: .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2699: (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), 2700: }; 2701: struct shrink_control shrink = { 2702: .gfp_mask = sc.gfp_mask, 2703: }; 2704: 2705: /* 2706: * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't 2707: * take care of from where we get pages. So the node where we start the 2708: * scan does not need to be the current node. 2709: */ 2710: nid = mem_cgroup_select_victim_node(memcg); 2711: 2712: zonelist = NODE_DATA(nid)->node_zonelists; 2713: 2714: trace_mm_vmscan_memcg_reclaim_begin(0, 2715: sc.may_writepage, 2716: sc.gfp_mask); 2717: 2718: nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); 2719: 2720: trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); 2721: 2722: return nr_reclaimed; 2723: } 2724: #endif 2725: 2726: static void age_active_anon(struct zone *zone, struct scan_control *sc) 2727: { 2728: struct mem_cgroup *memcg; 2729: 2730: if (!total_swap_pages) 2731: return; 2732: 2733: memcg = mem_cgroup_iter(NULL, NULL, NULL); 2734: do { 2735: struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2736: 2737: if (inactive_anon_is_low(lruvec)) 2738: shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 2739: sc, LRU_ACTIVE_ANON); 2740: 2741: memcg = mem_cgroup_iter(NULL, memcg, NULL); 2742: } while (memcg); 2743: } 2744: 2745: static bool zone_balanced(struct zone *zone, int order, 2746: unsigned long balance_gap, int classzone_idx) 2747: { 2748: if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + 2749: balance_gap, classzone_idx, 0)) 2750: return false; 2751: 2752: if (IS_ENABLED(CONFIG_COMPACTION) && order && 2753: !compaction_suitable(zone, order)) 2754: return false; 2755: 2756: return true; 2757: } 2758: 2759: /* 2760: * pgdat_balanced() is used when checking if a node is balanced. 2761: * 2762: * For order-0, all zones must be balanced! 2763: * 2764: * For high-order allocations only zones that meet watermarks and are in a 2765: * zone allowed by the callers classzone_idx are added to balanced_pages. The 2766: * total of balanced pages must be at least 25% of the zones allowed by 2767: * classzone_idx for the node to be considered balanced. Forcing all zones to 2768: * be balanced for high orders can cause excessive reclaim when there are 2769: * imbalanced zones. 2770: * The choice of 25% is due to 2771: * o a 16M DMA zone that is balanced will not balance a zone on any 2772: * reasonable sized machine 2773: * o On all other machines, the top zone must be at least a reasonable 2774: * percentage of the middle zones. For example, on 32-bit x86, highmem 2775: * would need to be at least 256M for it to be balance a whole node. 2776: * Similarly, on x86-64 the Normal zone would need to be at least 1G 2777: * to balance a node on its own. These seemed like reasonable ratios. 2778: */ 2779: static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) 2780: { 2781: unsigned long managed_pages = 0; 2782: unsigned long balanced_pages = 0; 2783: int i; 2784: 2785: /* Check the watermark levels */ 2786: for (i = 0; i <= classzone_idx; i++) { 2787: struct zone *zone = pgdat->node_zones + i; 2788: 2789: if (!populated_zone(zone)) 2790: continue; 2791: 2792: managed_pages += zone->managed_pages; 2793: 2794: /* 2795: * A special case here: 2796: * 2797: * balance_pgdat() skips over all_unreclaimable after 2798: * DEF_PRIORITY. Effectively, it considers them balanced so 2799: * they must be considered balanced here as well! 2800: */ 2801: if (!zone_reclaimable(zone)) { 2802: balanced_pages += zone->managed_pages; 2803: continue; 2804: } 2805: 2806: if (zone_balanced(zone, order, 0, i)) 2807: balanced_pages += zone->managed_pages; 2808: else if (!order) 2809: return false; 2810: } 2811: 2812: if (order) 2813: return balanced_pages >= (managed_pages >> 2); 2814: else 2815: return true; 2816: } 2817: 2818: /* 2819: * Prepare kswapd for sleeping. This verifies that there are no processes 2820: * waiting in throttle_direct_reclaim() and that watermarks have been met. 2821: * 2822: * Returns true if kswapd is ready to sleep 2823: */ 2824: static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, 2825: int classzone_idx) 2826: { 2827: /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ 2828: if (remaining) 2829: return false; 2830: 2831: /* 2832: * There is a potential race between when kswapd checks its watermarks 2833: * and a process gets throttled. There is also a potential race if 2834: * processes get throttled, kswapd wakes, a large process exits therby 2835: * balancing the zones that causes kswapd to miss a wakeup. If kswapd 2836: * is going to sleep, no process should be sleeping on pfmemalloc_wait 2837: * so wake them now if necessary. If necessary, processes will wake 2838: * kswapd and get throttled again 2839: */ 2840: if (waitqueue_active(&pgdat->pfmemalloc_wait)) { 2841: wake_up(&pgdat->pfmemalloc_wait); 2842: return false; 2843: } 2844: 2845: return pgdat_balanced(pgdat, order, classzone_idx); 2846: } 2847: 2848: /* 2849: * kswapd shrinks the zone by the number of pages required to reach 2850: * the high watermark. 2851: * 2852: * Returns true if kswapd scanned at least the requested number of pages to 2853: * reclaim or if the lack of progress was due to pages under writeback. 2854: * This is used to determine if the scanning priority needs to be raised. 2855: */ 2856: static bool kswapd_shrink_zone(struct zone *zone, 2857: int classzone_idx, 2858: struct scan_control *sc, 2859: unsigned long lru_pages, 2860: unsigned long *nr_attempted) 2861: { 2862: int testorder = sc->order; 2863: unsigned long balance_gap; 2864: struct reclaim_state *reclaim_state = current->reclaim_state; 2865: struct shrink_control shrink = { 2866: .gfp_mask = sc->gfp_mask, 2867: }; 2868: bool lowmem_pressure; 2869: 2870: /* Reclaim above the high watermark. */ 2871: sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); 2872: 2873: /* 2874: * Kswapd reclaims only single pages with compaction enabled. Trying 2875: * too hard to reclaim until contiguous free pages have become 2876: * available can hurt performance by evicting too much useful data 2877: * from memory. Do not reclaim more than needed for compaction. 2878: */ 2879: if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && 2880: compaction_suitable(zone, sc->order) != 2881: COMPACT_SKIPPED) 2882: testorder = 0; 2883: 2884: /* 2885: * We put equal pressure on every zone, unless one zone has way too 2886: * many pages free already. The "too many pages" is defined as the 2887: * high wmark plus a "gap" where the gap is either the low 2888: * watermark or 1% of the zone, whichever is smaller. 2889: */ 2890: balance_gap = min(low_wmark_pages(zone), 2891: (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2892: KSWAPD_ZONE_BALANCE_GAP_RATIO); 2893: 2894: /* 2895: * If there is no low memory pressure or the zone is balanced then no 2896: * reclaim is necessary 2897: */ 2898: lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); 2899: if (!lowmem_pressure && zone_balanced(zone, testorder, 2900: balance_gap, classzone_idx)) 2901: return true; 2902: 2903: shrink_zone(zone, sc); 2904: nodes_clear(shrink.nodes_to_scan); 2905: node_set(zone_to_nid(zone), shrink.nodes_to_scan); 2906: 2907: reclaim_state->reclaimed_slab = 0; 2908: shrink_slab(&shrink, sc->nr_scanned, lru_pages); 2909: sc->nr_reclaimed += reclaim_state->reclaimed_slab; 2910: 2911: /* Account for the number of pages attempted to reclaim */ 2912: *nr_attempted += sc->nr_to_reclaim; 2913: 2914: zone_clear_flag(zone, ZONE_WRITEBACK); 2915: 2916: /* 2917: * If a zone reaches its high watermark, consider it to be no longer 2918: * congested. It's possible there are dirty pages backed by congested 2919: * BDIs but as pressure is relieved, speculatively avoid congestion 2920: * waits. 2921: */ 2922: if (zone_reclaimable(zone) && 2923: zone_balanced(zone, testorder, 0, classzone_idx)) { 2924: zone_clear_flag(zone, ZONE_CONGESTED); 2925: zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); 2926: } 2927: 2928: return sc->nr_scanned >= sc->nr_to_reclaim; 2929: } 2930: 2931: /* 2932: * For kswapd, balance_pgdat() will work across all this node's zones until 2933: * they are all at high_wmark_pages(zone). 2934: * 2935: * Returns the final order kswapd was reclaiming at 2936: * 2937: * There is special handling here for zones which are full of pinned pages. 2938: * This can happen if the pages are all mlocked, or if they are all used by 2939: * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. 2940: * What we do is to detect the case where all pages in the zone have been 2941: * scanned twice and there has been zero successful reclaim. Mark the zone as 2942: * dead and from now on, only perform a short scan. Basically we're polling 2943: * the zone for when the problem goes away. 2944: * 2945: * kswapd scans the zones in the highmem->normal->dma direction. It skips 2946: * zones which have free_pages > high_wmark_pages(zone), but once a zone is 2947: * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the 2948: * lower zones regardless of the number of free pages in the lower zones. This 2949: * interoperates with the page allocator fallback scheme to ensure that aging 2950: * of pages is balanced across the zones. 2951: */ 2952: static unsigned long balance_pgdat(pg_data_t *pgdat, int order, 2953: int *classzone_idx) 2954: { 2955: int i; 2956: int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2957: unsigned long nr_soft_reclaimed; 2958: unsigned long nr_soft_scanned; 2959: struct scan_control sc = { 2960: .gfp_mask = GFP_KERNEL, 2961: .priority = DEF_PRIORITY, 2962: .may_unmap = 1, 2963: .may_swap = 1, 2964: .may_writepage = !laptop_mode, 2965: .order = order, 2966: .target_mem_cgroup = NULL, 2967: }; 2968: count_vm_event(PAGEOUTRUN); 2969: 2970: do { 2971: unsigned long lru_pages = 0; 2972: unsigned long nr_attempted = 0; 2973: bool raise_priority = true; 2974: bool pgdat_needs_compaction = (order > 0); 2975: 2976: sc.nr_reclaimed = 0; 2977: 2978: /* 2979: * Scan in the highmem->dma direction for the highest 2980: * zone which needs scanning 2981: */ 2982: for (i = pgdat->nr_zones - 1; i >= 0; i--) { 2983: struct zone *zone = pgdat->node_zones + i; 2984: 2985: if (!populated_zone(zone)) 2986: continue; 2987: 2988: if (sc.priority != DEF_PRIORITY && 2989: !zone_reclaimable(zone)) 2990: continue; 2991: 2992: /* 2993: * Do some background aging of the anon list, to give 2994: * pages a chance to be referenced before reclaiming. 2995: */ 2996: age_active_anon(zone, &sc); 2997: 2998: /* 2999: * If the number of buffer_heads in the machine 3000: * exceeds the maximum allowed level and this node 3001: * has a highmem zone, force kswapd to reclaim from 3002: * it to relieve lowmem pressure. 3003: */ 3004: if (buffer_heads_over_limit && is_highmem_idx(i)) { 3005: end_zone = i; 3006: break; 3007: } 3008: 3009: if (!zone_balanced(zone, order, 0, 0)) { 3010: end_zone = i; 3011: break; 3012: } else { 3013: /* 3014: * If balanced, clear the dirty and congested 3015: * flags 3016: */ 3017: zone_clear_flag(zone, ZONE_CONGESTED); 3018: zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); 3019: } 3020: } 3021: 3022: if (i < 0) 3023: goto out; 3024: 3025: for (i = 0; i <= end_zone; i++) { 3026: struct zone *zone = pgdat->node_zones + i; 3027: 3028: if (!populated_zone(zone)) 3029: continue; 3030: 3031: lru_pages += zone_reclaimable_pages(zone); 3032: 3033: /* 3034: * If any zone is currently balanced then kswapd will 3035: * not call compaction as it is expected that the 3036: * necessary pages are already available. 3037: */ 3038: if (pgdat_needs_compaction && 3039: zone_watermark_ok(zone, order, 3040: low_wmark_pages(zone), 3041: *classzone_idx, 0)) 3042: pgdat_needs_compaction = false; 3043: } 3044: 3045: /* 3046: * If we're getting trouble reclaiming, start doing writepage 3047: * even in laptop mode. 3048: */ 3049: if (sc.priority < DEF_PRIORITY - 2) 3050: sc.may_writepage = 1; 3051: 3052: /* 3053: * Now scan the zone in the dma->highmem direction, stopping 3054: * at the last zone which needs scanning. 3055: * 3056: * We do this because the page allocator works in the opposite 3057: * direction. This prevents the page allocator from allocating 3058: * pages behind kswapd's direction of progress, which would 3059: * cause too much scanning of the lower zones. 3060: */ 3061: for (i = 0; i <= end_zone; i++) { 3062: struct zone *zone = pgdat->node_zones + i; 3063: 3064: if (!populated_zone(zone)) 3065: continue; 3066: 3067: if (sc.priority != DEF_PRIORITY && 3068: !zone_reclaimable(zone)) 3069: continue; 3070: 3071: sc.nr_scanned = 0; 3072: 3073: nr_soft_scanned = 0; 3074: /* 3075: * Call soft limit reclaim before calling shrink_zone. 3076: */ 3077: nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, 3078: order, sc.gfp_mask, 3079: &nr_soft_scanned); 3080: sc.nr_reclaimed += nr_soft_reclaimed; 3081: 3082: /* 3083: * There should be no need to raise the scanning 3084: * priority if enough pages are already being scanned 3085: * that that high watermark would be met at 100% 3086: * efficiency. 3087: */ 3088: if (kswapd_shrink_zone(zone, end_zone, &sc, 3089: lru_pages, &nr_attempted)) 3090: raise_priority = false; 3091: } 3092: 3093: /* 3094: * If the low watermark is met there is no need for processes 3095: * to be throttled on pfmemalloc_wait as they should not be 3096: * able to safely make forward progress. Wake them 3097: */ 3098: if (waitqueue_active(&pgdat->pfmemalloc_wait) && 3099: pfmemalloc_watermark_ok(pgdat)) 3100: wake_up(&pgdat->pfmemalloc_wait); 3101: 3102: /* 3103: * Fragmentation may mean that the system cannot be rebalanced 3104: * for high-order allocations in all zones. If twice the 3105: * allocation size has been reclaimed and the zones are still 3106: * not balanced then recheck the watermarks at order-0 to 3107: * prevent kswapd reclaiming excessively. Assume that a 3108: * process requested a high-order can direct reclaim/compact. 3109: */ 3110: if (order && sc.nr_reclaimed >= 2UL << order) 3111: order = sc.order = 0; 3112: 3113: /* Check if kswapd should be suspending */ 3114: if (try_to_freeze() || kthread_should_stop()) 3115: break; 3116: 3117: /* 3118: * Compact if necessary and kswapd is reclaiming at least the 3119: * high watermark number of pages as requsted 3120: */ 3121: if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted) 3122: compact_pgdat(pgdat, order); 3123: 3124: /* 3125: * Raise priority if scanning rate is too low or there was no 3126: * progress in reclaiming pages 3127: */ 3128: if (raise_priority || !sc.nr_reclaimed) 3129: sc.priority--; 3130: } while (sc.priority >= 1 && 3131: !pgdat_balanced(pgdat, order, *classzone_idx)); 3132: 3133: out: 3134: /* 3135: * Return the order we were reclaiming at so prepare_kswapd_sleep() 3136: * makes a decision on the order we were last reclaiming at. However, 3137: * if another caller entered the allocator slow path while kswapd 3138: * was awake, order will remain at the higher level 3139: */ 3140: *classzone_idx = end_zone; 3141: return order; 3142: } 3143: 3144: static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) 3145: { 3146: long remaining = 0; 3147: DEFINE_WAIT(wait); 3148: 3149: if (freezing(current) || kthread_should_stop()) 3150: return; 3151: 3152: prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 3153: 3154: /* Try to sleep for a short interval */ 3155: if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { 3156: remaining = schedule_timeout(HZ/10); 3157: finish_wait(&pgdat->kswapd_wait, &wait); 3158: prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 3159: } 3160: 3161: /* 3162: * After a short sleep, check if it was a premature sleep. If not, then 3163: * go fully to sleep until explicitly woken up. 3164: */ 3165: if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { 3166: trace_mm_vmscan_kswapd_sleep(pgdat->node_id); 3167: 3168: /* 3169: * vmstat counters are not perfectly accurate and the estimated 3170: * value for counters such as NR_FREE_PAGES can deviate from the 3171: * true value by nr_online_cpus * threshold. To avoid the zone 3172: * watermarks being breached while under pressure, we reduce the 3173: * per-cpu vmstat threshold while kswapd is awake and restore 3174: * them before going back to sleep. 3175: */ 3176: set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); 3177: 3178: /* 3179: * Compaction records what page blocks it recently failed to 3180: * isolate pages from and skips them in the future scanning. 3181: * When kswapd is going to sleep, it is reasonable to assume 3182: * that pages and compaction may succeed so reset the cache. 3183: */ 3184: reset_isolation_suitable(pgdat); 3185: 3186: if (!kthread_should_stop()) 3187: schedule(); 3188: 3189: set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); 3190: } else { 3191: if (remaining) 3192: count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); 3193: else 3194: count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); 3195: } 3196: finish_wait(&pgdat->kswapd_wait, &wait); 3197: } 3198: 3199: /* 3200: * The background pageout daemon, started as a kernel thread 3201: * from the init process. 3202: * 3203: * This basically trickles out pages so that we have _some_ 3204: * free memory available even if there is no other activity 3205: * that frees anything up. This is needed for things like routing 3206: * etc, where we otherwise might have all activity going on in 3207: * asynchronous contexts that cannot page things out. 3208: * 3209: * If there are applications that are active memory-allocators 3210: * (most normal use), this basically shouldn't matter. 3211: */ 3212: static int kswapd(void *p) 3213: { 3214: unsigned long order, new_order; 3215: unsigned balanced_order; 3216: int classzone_idx, new_classzone_idx; 3217: int balanced_classzone_idx; 3218: pg_data_t *pgdat = (pg_data_t*)p; 3219: struct task_struct *tsk = current; 3220: 3221: struct reclaim_state reclaim_state = { 3222: .reclaimed_slab = 0, 3223: }; 3224: const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 3225: 3226: lockdep_set_current_reclaim_state(GFP_KERNEL); 3227: 3228: if (!cpumask_empty(cpumask)) 3229: set_cpus_allowed_ptr(tsk, cpumask); 3230: current->reclaim_state = &reclaim_state; 3231: 3232: /* 3233: * Tell the memory management that we're a "memory allocator", 3234: * and that if we need more memory we should get access to it 3235: * regardless (see "__alloc_pages()"). "kswapd" should 3236: * never get caught in the normal page freeing logic. 3237: * 3238: * (Kswapd normally doesn't need memory anyway, but sometimes 3239: * you need a small amount of memory in order to be able to 3240: * page out something else, and this flag essentially protects 3241: * us from recursively trying to free more memory as we're 3242: * trying to free the first piece of memory in the first place). 3243: */ 3244: tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; 3245: set_freezable(); 3246: 3247: order = new_order = 0; 3248: balanced_order = 0; 3249: classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; 3250: balanced_classzone_idx = classzone_idx; 3251: for ( ; ; ) { 3252: bool ret; 3253: 3254: /* 3255: * If the last balance_pgdat was unsuccessful it's unlikely a 3256: * new request of a similar or harder type will succeed soon 3257: * so consider going to sleep on the basis we reclaimed at 3258: */ 3259: if (balanced_classzone_idx >= new_classzone_idx && 3260: balanced_order == new_order) { 3261: new_order = pgdat->kswapd_max_order; 3262: new_classzone_idx = pgdat->classzone_idx; 3263: pgdat->kswapd_max_order = 0; 3264: pgdat->classzone_idx = pgdat->nr_zones - 1; 3265: } 3266: 3267: if (order < new_order || classzone_idx > new_classzone_idx) { 3268: /* 3269: * Don't sleep if someone wants a larger 'order' 3270: * allocation or has tigher zone constraints 3271: */ 3272: order = new_order; 3273: classzone_idx = new_classzone_idx; 3274: } else { 3275: kswapd_try_to_sleep(pgdat, balanced_order, 3276: balanced_classzone_idx); 3277: order = pgdat->kswapd_max_order; 3278: classzone_idx = pgdat->classzone_idx; 3279: new_order = order; 3280: new_classzone_idx = classzone_idx; 3281: pgdat->kswapd_max_order = 0; 3282: pgdat->classzone_idx = pgdat->nr_zones - 1; 3283: } 3284: 3285: ret = try_to_freeze(); 3286: if (kthread_should_stop()) 3287: break; 3288: 3289: /* 3290: * We can speed up thawing tasks if we don't call balance_pgdat 3291: * after returning from the refrigerator 3292: */ 3293: if (!ret) { 3294: trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); 3295: balanced_classzone_idx = classzone_idx; 3296: balanced_order = balance_pgdat(pgdat, order, 3297: &balanced_classzone_idx); 3298: } 3299: } 3300: 3301: tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); 3302: current->reclaim_state = NULL; 3303: lockdep_clear_current_reclaim_state(); 3304: 3305: return 0; 3306: } 3307: 3308: /* 3309: * A zone is low on free memory, so wake its kswapd task to service it. 3310: */ 3311: void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) 3312: { 3313: pg_data_t *pgdat; 3314: 3315: if (!populated_zone(zone)) 3316: return; 3317: 3318: if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 3319: return; 3320: pgdat = zone->zone_pgdat; 3321: if (pgdat->kswapd_max_order < order) { 3322: pgdat->kswapd_max_order = order; 3323: pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); 3324: } 3325: if (!waitqueue_active(&pgdat->kswapd_wait)) 3326: return; 3327: if (zone_balanced(zone, order, 0, 0)) 3328: return; 3329: 3330: trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); 3331: wake_up_interruptible(&pgdat->kswapd_wait); 3332: } 3333: 3334: #ifdef CONFIG_HIBERNATION 3335: /* 3336: * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of 3337: * freed pages. 3338: * 3339: * Rather than trying to age LRUs the aim is to preserve the overall 3340: * LRU order by reclaiming preferentially 3341: * inactive > active > active referenced > active mapped 3342: */ 3343: unsigned long shrink_all_memory(unsigned long nr_to_reclaim) 3344: { 3345: struct reclaim_state reclaim_state; 3346: struct scan_control sc = { 3347: .gfp_mask = GFP_HIGHUSER_MOVABLE, 3348: .may_swap = 1, 3349: .may_unmap = 1, 3350: .may_writepage = 1, 3351: .nr_to_reclaim = nr_to_reclaim, 3352: .hibernation_mode = 1, 3353: .order = 0, 3354: .priority = DEF_PRIORITY, 3355: }; 3356: struct shrink_control shrink = { 3357: .gfp_mask = sc.gfp_mask, 3358: }; 3359: struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); 3360: struct task_struct *p = current; 3361: unsigned long nr_reclaimed; 3362: 3363: p->flags |= PF_MEMALLOC; 3364: lockdep_set_current_reclaim_state(sc.gfp_mask); 3365: reclaim_state.reclaimed_slab = 0; 3366: p->reclaim_state = &reclaim_state; 3367: 3368: nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); 3369: 3370: p->reclaim_state = NULL; 3371: lockdep_clear_current_reclaim_state(); 3372: p->flags &= ~PF_MEMALLOC; 3373: 3374: return nr_reclaimed; 3375: } 3376: #endif /* CONFIG_HIBERNATION */ 3377: 3378: /* It's optimal to keep kswapds on the same CPUs as their memory, but 3379: not required for correctness. So if the last cpu in a node goes 3380: away, we get changed to run anywhere: as the first one comes back, 3381: restore their cpu bindings. */ 3382: static int cpu_callback(struct notifier_block *nfb, unsigned long action, 3383: void *hcpu) 3384: { 3385: int nid; 3386: 3387: if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { 3388: for_each_node_state(nid, N_MEMORY) { 3389: pg_data_t *pgdat = NODE_DATA(nid); 3390: const struct cpumask *mask; 3391: 3392: mask = cpumask_of_node(pgdat->node_id); 3393: 3394: if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) 3395: /* One of our CPUs online: restore mask */ 3396: set_cpus_allowed_ptr(pgdat->kswapd, mask); 3397: } 3398: } 3399: return NOTIFY_OK; 3400: } 3401: 3402: /* 3403: * This kswapd start function will be called by init and node-hot-add. 3404: * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. 3405: */ 3406: int kswapd_run(int nid) 3407: { 3408: pg_data_t *pgdat = NODE_DATA(nid); 3409: int ret = 0; 3410: 3411: if (pgdat->kswapd) 3412: return 0; 3413: 3414: pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); 3415: if (IS_ERR(pgdat->kswapd)) { 3416: /* failure at boot is fatal */ 3417: BUG_ON(system_state == SYSTEM_BOOTING); 3418: pr_err("Failed to start kswapd on node %d\n", nid); 3419: ret = PTR_ERR(pgdat->kswapd); 3420: pgdat->kswapd = NULL; 3421: } 3422: return ret; 3423: } 3424: 3425: /* 3426: * Called by memory hotplug when all memory in a node is offlined. Caller must 3427: * hold lock_memory_hotplug(). 3428: */ 3429: void kswapd_stop(int nid) 3430: { 3431: struct task_struct *kswapd = NODE_DATA(nid)->kswapd; 3432: 3433: if (kswapd) { 3434: kthread_stop(kswapd); 3435: NODE_DATA(nid)->kswapd = NULL; 3436: } 3437: } 3438: 3439: static int __init kswapd_init(void) 3440: { 3441: int nid; 3442: 3443: swap_setup(); 3444: for_each_node_state(nid, N_MEMORY) 3445: kswapd_run(nid); 3446: hotcpu_notifier(cpu_callback, 0); 3447: return 0; 3448: } 3449: 3450: module_init(kswapd_init) 3451: 3452: #ifdef CONFIG_NUMA 3453: /* 3454: * Zone reclaim mode 3455: * 3456: * If non-zero call zone_reclaim when the number of free pages falls below 3457: * the watermarks. 3458: */ 3459: int zone_reclaim_mode __read_mostly; 3460: 3461: #define RECLAIM_OFF 0 3462: #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ 3463: #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ 3464: #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ 3465: 3466: /* 3467: * Priority for ZONE_RECLAIM. This determines the fraction of pages 3468: * of a node considered for each zone_reclaim. 4 scans 1/16th of 3469: * a zone. 3470: */ 3471: #define ZONE_RECLAIM_PRIORITY 4 3472: 3473: /* 3474: * Percentage of pages in a zone that must be unmapped for zone_reclaim to 3475: * occur. 3476: */ 3477: int sysctl_min_unmapped_ratio = 1; 3478: 3479: /* 3480: * If the number of slab pages in a zone grows beyond this percentage then 3481: * slab reclaim needs to occur. 3482: */ 3483: int sysctl_min_slab_ratio = 5; 3484: 3485: static inline unsigned long zone_unmapped_file_pages(struct zone *zone) 3486: { 3487: unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); 3488: unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + 3489: zone_page_state(zone, NR_ACTIVE_FILE); 3490: 3491: /* 3492: * It's possible for there to be more file mapped pages than 3493: * accounted for by the pages on the file LRU lists because 3494: * tmpfs pages accounted for as ANON can also be FILE_MAPPED 3495: */ 3496: return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; 3497: } 3498: 3499: /* Work out how many page cache pages we can reclaim in this reclaim_mode */ 3500: static long zone_pagecache_reclaimable(struct zone *zone) 3501: { 3502: long nr_pagecache_reclaimable; 3503: long delta = 0; 3504: 3505: /* 3506: * If RECLAIM_SWAP is set, then all file pages are considered 3507: * potentially reclaimable. Otherwise, we have to worry about 3508: * pages like swapcache and zone_unmapped_file_pages() provides 3509: * a better estimate 3510: */ 3511: if (zone_reclaim_mode & RECLAIM_SWAP) 3512: nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); 3513: else 3514: nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); 3515: 3516: /* If we can't clean pages, remove dirty pages from consideration */ 3517: if (!(zone_reclaim_mode & RECLAIM_WRITE)) 3518: delta += zone_page_state(zone, NR_FILE_DIRTY); 3519: 3520: /* Watch for any possible underflows due to delta */ 3521: if (unlikely(delta > nr_pagecache_reclaimable)) 3522: delta = nr_pagecache_reclaimable; 3523: 3524: return nr_pagecache_reclaimable - delta; 3525: } 3526: 3527: /* 3528: * Try to free up some pages from this zone through reclaim. 3529: */ 3530: static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 3531: { 3532: /* Minimum pages needed in order to stay on node */ 3533: const unsigned long nr_pages = 1 << order; 3534: struct task_struct *p = current; 3535: struct reclaim_state reclaim_state; 3536: struct scan_control sc = { 3537: .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 3538: .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), 3539: .may_swap = 1, 3540: .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), 3541: .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), 3542: .order = order, 3543: .priority = ZONE_RECLAIM_PRIORITY, 3544: }; 3545: struct shrink_control shrink = { 3546: .gfp_mask = sc.gfp_mask, 3547: }; 3548: unsigned long nr_slab_pages0, nr_slab_pages1; 3549: 3550: cond_resched(); 3551: /* 3552: * We need to be able to allocate from the reserves for RECLAIM_SWAP 3553: * and we also need to be able to write out pages for RECLAIM_WRITE 3554: * and RECLAIM_SWAP. 3555: */ 3556: p->flags |= PF_MEMALLOC | PF_SWAPWRITE; 3557: lockdep_set_current_reclaim_state(gfp_mask); 3558: reclaim_state.reclaimed_slab = 0; 3559: p->reclaim_state = &reclaim_state; 3560: 3561: if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { 3562: /* 3563: * Free memory by calling shrink zone with increasing 3564: * priorities until we have enough memory freed. 3565: */ 3566: do { 3567: shrink_zone(zone, &sc); 3568: } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); 3569: } 3570: 3571: nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 3572: if (nr_slab_pages0 > zone->min_slab_pages) { 3573: /* 3574: * shrink_slab() does not currently allow us to determine how 3575: * many pages were freed in this zone. So we take the current 3576: * number of slab pages and shake the slab until it is reduced 3577: * by the same nr_pages that we used for reclaiming unmapped 3578: * pages. 3579: */ 3580: nodes_clear(shrink.nodes_to_scan); 3581: node_set(zone_to_nid(zone), shrink.nodes_to_scan); 3582: for (;;) { 3583: unsigned long lru_pages = zone_reclaimable_pages(zone); 3584: 3585: /* No reclaimable slab or very low memory pressure */ 3586: if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages)) 3587: break; 3588: 3589: /* Freed enough memory */ 3590: nr_slab_pages1 = zone_page_state(zone, 3591: NR_SLAB_RECLAIMABLE); 3592: if (nr_slab_pages1 + nr_pages <= nr_slab_pages0) 3593: break; 3594: } 3595: 3596: /* 3597: * Update nr_reclaimed by the number of slab pages we 3598: * reclaimed from this zone. 3599: */ 3600: nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 3601: if (nr_slab_pages1 < nr_slab_pages0) 3602: sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1; 3603: } 3604: 3605: p->reclaim_state = NULL; 3606: current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 3607: lockdep_clear_current_reclaim_state(); 3608: return sc.nr_reclaimed >= nr_pages; 3609: } 3610: 3611: int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 3612: { 3613: int node_id; 3614: int ret; 3615: 3616: /* 3617: * Zone reclaim reclaims unmapped file backed pages and 3618: * slab pages if we are over the defined limits. 3619: * 3620: * A small portion of unmapped file backed pages is needed for 3621: * file I/O otherwise pages read by file I/O will be immediately 3622: * thrown out if the zone is overallocated. So we do not reclaim 3623: * if less than a specified percentage of the zone is used by 3624: * unmapped file backed pages. 3625: */ 3626: if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && 3627: zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) 3628: return ZONE_RECLAIM_FULL; 3629: 3630: if (!zone_reclaimable(zone)) 3631: return ZONE_RECLAIM_FULL; 3632: 3633: /* 3634: * Do not scan if the allocation should not be delayed. 3635: */ 3636: if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) 3637: return ZONE_RECLAIM_NOSCAN; 3638: 3639: /* 3640: * Only run zone reclaim on the local zone or on zones that do not 3641: * have associated processors. This will favor the local processor 3642: * over remote processors and spread off node memory allocations 3643: * as wide as possible. 3644: */ 3645: node_id = zone_to_nid(zone); 3646: if (node_state(node_id, N_CPU) && node_id != numa_node_id()) 3647: return ZONE_RECLAIM_NOSCAN; 3648: 3649: if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) 3650: return ZONE_RECLAIM_NOSCAN; 3651: 3652: ret = __zone_reclaim(zone, gfp_mask, order); 3653: zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); 3654: 3655: if (!ret) 3656: count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); 3657: 3658: return ret; 3659: } 3660: #endif 3661: 3662: /* 3663: * page_evictable - test whether a page is evictable 3664: * @page: the page to test 3665: * 3666: * Test whether page is evictable--i.e., should be placed on active/inactive 3667: * lists vs unevictable list. 3668: * 3669: * Reasons page might not be evictable: 3670: * (1) page's mapping marked unevictable 3671: * (2) page is part of an mlocked VMA 3672: * 3673: */ 3674: int page_evictable(struct page *page) 3675: { 3676: return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); 3677: } 3678: 3679: #ifdef CONFIG_SHMEM 3680: /** 3681: * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list 3682: * @pages: array of pages to check 3683: * @nr_pages: number of pages to check 3684: * 3685: * Checks pages for evictability and moves them to the appropriate lru list. 3686: * 3687: * This function is only used for SysV IPC SHM_UNLOCK. 3688: */ 3689: void check_move_unevictable_pages(struct page **pages, int nr_pages) 3690: { 3691: struct lruvec *lruvec; 3692: struct zone *zone = NULL; 3693: int pgscanned = 0; 3694: int pgrescued = 0; 3695: int i; 3696: 3697: for (i = 0; i < nr_pages; i++) { 3698: struct page *page = pages[i]; 3699: struct zone *pagezone; 3700: 3701: pgscanned++; 3702: pagezone = page_zone(page); 3703: if (pagezone != zone) { 3704: if (zone) 3705: spin_unlock_irq(&zone->lru_lock); 3706: zone = pagezone; 3707: spin_lock_irq(&zone->lru_lock); 3708: } 3709: lruvec = mem_cgroup_page_lruvec(page, zone); 3710: 3711: if (!PageLRU(page) || !PageUnevictable(page)) 3712: continue; 3713: 3714: if (page_evictable(page)) { 3715: enum lru_list lru = page_lru_base_type(page); 3716: 3717: VM_BUG_ON(PageActive(page)); 3718: ClearPageUnevictable(page); 3719: del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); 3720: add_page_to_lru_list(page, lruvec, lru); 3721: pgrescued++; 3722: } 3723: } 3724: 3725: if (zone) { 3726: __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); 3727: __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); 3728: spin_unlock_irq(&zone->lru_lock); 3729: } 3730: } 3731: #endif /* CONFIG_SHMEM */ 3732: 3733: static void warn_scan_unevictable_pages(void) 3734: { 3735: printk_once(KERN_WARNING 3736: "%s: The scan_unevictable_pages sysctl/node-interface has been " 3737: "disabled for lack of a legitimate use case. If you have " 3738: "one, please send an email to linux-mm@kvack.org.\n", 3739: current->comm); 3740: } 3741: 3742: /* 3743: * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of 3744: * all nodes' unevictable lists for evictable pages 3745: */ 3746: unsigned long scan_unevictable_pages; 3747: 3748: int scan_unevictable_handler(struct ctl_table *table, int write, 3749: void __user *buffer, 3750: size_t *length, loff_t *ppos) 3751: { 3752: warn_scan_unevictable_pages(); 3753: proc_doulongvec_minmax(table, write, buffer, length, ppos); 3754: scan_unevictable_pages = 0; 3755: return 0; 3756: } 3757: 3758: #ifdef CONFIG_NUMA 3759: /* 3760: * per node 'scan_unevictable_pages' attribute. On demand re-scan of 3761: * a specified node's per zone unevictable lists for evictable pages. 3762: */ 3763: 3764: static ssize_t read_scan_unevictable_node(struct device *dev, 3765: struct device_attribute *attr, 3766: char *buf) 3767: { 3768: warn_scan_unevictable_pages(); 3769: return sprintf(buf, "0\n"); /* always zero; should fit... */ 3770: } 3771: 3772: static ssize_t write_scan_unevictable_node(struct device *dev, 3773: struct device_attribute *attr, 3774: const char *buf, size_t count) 3775: { 3776: warn_scan_unevictable_pages(); 3777: return 1; 3778: } 3779: 3780: 3781: static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, 3782: read_scan_unevictable_node, 3783: write_scan_unevictable_node); 3784: 3785: int scan_unevictable_register_node(struct node *node) 3786: { 3787: return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages); 3788: } 3789: 3790: void scan_unevictable_unregister_node(struct node *node) 3791: { 3792: device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages); 3793: } 3794: #endif