net/core/page_pool.c

   1 /* SPDX-License-Identifier: GPL-2.0
   2  *
   3  * page_pool.c
   4  *      Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
   5  *      Copyright (C) 2016 Red Hat, Inc.
   6  */
   7
   8 #include <linux/types.h>
   9 #include <linux/kernel.h>
  10 #include <linux/slab.h>
  11 #include <linux/device.h>
  12
  13 #include <net/page_pool.h>
  14 #include <net/xdp.h>
  15
  16 #include <linux/dma-direction.h>
  17 #include <linux/dma-mapping.h>
  18 #include <linux/page-flags.h>
  19 #include <linux/mm.h> /* for __put_page() */
  20 #include <linux/poison.h>
  21
  22 #include <trace/events/page_pool.h>
  23
  24 #define DEFER_TIME (msecs_to_jiffies(1000))
  25 #define DEFER_WARN_INTERVAL (60 * HZ)
  26
  27 #define BIAS_MAX        LONG_MAX
  28
  29 static bool page_pool_producer_lock(struct page_pool *pool)
  30         __acquires(&pool->ring.producer_lock)
  31 {
  32         bool in_softirq = in_softirq();
  33
  34         if (in_softirq)
  35                 spin_lock(&pool->ring.producer_lock);
  36         else
  37                 spin_lock_bh(&pool->ring.producer_lock);
  38
  39         return in_softirq;
  40 }
  41
  42 static void page_pool_producer_unlock(struct page_pool *pool,
  43                                       bool in_softirq)
  44         __releases(&pool->ring.producer_lock)
  45 {
  46         if (in_softirq)
  47                 spin_unlock(&pool->ring.producer_lock);
  48         else
  49                 spin_unlock_bh(&pool->ring.producer_lock);
  50 }
  51
  52 static int page_pool_init(struct page_pool *pool,
  53                           const struct page_pool_params *params)
  54 {
  55         unsigned int ring_qsize = 1024; /* Default */
  56
  57         memcpy(&pool->p, params, sizeof(pool->p));
  58
  59         /* Validate only known flags were used */
  60         if (pool->p.flags & ~(PP_FLAG_ALL))
  61                 return -EINVAL;
  62
  63         if (pool->p.pool_size)
  64                 ring_qsize = pool->p.pool_size;
  65
  66         /* Sanity limit mem that can be pinned down */
  67         if (ring_qsize > 32768)
  68                 return -E2BIG;
  69
  70         /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
  71          * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
  72          * which is the XDP_TX use-case.
  73          */
  74         if (pool->p.flags & PP_FLAG_DMA_MAP) {
  75                 if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
  76                     (pool->p.dma_dir != DMA_BIDIRECTIONAL))
  77                         return -EINVAL;
  78         }
  79
  80         if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
  81                 /* In order to request DMA-sync-for-device the page
  82                  * needs to be mapped
  83                  */
  84                 if (!(pool->p.flags & PP_FLAG_DMA_MAP))
  85                         return -EINVAL;
  86
  87                 if (!pool->p.max_len)
  88                         return -EINVAL;
  89
  90                 /* pool->p.offset has to be set according to the address
  91                  * offset used by the DMA engine to start copying rx data
  92                  */
  93         }
  94
  95         if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT &&
  96             pool->p.flags & PP_FLAG_PAGE_FRAG)
  97                 return -EINVAL;
  98
  99         if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
 100                 return -ENOMEM;
 101
 102         atomic_set(&pool->pages_state_release_cnt, 0);
 103
 104         /* Driver calling page_pool_create() also call page_pool_destroy() */
 105         refcount_set(&pool->user_cnt, 1);
 106
 107         if (pool->p.flags & PP_FLAG_DMA_MAP)
 108                 get_device(pool->p.dev);
 109
 110         return 0;
 111 }
 112
 113 struct page_pool *page_pool_create(const struct page_pool_params *params)
 114 {
 115         struct page_pool *pool;
 116         int err;
 117
 118         pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
 119         if (!pool)
 120                 return ERR_PTR(-ENOMEM);
 121
 122         err = page_pool_init(pool, params);
 123         if (err < 0) {
 124                 pr_warn("%s() gave up with errno %d\n", __func__, err);
 125                 kfree(pool);
 126                 return ERR_PTR(err);
 127         }
 128
 129         return pool;
 130 }
 131 EXPORT_SYMBOL(page_pool_create);
 132
 133 static void page_pool_return_page(struct page_pool *pool, struct page *page);
 134
 135 noinline
 136 static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
 137 {
 138         struct ptr_ring *r = &pool->ring;
 139         struct page *page;
 140         int pref_nid; /* preferred NUMA node */
 141
 142         /* Quicker fallback, avoid locks when ring is empty */
 143         if (__ptr_ring_empty(r))
 144                 return NULL;
 145
 146         /* Softirq guarantee CPU and thus NUMA node is stable. This,
 147          * assumes CPU refilling driver RX-ring will also run RX-NAPI.
 148          */
 149 #ifdef CONFIG_NUMA
 150         pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
 151 #else
 152         /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
 153         pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
 154 #endif
 155
 156         /* Slower-path: Get pages from locked ring queue */
 157         spin_lock(&r->consumer_lock);
 158
 159         /* Refill alloc array, but only if NUMA match */
 160         do {
 161                 page = __ptr_ring_consume(r);
 162                 if (unlikely(!page))
 163                         break;
 164
 165                 if (likely(page_to_nid(page) == pref_nid)) {
 166                         pool->alloc.cache[pool->alloc.count++] = page;
 167                 } else {
 168                         /* NUMA mismatch;
 169                          * (1) release 1 page to page-allocator and
 170                          * (2) break out to fallthrough to alloc_pages_node.
 171                          * This limit stress on page buddy alloactor.
 172                          */
 173                         page_pool_return_page(pool, page);
 174                         page = NULL;
 175                         break;
 176                 }
 177         } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
 178
 179         /* Return last page */
 180         if (likely(pool->alloc.count > 0))
 181                 page = pool->alloc.cache[--pool->alloc.count];
 182
 183         spin_unlock(&r->consumer_lock);
 184         return page;
 185 }
 186
 187 /* fast path */
 188 static struct page *__page_pool_get_cached(struct page_pool *pool)
 189 {
 190         struct page *page;
 191
 192         /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
 193         if (likely(pool->alloc.count)) {
 194                 /* Fast-path */
 195                 page = pool->alloc.cache[--pool->alloc.count];
 196         } else {
 197                 page = page_pool_refill_alloc_cache(pool);
 198         }
 199
 200         return page;
 201 }
 202
 203 static void page_pool_dma_sync_for_device(struct page_pool *pool,
 204                                           struct page *page,
 205                                           unsigned int dma_sync_size)
 206 {
 207         dma_addr_t dma_addr = page_pool_get_dma_addr(page);
 208
 209         dma_sync_size = min(dma_sync_size, pool->p.max_len);
 210         dma_sync_single_range_for_device(pool->p.dev, dma_addr,
 211                                          pool->p.offset, dma_sync_size,
 212                                          pool->p.dma_dir);
 213 }
 214
 215 static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
 216 {
 217         dma_addr_t dma;
 218
 219         /* Setup DMA mapping: use 'struct page' area for storing DMA-addr
 220          * since dma_addr_t can be either 32 or 64 bits and does not always fit
 221          * into page private data (i.e 32bit cpu with 64bit DMA caps)
 222          * This mapping is kept for lifetime of page, until leaving pool.
 223          */
 224         dma = dma_map_page_attrs(pool->p.dev, page, 0,
 225                                  (PAGE_SIZE << pool->p.order),
 226                                  pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC);
 227         if (dma_mapping_error(pool->p.dev, dma))
 228                 return false;
 229
 230         page_pool_set_dma_addr(page, dma);
 231
 232         if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
 233                 page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
 234
 235         return true;
 236 }
 237
 238 static void page_pool_set_pp_info(struct page_pool *pool,
 239                                   struct page *page)
 240 {
 241         page->pp = pool;
 242         page->pp_magic |= PP_SIGNATURE;
 243 }
 244
 245 static void page_pool_clear_pp_info(struct page *page)
 246 {
 247         page->pp_magic = 0;
 248         page->pp = NULL;
 249 }
 250
 251 static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
 252                                                  gfp_t gfp)
 253 {
 254         struct page *page;
 255
 256         gfp |= __GFP_COMP;
 257         page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
 258         if (unlikely(!page))
 259                 return NULL;
 260
 261         if ((pool->p.flags & PP_FLAG_DMA_MAP) &&
 262             unlikely(!page_pool_dma_map(pool, page))) {
 263                 put_page(page);
 264                 return NULL;
 265         }
 266
 267         page_pool_set_pp_info(pool, page);
 268
 269         /* Track how many pages are held 'in-flight' */
 270         pool->pages_state_hold_cnt++;
 271         trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
 272         return page;
 273 }
 274
 275 /* slow path */
 276 noinline
 277 static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
 278                                                  gfp_t gfp)
 279 {
 280         const int bulk = PP_ALLOC_CACHE_REFILL;
 281         unsigned int pp_flags = pool->p.flags;
 282         unsigned int pp_order = pool->p.order;
 283         struct page *page;
 284         int i, nr_pages;
 285
 286         /* Don't support bulk alloc for high-order pages */
 287         if (unlikely(pp_order))
 288                 return __page_pool_alloc_page_order(pool, gfp);
 289
 290         /* Unnecessary as alloc cache is empty, but guarantees zero count */
 291         if (unlikely(pool->alloc.count > 0))
 292                 return pool->alloc.cache[--pool->alloc.count];
 293
 294         /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
 295         memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
 296
 297         nr_pages = alloc_pages_bulk_array(gfp, bulk, pool->alloc.cache);
 298         if (unlikely(!nr_pages))
 299                 return NULL;
 300
 301         /* Pages have been filled into alloc.cache array, but count is zero and
 302          * page element have not been (possibly) DMA mapped.
 303          */
 304         for (i = 0; i < nr_pages; i++) {
 305                 page = pool->alloc.cache[i];
 306                 if ((pp_flags & PP_FLAG_DMA_MAP) &&
 307                     unlikely(!page_pool_dma_map(pool, page))) {
 308                         put_page(page);
 309                         continue;
 310                 }
 311
 312                 page_pool_set_pp_info(pool, page);
 313                 pool->alloc.cache[pool->alloc.count++] = page;
 314                 /* Track how many pages are held 'in-flight' */
 315                 pool->pages_state_hold_cnt++;
 316                 trace_page_pool_state_hold(pool, page,
 317                                            pool->pages_state_hold_cnt);
 318         }
 319
 320         /* Return last page */
 321         if (likely(pool->alloc.count > 0))
 322                 page = pool->alloc.cache[--pool->alloc.count];
 323         else
 324                 page = NULL;
 325
 326         /* When page just alloc'ed is should/must have refcnt 1. */
 327         return page;
 328 }
 329
 330 /* For using page_pool replace: alloc_pages() API calls, but provide
 331  * synchronization guarantee for allocation side.
 332  */
 333 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
 334 {
 335         struct page *page;
 336
 337         /* Fast-path: Get a page from cache */
 338         page = __page_pool_get_cached(pool);
 339         if (page)
 340                 return page;
 341
 342         /* Slow-path: cache empty, do real allocation */
 343         page = __page_pool_alloc_pages_slow(pool, gfp);
 344         return page;
 345 }
 346 EXPORT_SYMBOL(page_pool_alloc_pages);
 347
 348 /* Calculate distance between two u32 values, valid if distance is below 2^(31)
 349  *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
 350  */
 351 #define _distance(a, b) (s32)((a) - (b))
 352
 353 static s32 page_pool_inflight(struct page_pool *pool)
 354 {
 355         u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
 356         u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
 357         s32 inflight;
 358
 359         inflight = _distance(hold_cnt, release_cnt);
 360
 361         trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
 362         WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
 363
 364         return inflight;
 365 }
 366
 367 /* Disconnects a page (from a page_pool).  API users can have a need
 368  * to disconnect a page (from a page_pool), to allow it to be used as
 369  * a regular page (that will eventually be returned to the normal
 370  * page-allocator via put_page).
 371  */
 372 void page_pool_release_page(struct page_pool *pool, struct page *page)
 373 {
 374         dma_addr_t dma;
 375         int count;
 376
 377         if (!(pool->p.flags & PP_FLAG_DMA_MAP))
 378                 /* Always account for inflight pages, even if we didn't
 379                  * map them
 380                  */
 381                 goto skip_dma_unmap;
 382
 383         dma = page_pool_get_dma_addr(page);
 384
 385         /* When page is unmapped, it cannot be returned to our pool */
 386         dma_unmap_page_attrs(pool->p.dev, dma,
 387                              PAGE_SIZE << pool->p.order, pool->p.dma_dir,
 388                              DMA_ATTR_SKIP_CPU_SYNC);
 389         page_pool_set_dma_addr(page, 0);
 390 skip_dma_unmap:
 391         page_pool_clear_pp_info(page);
 392
 393         /* This may be the last page returned, releasing the pool, so
 394          * it is not safe to reference pool afterwards.
 395          */
 396         count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
 397         trace_page_pool_state_release(pool, page, count);
 398 }
 399 EXPORT_SYMBOL(page_pool_release_page);
 400
 401 /* Return a page to the page allocator, cleaning up our state */
 402 static void page_pool_return_page(struct page_pool *pool, struct page *page)
 403 {
 404         page_pool_release_page(pool, page);
 405
 406         put_page(page);
 407         /* An optimization would be to call __free_pages(page, pool->p.order)
 408          * knowing page is not part of page-cache (thus avoiding a
 409          * __page_cache_release() call).
 410          */
 411 }
 412
 413 static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
 414 {
 415         int ret;
 416         /* BH protection not needed if current is softirq */
 417         if (in_softirq())
 418                 ret = ptr_ring_produce(&pool->ring, page);
 419         else
 420                 ret = ptr_ring_produce_bh(&pool->ring, page);
 421
 422         return (ret == 0) ? true : false;
 423 }
 424
 425 /* Only allow direct recycling in special circumstances, into the
 426  * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
 427  *
 428  * Caller must provide appropriate safe context.
 429  */
 430 static bool page_pool_recycle_in_cache(struct page *page,
 431                                        struct page_pool *pool)
 432 {
 433         if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
 434                 return false;
 435
 436         /* Caller MUST have verified/know (page_ref_count(page) == 1) */
 437         pool->alloc.cache[pool->alloc.count++] = page;
 438         return true;
 439 }
 440
 441 /* If the page refcnt == 1, this will try to recycle the page.
 442  * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
 443  * the configured size min(dma_sync_size, pool->max_len).
 444  * If the page refcnt != 1, then the page will be returned to memory
 445  * subsystem.
 446  */
 447 static __always_inline struct page *
 448 __page_pool_put_page(struct page_pool *pool, struct page *page,
 449                      unsigned int dma_sync_size, bool allow_direct)
 450 {
 451         /* It is not the last user for the page frag case */
 452         if (pool->p.flags & PP_FLAG_PAGE_FRAG &&
 453             page_pool_atomic_sub_frag_count_return(page, 1))
 454                 return NULL;
 455
 456         /* This allocator is optimized for the XDP mode that uses
 457          * one-frame-per-page, but have fallbacks that act like the
 458          * regular page allocator APIs.
 459          *
 460          * refcnt == 1 means page_pool owns page, and can recycle it.
 461          *
 462          * page is NOT reusable when allocated when system is under
 463          * some pressure. (page_is_pfmemalloc)
 464          */
 465         if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) {
 466                 /* Read barrier done in page_ref_count / READ_ONCE */
 467
 468                 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
 469                         page_pool_dma_sync_for_device(pool, page,
 470                                                       dma_sync_size);
 471
 472                 if (allow_direct && in_softirq() &&
 473                     page_pool_recycle_in_cache(page, pool))
 474                         return NULL;
 475
 476                 /* Page found as candidate for recycling */
 477                 return page;
 478         }
 479         /* Fallback/non-XDP mode: API user have elevated refcnt.
 480          *
 481          * Many drivers split up the page into fragments, and some
 482          * want to keep doing this to save memory and do refcnt based
 483          * recycling. Support this use case too, to ease drivers
 484          * switching between XDP/non-XDP.
 485          *
 486          * In-case page_pool maintains the DMA mapping, API user must
 487          * call page_pool_put_page once.  In this elevated refcnt
 488          * case, the DMA is unmapped/released, as driver is likely
 489          * doing refcnt based recycle tricks, meaning another process
 490          * will be invoking put_page.
 491          */
 492         /* Do not replace this with page_pool_return_page() */
 493         page_pool_release_page(pool, page);
 494         put_page(page);
 495
 496         return NULL;
 497 }
 498
 499 void page_pool_put_page(struct page_pool *pool, struct page *page,
 500                         unsigned int dma_sync_size, bool allow_direct)
 501 {
 502         page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
 503         if (page && !page_pool_recycle_in_ring(pool, page)) {
 504                 /* Cache full, fallback to free pages */
 505                 page_pool_return_page(pool, page);
 506         }
 507 }
 508 EXPORT_SYMBOL(page_pool_put_page);
 509
 510 /* Caller must not use data area after call, as this function overwrites it */
 511 void page_pool_put_page_bulk(struct page_pool *pool, void **data,
 512                              int count)
 513 {
 514         int i, bulk_len = 0;
 515         bool in_softirq;
 516
 517         for (i = 0; i < count; i++) {
 518                 struct page *page = virt_to_head_page(data[i]);
 519
 520                 page = __page_pool_put_page(pool, page, -1, false);
 521                 /* Approved for bulk recycling in ptr_ring cache */
 522                 if (page)
 523                         data[bulk_len++] = page;
 524         }
 525
 526         if (unlikely(!bulk_len))
 527                 return;
 528
 529         /* Bulk producer into ptr_ring page_pool cache */
 530         in_softirq = page_pool_producer_lock(pool);
 531         for (i = 0; i < bulk_len; i++) {
 532                 if (__ptr_ring_produce(&pool->ring, data[i]))
 533                         break; /* ring full */
 534         }
 535         page_pool_producer_unlock(pool, in_softirq);
 536
 537         /* Hopefully all pages was return into ptr_ring */
 538         if (likely(i == bulk_len))
 539                 return;
 540
 541         /* ptr_ring cache full, free remaining pages outside producer lock
 542          * since put_page() with refcnt == 1 can be an expensive operation
 543          */
 544         for (; i < bulk_len; i++)
 545                 page_pool_return_page(pool, data[i]);
 546 }
 547 EXPORT_SYMBOL(page_pool_put_page_bulk);
 548
 549 static struct page *page_pool_drain_frag(struct page_pool *pool,
 550                                          struct page *page)
 551 {
 552         long drain_count = BIAS_MAX - pool->frag_users;
 553
 554         /* Some user is still using the page frag */
 555         if (likely(page_pool_atomic_sub_frag_count_return(page,
 556                                                           drain_count)))
 557                 return NULL;
 558
 559         if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) {
 560                 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
 561                         page_pool_dma_sync_for_device(pool, page, -1);
 562
 563                 return page;
 564         }
 565
 566         page_pool_return_page(pool, page);
 567         return NULL;
 568 }
 569
 570 static void page_pool_free_frag(struct page_pool *pool)
 571 {
 572         long drain_count = BIAS_MAX - pool->frag_users;
 573         struct page *page = pool->frag_page;
 574
 575         pool->frag_page = NULL;
 576
 577         if (!page ||
 578             page_pool_atomic_sub_frag_count_return(page, drain_count))
 579                 return;
 580
 581         page_pool_return_page(pool, page);
 582 }
 583
 584 struct page *page_pool_alloc_frag(struct page_pool *pool,
 585                                   unsigned int *offset,
 586                                   unsigned int size, gfp_t gfp)
 587 {
 588         unsigned int max_size = PAGE_SIZE << pool->p.order;
 589         struct page *page = pool->frag_page;
 590
 591         if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
 592                     size > max_size))
 593                 return NULL;
 594
 595         size = ALIGN(size, dma_get_cache_alignment());
 596         *offset = pool->frag_offset;
 597
 598         if (page && *offset + size > max_size) {
 599                 page = page_pool_drain_frag(pool, page);
 600                 if (page)
 601                         goto frag_reset;
 602         }
 603
 604         if (!page) {
 605                 page = page_pool_alloc_pages(pool, gfp);
 606                 if (unlikely(!page)) {
 607                         pool->frag_page = NULL;
 608                         return NULL;
 609                 }
 610
 611                 pool->frag_page = page;
 612
 613 frag_reset:
 614                 pool->frag_users = 1;
 615                 *offset = 0;
 616                 pool->frag_offset = size;
 617                 page_pool_set_frag_count(page, BIAS_MAX);
 618                 return page;
 619         }
 620
 621         pool->frag_users++;
 622         pool->frag_offset = *offset + size;
 623         return page;
 624 }
 625 EXPORT_SYMBOL(page_pool_alloc_frag);
 626
 627 static void page_pool_empty_ring(struct page_pool *pool)
 628 {
 629         struct page *page;
 630
 631         /* Empty recycle ring */
 632         while ((page = ptr_ring_consume_bh(&pool->ring))) {
 633                 /* Verify the refcnt invariant of cached pages */
 634                 if (!(page_ref_count(page) == 1))
 635                         pr_crit("%s() page_pool refcnt %d violation\n",
 636                                 __func__, page_ref_count(page));
 637
 638                 page_pool_return_page(pool, page);
 639         }
 640 }
 641
 642 static void page_pool_free(struct page_pool *pool)
 643 {
 644         if (pool->disconnect)
 645                 pool->disconnect(pool);
 646
 647         ptr_ring_cleanup(&pool->ring, NULL);
 648
 649         if (pool->p.flags & PP_FLAG_DMA_MAP)
 650                 put_device(pool->p.dev);
 651
 652         kfree(pool);
 653 }
 654
 655 static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
 656 {
 657         struct page *page;
 658
 659         if (pool->destroy_cnt)
 660                 return;
 661
 662         /* Empty alloc cache, assume caller made sure this is
 663          * no-longer in use, and page_pool_alloc_pages() cannot be
 664          * call concurrently.
 665          */
 666         while (pool->alloc.count) {
 667                 page = pool->alloc.cache[--pool->alloc.count];
 668                 page_pool_return_page(pool, page);
 669         }
 670 }
 671
 672 static void page_pool_scrub(struct page_pool *pool)
 673 {
 674         page_pool_empty_alloc_cache_once(pool);
 675         pool->destroy_cnt++;
 676
 677         /* No more consumers should exist, but producers could still
 678          * be in-flight.
 679          */
 680         page_pool_empty_ring(pool);
 681 }
 682
 683 static int page_pool_release(struct page_pool *pool)
 684 {
 685         int inflight;
 686
 687         page_pool_scrub(pool);
 688         inflight = page_pool_inflight(pool);
 689         if (!inflight)
 690                 page_pool_free(pool);
 691
 692         return inflight;
 693 }
 694
 695 static void page_pool_release_retry(struct work_struct *wq)
 696 {
 697         struct delayed_work *dwq = to_delayed_work(wq);
 698         struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
 699         int inflight;
 700
 701         inflight = page_pool_release(pool);
 702         if (!inflight)
 703                 return;
 704
 705         /* Periodic warning */
 706         if (time_after_eq(jiffies, pool->defer_warn)) {
 707                 int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
 708
 709                 pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
 710                         __func__, inflight, sec);
 711                 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
 712         }
 713
 714         /* Still not ready to be disconnected, retry later */
 715         schedule_delayed_work(&pool->release_dw, DEFER_TIME);
 716 }
 717
 718 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *))
 719 {
 720         refcount_inc(&pool->user_cnt);
 721         pool->disconnect = disconnect;
 722 }
 723
 724 void page_pool_destroy(struct page_pool *pool)
 725 {
 726         if (!pool)
 727                 return;
 728
 729         if (!page_pool_put(pool))
 730                 return;
 731
 732         page_pool_free_frag(pool);
 733
 734         if (!page_pool_release(pool))
 735                 return;
 736
 737         pool->defer_start = jiffies;
 738         pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
 739
 740         INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
 741         schedule_delayed_work(&pool->release_dw, DEFER_TIME);
 742 }
 743 EXPORT_SYMBOL(page_pool_destroy);
 744
 745 /* Caller must provide appropriate safe context, e.g. NAPI. */
 746 void page_pool_update_nid(struct page_pool *pool, int new_nid)
 747 {
 748         struct page *page;
 749
 750         trace_page_pool_update_nid(pool, new_nid);
 751         pool->p.nid = new_nid;
 752
 753         /* Flush pool alloc cache, as refill will check NUMA node */
 754         while (pool->alloc.count) {
 755                 page = pool->alloc.cache[--pool->alloc.count];
 756                 page_pool_return_page(pool, page);
 757         }
 758 }
 759 EXPORT_SYMBOL(page_pool_update_nid);
 760
 761 bool page_pool_return_skb_page(struct page *page)
 762 {
 763         struct page_pool *pp;
 764
 765         page = compound_head(page);
 766
 767         /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
 768          * in order to preserve any existing bits, such as bit 0 for the
 769          * head page of compound page and bit 1 for pfmemalloc page, so
 770          * mask those bits for freeing side when doing below checking,
 771          * and page_is_pfmemalloc() is checked in __page_pool_put_page()
 772          * to avoid recycling the pfmemalloc page.
 773          */
 774         if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE))
 775                 return false;
 776
 777         pp = page->pp;
 778
 779         /* Driver set this to memory recycling info. Reset it on recycle.
 780          * This will *not* work for NIC using a split-page memory model.
 781          * The page will be returned to the pool here regardless of the
 782          * 'flipped' fragment being in use or not.
 783          */
 784         page_pool_put_full_page(pp, page, false);
 785
 786         return true;
 787 }
 788 EXPORT_SYMBOL(page_pool_return_skb_page);