GNU Linux-libre 4.14.332-gnu1
[releases.git] / fs / nfs / pnfs.c
1 /*
2  *  pNFS functions to call and manage layout drivers.
3  *
4  *  Copyright (c) 2002 [year of first publication]
5  *  The Regents of the University of Michigan
6  *  All Rights Reserved
7  *
8  *  Dean Hildebrand <dhildebz@umich.edu>
9  *
10  *  Permission is granted to use, copy, create derivative works, and
11  *  redistribute this software and such derivative works for any purpose,
12  *  so long as the name of the University of Michigan is not used in
13  *  any advertising or publicity pertaining to the use or distribution
14  *  of this software without specific, written prior authorization. If
15  *  the above copyright notice or any other identification of the
16  *  University of Michigan is included in any copy of any portion of
17  *  this software, then the disclaimer below must also be included.
18  *
19  *  This software is provided as is, without representation or warranty
20  *  of any kind either express or implied, including without limitation
21  *  the implied warranties of merchantability, fitness for a particular
22  *  purpose, or noninfringement.  The Regents of the University of
23  *  Michigan shall not be liable for any damages, including special,
24  *  indirect, incidental, or consequential damages, with respect to any
25  *  claim arising out of or in connection with the use of the software,
26  *  even if it has been or is hereafter advised of the possibility of
27  *  such damages.
28  */
29
30 #include <linux/nfs_fs.h>
31 #include <linux/nfs_page.h>
32 #include <linux/module.h>
33 #include <linux/sort.h>
34 #include "internal.h"
35 #include "pnfs.h"
36 #include "iostat.h"
37 #include "nfs4trace.h"
38 #include "delegation.h"
39 #include "nfs42.h"
40
41 #define NFSDBG_FACILITY         NFSDBG_PNFS
42 #define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
43
44 /* Locking:
45  *
46  * pnfs_spinlock:
47  *      protects pnfs_modules_tbl.
48  */
49 static DEFINE_SPINLOCK(pnfs_spinlock);
50
51 /*
52  * pnfs_modules_tbl holds all pnfs modules
53  */
54 static LIST_HEAD(pnfs_modules_tbl);
55
56 static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo);
57 static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,
58                 struct list_head *free_me,
59                 const struct pnfs_layout_range *range,
60                 u32 seq);
61 static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
62                                 struct list_head *tmp_list);
63
64 /* Return the registered pnfs layout driver module matching given id */
65 static struct pnfs_layoutdriver_type *
66 find_pnfs_driver_locked(u32 id)
67 {
68         struct pnfs_layoutdriver_type *local;
69
70         list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
71                 if (local->id == id)
72                         goto out;
73         local = NULL;
74 out:
75         dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
76         return local;
77 }
78
79 static struct pnfs_layoutdriver_type *
80 find_pnfs_driver(u32 id)
81 {
82         struct pnfs_layoutdriver_type *local;
83
84         spin_lock(&pnfs_spinlock);
85         local = find_pnfs_driver_locked(id);
86         if (local != NULL && !try_module_get(local->owner)) {
87                 dprintk("%s: Could not grab reference on module\n", __func__);
88                 local = NULL;
89         }
90         spin_unlock(&pnfs_spinlock);
91         return local;
92 }
93
94 const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id)
95 {
96         return find_pnfs_driver(id);
97 }
98
99 void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld)
100 {
101         if (ld)
102                 module_put(ld->owner);
103 }
104
105 void
106 unset_pnfs_layoutdriver(struct nfs_server *nfss)
107 {
108         if (nfss->pnfs_curr_ld) {
109                 if (nfss->pnfs_curr_ld->clear_layoutdriver)
110                         nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
111                 /* Decrement the MDS count. Purge the deviceid cache if zero */
112                 if (atomic_dec_and_test(&nfss->nfs_client->cl_mds_count))
113                         nfs4_deviceid_purge_client(nfss->nfs_client);
114                 module_put(nfss->pnfs_curr_ld->owner);
115         }
116         nfss->pnfs_curr_ld = NULL;
117 }
118
119 /*
120  * When the server sends a list of layout types, we choose one in the order
121  * given in the list below.
122  *
123  * FIXME: should this list be configurable in some fashion? module param?
124  *        mount option? something else?
125  */
126 static const u32 ld_prefs[] = {
127         LAYOUT_SCSI,
128         LAYOUT_BLOCK_VOLUME,
129         LAYOUT_OSD2_OBJECTS,
130         LAYOUT_FLEX_FILES,
131         LAYOUT_NFSV4_1_FILES,
132         0
133 };
134
135 static int
136 ld_cmp(const void *e1, const void *e2)
137 {
138         u32 ld1 = *((u32 *)e1);
139         u32 ld2 = *((u32 *)e2);
140         int i;
141
142         for (i = 0; ld_prefs[i] != 0; i++) {
143                 if (ld1 == ld_prefs[i])
144                         return -1;
145
146                 if (ld2 == ld_prefs[i])
147                         return 1;
148         }
149         return 0;
150 }
151
152 /*
153  * Try to set the server's pnfs module to the pnfs layout type specified by id.
154  * Currently only one pNFS layout driver per filesystem is supported.
155  *
156  * @ids array of layout types supported by MDS.
157  */
158 void
159 set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
160                       struct nfs_fsinfo *fsinfo)
161 {
162         struct pnfs_layoutdriver_type *ld_type = NULL;
163         u32 id;
164         int i;
165
166         if (fsinfo->nlayouttypes == 0)
167                 goto out_no_driver;
168         if (!(server->nfs_client->cl_exchange_flags &
169                  (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
170                 printk(KERN_ERR "NFS: %s: cl_exchange_flags 0x%x\n",
171                         __func__, server->nfs_client->cl_exchange_flags);
172                 goto out_no_driver;
173         }
174
175         sort(fsinfo->layouttype, fsinfo->nlayouttypes,
176                 sizeof(*fsinfo->layouttype), ld_cmp, NULL);
177
178         for (i = 0; i < fsinfo->nlayouttypes; i++) {
179                 id = fsinfo->layouttype[i];
180                 ld_type = find_pnfs_driver(id);
181                 if (!ld_type) {
182                         request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX,
183                                         id);
184                         ld_type = find_pnfs_driver(id);
185                 }
186                 if (ld_type)
187                         break;
188         }
189
190         if (!ld_type) {
191                 dprintk("%s: No pNFS module found!\n", __func__);
192                 goto out_no_driver;
193         }
194
195         server->pnfs_curr_ld = ld_type;
196         if (ld_type->set_layoutdriver
197             && ld_type->set_layoutdriver(server, mntfh)) {
198                 printk(KERN_ERR "NFS: %s: Error initializing pNFS layout "
199                         "driver %u.\n", __func__, id);
200                 module_put(ld_type->owner);
201                 goto out_no_driver;
202         }
203         /* Bump the MDS count */
204         atomic_inc(&server->nfs_client->cl_mds_count);
205
206         dprintk("%s: pNFS module for %u set\n", __func__, id);
207         return;
208
209 out_no_driver:
210         dprintk("%s: Using NFSv4 I/O\n", __func__);
211         server->pnfs_curr_ld = NULL;
212 }
213
214 int
215 pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
216 {
217         int status = -EINVAL;
218         struct pnfs_layoutdriver_type *tmp;
219
220         if (ld_type->id == 0) {
221                 printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__);
222                 return status;
223         }
224         if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
225                 printk(KERN_ERR "NFS: %s Layout driver must provide "
226                        "alloc_lseg and free_lseg.\n", __func__);
227                 return status;
228         }
229
230         spin_lock(&pnfs_spinlock);
231         tmp = find_pnfs_driver_locked(ld_type->id);
232         if (!tmp) {
233                 list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
234                 status = 0;
235                 dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
236                         ld_type->name);
237         } else {
238                 printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n",
239                         __func__, ld_type->id);
240         }
241         spin_unlock(&pnfs_spinlock);
242
243         return status;
244 }
245 EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
246
247 void
248 pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
249 {
250         dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
251         spin_lock(&pnfs_spinlock);
252         list_del(&ld_type->pnfs_tblid);
253         spin_unlock(&pnfs_spinlock);
254 }
255 EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
256
257 /*
258  * pNFS client layout cache
259  */
260
261 /* Need to hold i_lock if caller does not already hold reference */
262 void
263 pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo)
264 {
265         atomic_inc(&lo->plh_refcount);
266 }
267
268 static struct pnfs_layout_hdr *
269 pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
270 {
271         struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
272         return ld->alloc_layout_hdr(ino, gfp_flags);
273 }
274
275 static void
276 pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
277 {
278         struct nfs_server *server = NFS_SERVER(lo->plh_inode);
279         struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
280
281         if (!list_empty(&lo->plh_layouts)) {
282                 struct nfs_client *clp = server->nfs_client;
283
284                 spin_lock(&clp->cl_lock);
285                 list_del_init(&lo->plh_layouts);
286                 spin_unlock(&clp->cl_lock);
287         }
288         put_rpccred(lo->plh_lc_cred);
289         return ld->free_layout_hdr(lo);
290 }
291
292 static void
293 pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo)
294 {
295         struct nfs_inode *nfsi = NFS_I(lo->plh_inode);
296         dprintk("%s: freeing layout cache %p\n", __func__, lo);
297         nfsi->layout = NULL;
298         /* Reset MDS Threshold I/O counters */
299         nfsi->write_io = 0;
300         nfsi->read_io = 0;
301 }
302
303 void
304 pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
305 {
306         struct inode *inode;
307
308         if (!lo)
309                 return;
310         inode = lo->plh_inode;
311         pnfs_layoutreturn_before_put_layout_hdr(lo);
312
313         if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
314                 if (!list_empty(&lo->plh_segs))
315                         WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
316                 pnfs_detach_layout_hdr(lo);
317                 spin_unlock(&inode->i_lock);
318                 pnfs_free_layout_hdr(lo);
319         }
320 }
321
322 static void
323 pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
324                          u32 seq)
325 {
326         if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode)
327                 iomode = IOMODE_ANY;
328         lo->plh_return_iomode = iomode;
329         set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
330         if (seq != 0) {
331                 WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq);
332                 lo->plh_return_seq = seq;
333         }
334 }
335
336 static void
337 pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
338 {
339         struct pnfs_layout_segment *lseg;
340         lo->plh_return_iomode = 0;
341         lo->plh_return_seq = 0;
342         clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
343         list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
344                 if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
345                         continue;
346                 pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
347         }
348 }
349
350 static void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
351 {
352         clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
353         clear_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags);
354         smp_mb__after_atomic();
355         wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
356         rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
357 }
358
359 static void
360 pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg,
361                 struct list_head *free_me)
362 {
363         clear_bit(NFS_LSEG_ROC, &lseg->pls_flags);
364         clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
365         if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags))
366                 pnfs_lseg_dec_and_remove_zero(lseg, free_me);
367         if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
368                 pnfs_lseg_dec_and_remove_zero(lseg, free_me);
369 }
370
371 /*
372  * Mark a pnfs_layout_hdr and all associated layout segments as invalid
373  *
374  * In order to continue using the pnfs_layout_hdr, a full recovery
375  * is required.
376  * Note that caller must hold inode->i_lock.
377  */
378 int
379 pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
380                 struct list_head *lseg_list)
381 {
382         struct pnfs_layout_range range = {
383                 .iomode = IOMODE_ANY,
384                 .offset = 0,
385                 .length = NFS4_MAX_UINT64,
386         };
387         struct pnfs_layout_segment *lseg, *next;
388
389         set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
390         list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
391                 pnfs_clear_lseg_state(lseg, lseg_list);
392         pnfs_clear_layoutreturn_info(lo);
393         pnfs_free_returned_lsegs(lo, lseg_list, &range, 0);
394         if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
395             !test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
396                 pnfs_clear_layoutreturn_waitbit(lo);
397         return !list_empty(&lo->plh_segs);
398 }
399
400 static int
401 pnfs_iomode_to_fail_bit(u32 iomode)
402 {
403         return iomode == IOMODE_RW ?
404                 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
405 }
406
407 static void
408 pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
409 {
410         lo->plh_retry_timestamp = jiffies;
411         if (!test_and_set_bit(fail_bit, &lo->plh_flags))
412                 atomic_inc(&lo->plh_refcount);
413 }
414
415 static void
416 pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
417 {
418         if (test_and_clear_bit(fail_bit, &lo->plh_flags))
419                 atomic_dec(&lo->plh_refcount);
420 }
421
422 static void
423 pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
424 {
425         struct inode *inode = lo->plh_inode;
426         struct pnfs_layout_range range = {
427                 .iomode = iomode,
428                 .offset = 0,
429                 .length = NFS4_MAX_UINT64,
430         };
431         LIST_HEAD(head);
432
433         spin_lock(&inode->i_lock);
434         pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
435         pnfs_mark_matching_lsegs_invalid(lo, &head, &range, 0);
436         spin_unlock(&inode->i_lock);
437         pnfs_free_lseg_list(&head);
438         dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
439                         iomode == IOMODE_RW ?  "RW" : "READ");
440 }
441
442 static bool
443 pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
444 {
445         unsigned long start, end;
446         int fail_bit = pnfs_iomode_to_fail_bit(iomode);
447
448         if (test_bit(fail_bit, &lo->plh_flags) == 0)
449                 return false;
450         end = jiffies;
451         start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT;
452         if (!time_in_range(lo->plh_retry_timestamp, start, end)) {
453                 /* It is time to retry the failed layoutgets */
454                 pnfs_layout_clear_fail_bit(lo, fail_bit);
455                 return false;
456         }
457         return true;
458 }
459
460 static void
461 pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
462                 const struct pnfs_layout_range *range,
463                 const nfs4_stateid *stateid)
464 {
465         INIT_LIST_HEAD(&lseg->pls_list);
466         INIT_LIST_HEAD(&lseg->pls_lc_list);
467         atomic_set(&lseg->pls_refcount, 1);
468         set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
469         lseg->pls_layout = lo;
470         lseg->pls_range = *range;
471         lseg->pls_seq = be32_to_cpu(stateid->seqid);
472 }
473
474 static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
475 {
476         if (lseg != NULL) {
477                 struct inode *inode = lseg->pls_layout->plh_inode;
478                 NFS_SERVER(inode)->pnfs_curr_ld->free_lseg(lseg);
479         }
480 }
481
482 static void
483 pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
484                 struct pnfs_layout_segment *lseg)
485 {
486         WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
487         list_del_init(&lseg->pls_list);
488         /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
489         atomic_dec(&lo->plh_refcount);
490         if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
491                 return;
492         if (list_empty(&lo->plh_segs) &&
493             !test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) &&
494             !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
495                 if (atomic_read(&lo->plh_outstanding) == 0)
496                         set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
497                 clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
498         }
499 }
500
501 static bool
502 pnfs_cache_lseg_for_layoutreturn(struct pnfs_layout_hdr *lo,
503                 struct pnfs_layout_segment *lseg)
504 {
505         if (test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
506             pnfs_layout_is_valid(lo)) {
507                 pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
508                 list_move_tail(&lseg->pls_list, &lo->plh_return_segs);
509                 return true;
510         }
511         return false;
512 }
513
514 void
515 pnfs_put_lseg(struct pnfs_layout_segment *lseg)
516 {
517         struct pnfs_layout_hdr *lo;
518         struct inode *inode;
519
520         if (!lseg)
521                 return;
522
523         dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
524                 atomic_read(&lseg->pls_refcount),
525                 test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
526
527         lo = lseg->pls_layout;
528         inode = lo->plh_inode;
529
530         if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
531                 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
532                         spin_unlock(&inode->i_lock);
533                         return;
534                 }
535                 pnfs_get_layout_hdr(lo);
536                 pnfs_layout_remove_lseg(lo, lseg);
537                 if (pnfs_cache_lseg_for_layoutreturn(lo, lseg))
538                         lseg = NULL;
539                 spin_unlock(&inode->i_lock);
540                 pnfs_free_lseg(lseg);
541                 pnfs_put_layout_hdr(lo);
542         }
543 }
544 EXPORT_SYMBOL_GPL(pnfs_put_lseg);
545
546 /*
547  * is l2 fully contained in l1?
548  *   start1                             end1
549  *   [----------------------------------)
550  *           start2           end2
551  *           [----------------)
552  */
553 static bool
554 pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
555                  const struct pnfs_layout_range *l2)
556 {
557         u64 start1 = l1->offset;
558         u64 end1 = pnfs_end_offset(start1, l1->length);
559         u64 start2 = l2->offset;
560         u64 end2 = pnfs_end_offset(start2, l2->length);
561
562         return (start1 <= start2) && (end1 >= end2);
563 }
564
565 static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
566                 struct list_head *tmp_list)
567 {
568         if (!atomic_dec_and_test(&lseg->pls_refcount))
569                 return false;
570         pnfs_layout_remove_lseg(lseg->pls_layout, lseg);
571         list_add(&lseg->pls_list, tmp_list);
572         return true;
573 }
574
575 /* Returns 1 if lseg is removed from list, 0 otherwise */
576 static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
577                              struct list_head *tmp_list)
578 {
579         int rv = 0;
580
581         if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
582                 /* Remove the reference keeping the lseg in the
583                  * list.  It will now be removed when all
584                  * outstanding io is finished.
585                  */
586                 dprintk("%s: lseg %p ref %d\n", __func__, lseg,
587                         atomic_read(&lseg->pls_refcount));
588                 if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list))
589                         rv = 1;
590         }
591         return rv;
592 }
593
594 /*
595  * Compare 2 layout stateid sequence ids, to see which is newer,
596  * taking into account wraparound issues.
597  */
598 static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
599 {
600         return (s32)(s1 - s2) > 0;
601 }
602
603 static bool
604 pnfs_should_free_range(const struct pnfs_layout_range *lseg_range,
605                  const struct pnfs_layout_range *recall_range)
606 {
607         return (recall_range->iomode == IOMODE_ANY ||
608                 lseg_range->iomode == recall_range->iomode) &&
609                pnfs_lseg_range_intersecting(lseg_range, recall_range);
610 }
611
612 static bool
613 pnfs_match_lseg_recall(const struct pnfs_layout_segment *lseg,
614                 const struct pnfs_layout_range *recall_range,
615                 u32 seq)
616 {
617         if (seq != 0 && pnfs_seqid_is_newer(lseg->pls_seq, seq))
618                 return false;
619         if (recall_range == NULL)
620                 return true;
621         return pnfs_should_free_range(&lseg->pls_range, recall_range);
622 }
623
624 /**
625  * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later
626  * @lo: layout header containing the lsegs
627  * @tmp_list: list head where doomed lsegs should go
628  * @recall_range: optional recall range argument to match (may be NULL)
629  * @seq: only invalidate lsegs obtained prior to this sequence (may be 0)
630  *
631  * Walk the list of lsegs in the layout header, and tear down any that should
632  * be destroyed. If "recall_range" is specified then the segment must match
633  * that range. If "seq" is non-zero, then only match segments that were handed
634  * out at or before that sequence.
635  *
636  * Returns number of matching invalid lsegs remaining in list after scanning
637  * it and purging them.
638  */
639 int
640 pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
641                             struct list_head *tmp_list,
642                             const struct pnfs_layout_range *recall_range,
643                             u32 seq)
644 {
645         struct pnfs_layout_segment *lseg, *next;
646         int remaining = 0;
647
648         dprintk("%s:Begin lo %p\n", __func__, lo);
649
650         if (list_empty(&lo->plh_segs))
651                 return 0;
652         list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
653                 if (pnfs_match_lseg_recall(lseg, recall_range, seq)) {
654                         dprintk("%s: freeing lseg %p iomode %d seq %u"
655                                 "offset %llu length %llu\n", __func__,
656                                 lseg, lseg->pls_range.iomode, lseg->pls_seq,
657                                 lseg->pls_range.offset, lseg->pls_range.length);
658                         if (!mark_lseg_invalid(lseg, tmp_list))
659                                 remaining++;
660                 }
661         dprintk("%s:Return %i\n", __func__, remaining);
662         return remaining;
663 }
664
665 static void
666 pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,
667                 struct list_head *free_me,
668                 const struct pnfs_layout_range *range,
669                 u32 seq)
670 {
671         struct pnfs_layout_segment *lseg, *next;
672
673         list_for_each_entry_safe(lseg, next, &lo->plh_return_segs, pls_list) {
674                 if (pnfs_match_lseg_recall(lseg, range, seq))
675                         list_move_tail(&lseg->pls_list, free_me);
676         }
677 }
678
679 /* note free_me must contain lsegs from a single layout_hdr */
680 void
681 pnfs_free_lseg_list(struct list_head *free_me)
682 {
683         struct pnfs_layout_segment *lseg, *tmp;
684
685         if (list_empty(free_me))
686                 return;
687
688         list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
689                 list_del(&lseg->pls_list);
690                 pnfs_free_lseg(lseg);
691         }
692 }
693
694 void
695 pnfs_destroy_layout(struct nfs_inode *nfsi)
696 {
697         struct pnfs_layout_hdr *lo;
698         LIST_HEAD(tmp_list);
699
700         spin_lock(&nfsi->vfs_inode.i_lock);
701         lo = nfsi->layout;
702         if (lo) {
703                 pnfs_get_layout_hdr(lo);
704                 pnfs_mark_layout_stateid_invalid(lo, &tmp_list);
705                 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
706                 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
707                 spin_unlock(&nfsi->vfs_inode.i_lock);
708                 pnfs_free_lseg_list(&tmp_list);
709                 nfs_commit_inode(&nfsi->vfs_inode, 0);
710                 pnfs_put_layout_hdr(lo);
711         } else
712                 spin_unlock(&nfsi->vfs_inode.i_lock);
713 }
714 EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
715
716 static bool
717 pnfs_layout_add_bulk_destroy_list(struct inode *inode,
718                 struct list_head *layout_list)
719 {
720         struct pnfs_layout_hdr *lo;
721         bool ret = false;
722
723         spin_lock(&inode->i_lock);
724         lo = NFS_I(inode)->layout;
725         if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) {
726                 pnfs_get_layout_hdr(lo);
727                 list_add(&lo->plh_bulk_destroy, layout_list);
728                 ret = true;
729         }
730         spin_unlock(&inode->i_lock);
731         return ret;
732 }
733
734 /* Caller must hold rcu_read_lock and clp->cl_lock */
735 static int
736 pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
737                 struct nfs_server *server,
738                 struct list_head *layout_list)
739         __must_hold(&clp->cl_lock)
740         __must_hold(RCU)
741 {
742         struct pnfs_layout_hdr *lo, *next;
743         struct inode *inode;
744
745         list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) {
746                 if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
747                     test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) ||
748                     !list_empty(&lo->plh_bulk_destroy))
749                         continue;
750                 /* If the sb is being destroyed, just bail */
751                 if (!nfs_sb_active(server->super))
752                         break;
753                 inode = igrab(lo->plh_inode);
754                 if (inode != NULL) {
755                         list_del_init(&lo->plh_layouts);
756                         if (pnfs_layout_add_bulk_destroy_list(inode,
757                                                 layout_list))
758                                 continue;
759                         rcu_read_unlock();
760                         spin_unlock(&clp->cl_lock);
761                         iput(inode);
762                 } else {
763                         rcu_read_unlock();
764                         spin_unlock(&clp->cl_lock);
765                         set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags);
766                 }
767                 nfs_sb_deactive(server->super);
768                 spin_lock(&clp->cl_lock);
769                 rcu_read_lock();
770                 return -EAGAIN;
771         }
772         return 0;
773 }
774
775 static int
776 pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
777                 bool is_bulk_recall)
778 {
779         struct pnfs_layout_hdr *lo;
780         struct inode *inode;
781         LIST_HEAD(lseg_list);
782         int ret = 0;
783
784         while (!list_empty(layout_list)) {
785                 lo = list_entry(layout_list->next, struct pnfs_layout_hdr,
786                                 plh_bulk_destroy);
787                 dprintk("%s freeing layout for inode %lu\n", __func__,
788                         lo->plh_inode->i_ino);
789                 inode = lo->plh_inode;
790
791                 pnfs_layoutcommit_inode(inode, false);
792
793                 spin_lock(&inode->i_lock);
794                 list_del_init(&lo->plh_bulk_destroy);
795                 if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) {
796                         if (is_bulk_recall)
797                                 set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
798                         ret = -EAGAIN;
799                 }
800                 spin_unlock(&inode->i_lock);
801                 pnfs_free_lseg_list(&lseg_list);
802                 /* Free all lsegs that are attached to commit buckets */
803                 nfs_commit_inode(inode, 0);
804                 pnfs_put_layout_hdr(lo);
805                 nfs_iput_and_deactive(inode);
806         }
807         return ret;
808 }
809
810 int
811 pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
812                 struct nfs_fsid *fsid,
813                 bool is_recall)
814 {
815         struct nfs_server *server;
816         LIST_HEAD(layout_list);
817
818         spin_lock(&clp->cl_lock);
819         rcu_read_lock();
820 restart:
821         list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
822                 if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0)
823                         continue;
824                 if (pnfs_layout_bulk_destroy_byserver_locked(clp,
825                                 server,
826                                 &layout_list) != 0)
827                         goto restart;
828         }
829         rcu_read_unlock();
830         spin_unlock(&clp->cl_lock);
831
832         if (list_empty(&layout_list))
833                 return 0;
834         return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
835 }
836
837 int
838 pnfs_destroy_layouts_byclid(struct nfs_client *clp,
839                 bool is_recall)
840 {
841         struct nfs_server *server;
842         LIST_HEAD(layout_list);
843
844         spin_lock(&clp->cl_lock);
845         rcu_read_lock();
846 restart:
847         list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
848                 if (pnfs_layout_bulk_destroy_byserver_locked(clp,
849                                         server,
850                                         &layout_list) != 0)
851                         goto restart;
852         }
853         rcu_read_unlock();
854         spin_unlock(&clp->cl_lock);
855
856         if (list_empty(&layout_list))
857                 return 0;
858         return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
859 }
860
861 /*
862  * Called by the state manger to remove all layouts established under an
863  * expired lease.
864  */
865 void
866 pnfs_destroy_all_layouts(struct nfs_client *clp)
867 {
868         nfs4_deviceid_mark_client_invalid(clp);
869         nfs4_deviceid_purge_client(clp);
870
871         pnfs_destroy_layouts_byclid(clp, false);
872 }
873
874 /* update lo->plh_stateid with new if is more recent */
875 void
876 pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
877                         bool update_barrier)
878 {
879         u32 oldseq, newseq, new_barrier = 0;
880
881         oldseq = be32_to_cpu(lo->plh_stateid.seqid);
882         newseq = be32_to_cpu(new->seqid);
883
884         if (!pnfs_layout_is_valid(lo)) {
885                 nfs4_stateid_copy(&lo->plh_stateid, new);
886                 lo->plh_barrier = newseq;
887                 pnfs_clear_layoutreturn_info(lo);
888                 clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
889                 return;
890         }
891         if (pnfs_seqid_is_newer(newseq, oldseq)) {
892                 nfs4_stateid_copy(&lo->plh_stateid, new);
893                 /*
894                  * Because of wraparound, we want to keep the barrier
895                  * "close" to the current seqids.
896                  */
897                 new_barrier = newseq - atomic_read(&lo->plh_outstanding);
898         }
899         if (update_barrier)
900                 new_barrier = be32_to_cpu(new->seqid);
901         else if (new_barrier == 0)
902                 return;
903         if (pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
904                 lo->plh_barrier = new_barrier;
905 }
906
907 static bool
908 pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
909                 const nfs4_stateid *stateid)
910 {
911         u32 seqid = be32_to_cpu(stateid->seqid);
912
913         return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
914 }
915
916 /* lget is set to 1 if called from inside send_layoutget call chain */
917 static bool
918 pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
919 {
920         return lo->plh_block_lgets ||
921                 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
922 }
923
924 /*
925  * Get layout from server.
926  *    for now, assume that whole file layouts are requested.
927  *    arg->offset: 0
928  *    arg->length: all ones
929  */
930 static struct pnfs_layout_segment *
931 send_layoutget(struct pnfs_layout_hdr *lo,
932            struct nfs_open_context *ctx,
933            nfs4_stateid *stateid,
934            const struct pnfs_layout_range *range,
935            long *timeout, gfp_t gfp_flags)
936 {
937         struct inode *ino = lo->plh_inode;
938         struct nfs_server *server = NFS_SERVER(ino);
939         struct nfs4_layoutget *lgp;
940         loff_t i_size;
941
942         dprintk("--> %s\n", __func__);
943
944         /*
945          * Synchronously retrieve layout information from server and
946          * store in lseg. If we race with a concurrent seqid morphing
947          * op, then re-send the LAYOUTGET.
948          */
949         lgp = kzalloc(sizeof(*lgp), gfp_flags);
950         if (lgp == NULL)
951                 return ERR_PTR(-ENOMEM);
952
953         i_size = i_size_read(ino);
954
955         lgp->args.minlength = PAGE_SIZE;
956         if (lgp->args.minlength > range->length)
957                 lgp->args.minlength = range->length;
958         if (range->iomode == IOMODE_READ) {
959                 if (range->offset >= i_size)
960                         lgp->args.minlength = 0;
961                 else if (i_size - range->offset < lgp->args.minlength)
962                         lgp->args.minlength = i_size - range->offset;
963         }
964         lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
965         pnfs_copy_range(&lgp->args.range, range);
966         lgp->args.type = server->pnfs_curr_ld->id;
967         lgp->args.inode = ino;
968         lgp->args.ctx = get_nfs_open_context(ctx);
969         nfs4_stateid_copy(&lgp->args.stateid, stateid);
970         lgp->gfp_flags = gfp_flags;
971         lgp->cred = lo->plh_lc_cred;
972
973         return nfs4_proc_layoutget(lgp, timeout, gfp_flags);
974 }
975
976 static void pnfs_clear_layoutcommit(struct inode *inode,
977                 struct list_head *head)
978 {
979         struct nfs_inode *nfsi = NFS_I(inode);
980         struct pnfs_layout_segment *lseg, *tmp;
981
982         if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
983                 return;
984         list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) {
985                 if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
986                         continue;
987                 pnfs_lseg_dec_and_remove_zero(lseg, head);
988         }
989 }
990
991 void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
992                 const nfs4_stateid *arg_stateid,
993                 const struct pnfs_layout_range *range,
994                 const nfs4_stateid *stateid)
995 {
996         struct inode *inode = lo->plh_inode;
997         LIST_HEAD(freeme);
998
999         spin_lock(&inode->i_lock);
1000         if (!pnfs_layout_is_valid(lo) || !arg_stateid ||
1001             !nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid))
1002                 goto out_unlock;
1003         if (stateid) {
1004                 u32 seq = be32_to_cpu(arg_stateid->seqid);
1005
1006                 pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq);
1007                 pnfs_free_returned_lsegs(lo, &freeme, range, seq);
1008                 pnfs_set_layout_stateid(lo, stateid, true);
1009         } else
1010                 pnfs_mark_layout_stateid_invalid(lo, &freeme);
1011 out_unlock:
1012         pnfs_clear_layoutreturn_waitbit(lo);
1013         spin_unlock(&inode->i_lock);
1014         pnfs_free_lseg_list(&freeme);
1015
1016 }
1017
1018 static bool
1019 pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
1020                 nfs4_stateid *stateid,
1021                 enum pnfs_iomode *iomode)
1022 {
1023         /* Serialise LAYOUTGET/LAYOUTRETURN */
1024         if (atomic_read(&lo->plh_outstanding) != 0)
1025                 return false;
1026         if (test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
1027                 return false;
1028         set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
1029         pnfs_get_layout_hdr(lo);
1030         if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
1031                 if (stateid != NULL) {
1032                         nfs4_stateid_copy(stateid, &lo->plh_stateid);
1033                         if (lo->plh_return_seq != 0)
1034                                 stateid->seqid = cpu_to_be32(lo->plh_return_seq);
1035                 }
1036                 if (iomode != NULL)
1037                         *iomode = lo->plh_return_iomode;
1038                 pnfs_clear_layoutreturn_info(lo);
1039                 return true;
1040         }
1041         if (stateid != NULL)
1042                 nfs4_stateid_copy(stateid, &lo->plh_stateid);
1043         if (iomode != NULL)
1044                 *iomode = IOMODE_ANY;
1045         return true;
1046 }
1047
1048 static void
1049 pnfs_init_layoutreturn_args(struct nfs4_layoutreturn_args *args,
1050                 struct pnfs_layout_hdr *lo,
1051                 const nfs4_stateid *stateid,
1052                 enum pnfs_iomode iomode)
1053 {
1054         struct inode *inode = lo->plh_inode;
1055
1056         args->layout_type = NFS_SERVER(inode)->pnfs_curr_ld->id;
1057         args->inode = inode;
1058         args->range.iomode = iomode;
1059         args->range.offset = 0;
1060         args->range.length = NFS4_MAX_UINT64;
1061         args->layout = lo;
1062         nfs4_stateid_copy(&args->stateid, stateid);
1063 }
1064
1065 static int
1066 pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
1067                        enum pnfs_iomode iomode, bool sync)
1068 {
1069         struct inode *ino = lo->plh_inode;
1070         struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
1071         struct nfs4_layoutreturn *lrp;
1072         int status = 0;
1073
1074         lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
1075         if (unlikely(lrp == NULL)) {
1076                 status = -ENOMEM;
1077                 spin_lock(&ino->i_lock);
1078                 pnfs_clear_layoutreturn_waitbit(lo);
1079                 spin_unlock(&ino->i_lock);
1080                 pnfs_put_layout_hdr(lo);
1081                 goto out;
1082         }
1083
1084         pnfs_init_layoutreturn_args(&lrp->args, lo, stateid, iomode);
1085         lrp->args.ld_private = &lrp->ld_private;
1086         lrp->clp = NFS_SERVER(ino)->nfs_client;
1087         lrp->cred = lo->plh_lc_cred;
1088         if (ld->prepare_layoutreturn)
1089                 ld->prepare_layoutreturn(&lrp->args);
1090
1091         status = nfs4_proc_layoutreturn(lrp, sync);
1092 out:
1093         dprintk("<-- %s status: %d\n", __func__, status);
1094         return status;
1095 }
1096
1097 /* Return true if layoutreturn is needed */
1098 static bool
1099 pnfs_layout_need_return(struct pnfs_layout_hdr *lo)
1100 {
1101         struct pnfs_layout_segment *s;
1102
1103         if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
1104                 return false;
1105
1106         /* Defer layoutreturn until all lsegs are done */
1107         list_for_each_entry(s, &lo->plh_segs, pls_list) {
1108                 if (test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
1109                         return false;
1110         }
1111
1112         return true;
1113 }
1114
1115 static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
1116 {
1117         struct inode *inode= lo->plh_inode;
1118
1119         if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
1120                 return;
1121         spin_lock(&inode->i_lock);
1122         if (pnfs_layout_need_return(lo)) {
1123                 nfs4_stateid stateid;
1124                 enum pnfs_iomode iomode;
1125                 bool send;
1126
1127                 send = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
1128                 spin_unlock(&inode->i_lock);
1129                 if (send) {
1130                         /* Send an async layoutreturn so we dont deadlock */
1131                         pnfs_send_layoutreturn(lo, &stateid, iomode, false);
1132                 }
1133         } else
1134                 spin_unlock(&inode->i_lock);
1135 }
1136
1137 /*
1138  * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
1139  * when the layout segment list is empty.
1140  *
1141  * Note that a pnfs_layout_hdr can exist with an empty layout segment
1142  * list when LAYOUTGET has failed, or when LAYOUTGET succeeded, but the
1143  * deviceid is marked invalid.
1144  */
1145 int
1146 _pnfs_return_layout(struct inode *ino)
1147 {
1148         struct pnfs_layout_hdr *lo = NULL;
1149         struct nfs_inode *nfsi = NFS_I(ino);
1150         struct pnfs_layout_range range = {
1151                 .iomode         = IOMODE_ANY,
1152                 .offset         = 0,
1153                 .length         = NFS4_MAX_UINT64,
1154         };
1155         LIST_HEAD(tmp_list);
1156         nfs4_stateid stateid;
1157         int status = 0;
1158         bool send, valid_layout;
1159
1160         dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino);
1161
1162         spin_lock(&ino->i_lock);
1163         lo = nfsi->layout;
1164         if (!lo) {
1165                 spin_unlock(&ino->i_lock);
1166                 dprintk("NFS: %s no layout to return\n", __func__);
1167                 goto out;
1168         }
1169         /* Reference matched in nfs4_layoutreturn_release */
1170         pnfs_get_layout_hdr(lo);
1171         /* Is there an outstanding layoutreturn ? */
1172         if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
1173                 spin_unlock(&ino->i_lock);
1174                 if (wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
1175                                         TASK_UNINTERRUPTIBLE))
1176                         goto out_put_layout_hdr;
1177                 spin_lock(&ino->i_lock);
1178         }
1179         valid_layout = pnfs_layout_is_valid(lo);
1180         pnfs_clear_layoutcommit(ino, &tmp_list);
1181         pnfs_mark_matching_lsegs_return(lo, &tmp_list, &range, 0);
1182
1183         if (NFS_SERVER(ino)->pnfs_curr_ld->return_range)
1184                 NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
1185
1186         /* Don't send a LAYOUTRETURN if list was initially empty */
1187         if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) ||
1188                         !valid_layout) {
1189                 spin_unlock(&ino->i_lock);
1190                 dprintk("NFS: %s no layout segments to return\n", __func__);
1191                 goto out_put_layout_hdr;
1192         }
1193
1194         send = pnfs_prepare_layoutreturn(lo, &stateid, NULL);
1195         spin_unlock(&ino->i_lock);
1196         if (send)
1197                 status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
1198 out_put_layout_hdr:
1199         pnfs_free_lseg_list(&tmp_list);
1200         pnfs_put_layout_hdr(lo);
1201 out:
1202         dprintk("<-- %s status: %d\n", __func__, status);
1203         return status;
1204 }
1205
1206 int
1207 pnfs_commit_and_return_layout(struct inode *inode)
1208 {
1209         struct pnfs_layout_hdr *lo;
1210         int ret;
1211
1212         spin_lock(&inode->i_lock);
1213         lo = NFS_I(inode)->layout;
1214         if (lo == NULL) {
1215                 spin_unlock(&inode->i_lock);
1216                 return 0;
1217         }
1218         pnfs_get_layout_hdr(lo);
1219         /* Block new layoutgets and read/write to ds */
1220         lo->plh_block_lgets++;
1221         spin_unlock(&inode->i_lock);
1222         filemap_fdatawait(inode->i_mapping);
1223         ret = pnfs_layoutcommit_inode(inode, true);
1224         if (ret == 0)
1225                 ret = _pnfs_return_layout(inode);
1226         spin_lock(&inode->i_lock);
1227         lo->plh_block_lgets--;
1228         spin_unlock(&inode->i_lock);
1229         pnfs_put_layout_hdr(lo);
1230         return ret;
1231 }
1232
1233 bool pnfs_roc(struct inode *ino,
1234                 struct nfs4_layoutreturn_args *args,
1235                 struct nfs4_layoutreturn_res *res,
1236                 const struct rpc_cred *cred)
1237 {
1238         struct nfs_inode *nfsi = NFS_I(ino);
1239         struct nfs_open_context *ctx;
1240         struct nfs4_state *state;
1241         struct pnfs_layout_hdr *lo;
1242         struct pnfs_layout_segment *lseg, *next;
1243         nfs4_stateid stateid;
1244         enum pnfs_iomode iomode = 0;
1245         bool layoutreturn = false, roc = false;
1246         bool skip_read = false;
1247
1248         if (!nfs_have_layout(ino))
1249                 return false;
1250 retry:
1251         spin_lock(&ino->i_lock);
1252         lo = nfsi->layout;
1253         if (!lo || !pnfs_layout_is_valid(lo) ||
1254             test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
1255                 lo = NULL;
1256                 goto out_noroc;
1257         }
1258         pnfs_get_layout_hdr(lo);
1259         if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
1260                 spin_unlock(&ino->i_lock);
1261                 wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
1262                                 TASK_UNINTERRUPTIBLE);
1263                 pnfs_put_layout_hdr(lo);
1264                 goto retry;
1265         }
1266
1267         /* no roc if we hold a delegation */
1268         if (nfs4_check_delegation(ino, FMODE_READ)) {
1269                 if (nfs4_check_delegation(ino, FMODE_WRITE))
1270                         goto out_noroc;
1271                 skip_read = true;
1272         }
1273
1274         list_for_each_entry(ctx, &nfsi->open_files, list) {
1275                 state = ctx->state;
1276                 if (state == NULL)
1277                         continue;
1278                 /* Don't return layout if there is open file state */
1279                 if (state->state & FMODE_WRITE)
1280                         goto out_noroc;
1281                 if (state->state & FMODE_READ)
1282                         skip_read = true;
1283         }
1284
1285
1286         list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) {
1287                 if (skip_read && lseg->pls_range.iomode == IOMODE_READ)
1288                         continue;
1289                 /* If we are sending layoutreturn, invalidate all valid lsegs */
1290                 if (!test_and_clear_bit(NFS_LSEG_ROC, &lseg->pls_flags))
1291                         continue;
1292                 /*
1293                  * Note: mark lseg for return so pnfs_layout_remove_lseg
1294                  * doesn't invalidate the layout for us.
1295                  */
1296                 set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
1297                 if (!mark_lseg_invalid(lseg, &lo->plh_return_segs))
1298                         continue;
1299                 pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
1300         }
1301
1302         if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
1303                 goto out_noroc;
1304
1305         /* ROC in two conditions:
1306          * 1. there are ROC lsegs
1307          * 2. we don't send layoutreturn
1308          */
1309         /* lo ref dropped in pnfs_roc_release() */
1310         layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
1311         /* If the creds don't match, we can't compound the layoutreturn */
1312         if (!layoutreturn || cred != lo->plh_lc_cred)
1313                 goto out_noroc;
1314
1315         roc = layoutreturn;
1316         pnfs_init_layoutreturn_args(args, lo, &stateid, iomode);
1317         res->lrs_present = 0;
1318         layoutreturn = false;
1319
1320 out_noroc:
1321         spin_unlock(&ino->i_lock);
1322         pnfs_layoutcommit_inode(ino, true);
1323         if (roc) {
1324                 struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
1325                 if (ld->prepare_layoutreturn)
1326                         ld->prepare_layoutreturn(args);
1327                 pnfs_put_layout_hdr(lo);
1328                 return true;
1329         }
1330         if (layoutreturn)
1331                 pnfs_send_layoutreturn(lo, &stateid, iomode, true);
1332         pnfs_put_layout_hdr(lo);
1333         return false;
1334 }
1335
1336 void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
1337                 struct nfs4_layoutreturn_res *res,
1338                 int ret)
1339 {
1340         struct pnfs_layout_hdr *lo = args->layout;
1341         struct inode *inode = args->inode;
1342         const nfs4_stateid *arg_stateid = NULL;
1343         const nfs4_stateid *res_stateid = NULL;
1344         struct nfs4_xdr_opaque_data *ld_private = args->ld_private;
1345
1346         switch (ret) {
1347         case -NFS4ERR_NOMATCHING_LAYOUT:
1348                 spin_lock(&inode->i_lock);
1349                 if (pnfs_layout_is_valid(lo) &&
1350                     nfs4_stateid_match_other(&args->stateid, &lo->plh_stateid))
1351                         pnfs_set_plh_return_info(lo, args->range.iomode, 0);
1352                 spin_unlock(&inode->i_lock);
1353                 break;
1354         case 0:
1355                 if (res->lrs_present)
1356                         res_stateid = &res->stateid;
1357                 /* Fallthrough */
1358         default:
1359                 arg_stateid = &args->stateid;
1360         }
1361         pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range,
1362                         res_stateid);
1363         if (ld_private && ld_private->ops && ld_private->ops->free)
1364                 ld_private->ops->free(ld_private);
1365         pnfs_put_layout_hdr(lo);
1366         trace_nfs4_layoutreturn_on_close(args->inode, 0);
1367 }
1368
1369 bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
1370 {
1371         struct nfs_inode *nfsi = NFS_I(ino);
1372         struct pnfs_layout_hdr *lo;
1373         bool sleep = false;
1374
1375         /* we might not have grabbed lo reference. so need to check under
1376          * i_lock */
1377         spin_lock(&ino->i_lock);
1378         lo = nfsi->layout;
1379         if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
1380                 rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
1381                 sleep = true;
1382         }
1383         spin_unlock(&ino->i_lock);
1384         return sleep;
1385 }
1386
1387 /*
1388  * Compare two layout segments for sorting into layout cache.
1389  * We want to preferentially return RW over RO layouts, so ensure those
1390  * are seen first.
1391  */
1392 static s64
1393 pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
1394            const struct pnfs_layout_range *l2)
1395 {
1396         s64 d;
1397
1398         /* high offset > low offset */
1399         d = l1->offset - l2->offset;
1400         if (d)
1401                 return d;
1402
1403         /* short length > long length */
1404         d = l2->length - l1->length;
1405         if (d)
1406                 return d;
1407
1408         /* read > read/write */
1409         return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
1410 }
1411
1412 static bool
1413 pnfs_lseg_range_is_after(const struct pnfs_layout_range *l1,
1414                 const struct pnfs_layout_range *l2)
1415 {
1416         return pnfs_lseg_range_cmp(l1, l2) > 0;
1417 }
1418
1419 static bool
1420 pnfs_lseg_no_merge(struct pnfs_layout_segment *lseg,
1421                 struct pnfs_layout_segment *old)
1422 {
1423         return false;
1424 }
1425
1426 void
1427 pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
1428                    struct pnfs_layout_segment *lseg,
1429                    bool (*is_after)(const struct pnfs_layout_range *,
1430                            const struct pnfs_layout_range *),
1431                    bool (*do_merge)(struct pnfs_layout_segment *,
1432                            struct pnfs_layout_segment *),
1433                    struct list_head *free_me)
1434 {
1435         struct pnfs_layout_segment *lp, *tmp;
1436
1437         dprintk("%s:Begin\n", __func__);
1438
1439         list_for_each_entry_safe(lp, tmp, &lo->plh_segs, pls_list) {
1440                 if (test_bit(NFS_LSEG_VALID, &lp->pls_flags) == 0)
1441                         continue;
1442                 if (do_merge(lseg, lp)) {
1443                         mark_lseg_invalid(lp, free_me);
1444                         continue;
1445                 }
1446                 if (is_after(&lseg->pls_range, &lp->pls_range))
1447                         continue;
1448                 list_add_tail(&lseg->pls_list, &lp->pls_list);
1449                 dprintk("%s: inserted lseg %p "
1450                         "iomode %d offset %llu length %llu before "
1451                         "lp %p iomode %d offset %llu length %llu\n",
1452                         __func__, lseg, lseg->pls_range.iomode,
1453                         lseg->pls_range.offset, lseg->pls_range.length,
1454                         lp, lp->pls_range.iomode, lp->pls_range.offset,
1455                         lp->pls_range.length);
1456                 goto out;
1457         }
1458         list_add_tail(&lseg->pls_list, &lo->plh_segs);
1459         dprintk("%s: inserted lseg %p "
1460                 "iomode %d offset %llu length %llu at tail\n",
1461                 __func__, lseg, lseg->pls_range.iomode,
1462                 lseg->pls_range.offset, lseg->pls_range.length);
1463 out:
1464         pnfs_get_layout_hdr(lo);
1465
1466         dprintk("%s:Return\n", __func__);
1467 }
1468 EXPORT_SYMBOL_GPL(pnfs_generic_layout_insert_lseg);
1469
1470 static void
1471 pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
1472                    struct pnfs_layout_segment *lseg,
1473                    struct list_head *free_me)
1474 {
1475         struct inode *inode = lo->plh_inode;
1476         struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
1477
1478         if (ld->add_lseg != NULL)
1479                 ld->add_lseg(lo, lseg, free_me);
1480         else
1481                 pnfs_generic_layout_insert_lseg(lo, lseg,
1482                                 pnfs_lseg_range_is_after,
1483                                 pnfs_lseg_no_merge,
1484                                 free_me);
1485 }
1486
1487 static struct pnfs_layout_hdr *
1488 alloc_init_layout_hdr(struct inode *ino,
1489                       struct nfs_open_context *ctx,
1490                       gfp_t gfp_flags)
1491 {
1492         struct pnfs_layout_hdr *lo;
1493
1494         lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
1495         if (!lo)
1496                 return NULL;
1497         atomic_set(&lo->plh_refcount, 1);
1498         INIT_LIST_HEAD(&lo->plh_layouts);
1499         INIT_LIST_HEAD(&lo->plh_segs);
1500         INIT_LIST_HEAD(&lo->plh_return_segs);
1501         INIT_LIST_HEAD(&lo->plh_bulk_destroy);
1502         lo->plh_inode = ino;
1503         lo->plh_lc_cred = get_rpccred(ctx->cred);
1504         lo->plh_flags |= 1 << NFS_LAYOUT_INVALID_STID;
1505         return lo;
1506 }
1507
1508 static struct pnfs_layout_hdr *
1509 pnfs_find_alloc_layout(struct inode *ino,
1510                        struct nfs_open_context *ctx,
1511                        gfp_t gfp_flags)
1512         __releases(&ino->i_lock)
1513         __acquires(&ino->i_lock)
1514 {
1515         struct nfs_inode *nfsi = NFS_I(ino);
1516         struct pnfs_layout_hdr *new = NULL;
1517
1518         dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
1519
1520         if (nfsi->layout != NULL)
1521                 goto out_existing;
1522         spin_unlock(&ino->i_lock);
1523         new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
1524         spin_lock(&ino->i_lock);
1525
1526         if (likely(nfsi->layout == NULL)) {     /* Won the race? */
1527                 nfsi->layout = new;
1528                 return new;
1529         } else if (new != NULL)
1530                 pnfs_free_layout_hdr(new);
1531 out_existing:
1532         pnfs_get_layout_hdr(nfsi->layout);
1533         return nfsi->layout;
1534 }
1535
1536 /*
1537  * iomode matching rules:
1538  * iomode       lseg    strict match
1539  *                      iomode
1540  * -----        -----   ------ -----
1541  * ANY          READ    N/A    true
1542  * ANY          RW      N/A    true
1543  * RW           READ    N/A    false
1544  * RW           RW      N/A    true
1545  * READ         READ    N/A    true
1546  * READ         RW      true   false
1547  * READ         RW      false  true
1548  */
1549 static bool
1550 pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
1551                  const struct pnfs_layout_range *range,
1552                  bool strict_iomode)
1553 {
1554         struct pnfs_layout_range range1;
1555
1556         if ((range->iomode == IOMODE_RW &&
1557              ls_range->iomode != IOMODE_RW) ||
1558             (range->iomode != ls_range->iomode &&
1559              strict_iomode == true) ||
1560             !pnfs_lseg_range_intersecting(ls_range, range))
1561                 return 0;
1562
1563         /* range1 covers only the first byte in the range */
1564         range1 = *range;
1565         range1.length = 1;
1566         return pnfs_lseg_range_contained(ls_range, &range1);
1567 }
1568
1569 /*
1570  * lookup range in layout
1571  */
1572 static struct pnfs_layout_segment *
1573 pnfs_find_lseg(struct pnfs_layout_hdr *lo,
1574                 struct pnfs_layout_range *range,
1575                 bool strict_iomode)
1576 {
1577         struct pnfs_layout_segment *lseg, *ret = NULL;
1578
1579         dprintk("%s:Begin\n", __func__);
1580
1581         list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
1582                 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
1583                     !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
1584                     pnfs_lseg_range_match(&lseg->pls_range, range,
1585                                           strict_iomode)) {
1586                         ret = pnfs_get_lseg(lseg);
1587                         break;
1588                 }
1589         }
1590
1591         dprintk("%s:Return lseg %p ref %d\n",
1592                 __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
1593         return ret;
1594 }
1595
1596 /*
1597  * Use mdsthreshold hints set at each OPEN to determine if I/O should go
1598  * to the MDS or over pNFS
1599  *
1600  * The nfs_inode read_io and write_io fields are cumulative counters reset
1601  * when there are no layout segments. Note that in pnfs_update_layout iomode
1602  * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a
1603  * WRITE request.
1604  *
1605  * A return of true means use MDS I/O.
1606  *
1607  * From rfc 5661:
1608  * If a file's size is smaller than the file size threshold, data accesses
1609  * SHOULD be sent to the metadata server.  If an I/O request has a length that
1610  * is below the I/O size threshold, the I/O SHOULD be sent to the metadata
1611  * server.  If both file size and I/O size are provided, the client SHOULD
1612  * reach or exceed  both thresholds before sending its read or write
1613  * requests to the data server.
1614  */
1615 static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
1616                                      struct inode *ino, int iomode)
1617 {
1618         struct nfs4_threshold *t = ctx->mdsthreshold;
1619         struct nfs_inode *nfsi = NFS_I(ino);
1620         loff_t fsize = i_size_read(ino);
1621         bool size = false, size_set = false, io = false, io_set = false, ret = false;
1622
1623         if (t == NULL)
1624                 return ret;
1625
1626         dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
1627                 __func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz);
1628
1629         switch (iomode) {
1630         case IOMODE_READ:
1631                 if (t->bm & THRESHOLD_RD) {
1632                         dprintk("%s fsize %llu\n", __func__, fsize);
1633                         size_set = true;
1634                         if (fsize < t->rd_sz)
1635                                 size = true;
1636                 }
1637                 if (t->bm & THRESHOLD_RD_IO) {
1638                         dprintk("%s nfsi->read_io %llu\n", __func__,
1639                                 nfsi->read_io);
1640                         io_set = true;
1641                         if (nfsi->read_io < t->rd_io_sz)
1642                                 io = true;
1643                 }
1644                 break;
1645         case IOMODE_RW:
1646                 if (t->bm & THRESHOLD_WR) {
1647                         dprintk("%s fsize %llu\n", __func__, fsize);
1648                         size_set = true;
1649                         if (fsize < t->wr_sz)
1650                                 size = true;
1651                 }
1652                 if (t->bm & THRESHOLD_WR_IO) {
1653                         dprintk("%s nfsi->write_io %llu\n", __func__,
1654                                 nfsi->write_io);
1655                         io_set = true;
1656                         if (nfsi->write_io < t->wr_io_sz)
1657                                 io = true;
1658                 }
1659                 break;
1660         }
1661         if (size_set && io_set) {
1662                 if (size && io)
1663                         ret = true;
1664         } else if (size || io)
1665                 ret = true;
1666
1667         dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret);
1668         return ret;
1669 }
1670
1671 static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
1672 {
1673         /*
1674          * send layoutcommit as it can hold up layoutreturn due to lseg
1675          * reference
1676          */
1677         pnfs_layoutcommit_inode(lo->plh_inode, false);
1678         return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
1679                                    nfs_wait_bit_killable,
1680                                    TASK_UNINTERRUPTIBLE);
1681 }
1682
1683 static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
1684 {
1685         unsigned long *bitlock = &lo->plh_flags;
1686
1687         clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock);
1688         smp_mb__after_atomic();
1689         wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET);
1690 }
1691
1692 /*
1693  * Layout segment is retreived from the server if not cached.
1694  * The appropriate layout segment is referenced and returned to the caller.
1695  */
1696 struct pnfs_layout_segment *
1697 pnfs_update_layout(struct inode *ino,
1698                    struct nfs_open_context *ctx,
1699                    loff_t pos,
1700                    u64 count,
1701                    enum pnfs_iomode iomode,
1702                    bool strict_iomode,
1703                    gfp_t gfp_flags)
1704 {
1705         struct pnfs_layout_range arg = {
1706                 .iomode = iomode,
1707                 .offset = pos,
1708                 .length = count,
1709         };
1710         unsigned pg_offset;
1711         struct nfs_server *server = NFS_SERVER(ino);
1712         struct nfs_client *clp = server->nfs_client;
1713         struct pnfs_layout_hdr *lo = NULL;
1714         struct pnfs_layout_segment *lseg = NULL;
1715         nfs4_stateid stateid;
1716         long timeout = 0;
1717         unsigned long giveup = jiffies + (clp->cl_lease_time << 1);
1718         bool first;
1719
1720         if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
1721                 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1722                                  PNFS_UPDATE_LAYOUT_NO_PNFS);
1723                 goto out;
1724         }
1725
1726         if (iomode == IOMODE_READ && i_size_read(ino) == 0) {
1727                 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1728                                  PNFS_UPDATE_LAYOUT_RD_ZEROLEN);
1729                 goto out;
1730         }
1731
1732         if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
1733                 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1734                                  PNFS_UPDATE_LAYOUT_MDSTHRESH);
1735                 goto out;
1736         }
1737
1738 lookup_again:
1739         nfs4_client_recover_expired_lease(clp);
1740         first = false;
1741         spin_lock(&ino->i_lock);
1742         lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
1743         if (lo == NULL) {
1744                 spin_unlock(&ino->i_lock);
1745                 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1746                                  PNFS_UPDATE_LAYOUT_NOMEM);
1747                 goto out;
1748         }
1749
1750         /* Do we even need to bother with this? */
1751         if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
1752                 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1753                                  PNFS_UPDATE_LAYOUT_BULK_RECALL);
1754                 dprintk("%s matches recall, use MDS\n", __func__);
1755                 goto out_unlock;
1756         }
1757
1758         /* if LAYOUTGET already failed once we don't try again */
1759         if (pnfs_layout_io_test_failed(lo, iomode)) {
1760                 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1761                                  PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
1762                 goto out_unlock;
1763         }
1764
1765         lseg = pnfs_find_lseg(lo, &arg, strict_iomode);
1766         if (lseg) {
1767                 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1768                                 PNFS_UPDATE_LAYOUT_FOUND_CACHED);
1769                 goto out_unlock;
1770         }
1771
1772         if (!nfs4_valid_open_stateid(ctx->state)) {
1773                 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1774                                 PNFS_UPDATE_LAYOUT_INVALID_OPEN);
1775                 goto out_unlock;
1776         }
1777
1778         /*
1779          * Choose a stateid for the LAYOUTGET. If we don't have a layout
1780          * stateid, or it has been invalidated, then we must use the open
1781          * stateid.
1782          */
1783         if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
1784
1785                 /*
1786                  * The first layoutget for the file. Need to serialize per
1787                  * RFC 5661 Errata 3208.
1788                  */
1789                 if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
1790                                      &lo->plh_flags)) {
1791                         spin_unlock(&ino->i_lock);
1792                         wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET,
1793                                     TASK_UNINTERRUPTIBLE);
1794                         pnfs_put_layout_hdr(lo);
1795                         dprintk("%s retrying\n", __func__);
1796                         goto lookup_again;
1797                 }
1798
1799                 first = true;
1800                 if (nfs4_select_rw_stateid(ctx->state,
1801                                         iomode == IOMODE_RW ? FMODE_WRITE : FMODE_READ,
1802                                         NULL, &stateid, NULL) != 0) {
1803                         trace_pnfs_update_layout(ino, pos, count,
1804                                         iomode, lo, lseg,
1805                                         PNFS_UPDATE_LAYOUT_INVALID_OPEN);
1806                         goto out_unlock;
1807                 }
1808         } else {
1809                 nfs4_stateid_copy(&stateid, &lo->plh_stateid);
1810         }
1811
1812         /*
1813          * Because we free lsegs before sending LAYOUTRETURN, we need to wait
1814          * for LAYOUTRETURN even if first is true.
1815          */
1816         if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
1817                 spin_unlock(&ino->i_lock);
1818                 dprintk("%s wait for layoutreturn\n", __func__);
1819                 if (pnfs_prepare_to_retry_layoutget(lo)) {
1820                         if (first)
1821                                 pnfs_clear_first_layoutget(lo);
1822                         pnfs_put_layout_hdr(lo);
1823                         dprintk("%s retrying\n", __func__);
1824                         trace_pnfs_update_layout(ino, pos, count, iomode, lo,
1825                                         lseg, PNFS_UPDATE_LAYOUT_RETRY);
1826                         goto lookup_again;
1827                 }
1828                 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1829                                 PNFS_UPDATE_LAYOUT_RETURN);
1830                 goto out_put_layout_hdr;
1831         }
1832
1833         if (pnfs_layoutgets_blocked(lo)) {
1834                 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1835                                 PNFS_UPDATE_LAYOUT_BLOCKED);
1836                 goto out_unlock;
1837         }
1838         atomic_inc(&lo->plh_outstanding);
1839         spin_unlock(&ino->i_lock);
1840
1841         if (list_empty(&lo->plh_layouts)) {
1842                 /* The lo must be on the clp list if there is any
1843                  * chance of a CB_LAYOUTRECALL(FILE) coming in.
1844                  */
1845                 spin_lock(&clp->cl_lock);
1846                 if (list_empty(&lo->plh_layouts))
1847                         list_add_tail(&lo->plh_layouts, &server->layouts);
1848                 spin_unlock(&clp->cl_lock);
1849         }
1850
1851         pg_offset = arg.offset & ~PAGE_MASK;
1852         if (pg_offset) {
1853                 arg.offset -= pg_offset;
1854                 arg.length += pg_offset;
1855         }
1856         if (arg.length != NFS4_MAX_UINT64)
1857                 arg.length = PAGE_ALIGN(arg.length);
1858
1859         lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags);
1860         trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1861                                  PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
1862         atomic_dec(&lo->plh_outstanding);
1863         if (IS_ERR(lseg)) {
1864                 switch(PTR_ERR(lseg)) {
1865                 case -EBUSY:
1866                         if (time_after(jiffies, giveup))
1867                                 lseg = NULL;
1868                         break;
1869                 case -ERECALLCONFLICT:
1870                         /* Huh? We hold no layouts, how is there a recall? */
1871                         if (first) {
1872                                 lseg = NULL;
1873                                 break;
1874                         }
1875                         /* Destroy the existing layout and start over */
1876                         if (time_after(jiffies, giveup))
1877                                 pnfs_destroy_layout(NFS_I(ino));
1878                         /* Fallthrough */
1879                 case -EAGAIN:
1880                         break;
1881                 case -ENODATA:
1882                         /* The server returned NFS4ERR_LAYOUTUNAVAILABLE */
1883                         pnfs_layout_set_fail_bit(
1884                                 lo, pnfs_iomode_to_fail_bit(iomode));
1885                         lseg = NULL;
1886                         goto out_put_layout_hdr;
1887                 default:
1888                         if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
1889                                 pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
1890                                 lseg = NULL;
1891                         }
1892                         goto out_put_layout_hdr;
1893                 }
1894                 if (lseg) {
1895                         if (first)
1896                                 pnfs_clear_first_layoutget(lo);
1897                         trace_pnfs_update_layout(ino, pos, count,
1898                                 iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
1899                         pnfs_put_layout_hdr(lo);
1900                         goto lookup_again;
1901                 }
1902         } else {
1903                 pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
1904         }
1905
1906 out_put_layout_hdr:
1907         if (first)
1908                 pnfs_clear_first_layoutget(lo);
1909         pnfs_put_layout_hdr(lo);
1910 out:
1911         dprintk("%s: inode %s/%llu pNFS layout segment %s for "
1912                         "(%s, offset: %llu, length: %llu)\n",
1913                         __func__, ino->i_sb->s_id,
1914                         (unsigned long long)NFS_FILEID(ino),
1915                         IS_ERR_OR_NULL(lseg) ? "not found" : "found",
1916                         iomode==IOMODE_RW ?  "read/write" : "read-only",
1917                         (unsigned long long)pos,
1918                         (unsigned long long)count);
1919         return lseg;
1920 out_unlock:
1921         spin_unlock(&ino->i_lock);
1922         goto out_put_layout_hdr;
1923 }
1924 EXPORT_SYMBOL_GPL(pnfs_update_layout);
1925
1926 static bool
1927 pnfs_sanity_check_layout_range(struct pnfs_layout_range *range)
1928 {
1929         switch (range->iomode) {
1930         case IOMODE_READ:
1931         case IOMODE_RW:
1932                 break;
1933         default:
1934                 return false;
1935         }
1936         if (range->offset == NFS4_MAX_UINT64)
1937                 return false;
1938         if (range->length == 0)
1939                 return false;
1940         if (range->length != NFS4_MAX_UINT64 &&
1941             range->length > NFS4_MAX_UINT64 - range->offset)
1942                 return false;
1943         return true;
1944 }
1945
1946 struct pnfs_layout_segment *
1947 pnfs_layout_process(struct nfs4_layoutget *lgp)
1948 {
1949         struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
1950         struct nfs4_layoutget_res *res = &lgp->res;
1951         struct pnfs_layout_segment *lseg;
1952         struct inode *ino = lo->plh_inode;
1953         LIST_HEAD(free_me);
1954
1955         if (!pnfs_sanity_check_layout_range(&res->range))
1956                 return ERR_PTR(-EINVAL);
1957
1958         /* Inject layout blob into I/O device driver */
1959         lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
1960         if (IS_ERR_OR_NULL(lseg)) {
1961                 if (!lseg)
1962                         lseg = ERR_PTR(-ENOMEM);
1963
1964                 dprintk("%s: Could not allocate layout: error %ld\n",
1965                        __func__, PTR_ERR(lseg));
1966                 return lseg;
1967         }
1968
1969         pnfs_init_lseg(lo, lseg, &res->range, &res->stateid);
1970
1971         spin_lock(&ino->i_lock);
1972         if (pnfs_layoutgets_blocked(lo)) {
1973                 dprintk("%s forget reply due to state\n", __func__);
1974                 goto out_forget;
1975         }
1976
1977         if (!pnfs_layout_is_valid(lo)) {
1978                 /* We have a completely new layout */
1979                 pnfs_set_layout_stateid(lo, &res->stateid, true);
1980         } else if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
1981                 /* existing state ID, make sure the sequence number matches. */
1982                 if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
1983                         dprintk("%s forget reply due to sequence\n", __func__);
1984                         goto out_forget;
1985                 }
1986                 pnfs_set_layout_stateid(lo, &res->stateid, false);
1987         } else {
1988                 /*
1989                  * We got an entirely new state ID.  Mark all segments for the
1990                  * inode invalid, and retry the layoutget
1991                  */
1992                 struct pnfs_layout_range range = {
1993                         .iomode = IOMODE_ANY,
1994                         .length = NFS4_MAX_UINT64,
1995                 };
1996                 pnfs_set_plh_return_info(lo, IOMODE_ANY, 0);
1997                 pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs,
1998                                                 &range, 0);
1999                 goto out_forget;
2000         }
2001
2002         pnfs_get_lseg(lseg);
2003         pnfs_layout_insert_lseg(lo, lseg, &free_me);
2004
2005
2006         if (res->return_on_close)
2007                 set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
2008
2009         spin_unlock(&ino->i_lock);
2010         pnfs_free_lseg_list(&free_me);
2011         return lseg;
2012
2013 out_forget:
2014         spin_unlock(&ino->i_lock);
2015         lseg->pls_layout = lo;
2016         NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
2017         if (!pnfs_layout_is_valid(lo))
2018                 nfs_commit_inode(ino, 0);
2019         return ERR_PTR(-EAGAIN);
2020 }
2021
2022 /**
2023  * pnfs_mark_matching_lsegs_return - Free or return matching layout segments
2024  * @lo: pointer to layout header
2025  * @tmp_list: list header to be used with pnfs_free_lseg_list()
2026  * @return_range: describe layout segment ranges to be returned
2027  *
2028  * This function is mainly intended for use by layoutrecall. It attempts
2029  * to free the layout segment immediately, or else to mark it for return
2030  * as soon as its reference count drops to zero.
2031  */
2032 int
2033 pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
2034                                 struct list_head *tmp_list,
2035                                 const struct pnfs_layout_range *return_range,
2036                                 u32 seq)
2037 {
2038         struct pnfs_layout_segment *lseg, *next;
2039         int remaining = 0;
2040
2041         dprintk("%s:Begin lo %p\n", __func__, lo);
2042
2043         if (list_empty(&lo->plh_segs))
2044                 return 0;
2045
2046         assert_spin_locked(&lo->plh_inode->i_lock);
2047
2048         list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
2049                 if (pnfs_match_lseg_recall(lseg, return_range, seq)) {
2050                         dprintk("%s: marking lseg %p iomode %d "
2051                                 "offset %llu length %llu\n", __func__,
2052                                 lseg, lseg->pls_range.iomode,
2053                                 lseg->pls_range.offset,
2054                                 lseg->pls_range.length);
2055                         if (mark_lseg_invalid(lseg, tmp_list))
2056                                 continue;
2057                         remaining++;
2058                         set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
2059                 }
2060
2061         if (remaining)
2062                 pnfs_set_plh_return_info(lo, return_range->iomode, seq);
2063
2064         return remaining;
2065 }
2066
2067 void pnfs_error_mark_layout_for_return(struct inode *inode,
2068                                        struct pnfs_layout_segment *lseg)
2069 {
2070         struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
2071         struct pnfs_layout_range range = {
2072                 .iomode = lseg->pls_range.iomode,
2073                 .offset = 0,
2074                 .length = NFS4_MAX_UINT64,
2075         };
2076         bool return_now = false;
2077
2078         spin_lock(&inode->i_lock);
2079         if (!pnfs_layout_is_valid(lo)) {
2080                 spin_unlock(&inode->i_lock);
2081                 return;
2082         }
2083         pnfs_set_plh_return_info(lo, range.iomode, 0);
2084         /*
2085          * mark all matching lsegs so that we are sure to have no live
2086          * segments at hand when sending layoutreturn. See pnfs_put_lseg()
2087          * for how it works.
2088          */
2089         if (!pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, &range, 0)) {
2090                 nfs4_stateid stateid;
2091                 enum pnfs_iomode iomode;
2092
2093                 return_now = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
2094                 spin_unlock(&inode->i_lock);
2095                 if (return_now)
2096                         pnfs_send_layoutreturn(lo, &stateid, iomode, false);
2097         } else {
2098                 spin_unlock(&inode->i_lock);
2099                 nfs_commit_inode(inode, 0);
2100         }
2101 }
2102 EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
2103
2104 void
2105 pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio)
2106 {
2107         if (pgio->pg_lseg == NULL ||
2108             test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags))
2109                 return;
2110         pnfs_put_lseg(pgio->pg_lseg);
2111         pgio->pg_lseg = NULL;
2112 }
2113 EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout);
2114
2115 /*
2116  * Check for any intersection between the request and the pgio->pg_lseg,
2117  * and if none, put this pgio->pg_lseg away.
2118  */
2119 static void
2120 pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
2121 {
2122         if (pgio->pg_lseg && !pnfs_lseg_request_intersecting(pgio->pg_lseg, req)) {
2123                 pnfs_put_lseg(pgio->pg_lseg);
2124                 pgio->pg_lseg = NULL;
2125         }
2126 }
2127
2128 void
2129 pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
2130 {
2131         u64 rd_size = req->wb_bytes;
2132
2133         pnfs_generic_pg_check_layout(pgio);
2134         pnfs_generic_pg_check_range(pgio, req);
2135         if (pgio->pg_lseg == NULL) {
2136                 if (pgio->pg_dreq == NULL)
2137                         rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
2138                 else
2139                         rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
2140
2141                 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
2142                                                    req->wb_context,
2143                                                    req_offset(req),
2144                                                    rd_size,
2145                                                    IOMODE_READ,
2146                                                    false,
2147                                                    GFP_KERNEL);
2148                 if (IS_ERR(pgio->pg_lseg)) {
2149                         pgio->pg_error = PTR_ERR(pgio->pg_lseg);
2150                         pgio->pg_lseg = NULL;
2151                         return;
2152                 }
2153         }
2154         /* If no lseg, fall back to read through mds */
2155         if (pgio->pg_lseg == NULL)
2156                 nfs_pageio_reset_read_mds(pgio);
2157
2158 }
2159 EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read);
2160
2161 void
2162 pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
2163                            struct nfs_page *req, u64 wb_size)
2164 {
2165         pnfs_generic_pg_check_layout(pgio);
2166         pnfs_generic_pg_check_range(pgio, req);
2167         if (pgio->pg_lseg == NULL) {
2168                 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
2169                                                    req->wb_context,
2170                                                    req_offset(req),
2171                                                    wb_size,
2172                                                    IOMODE_RW,
2173                                                    false,
2174                                                    GFP_NOFS);
2175                 if (IS_ERR(pgio->pg_lseg)) {
2176                         pgio->pg_error = PTR_ERR(pgio->pg_lseg);
2177                         pgio->pg_lseg = NULL;
2178                         return;
2179                 }
2180         }
2181         /* If no lseg, fall back to write through mds */
2182         if (pgio->pg_lseg == NULL)
2183                 nfs_pageio_reset_write_mds(pgio);
2184 }
2185 EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
2186
2187 void
2188 pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *desc)
2189 {
2190         if (desc->pg_lseg) {
2191                 pnfs_put_lseg(desc->pg_lseg);
2192                 desc->pg_lseg = NULL;
2193         }
2194 }
2195 EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup);
2196
2197 /*
2198  * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
2199  * of bytes (maximum @req->wb_bytes) that can be coalesced.
2200  */
2201 size_t
2202 pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
2203                      struct nfs_page *prev, struct nfs_page *req)
2204 {
2205         unsigned int size;
2206         u64 seg_end, req_start, seg_left;
2207
2208         size = nfs_generic_pg_test(pgio, prev, req);
2209         if (!size)
2210                 return 0;
2211
2212         /*
2213          * 'size' contains the number of bytes left in the current page (up
2214          * to the original size asked for in @req->wb_bytes).
2215          *
2216          * Calculate how many bytes are left in the layout segment
2217          * and if there are less bytes than 'size', return that instead.
2218          *
2219          * Please also note that 'end_offset' is actually the offset of the
2220          * first byte that lies outside the pnfs_layout_range. FIXME?
2221          *
2222          */
2223         if (pgio->pg_lseg) {
2224                 seg_end = pnfs_end_offset(pgio->pg_lseg->pls_range.offset,
2225                                      pgio->pg_lseg->pls_range.length);
2226                 req_start = req_offset(req);
2227
2228                 /* start of request is past the last byte of this segment */
2229                 if (req_start >= seg_end)
2230                         return 0;
2231
2232                 /* adjust 'size' iff there are fewer bytes left in the
2233                  * segment than what nfs_generic_pg_test returned */
2234                 seg_left = seg_end - req_start;
2235                 if (seg_left < size)
2236                         size = (unsigned int)seg_left;
2237         }
2238
2239         return size;
2240 }
2241 EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
2242
2243 int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
2244 {
2245         struct nfs_pageio_descriptor pgio;
2246
2247         /* Resend all requests through the MDS */
2248         nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
2249                               hdr->completion_ops);
2250         set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags);
2251         return nfs_pageio_resend(&pgio, hdr);
2252 }
2253 EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
2254
2255 static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
2256 {
2257
2258         dprintk("pnfs write error = %d\n", hdr->pnfs_error);
2259         if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
2260             PNFS_LAYOUTRET_ON_ERROR) {
2261                 pnfs_return_layout(hdr->inode);
2262         }
2263         if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
2264                 hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr);
2265 }
2266
2267 /*
2268  * Called by non rpc-based layout drivers
2269  */
2270 void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
2271 {
2272         if (likely(!hdr->pnfs_error)) {
2273                 pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
2274                                 hdr->mds_offset + hdr->res.count);
2275                 hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
2276         }
2277         trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
2278         if (unlikely(hdr->pnfs_error))
2279                 pnfs_ld_handle_write_error(hdr);
2280         hdr->mds_ops->rpc_release(hdr);
2281 }
2282 EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
2283
2284 static void
2285 pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
2286                 struct nfs_pgio_header *hdr)
2287 {
2288         struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2289
2290         if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2291                 list_splice_tail_init(&hdr->pages, &mirror->pg_list);
2292                 nfs_pageio_reset_write_mds(desc);
2293                 mirror->pg_recoalesce = 1;
2294         }
2295         hdr->completion_ops->completion(hdr);
2296 }
2297
2298 static enum pnfs_try_status
2299 pnfs_try_to_write_data(struct nfs_pgio_header *hdr,
2300                         const struct rpc_call_ops *call_ops,
2301                         struct pnfs_layout_segment *lseg,
2302                         int how)
2303 {
2304         struct inode *inode = hdr->inode;
2305         enum pnfs_try_status trypnfs;
2306         struct nfs_server *nfss = NFS_SERVER(inode);
2307
2308         hdr->mds_ops = call_ops;
2309
2310         dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
2311                 inode->i_ino, hdr->args.count, hdr->args.offset, how);
2312         trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how);
2313         if (trypnfs != PNFS_NOT_ATTEMPTED)
2314                 nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
2315         dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
2316         return trypnfs;
2317 }
2318
2319 static void
2320 pnfs_do_write(struct nfs_pageio_descriptor *desc,
2321               struct nfs_pgio_header *hdr, int how)
2322 {
2323         const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
2324         struct pnfs_layout_segment *lseg = desc->pg_lseg;
2325         enum pnfs_try_status trypnfs;
2326
2327         trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
2328         switch (trypnfs) {
2329         case PNFS_NOT_ATTEMPTED:
2330                 pnfs_write_through_mds(desc, hdr);
2331         case PNFS_ATTEMPTED:
2332                 break;
2333         case PNFS_TRY_AGAIN:
2334                 /* cleanup hdr and prepare to redo pnfs */
2335                 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2336                         struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2337                         list_splice_init(&hdr->pages, &mirror->pg_list);
2338                         mirror->pg_recoalesce = 1;
2339                 }
2340                 hdr->mds_ops->rpc_release(hdr);
2341         }
2342 }
2343
2344 static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
2345 {
2346         pnfs_put_lseg(hdr->lseg);
2347         nfs_pgio_header_free(hdr);
2348 }
2349
2350 int
2351 pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
2352 {
2353         struct nfs_pgio_header *hdr;
2354         int ret;
2355
2356         hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
2357         if (!hdr) {
2358                 desc->pg_error = -ENOMEM;
2359                 return desc->pg_error;
2360         }
2361         nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
2362
2363         hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
2364         ret = nfs_generic_pgio(desc, hdr);
2365         if (!ret)
2366                 pnfs_do_write(desc, hdr, desc->pg_ioflags);
2367
2368         return ret;
2369 }
2370 EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
2371
2372 int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr)
2373 {
2374         struct nfs_pageio_descriptor pgio;
2375
2376         /* Resend all requests through the MDS */
2377         nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops);
2378         return nfs_pageio_resend(&pgio, hdr);
2379 }
2380 EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
2381
2382 static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
2383 {
2384         dprintk("pnfs read error = %d\n", hdr->pnfs_error);
2385         if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
2386             PNFS_LAYOUTRET_ON_ERROR) {
2387                 pnfs_return_layout(hdr->inode);
2388         }
2389         if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
2390                 hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr);
2391 }
2392
2393 /*
2394  * Called by non rpc-based layout drivers
2395  */
2396 void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
2397 {
2398         if (likely(!hdr->pnfs_error))
2399                 hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
2400         trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
2401         if (unlikely(hdr->pnfs_error))
2402                 pnfs_ld_handle_read_error(hdr);
2403         hdr->mds_ops->rpc_release(hdr);
2404 }
2405 EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
2406
2407 static void
2408 pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
2409                 struct nfs_pgio_header *hdr)
2410 {
2411         struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2412
2413         if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2414                 list_splice_tail_init(&hdr->pages, &mirror->pg_list);
2415                 nfs_pageio_reset_read_mds(desc);
2416                 mirror->pg_recoalesce = 1;
2417         }
2418         hdr->completion_ops->completion(hdr);
2419 }
2420
2421 /*
2422  * Call the appropriate parallel I/O subsystem read function.
2423  */
2424 static enum pnfs_try_status
2425 pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
2426                        const struct rpc_call_ops *call_ops,
2427                        struct pnfs_layout_segment *lseg)
2428 {
2429         struct inode *inode = hdr->inode;
2430         struct nfs_server *nfss = NFS_SERVER(inode);
2431         enum pnfs_try_status trypnfs;
2432
2433         hdr->mds_ops = call_ops;
2434
2435         dprintk("%s: Reading ino:%lu %u@%llu\n",
2436                 __func__, inode->i_ino, hdr->args.count, hdr->args.offset);
2437
2438         trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr);
2439         if (trypnfs != PNFS_NOT_ATTEMPTED)
2440                 nfs_inc_stats(inode, NFSIOS_PNFS_READ);
2441         dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
2442         return trypnfs;
2443 }
2444
2445 /* Resend all requests through pnfs. */
2446 void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
2447 {
2448         struct nfs_pageio_descriptor pgio;
2449
2450         if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2451                 /* Prevent deadlocks with layoutreturn! */
2452                 pnfs_put_lseg(hdr->lseg);
2453                 hdr->lseg = NULL;
2454
2455                 nfs_pageio_init_read(&pgio, hdr->inode, false,
2456                                         hdr->completion_ops);
2457                 hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr);
2458         }
2459 }
2460 EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs);
2461
2462 static void
2463 pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
2464 {
2465         const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
2466         struct pnfs_layout_segment *lseg = desc->pg_lseg;
2467         enum pnfs_try_status trypnfs;
2468
2469         trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
2470         switch (trypnfs) {
2471         case PNFS_NOT_ATTEMPTED:
2472                 pnfs_read_through_mds(desc, hdr);
2473         case PNFS_ATTEMPTED:
2474                 break;
2475         case PNFS_TRY_AGAIN:
2476                 /* cleanup hdr and prepare to redo pnfs */
2477                 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2478                         struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2479                         list_splice_init(&hdr->pages, &mirror->pg_list);
2480                         mirror->pg_recoalesce = 1;
2481                 }
2482                 hdr->mds_ops->rpc_release(hdr);
2483         }
2484 }
2485
2486 static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
2487 {
2488         pnfs_put_lseg(hdr->lseg);
2489         nfs_pgio_header_free(hdr);
2490 }
2491
2492 int
2493 pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
2494 {
2495         struct nfs_pgio_header *hdr;
2496         int ret;
2497
2498         hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
2499         if (!hdr) {
2500                 desc->pg_error = -ENOMEM;
2501                 return desc->pg_error;
2502         }
2503         nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
2504         hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
2505         ret = nfs_generic_pgio(desc, hdr);
2506         if (!ret)
2507                 pnfs_do_read(desc, hdr);
2508         return ret;
2509 }
2510 EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
2511
2512 static void pnfs_clear_layoutcommitting(struct inode *inode)
2513 {
2514         unsigned long *bitlock = &NFS_I(inode)->flags;
2515
2516         clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
2517         smp_mb__after_atomic();
2518         wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
2519 }
2520
2521 /*
2522  * There can be multiple RW segments.
2523  */
2524 static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
2525 {
2526         struct pnfs_layout_segment *lseg;
2527
2528         list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
2529                 if (lseg->pls_range.iomode == IOMODE_RW &&
2530                     test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
2531                         list_add(&lseg->pls_lc_list, listp);
2532         }
2533 }
2534
2535 static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp)
2536 {
2537         struct pnfs_layout_segment *lseg, *tmp;
2538
2539         /* Matched by references in pnfs_set_layoutcommit */
2540         list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) {
2541                 list_del_init(&lseg->pls_lc_list);
2542                 pnfs_put_lseg(lseg);
2543         }
2544
2545         pnfs_clear_layoutcommitting(inode);
2546 }
2547
2548 void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
2549 {
2550         pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode);
2551 }
2552 EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
2553
2554 void
2555 pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg,
2556                 loff_t end_pos)
2557 {
2558         struct nfs_inode *nfsi = NFS_I(inode);
2559         bool mark_as_dirty = false;
2560
2561         spin_lock(&inode->i_lock);
2562         if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
2563                 nfsi->layout->plh_lwb = end_pos;
2564                 mark_as_dirty = true;
2565                 dprintk("%s: Set layoutcommit for inode %lu ",
2566                         __func__, inode->i_ino);
2567         } else if (end_pos > nfsi->layout->plh_lwb)
2568                 nfsi->layout->plh_lwb = end_pos;
2569         if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) {
2570                 /* references matched in nfs4_layoutcommit_release */
2571                 pnfs_get_lseg(lseg);
2572         }
2573         spin_unlock(&inode->i_lock);
2574         dprintk("%s: lseg %p end_pos %llu\n",
2575                 __func__, lseg, nfsi->layout->plh_lwb);
2576
2577         /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
2578          * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
2579         if (mark_as_dirty)
2580                 mark_inode_dirty_sync(inode);
2581 }
2582 EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
2583
2584 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
2585 {
2586         struct nfs_server *nfss = NFS_SERVER(data->args.inode);
2587
2588         if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
2589                 nfss->pnfs_curr_ld->cleanup_layoutcommit(data);
2590         pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list);
2591 }
2592
2593 /*
2594  * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
2595  * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
2596  * data to disk to allow the server to recover the data if it crashes.
2597  * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag
2598  * is off, and a COMMIT is sent to a data server, or
2599  * if WRITEs to a data server return NFS_DATA_SYNC.
2600  */
2601 int
2602 pnfs_layoutcommit_inode(struct inode *inode, bool sync)
2603 {
2604         struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
2605         struct nfs4_layoutcommit_data *data;
2606         struct nfs_inode *nfsi = NFS_I(inode);
2607         loff_t end_pos;
2608         int status;
2609
2610         if (!pnfs_layoutcommit_outstanding(inode))
2611                 return 0;
2612
2613         dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
2614
2615         status = -EAGAIN;
2616         if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
2617                 if (!sync)
2618                         goto out;
2619                 status = wait_on_bit_lock_action(&nfsi->flags,
2620                                 NFS_INO_LAYOUTCOMMITTING,
2621                                 nfs_wait_bit_killable,
2622                                 TASK_KILLABLE);
2623                 if (status)
2624                         goto out;
2625         }
2626
2627         status = -ENOMEM;
2628         /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
2629         data = kzalloc(sizeof(*data), GFP_NOFS);
2630         if (!data)
2631                 goto clear_layoutcommitting;
2632
2633         status = 0;
2634         spin_lock(&inode->i_lock);
2635         if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
2636                 goto out_unlock;
2637
2638         INIT_LIST_HEAD(&data->lseg_list);
2639         pnfs_list_write_lseg(inode, &data->lseg_list);
2640
2641         end_pos = nfsi->layout->plh_lwb;
2642
2643         nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
2644         spin_unlock(&inode->i_lock);
2645
2646         data->args.inode = inode;
2647         data->cred = get_rpccred(nfsi->layout->plh_lc_cred);
2648         nfs_fattr_init(&data->fattr);
2649         data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
2650         data->res.fattr = &data->fattr;
2651         if (end_pos != 0)
2652                 data->args.lastbytewritten = end_pos - 1;
2653         else
2654                 data->args.lastbytewritten = U64_MAX;
2655         data->res.server = NFS_SERVER(inode);
2656
2657         if (ld->prepare_layoutcommit) {
2658                 status = ld->prepare_layoutcommit(&data->args);
2659                 if (status) {
2660                         put_rpccred(data->cred);
2661                         spin_lock(&inode->i_lock);
2662                         set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
2663                         if (end_pos > nfsi->layout->plh_lwb)
2664                                 nfsi->layout->plh_lwb = end_pos;
2665                         goto out_unlock;
2666                 }
2667         }
2668
2669
2670         status = nfs4_proc_layoutcommit(data, sync);
2671 out:
2672         if (status)
2673                 mark_inode_dirty_sync(inode);
2674         dprintk("<-- %s status %d\n", __func__, status);
2675         return status;
2676 out_unlock:
2677         spin_unlock(&inode->i_lock);
2678         kfree(data);
2679 clear_layoutcommitting:
2680         pnfs_clear_layoutcommitting(inode);
2681         goto out;
2682 }
2683 EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode);
2684
2685 int
2686 pnfs_generic_sync(struct inode *inode, bool datasync)
2687 {
2688         return pnfs_layoutcommit_inode(inode, true);
2689 }
2690 EXPORT_SYMBOL_GPL(pnfs_generic_sync);
2691
2692 struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
2693 {
2694         struct nfs4_threshold *thp;
2695
2696         thp = kzalloc(sizeof(*thp), GFP_NOFS);
2697         if (!thp) {
2698                 dprintk("%s mdsthreshold allocation failed\n", __func__);
2699                 return NULL;
2700         }
2701         return thp;
2702 }
2703
2704 #if IS_ENABLED(CONFIG_NFS_V4_2)
2705 int
2706 pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
2707 {
2708         struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
2709         struct nfs_server *server = NFS_SERVER(inode);
2710         struct nfs_inode *nfsi = NFS_I(inode);
2711         struct nfs42_layoutstat_data *data;
2712         struct pnfs_layout_hdr *hdr;
2713         int status = 0;
2714
2715         if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats)
2716                 goto out;
2717
2718         if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS))
2719                 goto out;
2720
2721         if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags))
2722                 goto out;
2723
2724         spin_lock(&inode->i_lock);
2725         if (!NFS_I(inode)->layout) {
2726                 spin_unlock(&inode->i_lock);
2727                 goto out_clear_layoutstats;
2728         }
2729         hdr = NFS_I(inode)->layout;
2730         pnfs_get_layout_hdr(hdr);
2731         spin_unlock(&inode->i_lock);
2732
2733         data = kzalloc(sizeof(*data), gfp_flags);
2734         if (!data) {
2735                 status = -ENOMEM;
2736                 goto out_put;
2737         }
2738
2739         data->args.fh = NFS_FH(inode);
2740         data->args.inode = inode;
2741         status = ld->prepare_layoutstats(&data->args);
2742         if (status)
2743                 goto out_free;
2744
2745         status = nfs42_proc_layoutstats_generic(NFS_SERVER(inode), data);
2746
2747 out:
2748         dprintk("%s returns %d\n", __func__, status);
2749         return status;
2750
2751 out_free:
2752         kfree(data);
2753 out_put:
2754         pnfs_put_layout_hdr(hdr);
2755 out_clear_layoutstats:
2756         smp_mb__before_atomic();
2757         clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags);
2758         smp_mb__after_atomic();
2759         goto out;
2760 }
2761 EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
2762 #endif
2763
2764 unsigned int layoutstats_timer;
2765 module_param(layoutstats_timer, uint, 0644);
2766 EXPORT_SYMBOL_GPL(layoutstats_timer);