GNU Linux-libre 4.19.211-gnu1
[releases.git] / fs / xfs / scrub / scrub.c
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  * Copyright (C) 2017 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <darrick.wong@oracle.com>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_btree.h"
14 #include "xfs_bit.h"
15 #include "xfs_log_format.h"
16 #include "xfs_trans.h"
17 #include "xfs_sb.h"
18 #include "xfs_inode.h"
19 #include "xfs_icache.h"
20 #include "xfs_itable.h"
21 #include "xfs_alloc.h"
22 #include "xfs_alloc_btree.h"
23 #include "xfs_bmap.h"
24 #include "xfs_bmap_btree.h"
25 #include "xfs_ialloc.h"
26 #include "xfs_ialloc_btree.h"
27 #include "xfs_refcount.h"
28 #include "xfs_refcount_btree.h"
29 #include "xfs_rmap.h"
30 #include "xfs_rmap_btree.h"
31 #include "xfs_quota.h"
32 #include "xfs_qm.h"
33 #include "xfs_errortag.h"
34 #include "xfs_error.h"
35 #include "xfs_log.h"
36 #include "xfs_trans_priv.h"
37 #include "scrub/xfs_scrub.h"
38 #include "scrub/scrub.h"
39 #include "scrub/common.h"
40 #include "scrub/trace.h"
41 #include "scrub/btree.h"
42 #include "scrub/repair.h"
43
44 /*
45  * Online Scrub and Repair
46  *
47  * Traditionally, XFS (the kernel driver) did not know how to check or
48  * repair on-disk data structures.  That task was left to the xfs_check
49  * and xfs_repair tools, both of which require taking the filesystem
50  * offline for a thorough but time consuming examination.  Online
51  * scrub & repair, on the other hand, enables us to check the metadata
52  * for obvious errors while carefully stepping around the filesystem's
53  * ongoing operations, locking rules, etc.
54  *
55  * Given that most XFS metadata consist of records stored in a btree,
56  * most of the checking functions iterate the btree blocks themselves
57  * looking for irregularities.  When a record block is encountered, each
58  * record can be checked for obviously bad values.  Record values can
59  * also be cross-referenced against other btrees to look for potential
60  * misunderstandings between pieces of metadata.
61  *
62  * It is expected that the checkers responsible for per-AG metadata
63  * structures will lock the AG headers (AGI, AGF, AGFL), iterate the
64  * metadata structure, and perform any relevant cross-referencing before
65  * unlocking the AG and returning the results to userspace.  These
66  * scrubbers must not keep an AG locked for too long to avoid tying up
67  * the block and inode allocators.
68  *
69  * Block maps and b-trees rooted in an inode present a special challenge
70  * because they can involve extents from any AG.  The general scrubber
71  * structure of lock -> check -> xref -> unlock still holds, but AG
72  * locking order rules /must/ be obeyed to avoid deadlocks.  The
73  * ordering rule, of course, is that we must lock in increasing AG
74  * order.  Helper functions are provided to track which AG headers we've
75  * already locked.  If we detect an imminent locking order violation, we
76  * can signal a potential deadlock, in which case the scrubber can jump
77  * out to the top level, lock all the AGs in order, and retry the scrub.
78  *
79  * For file data (directories, extended attributes, symlinks) scrub, we
80  * can simply lock the inode and walk the data.  For btree data
81  * (directories and attributes) we follow the same btree-scrubbing
82  * strategy outlined previously to check the records.
83  *
84  * We use a bit of trickery with transactions to avoid buffer deadlocks
85  * if there is a cycle in the metadata.  The basic problem is that
86  * travelling down a btree involves locking the current buffer at each
87  * tree level.  If a pointer should somehow point back to a buffer that
88  * we've already examined, we will deadlock due to the second buffer
89  * locking attempt.  Note however that grabbing a buffer in transaction
90  * context links the locked buffer to the transaction.  If we try to
91  * re-grab the buffer in the context of the same transaction, we avoid
92  * the second lock attempt and continue.  Between the verifier and the
93  * scrubber, something will notice that something is amiss and report
94  * the corruption.  Therefore, each scrubber will allocate an empty
95  * transaction, attach buffers to it, and cancel the transaction at the
96  * end of the scrub run.  Cancelling a non-dirty transaction simply
97  * unlocks the buffers.
98  *
99  * There are four pieces of data that scrub can communicate to
100  * userspace.  The first is the error code (errno), which can be used to
101  * communicate operational errors in performing the scrub.  There are
102  * also three flags that can be set in the scrub context.  If the data
103  * structure itself is corrupt, the CORRUPT flag will be set.  If
104  * the metadata is correct but otherwise suboptimal, the PREEN flag
105  * will be set.
106  *
107  * We perform secondary validation of filesystem metadata by
108  * cross-referencing every record with all other available metadata.
109  * For example, for block mapping extents, we verify that there are no
110  * records in the free space and inode btrees corresponding to that
111  * space extent and that there is a corresponding entry in the reverse
112  * mapping btree.  Inconsistent metadata is noted by setting the
113  * XCORRUPT flag; btree query function errors are noted by setting the
114  * XFAIL flag and deleting the cursor to prevent further attempts to
115  * cross-reference with a defective btree.
116  *
117  * If a piece of metadata proves corrupt or suboptimal, the userspace
118  * program can ask the kernel to apply some tender loving care (TLC) to
119  * the metadata object by setting the REPAIR flag and re-calling the
120  * scrub ioctl.  "Corruption" is defined by metadata violating the
121  * on-disk specification; operations cannot continue if the violation is
122  * left untreated.  It is possible for XFS to continue if an object is
123  * "suboptimal", however performance may be degraded.  Repairs are
124  * usually performed by rebuilding the metadata entirely out of
125  * redundant metadata.  Optimizing, on the other hand, can sometimes be
126  * done without rebuilding entire structures.
127  *
128  * Generally speaking, the repair code has the following code structure:
129  * Lock -> scrub -> repair -> commit -> re-lock -> re-scrub -> unlock.
130  * The first check helps us figure out if we need to rebuild or simply
131  * optimize the structure so that the rebuild knows what to do.  The
132  * second check evaluates the completeness of the repair; that is what
133  * is reported to userspace.
134  *
135  * A quick note on symbol prefixes:
136  * - "xfs_" are general XFS symbols.
137  * - "xchk_" are symbols related to metadata checking.
138  * - "xrep_" are symbols related to metadata repair.
139  * - "xfs_scrub_" are symbols that tie online fsck to the rest of XFS.
140  */
141
142 /*
143  * Scrub probe -- userspace uses this to probe if we're willing to scrub
144  * or repair a given mountpoint.  This will be used by xfs_scrub to
145  * probe the kernel's abilities to scrub (and repair) the metadata.  We
146  * do this by validating the ioctl inputs from userspace, preparing the
147  * filesystem for a scrub (or a repair) operation, and immediately
148  * returning to userspace.  Userspace can use the returned errno and
149  * structure state to decide (in broad terms) if scrub/repair are
150  * supported by the running kernel.
151  */
152 static int
153 xchk_probe(
154         struct xfs_scrub        *sc)
155 {
156         int                     error = 0;
157
158         if (xchk_should_terminate(sc, &error))
159                 return error;
160
161         return 0;
162 }
163
164 /* Scrub setup and teardown */
165
166 /* Free all the resources and finish the transactions. */
167 STATIC int
168 xchk_teardown(
169         struct xfs_scrub        *sc,
170         struct xfs_inode        *ip_in,
171         int                     error)
172 {
173         xchk_ag_free(sc, &sc->sa);
174         if (sc->tp) {
175                 if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
176                         error = xfs_trans_commit(sc->tp);
177                 else
178                         xfs_trans_cancel(sc->tp);
179                 sc->tp = NULL;
180         }
181         if (sc->ip) {
182                 if (sc->ilock_flags)
183                         xfs_iunlock(sc->ip, sc->ilock_flags);
184                 if (sc->ip != ip_in &&
185                     !xfs_internal_inum(sc->mp, sc->ip->i_ino))
186                         xfs_irele(sc->ip);
187                 sc->ip = NULL;
188         }
189         if (sc->has_quotaofflock)
190                 mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
191         if (sc->buf) {
192                 kmem_free(sc->buf);
193                 sc->buf = NULL;
194         }
195         return error;
196 }
197
198 /* Scrubbing dispatch. */
199
200 static const struct xchk_meta_ops meta_scrub_ops[] = {
201         [XFS_SCRUB_TYPE_PROBE] = {      /* ioctl presence test */
202                 .type   = ST_NONE,
203                 .setup  = xchk_setup_fs,
204                 .scrub  = xchk_probe,
205                 .repair = xrep_probe,
206         },
207         [XFS_SCRUB_TYPE_SB] = {         /* superblock */
208                 .type   = ST_PERAG,
209                 .setup  = xchk_setup_fs,
210                 .scrub  = xchk_superblock,
211                 .repair = xrep_superblock,
212         },
213         [XFS_SCRUB_TYPE_AGF] = {        /* agf */
214                 .type   = ST_PERAG,
215                 .setup  = xchk_setup_fs,
216                 .scrub  = xchk_agf,
217                 .repair = xrep_agf,
218         },
219         [XFS_SCRUB_TYPE_AGFL]= {        /* agfl */
220                 .type   = ST_PERAG,
221                 .setup  = xchk_setup_fs,
222                 .scrub  = xchk_agfl,
223                 .repair = xrep_agfl,
224         },
225         [XFS_SCRUB_TYPE_AGI] = {        /* agi */
226                 .type   = ST_PERAG,
227                 .setup  = xchk_setup_fs,
228                 .scrub  = xchk_agi,
229                 .repair = xrep_agi,
230         },
231         [XFS_SCRUB_TYPE_BNOBT] = {      /* bnobt */
232                 .type   = ST_PERAG,
233                 .setup  = xchk_setup_ag_allocbt,
234                 .scrub  = xchk_bnobt,
235                 .repair = xrep_notsupported,
236         },
237         [XFS_SCRUB_TYPE_CNTBT] = {      /* cntbt */
238                 .type   = ST_PERAG,
239                 .setup  = xchk_setup_ag_allocbt,
240                 .scrub  = xchk_cntbt,
241                 .repair = xrep_notsupported,
242         },
243         [XFS_SCRUB_TYPE_INOBT] = {      /* inobt */
244                 .type   = ST_PERAG,
245                 .setup  = xchk_setup_ag_iallocbt,
246                 .scrub  = xchk_inobt,
247                 .repair = xrep_notsupported,
248         },
249         [XFS_SCRUB_TYPE_FINOBT] = {     /* finobt */
250                 .type   = ST_PERAG,
251                 .setup  = xchk_setup_ag_iallocbt,
252                 .scrub  = xchk_finobt,
253                 .has    = xfs_sb_version_hasfinobt,
254                 .repair = xrep_notsupported,
255         },
256         [XFS_SCRUB_TYPE_RMAPBT] = {     /* rmapbt */
257                 .type   = ST_PERAG,
258                 .setup  = xchk_setup_ag_rmapbt,
259                 .scrub  = xchk_rmapbt,
260                 .has    = xfs_sb_version_hasrmapbt,
261                 .repair = xrep_notsupported,
262         },
263         [XFS_SCRUB_TYPE_REFCNTBT] = {   /* refcountbt */
264                 .type   = ST_PERAG,
265                 .setup  = xchk_setup_ag_refcountbt,
266                 .scrub  = xchk_refcountbt,
267                 .has    = xfs_sb_version_hasreflink,
268                 .repair = xrep_notsupported,
269         },
270         [XFS_SCRUB_TYPE_INODE] = {      /* inode record */
271                 .type   = ST_INODE,
272                 .setup  = xchk_setup_inode,
273                 .scrub  = xchk_inode,
274                 .repair = xrep_notsupported,
275         },
276         [XFS_SCRUB_TYPE_BMBTD] = {      /* inode data fork */
277                 .type   = ST_INODE,
278                 .setup  = xchk_setup_inode_bmap,
279                 .scrub  = xchk_bmap_data,
280                 .repair = xrep_notsupported,
281         },
282         [XFS_SCRUB_TYPE_BMBTA] = {      /* inode attr fork */
283                 .type   = ST_INODE,
284                 .setup  = xchk_setup_inode_bmap,
285                 .scrub  = xchk_bmap_attr,
286                 .repair = xrep_notsupported,
287         },
288         [XFS_SCRUB_TYPE_BMBTC] = {      /* inode CoW fork */
289                 .type   = ST_INODE,
290                 .setup  = xchk_setup_inode_bmap,
291                 .scrub  = xchk_bmap_cow,
292                 .repair = xrep_notsupported,
293         },
294         [XFS_SCRUB_TYPE_DIR] = {        /* directory */
295                 .type   = ST_INODE,
296                 .setup  = xchk_setup_directory,
297                 .scrub  = xchk_directory,
298                 .repair = xrep_notsupported,
299         },
300         [XFS_SCRUB_TYPE_XATTR] = {      /* extended attributes */
301                 .type   = ST_INODE,
302                 .setup  = xchk_setup_xattr,
303                 .scrub  = xchk_xattr,
304                 .repair = xrep_notsupported,
305         },
306         [XFS_SCRUB_TYPE_SYMLINK] = {    /* symbolic link */
307                 .type   = ST_INODE,
308                 .setup  = xchk_setup_symlink,
309                 .scrub  = xchk_symlink,
310                 .repair = xrep_notsupported,
311         },
312         [XFS_SCRUB_TYPE_PARENT] = {     /* parent pointers */
313                 .type   = ST_INODE,
314                 .setup  = xchk_setup_parent,
315                 .scrub  = xchk_parent,
316                 .repair = xrep_notsupported,
317         },
318         [XFS_SCRUB_TYPE_RTBITMAP] = {   /* realtime bitmap */
319                 .type   = ST_FS,
320                 .setup  = xchk_setup_rt,
321                 .scrub  = xchk_rtbitmap,
322                 .has    = xfs_sb_version_hasrealtime,
323                 .repair = xrep_notsupported,
324         },
325         [XFS_SCRUB_TYPE_RTSUM] = {      /* realtime summary */
326                 .type   = ST_FS,
327                 .setup  = xchk_setup_rt,
328                 .scrub  = xchk_rtsummary,
329                 .has    = xfs_sb_version_hasrealtime,
330                 .repair = xrep_notsupported,
331         },
332         [XFS_SCRUB_TYPE_UQUOTA] = {     /* user quota */
333                 .type   = ST_FS,
334                 .setup  = xchk_setup_quota,
335                 .scrub  = xchk_quota,
336                 .repair = xrep_notsupported,
337         },
338         [XFS_SCRUB_TYPE_GQUOTA] = {     /* group quota */
339                 .type   = ST_FS,
340                 .setup  = xchk_setup_quota,
341                 .scrub  = xchk_quota,
342                 .repair = xrep_notsupported,
343         },
344         [XFS_SCRUB_TYPE_PQUOTA] = {     /* project quota */
345                 .type   = ST_FS,
346                 .setup  = xchk_setup_quota,
347                 .scrub  = xchk_quota,
348                 .repair = xrep_notsupported,
349         },
350 };
351
352 /* This isn't a stable feature, warn once per day. */
353 static inline void
354 xchk_experimental_warning(
355         struct xfs_mount        *mp)
356 {
357         static struct ratelimit_state scrub_warning = RATELIMIT_STATE_INIT(
358                         "xchk_warning", 86400 * HZ, 1);
359         ratelimit_set_flags(&scrub_warning, RATELIMIT_MSG_ON_RELEASE);
360
361         if (__ratelimit(&scrub_warning))
362                 xfs_alert(mp,
363 "EXPERIMENTAL online scrub feature in use. Use at your own risk!");
364 }
365
366 static int
367 xchk_validate_inputs(
368         struct xfs_mount                *mp,
369         struct xfs_scrub_metadata       *sm)
370 {
371         int                             error;
372         const struct xchk_meta_ops      *ops;
373
374         error = -EINVAL;
375         /* Check our inputs. */
376         sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
377         if (sm->sm_flags & ~XFS_SCRUB_FLAGS_IN)
378                 goto out;
379         /* sm_reserved[] must be zero */
380         if (memchr_inv(sm->sm_reserved, 0, sizeof(sm->sm_reserved)))
381                 goto out;
382
383         error = -ENOENT;
384         /* Do we know about this type of metadata? */
385         if (sm->sm_type >= XFS_SCRUB_TYPE_NR)
386                 goto out;
387         ops = &meta_scrub_ops[sm->sm_type];
388         if (ops->setup == NULL || ops->scrub == NULL)
389                 goto out;
390         /* Does this fs even support this type of metadata? */
391         if (ops->has && !ops->has(&mp->m_sb))
392                 goto out;
393
394         error = -EINVAL;
395         /* restricting fields must be appropriate for type */
396         switch (ops->type) {
397         case ST_NONE:
398         case ST_FS:
399                 if (sm->sm_ino || sm->sm_gen || sm->sm_agno)
400                         goto out;
401                 break;
402         case ST_PERAG:
403                 if (sm->sm_ino || sm->sm_gen ||
404                     sm->sm_agno >= mp->m_sb.sb_agcount)
405                         goto out;
406                 break;
407         case ST_INODE:
408                 if (sm->sm_agno || (sm->sm_gen && !sm->sm_ino))
409                         goto out;
410                 break;
411         default:
412                 goto out;
413         }
414
415         error = -EOPNOTSUPP;
416         /*
417          * We won't scrub any filesystem that doesn't have the ability
418          * to record unwritten extents.  The option was made default in
419          * 2003, removed from mkfs in 2007, and cannot be disabled in
420          * v5, so if we find a filesystem without this flag it's either
421          * really old or totally unsupported.  Avoid it either way.
422          * We also don't support v1-v3 filesystems, which aren't
423          * mountable.
424          */
425         if (!xfs_sb_version_hasextflgbit(&mp->m_sb))
426                 goto out;
427
428         /*
429          * We only want to repair read-write v5+ filesystems.  Defer the check
430          * for ops->repair until after our scrub confirms that we need to
431          * perform repairs so that we avoid failing due to not supporting
432          * repairing an object that doesn't need repairs.
433          */
434         if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
435                 error = -EOPNOTSUPP;
436                 if (!xfs_sb_version_hascrc(&mp->m_sb))
437                         goto out;
438
439                 error = -EROFS;
440                 if (mp->m_flags & XFS_MOUNT_RDONLY)
441                         goto out;
442         }
443
444         error = 0;
445 out:
446         return error;
447 }
448
449 #ifdef CONFIG_XFS_ONLINE_REPAIR
450 static inline void xchk_postmortem(struct xfs_scrub *sc)
451 {
452         /*
453          * Userspace asked us to repair something, we repaired it, rescanned
454          * it, and the rescan says it's still broken.  Scream about this in
455          * the system logs.
456          */
457         if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
458             (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
459                                  XFS_SCRUB_OFLAG_XCORRUPT)))
460                 xrep_failure(sc->mp);
461 }
462 #else
463 static inline void xchk_postmortem(struct xfs_scrub *sc)
464 {
465         /*
466          * Userspace asked us to scrub something, it's broken, and we have no
467          * way of fixing it.  Scream in the logs.
468          */
469         if (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
470                                 XFS_SCRUB_OFLAG_XCORRUPT))
471                 xfs_alert_ratelimited(sc->mp,
472                                 "Corruption detected during scrub.");
473 }
474 #endif /* CONFIG_XFS_ONLINE_REPAIR */
475
476 /* Dispatch metadata scrubbing. */
477 int
478 xfs_scrub_metadata(
479         struct xfs_inode                *ip,
480         struct xfs_scrub_metadata       *sm)
481 {
482         struct xfs_scrub                sc;
483         struct xfs_mount                *mp = ip->i_mount;
484         bool                            try_harder = false;
485         bool                            already_fixed = false;
486         int                             error = 0;
487
488         BUILD_BUG_ON(sizeof(meta_scrub_ops) !=
489                 (sizeof(struct xchk_meta_ops) * XFS_SCRUB_TYPE_NR));
490
491         trace_xchk_start(ip, sm, error);
492
493         /* Forbidden if we are shut down or mounted norecovery. */
494         error = -ESHUTDOWN;
495         if (XFS_FORCED_SHUTDOWN(mp))
496                 goto out;
497         error = -ENOTRECOVERABLE;
498         if (mp->m_flags & XFS_MOUNT_NORECOVERY)
499                 goto out;
500
501         error = xchk_validate_inputs(mp, sm);
502         if (error)
503                 goto out;
504
505         xchk_experimental_warning(mp);
506
507 retry_op:
508         /* Set up for the operation. */
509         memset(&sc, 0, sizeof(sc));
510         sc.mp = ip->i_mount;
511         sc.sm = sm;
512         sc.ops = &meta_scrub_ops[sm->sm_type];
513         sc.try_harder = try_harder;
514         sc.sa.agno = NULLAGNUMBER;
515         error = sc.ops->setup(&sc, ip);
516         if (error)
517                 goto out_teardown;
518
519         /* Scrub for errors. */
520         error = sc.ops->scrub(&sc);
521         if (!try_harder && error == -EDEADLOCK) {
522                 /*
523                  * Scrubbers return -EDEADLOCK to mean 'try harder'.
524                  * Tear down everything we hold, then set up again with
525                  * preparation for worst-case scenarios.
526                  */
527                 error = xchk_teardown(&sc, ip, 0);
528                 if (error)
529                         goto out;
530                 try_harder = true;
531                 goto retry_op;
532         } else if (error)
533                 goto out_teardown;
534
535         if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && !already_fixed) {
536                 bool needs_fix;
537
538                 /* Let debug users force us into the repair routines. */
539                 if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
540                         sc.sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
541
542                 needs_fix = (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
543                                                 XFS_SCRUB_OFLAG_XCORRUPT |
544                                                 XFS_SCRUB_OFLAG_PREEN));
545                 /*
546                  * If userspace asked for a repair but it wasn't necessary,
547                  * report that back to userspace.
548                  */
549                 if (!needs_fix) {
550                         sc.sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED;
551                         goto out_nofix;
552                 }
553
554                 /*
555                  * If it's broken, userspace wants us to fix it, and we haven't
556                  * already tried to fix it, then attempt a repair.
557                  */
558                 error = xrep_attempt(ip, &sc, &already_fixed);
559                 if (error == -EAGAIN) {
560                         if (sc.try_harder)
561                                 try_harder = true;
562                         error = xchk_teardown(&sc, ip, 0);
563                         if (error) {
564                                 xrep_failure(mp);
565                                 goto out;
566                         }
567                         goto retry_op;
568                 }
569         }
570
571 out_nofix:
572         xchk_postmortem(&sc);
573 out_teardown:
574         error = xchk_teardown(&sc, ip, error);
575 out:
576         trace_xchk_done(ip, sm, error);
577         if (error == -EFSCORRUPTED || error == -EFSBADCRC) {
578                 sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
579                 error = 0;
580         }
581         return error;
582 }