GNU Linux-libre 4.19.209-gnu1
[releases.git] / fs / f2fs / gc.c
1 /*
2  * fs/f2fs/gc.c
3  *
4  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5  *             http://www.samsung.com/
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  */
11 #include <linux/fs.h>
12 #include <linux/module.h>
13 #include <linux/backing-dev.h>
14 #include <linux/init.h>
15 #include <linux/f2fs_fs.h>
16 #include <linux/kthread.h>
17 #include <linux/delay.h>
18 #include <linux/freezer.h>
19
20 #include "f2fs.h"
21 #include "node.h"
22 #include "segment.h"
23 #include "gc.h"
24 #include <trace/events/f2fs.h>
25
26 static int gc_thread_func(void *data)
27 {
28         struct f2fs_sb_info *sbi = data;
29         struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
30         wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
31         unsigned int wait_ms;
32
33         wait_ms = gc_th->min_sleep_time;
34
35         set_freezable();
36         do {
37                 wait_event_interruptible_timeout(*wq,
38                                 kthread_should_stop() || freezing(current) ||
39                                 gc_th->gc_wake,
40                                 msecs_to_jiffies(wait_ms));
41
42                 /* give it a try one time */
43                 if (gc_th->gc_wake)
44                         gc_th->gc_wake = 0;
45
46                 if (try_to_freeze())
47                         continue;
48                 if (kthread_should_stop())
49                         break;
50
51                 if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) {
52                         increase_sleep_time(gc_th, &wait_ms);
53                         continue;
54                 }
55
56                 if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
57                         f2fs_show_injection_info(FAULT_CHECKPOINT);
58                         f2fs_stop_checkpoint(sbi, false);
59                 }
60
61                 if (!sb_start_write_trylock(sbi->sb))
62                         continue;
63
64                 /*
65                  * [GC triggering condition]
66                  * 0. GC is not conducted currently.
67                  * 1. There are enough dirty segments.
68                  * 2. IO subsystem is idle by checking the # of writeback pages.
69                  * 3. IO subsystem is idle by checking the # of requests in
70                  *    bdev's request list.
71                  *
72                  * Note) We have to avoid triggering GCs frequently.
73                  * Because it is possible that some segments can be
74                  * invalidated soon after by user update or deletion.
75                  * So, I'd like to wait some time to collect dirty segments.
76                  */
77                 if (sbi->gc_mode == GC_URGENT) {
78                         wait_ms = gc_th->urgent_sleep_time;
79                         mutex_lock(&sbi->gc_mutex);
80                         goto do_gc;
81                 }
82
83                 if (!mutex_trylock(&sbi->gc_mutex))
84                         goto next;
85
86                 if (!is_idle(sbi)) {
87                         increase_sleep_time(gc_th, &wait_ms);
88                         mutex_unlock(&sbi->gc_mutex);
89                         goto next;
90                 }
91
92                 if (has_enough_invalid_blocks(sbi))
93                         decrease_sleep_time(gc_th, &wait_ms);
94                 else
95                         increase_sleep_time(gc_th, &wait_ms);
96 do_gc:
97                 stat_inc_bggc_count(sbi);
98
99                 /* if return value is not zero, no victim was selected */
100                 if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO))
101                         wait_ms = gc_th->no_gc_sleep_time;
102
103                 trace_f2fs_background_gc(sbi->sb, wait_ms,
104                                 prefree_segments(sbi), free_segments(sbi));
105
106                 /* balancing f2fs's metadata periodically */
107                 f2fs_balance_fs_bg(sbi);
108 next:
109                 sb_end_write(sbi->sb);
110
111         } while (!kthread_should_stop());
112         return 0;
113 }
114
115 int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
116 {
117         struct f2fs_gc_kthread *gc_th;
118         dev_t dev = sbi->sb->s_bdev->bd_dev;
119         int err = 0;
120
121         gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
122         if (!gc_th) {
123                 err = -ENOMEM;
124                 goto out;
125         }
126
127         gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME;
128         gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME;
129         gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME;
130         gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME;
131
132         gc_th->gc_wake= 0;
133
134         sbi->gc_thread = gc_th;
135         init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
136         sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
137                         "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
138         if (IS_ERR(gc_th->f2fs_gc_task)) {
139                 err = PTR_ERR(gc_th->f2fs_gc_task);
140                 kfree(gc_th);
141                 sbi->gc_thread = NULL;
142         }
143 out:
144         return err;
145 }
146
147 void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi)
148 {
149         struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
150         if (!gc_th)
151                 return;
152         kthread_stop(gc_th->f2fs_gc_task);
153         kfree(gc_th);
154         sbi->gc_thread = NULL;
155 }
156
157 static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type)
158 {
159         int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY;
160
161         switch (sbi->gc_mode) {
162         case GC_IDLE_CB:
163                 gc_mode = GC_CB;
164                 break;
165         case GC_IDLE_GREEDY:
166         case GC_URGENT:
167                 gc_mode = GC_GREEDY;
168                 break;
169         }
170         return gc_mode;
171 }
172
173 static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
174                         int type, struct victim_sel_policy *p)
175 {
176         struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
177
178         if (p->alloc_mode == SSR) {
179                 p->gc_mode = GC_GREEDY;
180                 p->dirty_segmap = dirty_i->dirty_segmap[type];
181                 p->max_search = dirty_i->nr_dirty[type];
182                 p->ofs_unit = 1;
183         } else {
184                 p->gc_mode = select_gc_type(sbi, gc_type);
185                 p->dirty_segmap = dirty_i->dirty_segmap[DIRTY];
186                 p->max_search = dirty_i->nr_dirty[DIRTY];
187                 p->ofs_unit = sbi->segs_per_sec;
188         }
189
190         /* we need to check every dirty segments in the FG_GC case */
191         if (gc_type != FG_GC &&
192                         (sbi->gc_mode != GC_URGENT) &&
193                         p->max_search > sbi->max_victim_search)
194                 p->max_search = sbi->max_victim_search;
195
196         /* let's select beginning hot/small space first in no_heap mode*/
197         if (test_opt(sbi, NOHEAP) &&
198                 (type == CURSEG_HOT_DATA || IS_NODESEG(type)))
199                 p->offset = 0;
200         else
201                 p->offset = SIT_I(sbi)->last_victim[p->gc_mode];
202 }
203
204 static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
205                                 struct victim_sel_policy *p)
206 {
207         /* SSR allocates in a segment unit */
208         if (p->alloc_mode == SSR)
209                 return sbi->blocks_per_seg;
210         if (p->gc_mode == GC_GREEDY)
211                 return 2 * sbi->blocks_per_seg * p->ofs_unit;
212         else if (p->gc_mode == GC_CB)
213                 return UINT_MAX;
214         else /* No other gc_mode */
215                 return 0;
216 }
217
218 static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
219 {
220         struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
221         unsigned int secno;
222
223         /*
224          * If the gc_type is FG_GC, we can select victim segments
225          * selected by background GC before.
226          * Those segments guarantee they have small valid blocks.
227          */
228         for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) {
229                 if (sec_usage_check(sbi, secno))
230                         continue;
231                 clear_bit(secno, dirty_i->victim_secmap);
232                 return GET_SEG_FROM_SEC(sbi, secno);
233         }
234         return NULL_SEGNO;
235 }
236
237 static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
238 {
239         struct sit_info *sit_i = SIT_I(sbi);
240         unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
241         unsigned int start = GET_SEG_FROM_SEC(sbi, secno);
242         unsigned long long mtime = 0;
243         unsigned int vblocks;
244         unsigned char age = 0;
245         unsigned char u;
246         unsigned int i;
247
248         for (i = 0; i < sbi->segs_per_sec; i++)
249                 mtime += get_seg_entry(sbi, start + i)->mtime;
250         vblocks = get_valid_blocks(sbi, segno, true);
251
252         mtime = div_u64(mtime, sbi->segs_per_sec);
253         vblocks = div_u64(vblocks, sbi->segs_per_sec);
254
255         u = (vblocks * 100) >> sbi->log_blocks_per_seg;
256
257         /* Handle if the system time has changed by the user */
258         if (mtime < sit_i->min_mtime)
259                 sit_i->min_mtime = mtime;
260         if (mtime > sit_i->max_mtime)
261                 sit_i->max_mtime = mtime;
262         if (sit_i->max_mtime != sit_i->min_mtime)
263                 age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime),
264                                 sit_i->max_mtime - sit_i->min_mtime);
265
266         return UINT_MAX - ((100 * (100 - u) * age) / (100 + u));
267 }
268
269 static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
270                         unsigned int segno, struct victim_sel_policy *p)
271 {
272         if (p->alloc_mode == SSR)
273                 return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
274
275         /* alloc_mode == LFS */
276         if (p->gc_mode == GC_GREEDY)
277                 return get_valid_blocks(sbi, segno, true);
278         else
279                 return get_cb_cost(sbi, segno);
280 }
281
282 static unsigned int count_bits(const unsigned long *addr,
283                                 unsigned int offset, unsigned int len)
284 {
285         unsigned int end = offset + len, sum = 0;
286
287         while (offset < end) {
288                 if (test_bit(offset++, addr))
289                         ++sum;
290         }
291         return sum;
292 }
293
294 /*
295  * This function is called from two paths.
296  * One is garbage collection and the other is SSR segment selection.
297  * When it is called during GC, it just gets a victim segment
298  * and it does not remove it from dirty seglist.
299  * When it is called from SSR segment selection, it finds a segment
300  * which has minimum valid blocks and removes it from dirty seglist.
301  */
302 static int get_victim_by_default(struct f2fs_sb_info *sbi,
303                 unsigned int *result, int gc_type, int type, char alloc_mode)
304 {
305         struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
306         struct sit_info *sm = SIT_I(sbi);
307         struct victim_sel_policy p;
308         unsigned int secno, last_victim;
309         unsigned int last_segment = MAIN_SEGS(sbi);
310         unsigned int nsearched = 0;
311
312         mutex_lock(&dirty_i->seglist_lock);
313
314         p.alloc_mode = alloc_mode;
315         select_policy(sbi, gc_type, type, &p);
316
317         p.min_segno = NULL_SEGNO;
318         p.min_cost = get_max_cost(sbi, &p);
319
320         if (*result != NULL_SEGNO) {
321                 if (get_valid_blocks(sbi, *result, false) &&
322                         !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result)))
323                         p.min_segno = *result;
324                 goto out;
325         }
326
327         if (p.max_search == 0)
328                 goto out;
329
330         last_victim = sm->last_victim[p.gc_mode];
331         if (p.alloc_mode == LFS && gc_type == FG_GC) {
332                 p.min_segno = check_bg_victims(sbi);
333                 if (p.min_segno != NULL_SEGNO)
334                         goto got_it;
335         }
336
337         while (1) {
338                 unsigned long cost;
339                 unsigned int segno;
340
341                 segno = find_next_bit(p.dirty_segmap, last_segment, p.offset);
342                 if (segno >= last_segment) {
343                         if (sm->last_victim[p.gc_mode]) {
344                                 last_segment =
345                                         sm->last_victim[p.gc_mode];
346                                 sm->last_victim[p.gc_mode] = 0;
347                                 p.offset = 0;
348                                 continue;
349                         }
350                         break;
351                 }
352
353                 p.offset = segno + p.ofs_unit;
354                 if (p.ofs_unit > 1) {
355                         p.offset -= segno % p.ofs_unit;
356                         nsearched += count_bits(p.dirty_segmap,
357                                                 p.offset - p.ofs_unit,
358                                                 p.ofs_unit);
359                 } else {
360                         nsearched++;
361                 }
362
363                 secno = GET_SEC_FROM_SEG(sbi, segno);
364
365                 if (sec_usage_check(sbi, secno))
366                         goto next;
367                 if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
368                         goto next;
369
370                 cost = get_gc_cost(sbi, segno, &p);
371
372                 if (p.min_cost > cost) {
373                         p.min_segno = segno;
374                         p.min_cost = cost;
375                 }
376 next:
377                 if (nsearched >= p.max_search) {
378                         if (!sm->last_victim[p.gc_mode] && segno <= last_victim)
379                                 sm->last_victim[p.gc_mode] = last_victim + 1;
380                         else
381                                 sm->last_victim[p.gc_mode] = segno + 1;
382                         sm->last_victim[p.gc_mode] %= MAIN_SEGS(sbi);
383                         break;
384                 }
385         }
386         if (p.min_segno != NULL_SEGNO) {
387 got_it:
388                 if (p.alloc_mode == LFS) {
389                         secno = GET_SEC_FROM_SEG(sbi, p.min_segno);
390                         if (gc_type == FG_GC)
391                                 sbi->cur_victim_sec = secno;
392                         else
393                                 set_bit(secno, dirty_i->victim_secmap);
394                 }
395                 *result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
396
397                 trace_f2fs_get_victim(sbi->sb, type, gc_type, &p,
398                                 sbi->cur_victim_sec,
399                                 prefree_segments(sbi), free_segments(sbi));
400         }
401 out:
402         mutex_unlock(&dirty_i->seglist_lock);
403
404         return (p.min_segno == NULL_SEGNO) ? 0 : 1;
405 }
406
407 static const struct victim_selection default_v_ops = {
408         .get_victim = get_victim_by_default,
409 };
410
411 static struct inode *find_gc_inode(struct gc_inode_list *gc_list, nid_t ino)
412 {
413         struct inode_entry *ie;
414
415         ie = radix_tree_lookup(&gc_list->iroot, ino);
416         if (ie)
417                 return ie->inode;
418         return NULL;
419 }
420
421 static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode)
422 {
423         struct inode_entry *new_ie;
424
425         if (inode == find_gc_inode(gc_list, inode->i_ino)) {
426                 iput(inode);
427                 return;
428         }
429         new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, GFP_NOFS);
430         new_ie->inode = inode;
431
432         f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie);
433         list_add_tail(&new_ie->list, &gc_list->ilist);
434 }
435
436 static void put_gc_inode(struct gc_inode_list *gc_list)
437 {
438         struct inode_entry *ie, *next_ie;
439         list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) {
440                 radix_tree_delete(&gc_list->iroot, ie->inode->i_ino);
441                 iput(ie->inode);
442                 list_del(&ie->list);
443                 kmem_cache_free(f2fs_inode_entry_slab, ie);
444         }
445 }
446
447 static int check_valid_map(struct f2fs_sb_info *sbi,
448                                 unsigned int segno, int offset)
449 {
450         struct sit_info *sit_i = SIT_I(sbi);
451         struct seg_entry *sentry;
452         int ret;
453
454         down_read(&sit_i->sentry_lock);
455         sentry = get_seg_entry(sbi, segno);
456         ret = f2fs_test_bit(offset, sentry->cur_valid_map);
457         up_read(&sit_i->sentry_lock);
458         return ret;
459 }
460
461 /*
462  * This function compares node address got in summary with that in NAT.
463  * On validity, copy that node with cold status, otherwise (invalid node)
464  * ignore that.
465  */
466 static void gc_node_segment(struct f2fs_sb_info *sbi,
467                 struct f2fs_summary *sum, unsigned int segno, int gc_type)
468 {
469         struct f2fs_summary *entry;
470         block_t start_addr;
471         int off;
472         int phase = 0;
473         bool fggc = (gc_type == FG_GC);
474
475         start_addr = START_BLOCK(sbi, segno);
476
477 next_step:
478         entry = sum;
479
480         if (fggc && phase == 2)
481                 atomic_inc(&sbi->wb_sync_req[NODE]);
482
483         for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
484                 nid_t nid = le32_to_cpu(entry->nid);
485                 struct page *node_page;
486                 struct node_info ni;
487
488                 /* stop BG_GC if there is not enough free sections. */
489                 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
490                         return;
491
492                 if (check_valid_map(sbi, segno, off) == 0)
493                         continue;
494
495                 if (phase == 0) {
496                         f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1,
497                                                         META_NAT, true);
498                         continue;
499                 }
500
501                 if (phase == 1) {
502                         f2fs_ra_node_page(sbi, nid);
503                         continue;
504                 }
505
506                 /* phase == 2 */
507                 node_page = f2fs_get_node_page(sbi, nid);
508                 if (IS_ERR(node_page))
509                         continue;
510
511                 /* block may become invalid during f2fs_get_node_page */
512                 if (check_valid_map(sbi, segno, off) == 0) {
513                         f2fs_put_page(node_page, 1);
514                         continue;
515                 }
516
517                 if (f2fs_get_node_info(sbi, nid, &ni)) {
518                         f2fs_put_page(node_page, 1);
519                         continue;
520                 }
521
522                 if (ni.blk_addr != start_addr + off) {
523                         f2fs_put_page(node_page, 1);
524                         continue;
525                 }
526
527                 f2fs_move_node_page(node_page, gc_type);
528                 stat_inc_node_blk_count(sbi, 1, gc_type);
529         }
530
531         if (++phase < 3)
532                 goto next_step;
533
534         if (fggc)
535                 atomic_dec(&sbi->wb_sync_req[NODE]);
536 }
537
538 /*
539  * Calculate start block index indicating the given node offset.
540  * Be careful, caller should give this node offset only indicating direct node
541  * blocks. If any node offsets, which point the other types of node blocks such
542  * as indirect or double indirect node blocks, are given, it must be a caller's
543  * bug.
544  */
545 block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode)
546 {
547         unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4;
548         unsigned int bidx;
549
550         if (node_ofs == 0)
551                 return 0;
552
553         if (node_ofs <= 2) {
554                 bidx = node_ofs - 1;
555         } else if (node_ofs <= indirect_blks) {
556                 int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1);
557                 bidx = node_ofs - 2 - dec;
558         } else {
559                 int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
560                 bidx = node_ofs - 5 - dec;
561         }
562         return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode);
563 }
564
565 static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
566                 struct node_info *dni, block_t blkaddr, unsigned int *nofs)
567 {
568         struct page *node_page;
569         nid_t nid;
570         unsigned int ofs_in_node;
571         block_t source_blkaddr;
572
573         nid = le32_to_cpu(sum->nid);
574         ofs_in_node = le16_to_cpu(sum->ofs_in_node);
575
576         node_page = f2fs_get_node_page(sbi, nid);
577         if (IS_ERR(node_page))
578                 return false;
579
580         if (f2fs_get_node_info(sbi, nid, dni)) {
581                 f2fs_put_page(node_page, 1);
582                 return false;
583         }
584
585         if (sum->version != dni->version) {
586                 f2fs_msg(sbi->sb, KERN_WARNING,
587                                 "%s: valid data with mismatched node version.",
588                                 __func__);
589                 set_sbi_flag(sbi, SBI_NEED_FSCK);
590         }
591
592         *nofs = ofs_of_node(node_page);
593         source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node);
594         f2fs_put_page(node_page, 1);
595
596         if (source_blkaddr != blkaddr)
597                 return false;
598         return true;
599 }
600
601 static int ra_data_block(struct inode *inode, pgoff_t index)
602 {
603         struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
604         struct address_space *mapping = inode->i_mapping;
605         struct dnode_of_data dn;
606         struct page *page;
607         struct extent_info ei = {0, 0, 0};
608         struct f2fs_io_info fio = {
609                 .sbi = sbi,
610                 .ino = inode->i_ino,
611                 .type = DATA,
612                 .temp = COLD,
613                 .op = REQ_OP_READ,
614                 .op_flags = 0,
615                 .encrypted_page = NULL,
616                 .in_list = false,
617                 .retry = false,
618         };
619         int err;
620
621         page = f2fs_grab_cache_page(mapping, index, true);
622         if (!page)
623                 return -ENOMEM;
624
625         if (f2fs_lookup_extent_cache(inode, index, &ei)) {
626                 dn.data_blkaddr = ei.blk + index - ei.fofs;
627                 goto got_it;
628         }
629
630         set_new_dnode(&dn, inode, NULL, NULL, 0);
631         err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
632         if (err)
633                 goto put_page;
634         f2fs_put_dnode(&dn);
635
636         if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
637                                                 DATA_GENERIC))) {
638                 err = -EFSCORRUPTED;
639                 goto put_page;
640         }
641 got_it:
642         /* read page */
643         fio.page = page;
644         fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
645
646         fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(sbi),
647                                         dn.data_blkaddr,
648                                         FGP_LOCK | FGP_CREAT, GFP_NOFS);
649         if (!fio.encrypted_page) {
650                 err = -ENOMEM;
651                 goto put_page;
652         }
653
654         err = f2fs_submit_page_bio(&fio);
655         if (err)
656                 goto put_encrypted_page;
657         f2fs_put_page(fio.encrypted_page, 0);
658         f2fs_put_page(page, 1);
659         return 0;
660 put_encrypted_page:
661         f2fs_put_page(fio.encrypted_page, 1);
662 put_page:
663         f2fs_put_page(page, 1);
664         return err;
665 }
666
667 /*
668  * Move data block via META_MAPPING while keeping locked data page.
669  * This can be used to move blocks, aka LBAs, directly on disk.
670  */
671 static void move_data_block(struct inode *inode, block_t bidx,
672                                 int gc_type, unsigned int segno, int off)
673 {
674         struct f2fs_io_info fio = {
675                 .sbi = F2FS_I_SB(inode),
676                 .ino = inode->i_ino,
677                 .type = DATA,
678                 .temp = COLD,
679                 .op = REQ_OP_READ,
680                 .op_flags = 0,
681                 .encrypted_page = NULL,
682                 .in_list = false,
683                 .retry = false,
684         };
685         struct dnode_of_data dn;
686         struct f2fs_summary sum;
687         struct node_info ni;
688         struct page *page, *mpage;
689         block_t newaddr;
690         int err;
691         bool lfs_mode = test_opt(fio.sbi, LFS);
692
693         /* do not read out */
694         page = f2fs_grab_cache_page(inode->i_mapping, bidx, false);
695         if (!page)
696                 return;
697
698         if (!check_valid_map(F2FS_I_SB(inode), segno, off))
699                 goto out;
700
701         if (f2fs_is_atomic_file(inode)) {
702                 F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++;
703                 F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++;
704                 goto out;
705         }
706
707         if (f2fs_is_pinned_file(inode)) {
708                 f2fs_pin_file_control(inode, true);
709                 goto out;
710         }
711
712         set_new_dnode(&dn, inode, NULL, NULL, 0);
713         err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
714         if (err)
715                 goto out;
716
717         if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
718                 ClearPageUptodate(page);
719                 goto put_out;
720         }
721
722         /*
723          * don't cache encrypted data into meta inode until previous dirty
724          * data were writebacked to avoid racing between GC and flush.
725          */
726         f2fs_wait_on_page_writeback(page, DATA, true);
727
728         err = f2fs_get_node_info(fio.sbi, dn.nid, &ni);
729         if (err)
730                 goto put_out;
731
732         set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
733
734         /* read page */
735         fio.page = page;
736         fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
737
738         if (lfs_mode)
739                 down_write(&fio.sbi->io_order_lock);
740
741         f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
742                                         &sum, CURSEG_COLD_DATA, NULL, false);
743
744         fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi),
745                                 newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS);
746         if (!fio.encrypted_page) {
747                 err = -ENOMEM;
748                 goto recover_block;
749         }
750
751         mpage = f2fs_pagecache_get_page(META_MAPPING(fio.sbi),
752                                         fio.old_blkaddr, FGP_LOCK, GFP_NOFS);
753         if (mpage) {
754                 bool updated = false;
755
756                 if (PageUptodate(mpage)) {
757                         memcpy(page_address(fio.encrypted_page),
758                                         page_address(mpage), PAGE_SIZE);
759                         updated = true;
760                 }
761                 f2fs_put_page(mpage, 1);
762                 invalidate_mapping_pages(META_MAPPING(fio.sbi),
763                                         fio.old_blkaddr, fio.old_blkaddr);
764                 if (updated)
765                         goto write_page;
766         }
767
768         err = f2fs_submit_page_bio(&fio);
769         if (err)
770                 goto put_page_out;
771
772         /* write page */
773         lock_page(fio.encrypted_page);
774
775         if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) {
776                 err = -EIO;
777                 goto put_page_out;
778         }
779         if (unlikely(!PageUptodate(fio.encrypted_page))) {
780                 err = -EIO;
781                 goto put_page_out;
782         }
783
784 write_page:
785         set_page_dirty(fio.encrypted_page);
786         f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true);
787         if (clear_page_dirty_for_io(fio.encrypted_page))
788                 dec_page_count(fio.sbi, F2FS_DIRTY_META);
789
790         set_page_writeback(fio.encrypted_page);
791         ClearPageError(page);
792
793         /* allocate block address */
794         f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
795
796         fio.op = REQ_OP_WRITE;
797         fio.op_flags = REQ_SYNC;
798         fio.new_blkaddr = newaddr;
799         f2fs_submit_page_write(&fio);
800         if (fio.retry) {
801                 if (PageWriteback(fio.encrypted_page))
802                         end_page_writeback(fio.encrypted_page);
803                 goto put_page_out;
804         }
805
806         f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE);
807
808         f2fs_update_data_blkaddr(&dn, newaddr);
809         set_inode_flag(inode, FI_APPEND_WRITE);
810         if (page->index == 0)
811                 set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
812 put_page_out:
813         f2fs_put_page(fio.encrypted_page, 1);
814 recover_block:
815         if (lfs_mode)
816                 up_write(&fio.sbi->io_order_lock);
817         if (err)
818                 f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr,
819                                                                 true, true);
820 put_out:
821         f2fs_put_dnode(&dn);
822 out:
823         f2fs_put_page(page, 1);
824 }
825
826 static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
827                                                         unsigned int segno, int off)
828 {
829         struct page *page;
830
831         page = f2fs_get_lock_data_page(inode, bidx, true);
832         if (IS_ERR(page))
833                 return;
834
835         if (!check_valid_map(F2FS_I_SB(inode), segno, off))
836                 goto out;
837
838         if (f2fs_is_atomic_file(inode)) {
839                 F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++;
840                 F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++;
841                 goto out;
842         }
843         if (f2fs_is_pinned_file(inode)) {
844                 if (gc_type == FG_GC)
845                         f2fs_pin_file_control(inode, true);
846                 goto out;
847         }
848
849         if (gc_type == BG_GC) {
850                 if (PageWriteback(page))
851                         goto out;
852                 set_page_dirty(page);
853                 set_cold_data(page);
854         } else {
855                 struct f2fs_io_info fio = {
856                         .sbi = F2FS_I_SB(inode),
857                         .ino = inode->i_ino,
858                         .type = DATA,
859                         .temp = COLD,
860                         .op = REQ_OP_WRITE,
861                         .op_flags = REQ_SYNC,
862                         .old_blkaddr = NULL_ADDR,
863                         .page = page,
864                         .encrypted_page = NULL,
865                         .need_lock = LOCK_REQ,
866                         .io_type = FS_GC_DATA_IO,
867                 };
868                 bool is_dirty = PageDirty(page);
869                 int err;
870
871 retry:
872                 set_page_dirty(page);
873                 f2fs_wait_on_page_writeback(page, DATA, true);
874                 if (clear_page_dirty_for_io(page)) {
875                         inode_dec_dirty_pages(inode);
876                         f2fs_remove_dirty_inode(inode);
877                 }
878
879                 set_cold_data(page);
880
881                 err = f2fs_do_write_data_page(&fio);
882                 if (err) {
883                         clear_cold_data(page);
884                         if (err == -ENOMEM) {
885                                 congestion_wait(BLK_RW_ASYNC, HZ/50);
886                                 goto retry;
887                         }
888                         if (is_dirty)
889                                 set_page_dirty(page);
890                 }
891         }
892 out:
893         f2fs_put_page(page, 1);
894 }
895
896 /*
897  * This function tries to get parent node of victim data block, and identifies
898  * data block validity. If the block is valid, copy that with cold status and
899  * modify parent node.
900  * If the parent node is not valid or the data block address is different,
901  * the victim data block is ignored.
902  */
903 static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
904                 struct gc_inode_list *gc_list, unsigned int segno, int gc_type)
905 {
906         struct super_block *sb = sbi->sb;
907         struct f2fs_summary *entry;
908         block_t start_addr;
909         int off;
910         int phase = 0;
911
912         start_addr = START_BLOCK(sbi, segno);
913
914 next_step:
915         entry = sum;
916
917         for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
918                 struct page *data_page;
919                 struct inode *inode;
920                 struct node_info dni; /* dnode info for the data */
921                 unsigned int ofs_in_node, nofs;
922                 block_t start_bidx;
923                 nid_t nid = le32_to_cpu(entry->nid);
924
925                 /* stop BG_GC if there is not enough free sections. */
926                 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
927                         return;
928
929                 if (check_valid_map(sbi, segno, off) == 0)
930                         continue;
931
932                 if (phase == 0) {
933                         f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1,
934                                                         META_NAT, true);
935                         continue;
936                 }
937
938                 if (phase == 1) {
939                         f2fs_ra_node_page(sbi, nid);
940                         continue;
941                 }
942
943                 /* Get an inode by ino with checking validity */
944                 if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs))
945                         continue;
946
947                 if (phase == 2) {
948                         f2fs_ra_node_page(sbi, dni.ino);
949                         continue;
950                 }
951
952                 ofs_in_node = le16_to_cpu(entry->ofs_in_node);
953
954                 if (phase == 3) {
955                         inode = f2fs_iget(sb, dni.ino);
956                         if (IS_ERR(inode) || is_bad_inode(inode))
957                                 continue;
958
959                         if (!down_write_trylock(
960                                 &F2FS_I(inode)->i_gc_rwsem[WRITE])) {
961                                 iput(inode);
962                                 sbi->skipped_gc_rwsem++;
963                                 continue;
964                         }
965
966                         start_bidx = f2fs_start_bidx_of_node(nofs, inode) +
967                                                                 ofs_in_node;
968
969                         if (f2fs_post_read_required(inode)) {
970                                 int err = ra_data_block(inode, start_bidx);
971
972                                 up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
973                                 if (err) {
974                                         iput(inode);
975                                         continue;
976                                 }
977                                 add_gc_inode(gc_list, inode);
978                                 continue;
979                         }
980
981                         data_page = f2fs_get_read_data_page(inode,
982                                                 start_bidx, REQ_RAHEAD, true);
983                         up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
984                         if (IS_ERR(data_page)) {
985                                 iput(inode);
986                                 continue;
987                         }
988
989                         f2fs_put_page(data_page, 0);
990                         add_gc_inode(gc_list, inode);
991                         continue;
992                 }
993
994                 /* phase 4 */
995                 inode = find_gc_inode(gc_list, dni.ino);
996                 if (inode) {
997                         struct f2fs_inode_info *fi = F2FS_I(inode);
998                         bool locked = false;
999
1000                         if (S_ISREG(inode->i_mode)) {
1001                                 if (!down_write_trylock(&fi->i_gc_rwsem[READ])) {
1002                                         sbi->skipped_gc_rwsem++;
1003                                         continue;
1004                                 }
1005                                 if (!down_write_trylock(
1006                                                 &fi->i_gc_rwsem[WRITE])) {
1007                                         sbi->skipped_gc_rwsem++;
1008                                         up_write(&fi->i_gc_rwsem[READ]);
1009                                         continue;
1010                                 }
1011                                 locked = true;
1012
1013                                 /* wait for all inflight aio data */
1014                                 inode_dio_wait(inode);
1015                         }
1016
1017                         start_bidx = f2fs_start_bidx_of_node(nofs, inode)
1018                                                                 + ofs_in_node;
1019                         if (f2fs_post_read_required(inode))
1020                                 move_data_block(inode, start_bidx, gc_type,
1021                                                                 segno, off);
1022                         else
1023                                 move_data_page(inode, start_bidx, gc_type,
1024                                                                 segno, off);
1025
1026                         if (locked) {
1027                                 up_write(&fi->i_gc_rwsem[WRITE]);
1028                                 up_write(&fi->i_gc_rwsem[READ]);
1029                         }
1030
1031                         stat_inc_data_blk_count(sbi, 1, gc_type);
1032                 }
1033         }
1034
1035         if (++phase < 5)
1036                 goto next_step;
1037 }
1038
1039 static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
1040                         int gc_type)
1041 {
1042         struct sit_info *sit_i = SIT_I(sbi);
1043         int ret;
1044
1045         down_write(&sit_i->sentry_lock);
1046         ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type,
1047                                               NO_CHECK_TYPE, LFS);
1048         up_write(&sit_i->sentry_lock);
1049         return ret;
1050 }
1051
1052 static int do_garbage_collect(struct f2fs_sb_info *sbi,
1053                                 unsigned int start_segno,
1054                                 struct gc_inode_list *gc_list, int gc_type)
1055 {
1056         struct page *sum_page;
1057         struct f2fs_summary_block *sum;
1058         struct blk_plug plug;
1059         unsigned int segno = start_segno;
1060         unsigned int end_segno = start_segno + sbi->segs_per_sec;
1061         int seg_freed = 0;
1062         unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
1063                                                 SUM_TYPE_DATA : SUM_TYPE_NODE;
1064
1065         /* readahead multi ssa blocks those have contiguous address */
1066         if (sbi->segs_per_sec > 1)
1067                 f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno),
1068                                         sbi->segs_per_sec, META_SSA, true);
1069
1070         /* reference all summary page */
1071         while (segno < end_segno) {
1072                 sum_page = f2fs_get_sum_page(sbi, segno++);
1073                 unlock_page(sum_page);
1074         }
1075
1076         blk_start_plug(&plug);
1077
1078         for (segno = start_segno; segno < end_segno; segno++) {
1079
1080                 /* find segment summary of victim */
1081                 sum_page = find_get_page(META_MAPPING(sbi),
1082                                         GET_SUM_BLOCK(sbi, segno));
1083                 f2fs_put_page(sum_page, 0);
1084
1085                 if (get_valid_blocks(sbi, segno, false) == 0)
1086                         goto freed;
1087                 if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi)))
1088                         goto next;
1089
1090                 sum = page_address(sum_page);
1091                 if (type != GET_SUM_TYPE((&sum->footer))) {
1092                         f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent segment (%u) "
1093                                 "type [%d, %d] in SSA and SIT",
1094                                 segno, type, GET_SUM_TYPE((&sum->footer)));
1095                         set_sbi_flag(sbi, SBI_NEED_FSCK);
1096                         goto next;
1097                 }
1098
1099                 /*
1100                  * this is to avoid deadlock:
1101                  * - lock_page(sum_page)         - f2fs_replace_block
1102                  *  - check_valid_map()            - down_write(sentry_lock)
1103                  *   - down_read(sentry_lock)     - change_curseg()
1104                  *                                  - lock_page(sum_page)
1105                  */
1106                 if (type == SUM_TYPE_NODE)
1107                         gc_node_segment(sbi, sum->entries, segno, gc_type);
1108                 else
1109                         gc_data_segment(sbi, sum->entries, gc_list, segno,
1110                                                                 gc_type);
1111
1112                 stat_inc_seg_count(sbi, type, gc_type);
1113
1114 freed:
1115                 if (gc_type == FG_GC &&
1116                                 get_valid_blocks(sbi, segno, false) == 0)
1117                         seg_freed++;
1118 next:
1119                 f2fs_put_page(sum_page, 0);
1120         }
1121
1122         if (gc_type == FG_GC)
1123                 f2fs_submit_merged_write(sbi,
1124                                 (type == SUM_TYPE_NODE) ? NODE : DATA);
1125
1126         blk_finish_plug(&plug);
1127
1128         stat_inc_call_count(sbi->stat_info);
1129
1130         return seg_freed;
1131 }
1132
1133 int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
1134                         bool background, unsigned int segno)
1135 {
1136         int gc_type = sync ? FG_GC : BG_GC;
1137         int sec_freed = 0, seg_freed = 0, total_freed = 0;
1138         int ret = 0;
1139         struct cp_control cpc;
1140         unsigned int init_segno = segno;
1141         struct gc_inode_list gc_list = {
1142                 .ilist = LIST_HEAD_INIT(gc_list.ilist),
1143                 .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
1144         };
1145         unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC];
1146         unsigned long long first_skipped;
1147         unsigned int skipped_round = 0, round = 0;
1148
1149         trace_f2fs_gc_begin(sbi->sb, sync, background,
1150                                 get_pages(sbi, F2FS_DIRTY_NODES),
1151                                 get_pages(sbi, F2FS_DIRTY_DENTS),
1152                                 get_pages(sbi, F2FS_DIRTY_IMETA),
1153                                 free_sections(sbi),
1154                                 free_segments(sbi),
1155                                 reserved_segments(sbi),
1156                                 prefree_segments(sbi));
1157
1158         cpc.reason = __get_cp_reason(sbi);
1159         sbi->skipped_gc_rwsem = 0;
1160         first_skipped = last_skipped;
1161 gc_more:
1162         if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) {
1163                 ret = -EINVAL;
1164                 goto stop;
1165         }
1166         if (unlikely(f2fs_cp_error(sbi))) {
1167                 ret = -EIO;
1168                 goto stop;
1169         }
1170
1171         if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) {
1172                 /*
1173                  * For example, if there are many prefree_segments below given
1174                  * threshold, we can make them free by checkpoint. Then, we
1175                  * secure free segments which doesn't need fggc any more.
1176                  */
1177                 if (prefree_segments(sbi)) {
1178                         ret = f2fs_write_checkpoint(sbi, &cpc);
1179                         if (ret)
1180                                 goto stop;
1181                 }
1182                 if (has_not_enough_free_secs(sbi, 0, 0))
1183                         gc_type = FG_GC;
1184         }
1185
1186         /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */
1187         if (gc_type == BG_GC && !background) {
1188                 ret = -EINVAL;
1189                 goto stop;
1190         }
1191         if (!__get_victim(sbi, &segno, gc_type)) {
1192                 ret = -ENODATA;
1193                 goto stop;
1194         }
1195
1196         seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type);
1197         if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec)
1198                 sec_freed++;
1199         total_freed += seg_freed;
1200
1201         if (gc_type == FG_GC) {
1202                 if (sbi->skipped_atomic_files[FG_GC] > last_skipped ||
1203                                                 sbi->skipped_gc_rwsem)
1204                         skipped_round++;
1205                 last_skipped = sbi->skipped_atomic_files[FG_GC];
1206                 round++;
1207         }
1208
1209         if (gc_type == FG_GC)
1210                 sbi->cur_victim_sec = NULL_SEGNO;
1211
1212         if (sync)
1213                 goto stop;
1214
1215         if (has_not_enough_free_secs(sbi, sec_freed, 0)) {
1216                 if (skipped_round <= MAX_SKIP_GC_COUNT ||
1217                                         skipped_round * 2 < round) {
1218                         segno = NULL_SEGNO;
1219                         goto gc_more;
1220                 }
1221
1222                 if (first_skipped < last_skipped &&
1223                                 (last_skipped - first_skipped) >
1224                                                 sbi->skipped_gc_rwsem) {
1225                         f2fs_drop_inmem_pages_all(sbi, true);
1226                         segno = NULL_SEGNO;
1227                         goto gc_more;
1228                 }
1229                 if (gc_type == FG_GC)
1230                         ret = f2fs_write_checkpoint(sbi, &cpc);
1231         }
1232 stop:
1233         SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
1234         SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno;
1235
1236         trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed,
1237                                 get_pages(sbi, F2FS_DIRTY_NODES),
1238                                 get_pages(sbi, F2FS_DIRTY_DENTS),
1239                                 get_pages(sbi, F2FS_DIRTY_IMETA),
1240                                 free_sections(sbi),
1241                                 free_segments(sbi),
1242                                 reserved_segments(sbi),
1243                                 prefree_segments(sbi));
1244
1245         mutex_unlock(&sbi->gc_mutex);
1246
1247         put_gc_inode(&gc_list);
1248
1249         if (sync && !ret)
1250                 ret = sec_freed ? 0 : -EAGAIN;
1251         return ret;
1252 }
1253
1254 void f2fs_build_gc_manager(struct f2fs_sb_info *sbi)
1255 {
1256         DIRTY_I(sbi)->v_ops = &default_v_ops;
1257
1258         sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES;
1259
1260         /* give warm/cold data area from slower device */
1261         if (f2fs_is_multi_device(sbi) && sbi->segs_per_sec == 1)
1262                 SIT_I(sbi)->last_victim[ALLOC_NEXT] =
1263                                 GET_SEGNO(sbi, FDEV(0).end_blk) + 1;
1264 }