2 * Copyright (C) International Business Machines Corp., 2000-2004
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 * jfs_imap.c: inode allocation map manager
23 * Each AG has a simple lock which is used to control the serialization of
24 * the AG level lists. This lock should be taken first whenever an AG
25 * level list will be modified or accessed.
27 * Each IAG is locked by obtaining the buffer for the IAG page.
29 * There is also a inode lock for the inode map inode. A read lock needs to
30 * be taken whenever an IAG is read from the map or the global level
31 * information is read. A write lock needs to be taken whenever the global
32 * level information is modified or an atomic operation needs to be used.
34 * If more than one IAG is read at one time, the read lock may not
35 * be given up until all of the IAG's are read. Otherwise, a deadlock
36 * may occur when trying to obtain the read lock while another thread
37 * holding the read lock is waiting on the IAG already being held.
39 * The control page of the inode map is read into memory by diMount().
40 * Thereafter it should only be modified in memory and then it will be
41 * written out when the filesystem is unmounted by diUnmount().
45 #include <linux/buffer_head.h>
46 #include <linux/pagemap.h>
47 #include <linux/quotaops.h>
48 #include <linux/slab.h>
50 #include "jfs_incore.h"
51 #include "jfs_inode.h"
52 #include "jfs_filsys.h"
53 #include "jfs_dinode.h"
56 #include "jfs_metapage.h"
57 #include "jfs_superblock.h"
58 #include "jfs_debug.h"
63 /* iag free list lock */
64 #define IAGFREE_LOCK_INIT(imap) mutex_init(&imap->im_freelock)
65 #define IAGFREE_LOCK(imap) mutex_lock(&imap->im_freelock)
66 #define IAGFREE_UNLOCK(imap) mutex_unlock(&imap->im_freelock)
68 /* per ag iag list locks */
69 #define AG_LOCK_INIT(imap,index) mutex_init(&(imap->im_aglock[index]))
70 #define AG_LOCK(imap,agno) mutex_lock(&imap->im_aglock[agno])
71 #define AG_UNLOCK(imap,agno) mutex_unlock(&imap->im_aglock[agno])
76 static int diAllocAG(struct inomap *, int, bool, struct inode *);
77 static int diAllocAny(struct inomap *, int, bool, struct inode *);
78 static int diAllocBit(struct inomap *, struct iag *, int);
79 static int diAllocExt(struct inomap *, int, struct inode *);
80 static int diAllocIno(struct inomap *, int, struct inode *);
81 static int diFindFree(u32, int);
82 static int diNewExt(struct inomap *, struct iag *, int);
83 static int diNewIAG(struct inomap *, int *, int, struct metapage **);
84 static void duplicateIXtree(struct super_block *, s64, int, s64 *);
86 static int diIAGRead(struct inomap * imap, int, struct metapage **);
87 static int copy_from_dinode(struct dinode *, struct inode *);
88 static void copy_to_dinode(struct dinode *, struct inode *);
93 * FUNCTION: initialize the incore inode map control structures for
94 * a fileset or aggregate init time.
96 * the inode map's control structure (dinomap) is
97 * brought in from disk and placed in virtual memory.
100 * ipimap - pointer to inode map inode for the aggregate or fileset.
104 * -ENOMEM - insufficient free virtual memory.
107 int diMount(struct inode *ipimap)
112 struct dinomap_disk *dinom_le;
115 * allocate/initialize the in-memory inode map control structure
117 /* allocate the in-memory inode map control structure. */
118 imap = kmalloc(sizeof(struct inomap), GFP_KERNEL);
120 jfs_err("diMount: kmalloc returned NULL!");
124 /* read the on-disk inode map control structure. */
126 mp = read_metapage(ipimap,
127 IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage,
134 /* copy the on-disk version to the in-memory version. */
135 dinom_le = (struct dinomap_disk *) mp->data;
136 imap->im_freeiag = le32_to_cpu(dinom_le->in_freeiag);
137 imap->im_nextiag = le32_to_cpu(dinom_le->in_nextiag);
138 atomic_set(&imap->im_numinos, le32_to_cpu(dinom_le->in_numinos));
139 atomic_set(&imap->im_numfree, le32_to_cpu(dinom_le->in_numfree));
140 imap->im_nbperiext = le32_to_cpu(dinom_le->in_nbperiext);
141 imap->im_l2nbperiext = le32_to_cpu(dinom_le->in_l2nbperiext);
142 for (index = 0; index < MAXAG; index++) {
143 imap->im_agctl[index].inofree =
144 le32_to_cpu(dinom_le->in_agctl[index].inofree);
145 imap->im_agctl[index].extfree =
146 le32_to_cpu(dinom_le->in_agctl[index].extfree);
147 imap->im_agctl[index].numinos =
148 le32_to_cpu(dinom_le->in_agctl[index].numinos);
149 imap->im_agctl[index].numfree =
150 le32_to_cpu(dinom_le->in_agctl[index].numfree);
153 /* release the buffer. */
154 release_metapage(mp);
157 * allocate/initialize inode allocation map locks
159 /* allocate and init iag free list lock */
160 IAGFREE_LOCK_INIT(imap);
162 /* allocate and init ag list locks */
163 for (index = 0; index < MAXAG; index++) {
164 AG_LOCK_INIT(imap, index);
167 /* bind the inode map inode and inode map control structure
170 imap->im_ipimap = ipimap;
171 JFS_IP(ipimap)->i_imap = imap;
180 * FUNCTION: write to disk the incore inode map control structures for
181 * a fileset or aggregate at unmount time.
184 * ipimap - pointer to inode map inode for the aggregate or fileset.
188 * -ENOMEM - insufficient free virtual memory.
191 int diUnmount(struct inode *ipimap, int mounterror)
193 struct inomap *imap = JFS_IP(ipimap)->i_imap;
196 * update the on-disk inode map control structure
199 if (!(mounterror || isReadOnly(ipimap)))
203 * Invalidate the page cache buffers
205 truncate_inode_pages(ipimap->i_mapping, 0);
208 * free in-memory control structure
211 JFS_IP(ipimap)->i_imap = NULL;
220 int diSync(struct inode *ipimap)
222 struct dinomap_disk *dinom_le;
223 struct inomap *imp = JFS_IP(ipimap)->i_imap;
228 * write imap global conrol page
230 /* read the on-disk inode map control structure */
231 mp = get_metapage(ipimap,
232 IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage,
235 jfs_err("diSync: get_metapage failed!");
239 /* copy the in-memory version to the on-disk version */
240 dinom_le = (struct dinomap_disk *) mp->data;
241 dinom_le->in_freeiag = cpu_to_le32(imp->im_freeiag);
242 dinom_le->in_nextiag = cpu_to_le32(imp->im_nextiag);
243 dinom_le->in_numinos = cpu_to_le32(atomic_read(&imp->im_numinos));
244 dinom_le->in_numfree = cpu_to_le32(atomic_read(&imp->im_numfree));
245 dinom_le->in_nbperiext = cpu_to_le32(imp->im_nbperiext);
246 dinom_le->in_l2nbperiext = cpu_to_le32(imp->im_l2nbperiext);
247 for (index = 0; index < MAXAG; index++) {
248 dinom_le->in_agctl[index].inofree =
249 cpu_to_le32(imp->im_agctl[index].inofree);
250 dinom_le->in_agctl[index].extfree =
251 cpu_to_le32(imp->im_agctl[index].extfree);
252 dinom_le->in_agctl[index].numinos =
253 cpu_to_le32(imp->im_agctl[index].numinos);
254 dinom_le->in_agctl[index].numfree =
255 cpu_to_le32(imp->im_agctl[index].numfree);
258 /* write out the control structure */
262 * write out dirty pages of imap
264 filemap_write_and_wait(ipimap->i_mapping);
266 diWriteSpecial(ipimap, 0);
275 * FUNCTION: initialize an incore inode from disk.
277 * on entry, the specifed incore inode should itself
278 * specify the disk inode number corresponding to the
279 * incore inode (i.e. i_number should be initialized).
281 * this routine handles incore inode initialization for
282 * both "special" and "regular" inodes. special inodes
283 * are those required early in the mount process and
284 * require special handling since much of the file system
285 * is not yet initialized. these "special" inodes are
286 * identified by a NULL inode map inode pointer and are
287 * actually initialized by a call to diReadSpecial().
289 * for regular inodes, the iag describing the disk inode
290 * is read from disk to determine the inode extent address
291 * for the disk inode. with the inode extent address in
292 * hand, the page of the extent that contains the disk
293 * inode is read and the disk inode is copied to the
297 * ip - pointer to incore inode to be initialized from disk.
302 * -ENOMEM - insufficient memory
305 int diRead(struct inode *ip)
307 struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
308 int iagno, ino, extno, rc;
309 struct inode *ipimap;
317 unsigned long pageno;
320 jfs_info("diRead: ino = %ld", ip->i_ino);
322 ipimap = sbi->ipimap;
323 JFS_IP(ip)->ipimap = ipimap;
325 /* determine the iag number for this inode (number) */
326 iagno = INOTOIAG(ip->i_ino);
329 imap = JFS_IP(ipimap)->i_imap;
330 IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
331 rc = diIAGRead(imap, iagno, &mp);
332 IREAD_UNLOCK(ipimap);
334 jfs_err("diRead: diIAGRead returned %d", rc);
338 iagp = (struct iag *) mp->data;
340 /* determine inode extent that holds the disk inode */
341 ino = ip->i_ino & (INOSPERIAG - 1);
342 extno = ino >> L2INOSPEREXT;
344 if ((lengthPXD(&iagp->inoext[extno]) != imap->im_nbperiext) ||
345 (addressPXD(&iagp->inoext[extno]) == 0)) {
346 release_metapage(mp);
350 /* get disk block number of the page within the inode extent
351 * that holds the disk inode.
353 blkno = INOPBLK(&iagp->inoext[extno], ino, sbi->l2nbperpage);
355 /* get the ag for the iag */
356 agstart = le64_to_cpu(iagp->agstart);
358 release_metapage(mp);
360 rel_inode = (ino & (INOSPERPAGE - 1));
361 pageno = blkno >> sbi->l2nbperpage;
363 if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) {
365 * OS/2 didn't always align inode extents on page boundaries
368 (sbi->nbperpage - block_offset) << sbi->l2niperblk;
370 if (rel_inode < inodes_left)
371 rel_inode += block_offset << sbi->l2niperblk;
374 rel_inode -= inodes_left;
378 /* read the page of disk inode */
379 mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
381 jfs_err("diRead: read_metapage failed");
385 /* locate the disk inode requested */
386 dp = (struct dinode *) mp->data;
389 if (ip->i_ino != le32_to_cpu(dp->di_number)) {
390 jfs_error(ip->i_sb, "i_ino != di_number\n");
392 } else if (le32_to_cpu(dp->di_nlink) == 0)
395 /* copy the disk inode to the in-memory inode */
396 rc = copy_from_dinode(dp, ip);
398 release_metapage(mp);
400 /* set the ag for the inode */
401 JFS_IP(ip)->agstart = agstart;
402 JFS_IP(ip)->active_ag = -1;
409 * NAME: diReadSpecial()
411 * FUNCTION: initialize a 'special' inode from disk.
413 * this routines handles aggregate level inodes. The
414 * inode cache cannot differentiate between the
415 * aggregate inodes and the filesystem inodes, so we
416 * handle these here. We don't actually use the aggregate
417 * inode map, since these inodes are at a fixed location
418 * and in some cases the aggregate inode map isn't initialized
422 * sb - filesystem superblock
423 * inum - aggregate inode number
424 * secondary - 1 if secondary aggregate inode table
427 * new inode - success
430 struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
432 struct jfs_sb_info *sbi = JFS_SBI(sb);
440 jfs_err("diReadSpecial: new_inode returned NULL!");
445 address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage;
446 JFS_IP(ip)->ipimap = sbi->ipaimap2;
448 address = AITBL_OFF >> L2PSIZE;
449 JFS_IP(ip)->ipimap = sbi->ipaimap;
452 ASSERT(inum < INOSPEREXT);
456 address += inum >> 3; /* 8 inodes per 4K page */
458 /* read the page of fixed disk inode (AIT) in raw mode */
459 mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1);
461 set_nlink(ip, 1); /* Don't want iput() deleting it */
466 /* get the pointer to the disk inode of interest */
467 dp = (struct dinode *) (mp->data);
468 dp += inum % 8; /* 8 inodes per 4K page */
470 /* copy on-disk inode to in-memory inode */
471 if ((copy_from_dinode(dp, ip)) != 0) {
472 /* handle bad return by returning NULL for ip */
473 set_nlink(ip, 1); /* Don't want iput() deleting it */
475 /* release the page */
476 release_metapage(mp);
481 ip->i_mapping->a_ops = &jfs_metapage_aops;
482 mapping_set_gfp_mask(ip->i_mapping, GFP_NOFS);
484 /* Allocations to metadata inodes should not affect quotas */
485 ip->i_flags |= S_NOQUOTA;
487 if ((inum == FILESYSTEM_I) && (JFS_IP(ip)->ipimap == sbi->ipaimap)) {
488 sbi->gengen = le32_to_cpu(dp->di_gengen);
489 sbi->inostamp = le32_to_cpu(dp->di_inostamp);
492 /* release the page */
493 release_metapage(mp);
496 * __mark_inode_dirty expects inodes to be hashed. Since we don't
497 * want special inodes in the fileset inode space, we make them
498 * appear hashed, but do not put on any lists. hlist_del()
499 * will work fine and require no locking.
501 hlist_add_fake(&ip->i_hash);
507 * NAME: diWriteSpecial()
509 * FUNCTION: Write the special inode to disk
513 * secondary - 1 if secondary aggregate inode table
515 * RETURN VALUES: none
518 void diWriteSpecial(struct inode *ip, int secondary)
520 struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
523 ino_t inum = ip->i_ino;
527 address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage;
529 address = AITBL_OFF >> L2PSIZE;
531 ASSERT(inum < INOSPEREXT);
533 address += inum >> 3; /* 8 inodes per 4K page */
535 /* read the page of fixed disk inode (AIT) in raw mode */
536 mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1);
538 jfs_err("diWriteSpecial: failed to read aggregate inode extent!");
542 /* get the pointer to the disk inode of interest */
543 dp = (struct dinode *) (mp->data);
544 dp += inum % 8; /* 8 inodes per 4K page */
546 /* copy on-disk inode to in-memory inode */
547 copy_to_dinode(dp, ip);
548 memcpy(&dp->di_xtroot, &JFS_IP(ip)->i_xtroot, 288);
550 if (inum == FILESYSTEM_I)
551 dp->di_gengen = cpu_to_le32(sbi->gengen);
558 * NAME: diFreeSpecial()
560 * FUNCTION: Free allocated space for special inode
562 void diFreeSpecial(struct inode *ip)
565 jfs_err("diFreeSpecial called with NULL ip!");
568 filemap_write_and_wait(ip->i_mapping);
569 truncate_inode_pages(ip->i_mapping, 0);
578 * FUNCTION: write the on-disk inode portion of the in-memory inode
579 * to its corresponding on-disk inode.
581 * on entry, the specifed incore inode should itself
582 * specify the disk inode number corresponding to the
583 * incore inode (i.e. i_number should be initialized).
585 * the inode contains the inode extent address for the disk
586 * inode. with the inode extent address in hand, the
587 * page of the extent that contains the disk inode is
588 * read and the disk inode portion of the incore inode
589 * is copied to the disk inode.
592 * tid - transacation id
593 * ip - pointer to incore inode to be written to the inode extent.
599 int diWrite(tid_t tid, struct inode *ip)
601 struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
602 struct jfs_inode_info *jfs_ip = JFS_IP(ip);
610 unsigned long pageno;
613 struct inode *ipimap;
616 struct tlock *ditlck, *tlck;
617 struct linelock *dilinelock, *ilinelock;
621 ipimap = jfs_ip->ipimap;
623 ino = ip->i_ino & (INOSPERIAG - 1);
625 if (!addressPXD(&(jfs_ip->ixpxd)) ||
626 (lengthPXD(&(jfs_ip->ixpxd)) !=
627 JFS_IP(ipimap)->i_imap->im_nbperiext)) {
628 jfs_error(ip->i_sb, "ixpxd invalid\n");
633 * read the page of disk inode containing the specified inode:
635 /* compute the block address of the page */
636 blkno = INOPBLK(&(jfs_ip->ixpxd), ino, sbi->l2nbperpage);
638 rel_inode = (ino & (INOSPERPAGE - 1));
639 pageno = blkno >> sbi->l2nbperpage;
641 if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) {
643 * OS/2 didn't always align inode extents on page boundaries
646 (sbi->nbperpage - block_offset) << sbi->l2niperblk;
648 if (rel_inode < inodes_left)
649 rel_inode += block_offset << sbi->l2niperblk;
652 rel_inode -= inodes_left;
655 /* read the page of disk inode */
657 mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
661 /* get the pointer to the disk inode */
662 dp = (struct dinode *) mp->data;
665 dioffset = (ino & (INOSPERPAGE - 1)) << L2DISIZE;
668 * acquire transaction lock on the on-disk inode;
669 * N.B. tlock is acquired on ipimap not ip;
672 txLock(tid, ipimap, mp, tlckINODE | tlckENTRY)) == NULL)
674 dilinelock = (struct linelock *) & ditlck->lock;
677 * copy btree root from in-memory inode to on-disk inode
679 * (tlock is taken from inline B+-tree root in in-memory
680 * inode when the B+-tree root is updated, which is pointed
681 * by jfs_ip->blid as well as being on tx tlock list)
683 * further processing of btree root is based on the copy
684 * in in-memory inode, where txLog() will log from, and,
685 * for xtree root, txUpdateMap() will update map and reset
689 if (S_ISDIR(ip->i_mode) && (lid = jfs_ip->xtlid)) {
691 * This is the special xtree inside the directory for storing
692 * the directory table
698 tlck = lid_to_tlock(lid);
699 assert(tlck->type & tlckXTREE);
700 tlck->type |= tlckBTROOT;
702 ilinelock = (struct linelock *) & tlck->lock;
705 * copy xtree root from inode to dinode:
707 p = &jfs_ip->i_xtroot;
708 xp = (xtpage_t *) &dp->di_dirtable;
710 for (n = 0; n < ilinelock->index; n++, lv++) {
711 memcpy(&xp->xad[lv->offset], &p->xad[lv->offset],
712 lv->length << L2XTSLOTSIZE);
715 /* reset on-disk (metadata page) xtree XAD_NEW bit */
716 xad = &xp->xad[XTENTRYSTART];
717 for (n = XTENTRYSTART;
718 n < le16_to_cpu(xp->header.nextindex); n++, xad++)
719 if (xad->flag & (XAD_NEW | XAD_EXTENDED))
720 xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
723 if ((lid = jfs_ip->blid) == 0)
727 tlck = lid_to_tlock(lid);
729 tlck->type |= tlckBTROOT;
731 ilinelock = (struct linelock *) & tlck->lock;
734 * regular file: 16 byte (XAD slot) granularity
736 if (type & tlckXTREE) {
741 * copy xtree root from inode to dinode:
743 p = &jfs_ip->i_xtroot;
746 for (n = 0; n < ilinelock->index; n++, lv++) {
747 memcpy(&xp->xad[lv->offset], &p->xad[lv->offset],
748 lv->length << L2XTSLOTSIZE);
751 /* reset on-disk (metadata page) xtree XAD_NEW bit */
752 xad = &xp->xad[XTENTRYSTART];
753 for (n = XTENTRYSTART;
754 n < le16_to_cpu(xp->header.nextindex); n++, xad++)
755 if (xad->flag & (XAD_NEW | XAD_EXTENDED))
756 xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
759 * directory: 32 byte (directory entry slot) granularity
761 else if (type & tlckDTREE) {
765 * copy dtree root from inode to dinode:
767 p = (dtpage_t *) &jfs_ip->i_dtroot;
768 xp = (dtpage_t *) & dp->di_dtroot;
770 for (n = 0; n < ilinelock->index; n++, lv++) {
771 memcpy(&xp->slot[lv->offset], &p->slot[lv->offset],
772 lv->length << L2DTSLOTSIZE);
775 jfs_err("diWrite: UFO tlock");
780 * copy inline symlink from in-memory inode to on-disk inode
782 if (S_ISLNK(ip->i_mode) && ip->i_size < IDATASIZE) {
783 lv = & dilinelock->lv[dilinelock->index];
784 lv->offset = (dioffset + 2 * 128) >> L2INODESLOTSIZE;
786 memcpy(&dp->di_fastsymlink, jfs_ip->i_inline, IDATASIZE);
790 * copy inline data from in-memory inode to on-disk inode:
791 * 128 byte slot granularity
793 if (test_cflag(COMMIT_Inlineea, ip)) {
794 lv = & dilinelock->lv[dilinelock->index];
795 lv->offset = (dioffset + 3 * 128) >> L2INODESLOTSIZE;
797 memcpy(&dp->di_inlineea, jfs_ip->i_inline_ea, INODESLOTSIZE);
800 clear_cflag(COMMIT_Inlineea, ip);
804 * lock/copy inode base: 128 byte slot granularity
806 lv = & dilinelock->lv[dilinelock->index];
807 lv->offset = dioffset >> L2INODESLOTSIZE;
808 copy_to_dinode(dp, ip);
809 if (test_and_clear_cflag(COMMIT_Dirtable, ip)) {
811 memcpy(&dp->di_dirtable, &jfs_ip->i_dirtable, 96);
816 /* release the buffer holding the updated on-disk inode.
817 * the buffer will be later written by commit processing.
828 * FUNCTION: free a specified inode from the inode working map
829 * for a fileset or aggregate.
831 * if the inode to be freed represents the first (only)
832 * free inode within the iag, the iag will be placed on
833 * the ag free inode list.
835 * freeing the inode will cause the inode extent to be
836 * freed if the inode is the only allocated inode within
837 * the extent. in this case all the disk resource backing
838 * up the inode extent will be freed. in addition, the iag
839 * will be placed on the ag extent free list if the extent
840 * is the first free extent in the iag. if freeing the
841 * extent also means that no free inodes will exist for
842 * the iag, the iag will also be removed from the ag free
845 * the iag describing the inode will be freed if the extent
846 * is to be freed and it is the only backed extent within
847 * the iag. in this case, the iag will be removed from the
848 * ag free extent list and ag free inode list and placed on
849 * the inode map's free iag list.
851 * a careful update approach is used to provide consistency
852 * in the face of updates to multiple buffers. under this
853 * approach, all required buffers are obtained before making
854 * any updates and are held until all updates are complete.
857 * ip - inode to be freed.
863 int diFree(struct inode *ip)
866 ino_t inum = ip->i_ino;
867 struct iag *iagp, *aiagp, *biagp, *ciagp, *diagp;
868 struct metapage *mp, *amp, *bmp, *cmp, *dmp;
869 int iagno, ino, extno, bitno, sword, agno;
872 struct inode *ipimap = JFS_SBI(ip->i_sb)->ipimap;
873 struct inomap *imap = JFS_IP(ipimap)->i_imap;
876 struct inode *iplist[3];
878 struct pxd_lock *pxdlock;
881 * This is just to suppress compiler warnings. The same logic that
882 * references these variables is used to initialize them.
884 aiagp = biagp = ciagp = diagp = NULL;
886 /* get the iag number containing the inode.
888 iagno = INOTOIAG(inum);
890 /* make sure that the iag is contained within
893 if (iagno >= imap->im_nextiag) {
894 print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4,
896 jfs_error(ip->i_sb, "inum = %d, iagno = %d, nextiag = %d\n",
897 (uint) inum, iagno, imap->im_nextiag);
901 /* get the allocation group for this ino.
903 agno = BLKTOAG(JFS_IP(ip)->agstart, JFS_SBI(ip->i_sb));
905 /* Lock the AG specific inode map information
909 /* Obtain read lock in imap inode. Don't release it until we have
910 * read all of the IAG's that we are going to.
912 IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
916 if ((rc = diIAGRead(imap, iagno, &mp))) {
917 IREAD_UNLOCK(ipimap);
918 AG_UNLOCK(imap, agno);
921 iagp = (struct iag *) mp->data;
923 /* get the inode number and extent number of the inode within
924 * the iag and the inode number within the extent.
926 ino = inum & (INOSPERIAG - 1);
927 extno = ino >> L2INOSPEREXT;
928 bitno = ino & (INOSPEREXT - 1);
929 mask = HIGHORDER >> bitno;
931 if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
932 jfs_error(ip->i_sb, "wmap shows inode already free\n");
935 if (!addressPXD(&iagp->inoext[extno])) {
936 release_metapage(mp);
937 IREAD_UNLOCK(ipimap);
938 AG_UNLOCK(imap, agno);
939 jfs_error(ip->i_sb, "invalid inoext\n");
943 /* compute the bitmap for the extent reflecting the freed inode.
945 bitmap = le32_to_cpu(iagp->wmap[extno]) & ~mask;
947 if (imap->im_agctl[agno].numfree > imap->im_agctl[agno].numinos) {
948 release_metapage(mp);
949 IREAD_UNLOCK(ipimap);
950 AG_UNLOCK(imap, agno);
951 jfs_error(ip->i_sb, "numfree > numinos\n");
955 * inode extent still has some inodes or below low water mark:
956 * keep the inode extent;
959 imap->im_agctl[agno].numfree < 96 ||
960 (imap->im_agctl[agno].numfree < 288 &&
961 (((imap->im_agctl[agno].numfree * 100) /
962 imap->im_agctl[agno].numinos) <= 25))) {
963 /* if the iag currently has no free inodes (i.e.,
964 * the inode being freed is the first free inode of iag),
965 * insert the iag at head of the inode free list for the ag.
967 if (iagp->nfreeinos == 0) {
968 /* check if there are any iags on the ag inode
969 * free list. if so, read the first one so that
970 * we can link the current iag onto the list at
973 if ((fwd = imap->im_agctl[agno].inofree) >= 0) {
974 /* read the iag that currently is the head
977 if ((rc = diIAGRead(imap, fwd, &))) {
978 IREAD_UNLOCK(ipimap);
979 AG_UNLOCK(imap, agno);
980 release_metapage(mp);
983 aiagp = (struct iag *) amp->data;
985 /* make current head point back to the iag.
987 aiagp->inofreeback = cpu_to_le32(iagno);
992 /* iag points forward to current head and iag
993 * becomes the new head of the list.
996 cpu_to_le32(imap->im_agctl[agno].inofree);
997 iagp->inofreeback = cpu_to_le32(-1);
998 imap->im_agctl[agno].inofree = iagno;
1000 IREAD_UNLOCK(ipimap);
1002 /* update the free inode summary map for the extent if
1003 * freeing the inode means the extent will now have free
1004 * inodes (i.e., the inode being freed is the first free
1007 if (iagp->wmap[extno] == cpu_to_le32(ONES)) {
1008 sword = extno >> L2EXTSPERSUM;
1009 bitno = extno & (EXTSPERSUM - 1);
1010 iagp->inosmap[sword] &=
1011 cpu_to_le32(~(HIGHORDER >> bitno));
1014 /* update the bitmap.
1016 iagp->wmap[extno] = cpu_to_le32(bitmap);
1018 /* update the free inode counts at the iag, ag and
1021 le32_add_cpu(&iagp->nfreeinos, 1);
1022 imap->im_agctl[agno].numfree += 1;
1023 atomic_inc(&imap->im_numfree);
1025 /* release the AG inode map lock
1027 AG_UNLOCK(imap, agno);
1037 * inode extent has become free and above low water mark:
1038 * free the inode extent;
1042 * prepare to update iag list(s) (careful update step 1)
1044 amp = bmp = cmp = dmp = NULL;
1047 /* check if the iag currently has no free extents. if so,
1048 * it will be placed on the head of the ag extent free list.
1050 if (iagp->nfreeexts == 0) {
1051 /* check if the ag extent free list has any iags.
1052 * if so, read the iag at the head of the list now.
1053 * this (head) iag will be updated later to reflect
1054 * the addition of the current iag at the head of
1057 if ((fwd = imap->im_agctl[agno].extfree) >= 0) {
1058 if ((rc = diIAGRead(imap, fwd, &)))
1060 aiagp = (struct iag *) amp->data;
1063 /* iag has free extents. check if the addition of a free
1064 * extent will cause all extents to be free within this
1065 * iag. if so, the iag will be removed from the ag extent
1066 * free list and placed on the inode map's free iag list.
1068 if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) {
1069 /* in preparation for removing the iag from the
1070 * ag extent free list, read the iags preceding
1071 * and following the iag on the ag extent free
1074 if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) {
1075 if ((rc = diIAGRead(imap, fwd, &)))
1077 aiagp = (struct iag *) amp->data;
1080 if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) {
1081 if ((rc = diIAGRead(imap, back, &bmp)))
1083 biagp = (struct iag *) bmp->data;
1088 /* remove the iag from the ag inode free list if freeing
1089 * this extent cause the iag to have no free inodes.
1091 if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) {
1092 int inofreeback = le32_to_cpu(iagp->inofreeback);
1093 int inofreefwd = le32_to_cpu(iagp->inofreefwd);
1095 /* in preparation for removing the iag from the
1096 * ag inode free list, read the iags preceding
1097 * and following the iag on the ag inode free
1098 * list. before reading these iags, we must make
1099 * sure that we already don't have them in hand
1100 * from up above, since re-reading an iag (buffer)
1101 * we are currently holding would cause a deadlock.
1103 if (inofreefwd >= 0) {
1105 if (inofreefwd == fwd)
1106 ciagp = (struct iag *) amp->data;
1107 else if (inofreefwd == back)
1108 ciagp = (struct iag *) bmp->data;
1111 diIAGRead(imap, inofreefwd, &cmp)))
1113 ciagp = (struct iag *) cmp->data;
1115 assert(ciagp != NULL);
1118 if (inofreeback >= 0) {
1119 if (inofreeback == fwd)
1120 diagp = (struct iag *) amp->data;
1121 else if (inofreeback == back)
1122 diagp = (struct iag *) bmp->data;
1125 diIAGRead(imap, inofreeback, &dmp)))
1127 diagp = (struct iag *) dmp->data;
1129 assert(diagp != NULL);
1133 IREAD_UNLOCK(ipimap);
1136 * invalidate any page of the inode extent freed from buffer cache;
1138 freepxd = iagp->inoext[extno];
1139 invalidate_pxd_metapages(ip, freepxd);
1142 * update iag list(s) (careful update step 2)
1144 /* add the iag to the ag extent free list if this is the
1145 * first free extent for the iag.
1147 if (iagp->nfreeexts == 0) {
1149 aiagp->extfreeback = cpu_to_le32(iagno);
1152 cpu_to_le32(imap->im_agctl[agno].extfree);
1153 iagp->extfreeback = cpu_to_le32(-1);
1154 imap->im_agctl[agno].extfree = iagno;
1156 /* remove the iag from the ag extent list if all extents
1157 * are now free and place it on the inode map iag free list.
1159 if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) {
1161 aiagp->extfreeback = iagp->extfreeback;
1164 biagp->extfreefwd = iagp->extfreefwd;
1166 imap->im_agctl[agno].extfree =
1167 le32_to_cpu(iagp->extfreefwd);
1169 iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1);
1172 iagp->iagfree = cpu_to_le32(imap->im_freeiag);
1173 imap->im_freeiag = iagno;
1174 IAGFREE_UNLOCK(imap);
1178 /* remove the iag from the ag inode free list if freeing
1179 * this extent causes the iag to have no free inodes.
1181 if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) {
1182 if ((int) le32_to_cpu(iagp->inofreefwd) >= 0)
1183 ciagp->inofreeback = iagp->inofreeback;
1185 if ((int) le32_to_cpu(iagp->inofreeback) >= 0)
1186 diagp->inofreefwd = iagp->inofreefwd;
1188 imap->im_agctl[agno].inofree =
1189 le32_to_cpu(iagp->inofreefwd);
1191 iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1);
1194 /* update the inode extent address and working map
1195 * to reflect the free extent.
1196 * the permanent map should have been updated already
1197 * for the inode being freed.
1199 if (iagp->pmap[extno] != 0) {
1200 jfs_error(ip->i_sb, "the pmap does not show inode free\n");
1202 iagp->wmap[extno] = 0;
1203 PXDlength(&iagp->inoext[extno], 0);
1204 PXDaddress(&iagp->inoext[extno], 0);
1206 /* update the free extent and free inode summary maps
1207 * to reflect the freed extent.
1208 * the inode summary map is marked to indicate no inodes
1209 * available for the freed extent.
1211 sword = extno >> L2EXTSPERSUM;
1212 bitno = extno & (EXTSPERSUM - 1);
1213 mask = HIGHORDER >> bitno;
1214 iagp->inosmap[sword] |= cpu_to_le32(mask);
1215 iagp->extsmap[sword] &= cpu_to_le32(~mask);
1217 /* update the number of free inodes and number of free extents
1220 le32_add_cpu(&iagp->nfreeinos, -(INOSPEREXT - 1));
1221 le32_add_cpu(&iagp->nfreeexts, 1);
1223 /* update the number of free inodes and backed inodes
1224 * at the ag and inode map level.
1226 imap->im_agctl[agno].numfree -= (INOSPEREXT - 1);
1227 imap->im_agctl[agno].numinos -= INOSPEREXT;
1228 atomic_sub(INOSPEREXT - 1, &imap->im_numfree);
1229 atomic_sub(INOSPEREXT, &imap->im_numinos);
1232 write_metapage(amp);
1234 write_metapage(bmp);
1236 write_metapage(cmp);
1238 write_metapage(dmp);
1241 * start transaction to update block allocation map
1242 * for the inode extent freed;
1244 * N.B. AG_LOCK is released and iag will be released below, and
1245 * other thread may allocate inode from/reusing the ixad freed
1246 * BUT with new/different backing inode extent from the extent
1247 * to be freed by the transaction;
1249 tid = txBegin(ipimap->i_sb, COMMIT_FORCE);
1250 mutex_lock(&JFS_IP(ipimap)->commit_mutex);
1252 /* acquire tlock of the iag page of the freed ixad
1253 * to force the page NOHOMEOK (even though no data is
1254 * logged from the iag page) until NOREDOPAGE|FREEXTENT log
1255 * for the free of the extent is committed;
1256 * write FREEXTENT|NOREDOPAGE log record
1257 * N.B. linelock is overlaid as freed extent descriptor;
1259 tlck = txLock(tid, ipimap, mp, tlckINODE | tlckFREE);
1260 pxdlock = (struct pxd_lock *) & tlck->lock;
1261 pxdlock->flag = mlckFREEPXD;
1262 pxdlock->pxd = freepxd;
1270 * logredo needs the IAG number and IAG extent index in order
1271 * to ensure that the IMap is consistent. The least disruptive
1272 * way to pass these values through to the transaction manager
1273 * is in the iplist array.
1275 * It's not pretty, but it works.
1277 iplist[1] = (struct inode *) (size_t)iagno;
1278 iplist[2] = (struct inode *) (size_t)extno;
1280 rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE);
1283 mutex_unlock(&JFS_IP(ipimap)->commit_mutex);
1285 /* unlock the AG inode map information */
1286 AG_UNLOCK(imap, agno);
1291 IREAD_UNLOCK(ipimap);
1294 release_metapage(amp);
1296 release_metapage(bmp);
1298 release_metapage(cmp);
1300 release_metapage(dmp);
1302 AG_UNLOCK(imap, agno);
1304 release_metapage(mp);
1310 * There are several places in the diAlloc* routines where we initialize
1314 diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp)
1316 struct jfs_inode_info *jfs_ip = JFS_IP(ip);
1318 ip->i_ino = (iagno << L2INOSPERIAG) + ino;
1319 jfs_ip->ixpxd = iagp->inoext[extno];
1320 jfs_ip->agstart = le64_to_cpu(iagp->agstart);
1321 jfs_ip->active_ag = -1;
1326 * NAME: diAlloc(pip,dir,ip)
1328 * FUNCTION: allocate a disk inode from the inode working map
1329 * for a fileset or aggregate.
1332 * pip - pointer to incore inode for the parent inode.
1333 * dir - 'true' if the new disk inode is for a directory.
1334 * ip - pointer to a new inode
1338 * -ENOSPC - insufficient disk resources.
1341 int diAlloc(struct inode *pip, bool dir, struct inode *ip)
1343 int rc, ino, iagno, addext, extno, bitno, sword;
1344 int nwords, rem, i, agno, dn_numag;
1345 u32 mask, inosmap, extsmap;
1346 struct inode *ipimap;
1347 struct metapage *mp;
1350 struct inomap *imap;
1352 /* get the pointers to the inode map inode and the
1353 * corresponding imap control structure.
1355 ipimap = JFS_SBI(pip->i_sb)->ipimap;
1356 imap = JFS_IP(ipimap)->i_imap;
1357 JFS_IP(ip)->ipimap = ipimap;
1358 JFS_IP(ip)->fileset = FILESYSTEM_I;
1360 /* for a directory, the allocation policy is to start
1361 * at the ag level using the preferred ag.
1364 agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap);
1365 AG_LOCK(imap, agno);
1369 /* for files, the policy starts off by trying to allocate from
1370 * the same iag containing the parent disk inode:
1371 * try to allocate the new disk inode close to the parent disk
1372 * inode, using parent disk inode number + 1 as the allocation
1373 * hint. (we use a left-to-right policy to attempt to avoid
1374 * moving backward on the disk.) compute the hint within the
1375 * file system and the iag.
1378 /* get the ag number of this iag */
1379 agno = BLKTOAG(JFS_IP(pip)->agstart, JFS_SBI(pip->i_sb));
1380 dn_numag = JFS_SBI(pip->i_sb)->bmap->db_numag;
1381 if (agno < 0 || agno > dn_numag)
1384 if (atomic_read(&JFS_SBI(pip->i_sb)->bmap->db_active[agno])) {
1386 * There is an open file actively growing. We want to
1387 * allocate new inodes from a different ag to avoid
1388 * fragmentation problems.
1390 agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap);
1391 AG_LOCK(imap, agno);
1395 inum = pip->i_ino + 1;
1396 ino = inum & (INOSPERIAG - 1);
1398 /* back off the hint if it is outside of the iag */
1402 /* lock the AG inode map information */
1403 AG_LOCK(imap, agno);
1405 /* Get read lock on imap inode */
1406 IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
1408 /* get the iag number and read the iag */
1409 iagno = INOTOIAG(inum);
1410 if ((rc = diIAGRead(imap, iagno, &mp))) {
1411 IREAD_UNLOCK(ipimap);
1412 AG_UNLOCK(imap, agno);
1415 iagp = (struct iag *) mp->data;
1417 /* determine if new inode extent is allowed to be added to the iag.
1418 * new inode extent can be added to the iag if the ag
1419 * has less than 32 free disk inodes and the iag has free extents.
1421 addext = (imap->im_agctl[agno].numfree < 32 && iagp->nfreeexts);
1424 * try to allocate from the IAG
1426 /* check if the inode may be allocated from the iag
1427 * (i.e. the inode has free inodes or new extent can be added).
1429 if (iagp->nfreeinos || addext) {
1430 /* determine the extent number of the hint.
1432 extno = ino >> L2INOSPEREXT;
1434 /* check if the extent containing the hint has backed
1435 * inodes. if so, try to allocate within this extent.
1437 if (addressPXD(&iagp->inoext[extno])) {
1438 bitno = ino & (INOSPEREXT - 1);
1440 diFindFree(le32_to_cpu(iagp->wmap[extno]),
1443 ino = (extno << L2INOSPEREXT) + bitno;
1445 /* a free inode (bit) was found within this
1446 * extent, so allocate it.
1448 rc = diAllocBit(imap, iagp, ino);
1449 IREAD_UNLOCK(ipimap);
1453 /* set the results of the allocation
1454 * and write the iag.
1456 diInitInode(ip, iagno, ino, extno,
1458 mark_metapage_dirty(mp);
1460 release_metapage(mp);
1462 /* free the AG lock and return.
1464 AG_UNLOCK(imap, agno);
1471 EXTSPERIAG - 1) ? 0 : extno + 1;
1475 * no free inodes within the extent containing the hint.
1477 * try to allocate from the backed extents following
1478 * hint or, if appropriate (i.e. addext is true), allocate
1479 * an extent of free inodes at or following the extent
1480 * containing the hint.
1482 * the free inode and free extent summary maps are used
1483 * here, so determine the starting summary map position
1484 * and the number of words we'll have to examine. again,
1485 * the approach is to allocate following the hint, so we
1486 * might have to initially ignore prior bits of the summary
1487 * map that represent extents prior to the extent containing
1488 * the hint and later revisit these bits.
1490 bitno = extno & (EXTSPERSUM - 1);
1491 nwords = (bitno == 0) ? SMAPSZ : SMAPSZ + 1;
1492 sword = extno >> L2EXTSPERSUM;
1494 /* mask any prior bits for the starting words of the
1497 mask = (bitno == 0) ? 0 : (ONES << (EXTSPERSUM - bitno));
1498 inosmap = le32_to_cpu(iagp->inosmap[sword]) | mask;
1499 extsmap = le32_to_cpu(iagp->extsmap[sword]) | mask;
1501 /* scan the free inode and free extent summary maps for
1504 for (i = 0; i < nwords; i++) {
1505 /* check if this word of the free inode summary
1506 * map describes an extent with free inodes.
1509 /* an extent with free inodes has been
1510 * found. determine the extent number
1511 * and the inode number within the extent.
1513 rem = diFindFree(inosmap, 0);
1514 extno = (sword << L2EXTSPERSUM) + rem;
1515 rem = diFindFree(le32_to_cpu(iagp->wmap[extno]),
1517 if (rem >= INOSPEREXT) {
1518 IREAD_UNLOCK(ipimap);
1519 release_metapage(mp);
1520 AG_UNLOCK(imap, agno);
1522 "can't find free bit in wmap\n");
1526 /* determine the inode number within the
1527 * iag and allocate the inode from the
1530 ino = (extno << L2INOSPEREXT) + rem;
1531 rc = diAllocBit(imap, iagp, ino);
1532 IREAD_UNLOCK(ipimap);
1536 /* set the results of the allocation
1537 * and write the iag.
1539 diInitInode(ip, iagno, ino, extno,
1541 mark_metapage_dirty(mp);
1543 release_metapage(mp);
1545 /* free the AG lock and return.
1547 AG_UNLOCK(imap, agno);
1552 /* check if we may allocate an extent of free
1553 * inodes and whether this word of the free
1554 * extents summary map describes a free extent.
1556 if (addext && ~extsmap) {
1557 /* a free extent has been found. determine
1558 * the extent number.
1560 rem = diFindFree(extsmap, 0);
1561 extno = (sword << L2EXTSPERSUM) + rem;
1563 /* allocate an extent of free inodes.
1565 if ((rc = diNewExt(imap, iagp, extno))) {
1566 /* if there is no disk space for a
1567 * new extent, try to allocate the
1568 * disk inode from somewhere else.
1575 /* set the results of the allocation
1576 * and write the iag.
1578 diInitInode(ip, iagno,
1579 extno << L2INOSPEREXT,
1581 mark_metapage_dirty(mp);
1583 release_metapage(mp);
1584 /* free the imap inode & the AG lock & return.
1586 IREAD_UNLOCK(ipimap);
1587 AG_UNLOCK(imap, agno);
1591 /* move on to the next set of summary map words.
1593 sword = (sword == SMAPSZ - 1) ? 0 : sword + 1;
1594 inosmap = le32_to_cpu(iagp->inosmap[sword]);
1595 extsmap = le32_to_cpu(iagp->extsmap[sword]);
1598 /* unlock imap inode */
1599 IREAD_UNLOCK(ipimap);
1601 /* nothing doing in this iag, so release it. */
1602 release_metapage(mp);
1606 * try to allocate anywhere within the same AG as the parent inode.
1608 rc = diAllocAG(imap, agno, dir, ip);
1610 AG_UNLOCK(imap, agno);
1616 * try to allocate in any AG.
1618 return (diAllocAny(imap, agno, dir, ip));
1623 * NAME: diAllocAG(imap,agno,dir,ip)
1625 * FUNCTION: allocate a disk inode from the allocation group.
1627 * this routine first determines if a new extent of free
1628 * inodes should be added for the allocation group, with
1629 * the current request satisfied from this extent. if this
1630 * is the case, an attempt will be made to do just that. if
1631 * this attempt fails or it has been determined that a new
1632 * extent should not be added, an attempt is made to satisfy
1633 * the request by allocating an existing (backed) free inode
1634 * from the allocation group.
1636 * PRE CONDITION: Already have the AG lock for this AG.
1639 * imap - pointer to inode map control structure.
1640 * agno - allocation group to allocate from.
1641 * dir - 'true' if the new disk inode is for a directory.
1642 * ip - pointer to the new inode to be filled in on successful return
1643 * with the disk inode number allocated, its extent address
1644 * and the start of the ag.
1648 * -ENOSPC - insufficient disk resources.
1652 diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
1654 int rc, addext, numfree, numinos;
1656 /* get the number of free and the number of backed disk
1657 * inodes currently within the ag.
1659 numfree = imap->im_agctl[agno].numfree;
1660 numinos = imap->im_agctl[agno].numinos;
1662 if (numfree > numinos) {
1663 jfs_error(ip->i_sb, "numfree > numinos\n");
1667 /* determine if we should allocate a new extent of free inodes
1668 * within the ag: for directory inodes, add a new extent
1669 * if there are a small number of free inodes or number of free
1670 * inodes is a small percentage of the number of backed inodes.
1673 addext = (numfree < 64 ||
1675 && ((numfree * 100) / numinos) <= 20));
1677 addext = (numfree == 0);
1680 * try to allocate a new extent of free inodes.
1683 /* if free space is not available for this new extent, try
1684 * below to allocate a free and existing (already backed)
1685 * inode from the ag.
1687 if ((rc = diAllocExt(imap, agno, ip)) != -ENOSPC)
1692 * try to allocate an existing free inode from the ag.
1694 return (diAllocIno(imap, agno, ip));
1699 * NAME: diAllocAny(imap,agno,dir,iap)
1701 * FUNCTION: allocate a disk inode from any other allocation group.
1703 * this routine is called when an allocation attempt within
1704 * the primary allocation group has failed. if attempts to
1705 * allocate an inode from any allocation group other than the
1706 * specified primary group.
1709 * imap - pointer to inode map control structure.
1710 * agno - primary allocation group (to avoid).
1711 * dir - 'true' if the new disk inode is for a directory.
1712 * ip - pointer to a new inode to be filled in on successful return
1713 * with the disk inode number allocated, its extent address
1714 * and the start of the ag.
1718 * -ENOSPC - insufficient disk resources.
1722 diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip)
1725 int maxag = JFS_SBI(imap->im_ipimap->i_sb)->bmap->db_maxag;
1728 /* try to allocate from the ags following agno up to
1729 * the maximum ag number.
1731 for (ag = agno + 1; ag <= maxag; ag++) {
1734 rc = diAllocAG(imap, ag, dir, ip);
1736 AG_UNLOCK(imap, ag);
1742 /* try to allocate from the ags in front of agno.
1744 for (ag = 0; ag < agno; ag++) {
1747 rc = diAllocAG(imap, ag, dir, ip);
1749 AG_UNLOCK(imap, ag);
1755 /* no free disk inodes.
1762 * NAME: diAllocIno(imap,agno,ip)
1764 * FUNCTION: allocate a disk inode from the allocation group's free
1765 * inode list, returning an error if this free list is
1766 * empty (i.e. no iags on the list).
1768 * allocation occurs from the first iag on the list using
1769 * the iag's free inode summary map to find the leftmost
1770 * free inode in the iag.
1772 * PRE CONDITION: Already have AG lock for this AG.
1775 * imap - pointer to inode map control structure.
1776 * agno - allocation group.
1777 * ip - pointer to new inode to be filled in on successful return
1778 * with the disk inode number allocated, its extent address
1779 * and the start of the ag.
1783 * -ENOSPC - insufficient disk resources.
1786 static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
1788 int iagno, ino, rc, rem, extno, sword;
1789 struct metapage *mp;
1792 /* check if there are iags on the ag's free inode list.
1794 if ((iagno = imap->im_agctl[agno].inofree) < 0)
1797 /* obtain read lock on imap inode */
1798 IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP);
1800 /* read the iag at the head of the list.
1802 if ((rc = diIAGRead(imap, iagno, &mp))) {
1803 IREAD_UNLOCK(imap->im_ipimap);
1806 iagp = (struct iag *) mp->data;
1808 /* better be free inodes in this iag if it is on the
1811 if (!iagp->nfreeinos) {
1812 IREAD_UNLOCK(imap->im_ipimap);
1813 release_metapage(mp);
1814 jfs_error(ip->i_sb, "nfreeinos = 0, but iag on freelist\n");
1818 /* scan the free inode summary map to find an extent
1821 for (sword = 0;; sword++) {
1822 if (sword >= SMAPSZ) {
1823 IREAD_UNLOCK(imap->im_ipimap);
1824 release_metapage(mp);
1826 "free inode not found in summary map\n");
1830 if (~iagp->inosmap[sword])
1834 /* found a extent with free inodes. determine
1835 * the extent number.
1837 rem = diFindFree(le32_to_cpu(iagp->inosmap[sword]), 0);
1838 if (rem >= EXTSPERSUM) {
1839 IREAD_UNLOCK(imap->im_ipimap);
1840 release_metapage(mp);
1841 jfs_error(ip->i_sb, "no free extent found\n");
1844 extno = (sword << L2EXTSPERSUM) + rem;
1846 /* find the first free inode in the extent.
1848 rem = diFindFree(le32_to_cpu(iagp->wmap[extno]), 0);
1849 if (rem >= INOSPEREXT) {
1850 IREAD_UNLOCK(imap->im_ipimap);
1851 release_metapage(mp);
1852 jfs_error(ip->i_sb, "free inode not found\n");
1856 /* compute the inode number within the iag.
1858 ino = (extno << L2INOSPEREXT) + rem;
1860 /* allocate the inode.
1862 rc = diAllocBit(imap, iagp, ino);
1863 IREAD_UNLOCK(imap->im_ipimap);
1865 release_metapage(mp);
1869 /* set the results of the allocation and write the iag.
1871 diInitInode(ip, iagno, ino, extno, iagp);
1879 * NAME: diAllocExt(imap,agno,ip)
1881 * FUNCTION: add a new extent of free inodes to an iag, allocating
1882 * an inode from this extent to satisfy the current allocation
1885 * this routine first tries to find an existing iag with free
1886 * extents through the ag free extent list. if list is not
1887 * empty, the head of the list will be selected as the home
1888 * of the new extent of free inodes. otherwise (the list is
1889 * empty), a new iag will be allocated for the ag to contain
1892 * once an iag has been selected, the free extent summary map
1893 * is used to locate a free extent within the iag and diNewExt()
1894 * is called to initialize the extent, with initialization
1895 * including the allocation of the first inode of the extent
1896 * for the purpose of satisfying this request.
1899 * imap - pointer to inode map control structure.
1900 * agno - allocation group number.
1901 * ip - pointer to new inode to be filled in on successful return
1902 * with the disk inode number allocated, its extent address
1903 * and the start of the ag.
1907 * -ENOSPC - insufficient disk resources.
1910 static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
1912 int rem, iagno, sword, extno, rc;
1913 struct metapage *mp;
1916 /* check if the ag has any iags with free extents. if not,
1917 * allocate a new iag for the ag.
1919 if ((iagno = imap->im_agctl[agno].extfree) < 0) {
1920 /* If successful, diNewIAG will obtain the read lock on the
1923 if ((rc = diNewIAG(imap, &iagno, agno, &mp))) {
1926 iagp = (struct iag *) mp->data;
1928 /* set the ag number if this a brand new iag
1931 cpu_to_le64(AGTOBLK(agno, imap->im_ipimap));
1935 IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP);
1936 if ((rc = diIAGRead(imap, iagno, &mp))) {
1937 IREAD_UNLOCK(imap->im_ipimap);
1938 jfs_error(ip->i_sb, "error reading iag\n");
1941 iagp = (struct iag *) mp->data;
1944 /* using the free extent summary map, find a free extent.
1946 for (sword = 0;; sword++) {
1947 if (sword >= SMAPSZ) {
1948 release_metapage(mp);
1949 IREAD_UNLOCK(imap->im_ipimap);
1950 jfs_error(ip->i_sb, "free ext summary map not found\n");
1953 if (~iagp->extsmap[sword])
1957 /* determine the extent number of the free extent.
1959 rem = diFindFree(le32_to_cpu(iagp->extsmap[sword]), 0);
1960 if (rem >= EXTSPERSUM) {
1961 release_metapage(mp);
1962 IREAD_UNLOCK(imap->im_ipimap);
1963 jfs_error(ip->i_sb, "free extent not found\n");
1966 extno = (sword << L2EXTSPERSUM) + rem;
1968 /* initialize the new extent.
1970 rc = diNewExt(imap, iagp, extno);
1971 IREAD_UNLOCK(imap->im_ipimap);
1973 /* something bad happened. if a new iag was allocated,
1974 * place it back on the inode map's iag free list, and
1975 * clear the ag number information.
1977 if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
1979 iagp->iagfree = cpu_to_le32(imap->im_freeiag);
1980 imap->im_freeiag = iagno;
1981 IAGFREE_UNLOCK(imap);
1987 /* set the results of the allocation and write the iag.
1989 diInitInode(ip, iagno, extno << L2INOSPEREXT, extno, iagp);
1998 * NAME: diAllocBit(imap,iagp,ino)
2000 * FUNCTION: allocate a backed inode from an iag.
2002 * this routine performs the mechanics of allocating a
2003 * specified inode from a backed extent.
2005 * if the inode to be allocated represents the last free
2006 * inode within the iag, the iag will be removed from the
2007 * ag free inode list.
2009 * a careful update approach is used to provide consistency
2010 * in the face of updates to multiple buffers. under this
2011 * approach, all required buffers are obtained before making
2012 * any updates and are held all are updates are complete.
2014 * PRE CONDITION: Already have buffer lock on iagp. Already have AG lock on
2015 * this AG. Must have read lock on imap inode.
2018 * imap - pointer to inode map control structure.
2019 * iagp - pointer to iag.
2020 * ino - inode number to be allocated within the iag.
2024 * -ENOSPC - insufficient disk resources.
2027 static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
2029 int extno, bitno, agno, sword, rc;
2030 struct metapage *amp = NULL, *bmp = NULL;
2031 struct iag *aiagp = NULL, *biagp = NULL;
2034 /* check if this is the last free inode within the iag.
2035 * if so, it will have to be removed from the ag free
2036 * inode list, so get the iags preceding and following
2039 if (iagp->nfreeinos == cpu_to_le32(1)) {
2040 if ((int) le32_to_cpu(iagp->inofreefwd) >= 0) {
2042 diIAGRead(imap, le32_to_cpu(iagp->inofreefwd),
2045 aiagp = (struct iag *) amp->data;
2048 if ((int) le32_to_cpu(iagp->inofreeback) >= 0) {
2051 le32_to_cpu(iagp->inofreeback),
2054 release_metapage(amp);
2057 biagp = (struct iag *) bmp->data;
2061 /* get the ag number, extent number, inode number within
2064 agno = BLKTOAG(le64_to_cpu(iagp->agstart), JFS_SBI(imap->im_ipimap->i_sb));
2065 extno = ino >> L2INOSPEREXT;
2066 bitno = ino & (INOSPEREXT - 1);
2068 /* compute the mask for setting the map.
2070 mask = HIGHORDER >> bitno;
2072 /* the inode should be free and backed.
2074 if (((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) ||
2075 ((le32_to_cpu(iagp->wmap[extno]) & mask) != 0) ||
2076 (addressPXD(&iagp->inoext[extno]) == 0)) {
2078 release_metapage(amp);
2080 release_metapage(bmp);
2082 jfs_error(imap->im_ipimap->i_sb, "iag inconsistent\n");
2086 /* mark the inode as allocated in the working map.
2088 iagp->wmap[extno] |= cpu_to_le32(mask);
2090 /* check if all inodes within the extent are now
2091 * allocated. if so, update the free inode summary
2092 * map to reflect this.
2094 if (iagp->wmap[extno] == cpu_to_le32(ONES)) {
2095 sword = extno >> L2EXTSPERSUM;
2096 bitno = extno & (EXTSPERSUM - 1);
2097 iagp->inosmap[sword] |= cpu_to_le32(HIGHORDER >> bitno);
2100 /* if this was the last free inode in the iag, remove the
2101 * iag from the ag free inode list.
2103 if (iagp->nfreeinos == cpu_to_le32(1)) {
2105 aiagp->inofreeback = iagp->inofreeback;
2106 write_metapage(amp);
2110 biagp->inofreefwd = iagp->inofreefwd;
2111 write_metapage(bmp);
2113 imap->im_agctl[agno].inofree =
2114 le32_to_cpu(iagp->inofreefwd);
2116 iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1);
2119 /* update the free inode count at the iag, ag, inode
2122 le32_add_cpu(&iagp->nfreeinos, -1);
2123 imap->im_agctl[agno].numfree -= 1;
2124 atomic_dec(&imap->im_numfree);
2131 * NAME: diNewExt(imap,iagp,extno)
2133 * FUNCTION: initialize a new extent of inodes for an iag, allocating
2134 * the first inode of the extent for use for the current
2135 * allocation request.
2137 * disk resources are allocated for the new extent of inodes
2138 * and the inodes themselves are initialized to reflect their
2139 * existence within the extent (i.e. their inode numbers and
2140 * inode extent addresses are set) and their initial state
2141 * (mode and link count are set to zero).
2143 * if the iag is new, it is not yet on an ag extent free list
2144 * but will now be placed on this list.
2146 * if the allocation of the new extent causes the iag to
2147 * have no free extent, the iag will be removed from the
2148 * ag extent free list.
2150 * if the iag has no free backed inodes, it will be placed
2151 * on the ag free inode list, since the addition of the new
2152 * extent will now cause it to have free inodes.
2154 * a careful update approach is used to provide consistency
2155 * (i.e. list consistency) in the face of updates to multiple
2156 * buffers. under this approach, all required buffers are
2157 * obtained before making any updates and are held until all
2158 * updates are complete.
2160 * PRE CONDITION: Already have buffer lock on iagp. Already have AG lock on
2161 * this AG. Must have read lock on imap inode.
2164 * imap - pointer to inode map control structure.
2165 * iagp - pointer to iag.
2166 * extno - extent number.
2170 * -ENOSPC - insufficient disk resources.
2173 static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
2175 int agno, iagno, fwd, back, freei = 0, sword, rc;
2176 struct iag *aiagp = NULL, *biagp = NULL, *ciagp = NULL;
2177 struct metapage *amp, *bmp, *cmp, *dmp;
2178 struct inode *ipimap;
2184 struct jfs_sb_info *sbi;
2186 /* better have free extents.
2188 if (!iagp->nfreeexts) {
2189 jfs_error(imap->im_ipimap->i_sb, "no free extents\n");
2193 /* get the inode map inode.
2195 ipimap = imap->im_ipimap;
2196 sbi = JFS_SBI(ipimap->i_sb);
2198 amp = bmp = cmp = NULL;
2200 /* get the ag and iag numbers for this iag.
2202 agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi);
2203 iagno = le32_to_cpu(iagp->iagnum);
2205 /* check if this is the last free extent within the
2206 * iag. if so, the iag must be removed from the ag
2207 * free extent list, so get the iags preceding and
2208 * following the iag on this list.
2210 if (iagp->nfreeexts == cpu_to_le32(1)) {
2211 if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) {
2212 if ((rc = diIAGRead(imap, fwd, &)))
2214 aiagp = (struct iag *) amp->data;
2217 if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) {
2218 if ((rc = diIAGRead(imap, back, &bmp)))
2220 biagp = (struct iag *) bmp->data;
2223 /* the iag has free extents. if all extents are free
2224 * (as is the case for a newly allocated iag), the iag
2225 * must be added to the ag free extent list, so get
2226 * the iag at the head of the list in preparation for
2227 * adding this iag to this list.
2230 if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
2231 if ((fwd = imap->im_agctl[agno].extfree) >= 0) {
2232 if ((rc = diIAGRead(imap, fwd, &)))
2234 aiagp = (struct iag *) amp->data;
2239 /* check if the iag has no free inodes. if so, the iag
2240 * will have to be added to the ag free inode list, so get
2241 * the iag at the head of the list in preparation for
2242 * adding this iag to this list. in doing this, we must
2243 * check if we already have the iag at the head of
2246 if (iagp->nfreeinos == 0) {
2247 freei = imap->im_agctl[agno].inofree;
2252 } else if (freei == back) {
2255 if ((rc = diIAGRead(imap, freei, &cmp)))
2257 ciagp = (struct iag *) cmp->data;
2259 if (ciagp == NULL) {
2260 jfs_error(imap->im_ipimap->i_sb,
2268 /* allocate disk space for the inode extent.
2270 if ((extno == 0) || (addressPXD(&iagp->inoext[extno - 1]) == 0))
2271 hint = ((s64) agno << sbi->bmap->db_agl2size) - 1;
2273 hint = addressPXD(&iagp->inoext[extno - 1]) +
2274 lengthPXD(&iagp->inoext[extno - 1]) - 1;
2276 if ((rc = dbAlloc(ipimap, hint, (s64) imap->im_nbperiext, &blkno)))
2279 /* compute the inode number of the first inode within the
2282 ino = (iagno << L2INOSPERIAG) + (extno << L2INOSPEREXT);
2284 /* initialize the inodes within the newly allocated extent a
2287 for (i = 0; i < imap->im_nbperiext; i += sbi->nbperpage) {
2288 /* get a buffer for this page of disk inodes.
2290 dmp = get_metapage(ipimap, blkno + i, PSIZE, 1);
2295 dp = (struct dinode *) dmp->data;
2297 /* initialize the inode number, mode, link count and
2298 * inode extent address.
2300 for (j = 0; j < INOSPERPAGE; j++, dp++, ino++) {
2301 dp->di_inostamp = cpu_to_le32(sbi->inostamp);
2302 dp->di_number = cpu_to_le32(ino);
2303 dp->di_fileset = cpu_to_le32(FILESYSTEM_I);
2306 PXDaddress(&(dp->di_ixpxd), blkno);
2307 PXDlength(&(dp->di_ixpxd), imap->im_nbperiext);
2309 write_metapage(dmp);
2312 /* if this is the last free extent within the iag, remove the
2313 * iag from the ag free extent list.
2315 if (iagp->nfreeexts == cpu_to_le32(1)) {
2317 aiagp->extfreeback = iagp->extfreeback;
2320 biagp->extfreefwd = iagp->extfreefwd;
2322 imap->im_agctl[agno].extfree =
2323 le32_to_cpu(iagp->extfreefwd);
2325 iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1);
2327 /* if the iag has all free extents (newly allocated iag),
2328 * add the iag to the ag free extent list.
2330 if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
2332 aiagp->extfreeback = cpu_to_le32(iagno);
2334 iagp->extfreefwd = cpu_to_le32(fwd);
2335 iagp->extfreeback = cpu_to_le32(-1);
2336 imap->im_agctl[agno].extfree = iagno;
2340 /* if the iag has no free inodes, add the iag to the
2341 * ag free inode list.
2343 if (iagp->nfreeinos == 0) {
2345 ciagp->inofreeback = cpu_to_le32(iagno);
2348 cpu_to_le32(imap->im_agctl[agno].inofree);
2349 iagp->inofreeback = cpu_to_le32(-1);
2350 imap->im_agctl[agno].inofree = iagno;
2353 /* initialize the extent descriptor of the extent. */
2354 PXDlength(&iagp->inoext[extno], imap->im_nbperiext);
2355 PXDaddress(&iagp->inoext[extno], blkno);
2357 /* initialize the working and persistent map of the extent.
2358 * the working map will be initialized such that
2359 * it indicates the first inode of the extent is allocated.
2361 iagp->wmap[extno] = cpu_to_le32(HIGHORDER);
2362 iagp->pmap[extno] = 0;
2364 /* update the free inode and free extent summary maps
2365 * for the extent to indicate the extent has free inodes
2366 * and no longer represents a free extent.
2368 sword = extno >> L2EXTSPERSUM;
2369 mask = HIGHORDER >> (extno & (EXTSPERSUM - 1));
2370 iagp->extsmap[sword] |= cpu_to_le32(mask);
2371 iagp->inosmap[sword] &= cpu_to_le32(~mask);
2373 /* update the free inode and free extent counts for the
2376 le32_add_cpu(&iagp->nfreeinos, (INOSPEREXT - 1));
2377 le32_add_cpu(&iagp->nfreeexts, -1);
2379 /* update the free and backed inode counts for the ag.
2381 imap->im_agctl[agno].numfree += (INOSPEREXT - 1);
2382 imap->im_agctl[agno].numinos += INOSPEREXT;
2384 /* update the free and backed inode counts for the inode map.
2386 atomic_add(INOSPEREXT - 1, &imap->im_numfree);
2387 atomic_add(INOSPEREXT, &imap->im_numinos);
2392 write_metapage(amp);
2394 write_metapage(bmp);
2396 write_metapage(cmp);
2402 /* release the iags.
2405 release_metapage(amp);
2407 release_metapage(bmp);
2409 release_metapage(cmp);
2416 * NAME: diNewIAG(imap,iagnop,agno)
2418 * FUNCTION: allocate a new iag for an allocation group.
2420 * first tries to allocate the iag from the inode map
2422 * if the list has free iags, the head of the list is removed
2423 * and returned to satisfy the request.
2424 * if the inode map's iag free list is empty, the inode map
2425 * is extended to hold a new iag. this new iag is initialized
2426 * and returned to satisfy the request.
2429 * imap - pointer to inode map control structure.
2430 * iagnop - pointer to an iag number set with the number of the
2431 * newly allocated iag upon successful return.
2432 * agno - allocation group number.
2433 * bpp - Buffer pointer to be filled in with new IAG's buffer
2437 * -ENOSPC - insufficient disk resources.
2441 * AG lock held on entry/exit;
2442 * write lock on the map is held inside;
2443 * read lock on the map is held on successful completion;
2445 * note: new iag transaction:
2446 * . synchronously write iag;
2447 * . write log of xtree and inode of imap;
2449 * . synchronous write of xtree (right to left, bottom to top);
2450 * . at start of logredo(): init in-memory imap with one additional iag page;
2451 * . at end of logredo(): re-read imap inode to determine
2455 diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
2459 struct inode *ipimap;
2460 struct super_block *sb;
2461 struct jfs_sb_info *sbi;
2462 struct metapage *mp;
2467 struct inode *iplist[1];
2469 /* pick up pointers to the inode map and mount inodes */
2470 ipimap = imap->im_ipimap;
2474 /* acquire the free iag lock */
2477 /* if there are any iags on the inode map free iag list,
2478 * allocate the iag from the head of the list.
2480 if (imap->im_freeiag >= 0) {
2481 /* pick up the iag number at the head of the list */
2482 iagno = imap->im_freeiag;
2484 /* determine the logical block number of the iag */
2485 blkno = IAGTOLBLK(iagno, sbi->l2nbperpage);
2487 /* no free iags. the inode map will have to be extented
2488 * to include a new iag.
2491 /* acquire inode map lock */
2492 IWRITE_LOCK(ipimap, RDWRLOCK_IMAP);
2494 if (ipimap->i_size >> L2PSIZE != imap->im_nextiag + 1) {
2495 IWRITE_UNLOCK(ipimap);
2496 IAGFREE_UNLOCK(imap);
2497 jfs_error(imap->im_ipimap->i_sb,
2498 "ipimap->i_size is wrong\n");
2503 /* get the next available iag number */
2504 iagno = imap->im_nextiag;
2506 /* make sure that we have not exceeded the maximum inode
2509 if (iagno > (MAXIAGS - 1)) {
2510 /* release the inode map lock */
2511 IWRITE_UNLOCK(ipimap);
2518 * synchronously append new iag page.
2520 /* determine the logical address of iag page to append */
2521 blkno = IAGTOLBLK(iagno, sbi->l2nbperpage);
2523 /* Allocate extent for new iag page */
2524 xlen = sbi->nbperpage;
2525 if ((rc = dbAlloc(ipimap, 0, (s64) xlen, &xaddr))) {
2526 /* release the inode map lock */
2527 IWRITE_UNLOCK(ipimap);
2533 * start transaction of update of the inode map
2534 * addressing structure pointing to the new iag page;
2536 tid = txBegin(sb, COMMIT_FORCE);
2537 mutex_lock(&JFS_IP(ipimap)->commit_mutex);
2539 /* update the inode map addressing structure to point to it */
2541 xtInsert(tid, ipimap, 0, blkno, xlen, &xaddr, 0))) {
2543 mutex_unlock(&JFS_IP(ipimap)->commit_mutex);
2544 /* Free the blocks allocated for the iag since it was
2545 * not successfully added to the inode map
2547 dbFree(ipimap, xaddr, (s64) xlen);
2549 /* release the inode map lock */
2550 IWRITE_UNLOCK(ipimap);
2555 /* update the inode map's inode to reflect the extension */
2556 ipimap->i_size += PSIZE;
2557 inode_add_bytes(ipimap, PSIZE);
2559 /* assign a buffer for the page */
2560 mp = get_metapage(ipimap, blkno, PSIZE, 0);
2563 * This is very unlikely since we just created the
2564 * extent, but let's try to handle it correctly
2566 xtTruncate(tid, ipimap, ipimap->i_size - PSIZE,
2571 mutex_unlock(&JFS_IP(ipimap)->commit_mutex);
2573 /* release the inode map lock */
2574 IWRITE_UNLOCK(ipimap);
2579 iagp = (struct iag *) mp->data;
2582 memset(iagp, 0, sizeof(struct iag));
2583 iagp->iagnum = cpu_to_le32(iagno);
2584 iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1);
2585 iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1);
2586 iagp->iagfree = cpu_to_le32(-1);
2587 iagp->nfreeinos = 0;
2588 iagp->nfreeexts = cpu_to_le32(EXTSPERIAG);
2590 /* initialize the free inode summary map (free extent
2591 * summary map initialization handled by bzero).
2593 for (i = 0; i < SMAPSZ; i++)
2594 iagp->inosmap[i] = cpu_to_le32(ONES);
2597 * Write and sync the metapage
2602 * txCommit(COMMIT_FORCE) will synchronously write address
2603 * index pages and inode after commit in careful update order
2604 * of address index pages (right to left, bottom up);
2607 rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE);
2610 mutex_unlock(&JFS_IP(ipimap)->commit_mutex);
2612 duplicateIXtree(sb, blkno, xlen, &xaddr);
2614 /* update the next available iag number */
2615 imap->im_nextiag += 1;
2617 /* Add the iag to the iag free list so we don't lose the iag
2618 * if a failure happens now.
2620 imap->im_freeiag = iagno;
2622 /* Until we have logredo working, we want the imap inode &
2623 * control page to be up to date.
2627 /* release the inode map lock */
2628 IWRITE_UNLOCK(ipimap);
2631 /* obtain read lock on map */
2632 IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
2635 if ((rc = diIAGRead(imap, iagno, &mp))) {
2636 IREAD_UNLOCK(ipimap);
2640 iagp = (struct iag *) mp->data;
2642 /* remove the iag from the iag free list */
2643 imap->im_freeiag = le32_to_cpu(iagp->iagfree);
2644 iagp->iagfree = cpu_to_le32(-1);
2646 /* set the return iag number and buffer pointer */
2651 /* release the iag free lock */
2652 IAGFREE_UNLOCK(imap);
2660 * FUNCTION: get the buffer for the specified iag within a fileset
2661 * or aggregate inode map.
2664 * imap - pointer to inode map control structure.
2665 * iagno - iag number.
2666 * bpp - point to buffer pointer to be filled in on successful
2670 * must have read lock on imap inode
2671 * (When called by diExtendFS, the filesystem is quiesced, therefore
2672 * the read lock is unnecessary.)
2678 static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp)
2680 struct inode *ipimap = imap->im_ipimap;
2683 /* compute the logical block number of the iag. */
2684 blkno = IAGTOLBLK(iagno, JFS_SBI(ipimap->i_sb)->l2nbperpage);
2687 *mpp = read_metapage(ipimap, blkno, PSIZE, 0);
2696 * NAME: diFindFree()
2698 * FUNCTION: find the first free bit in a word starting at
2699 * the specified bit position.
2702 * word - word to be examined.
2703 * start - starting bit position.
2706 * bit position of first free bit in the word or 32 if
2707 * no free bits were found.
2709 static int diFindFree(u32 word, int start)
2713 /* scan the word for the first free bit. */
2714 for (word <<= start, bitno = start; bitno < 32;
2715 bitno++, word <<= 1) {
2716 if ((word & HIGHORDER) == 0)
2723 * NAME: diUpdatePMap()
2725 * FUNCTION: Update the persistent map in an IAG for the allocation or
2726 * freeing of the specified inode.
2728 * PRE CONDITIONS: Working map has already been updated for allocate.
2731 * ipimap - Incore inode map inode
2732 * inum - Number of inode to mark in permanent map
2733 * is_free - If 'true' indicates inode should be marked freed, otherwise
2734 * indicates inode should be marked allocated.
2740 diUpdatePMap(struct inode *ipimap,
2741 unsigned long inum, bool is_free, struct tblock * tblk)
2745 struct metapage *mp;
2746 int iagno, ino, extno, bitno;
2747 struct inomap *imap;
2749 struct jfs_log *log;
2750 int lsn, difft, diffp;
2751 unsigned long flags;
2753 imap = JFS_IP(ipimap)->i_imap;
2754 /* get the iag number containing the inode */
2755 iagno = INOTOIAG(inum);
2756 /* make sure that the iag is contained within the map */
2757 if (iagno >= imap->im_nextiag) {
2758 jfs_error(ipimap->i_sb, "the iag is outside the map\n");
2762 IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
2763 rc = diIAGRead(imap, iagno, &mp);
2764 IREAD_UNLOCK(ipimap);
2767 metapage_wait_for_io(mp);
2768 iagp = (struct iag *) mp->data;
2769 /* get the inode number and extent number of the inode within
2770 * the iag and the inode number within the extent.
2772 ino = inum & (INOSPERIAG - 1);
2773 extno = ino >> L2INOSPEREXT;
2774 bitno = ino & (INOSPEREXT - 1);
2775 mask = HIGHORDER >> bitno;
2777 * mark the inode free in persistent map:
2780 /* The inode should have been allocated both in working
2781 * map and in persistent map;
2782 * the inode will be freed from working map at the release
2783 * of last reference release;
2785 if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
2786 jfs_error(ipimap->i_sb,
2787 "inode %ld not marked as allocated in wmap!\n",
2790 if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) {
2791 jfs_error(ipimap->i_sb,
2792 "inode %ld not marked as allocated in pmap!\n",
2795 /* update the bitmap for the extent of the freed inode */
2796 iagp->pmap[extno] &= cpu_to_le32(~mask);
2799 * mark the inode allocated in persistent map:
2802 /* The inode should be already allocated in the working map
2803 * and should be free in persistent map;
2805 if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
2806 release_metapage(mp);
2807 jfs_error(ipimap->i_sb,
2808 "the inode is not allocated in the working map\n");
2811 if ((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) {
2812 release_metapage(mp);
2813 jfs_error(ipimap->i_sb,
2814 "the inode is not free in the persistent map\n");
2817 /* update the bitmap for the extent of the allocated inode */
2818 iagp->pmap[extno] |= cpu_to_le32(mask);
2824 log = JFS_SBI(tblk->sb)->log;
2825 LOGSYNC_LOCK(log, flags);
2827 /* inherit older/smaller lsn */
2828 logdiff(difft, lsn, log);
2829 logdiff(diffp, mp->lsn, log);
2830 if (difft < diffp) {
2832 /* move mp after tblock in logsync list */
2833 list_move(&mp->synclist, &tblk->synclist);
2835 /* inherit younger/larger clsn */
2837 logdiff(difft, tblk->clsn, log);
2838 logdiff(diffp, mp->clsn, log);
2840 mp->clsn = tblk->clsn;
2844 /* insert mp after tblock in logsync list */
2846 list_add(&mp->synclist, &tblk->synclist);
2847 mp->clsn = tblk->clsn;
2849 LOGSYNC_UNLOCK(log, flags);
2857 * function: update imap for extendfs();
2859 * note: AG size has been increased s.t. each k old contiguous AGs are
2860 * coalesced into a new AG;
2862 int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
2865 struct inomap *imap = JFS_IP(ipimap)->i_imap;
2866 struct iag *iagp = NULL, *hiagp = NULL;
2867 struct bmap *mp = JFS_SBI(ipbmap->i_sb)->bmap;
2868 struct metapage *bp, *hbp;
2870 int numinos, xnuminos = 0, xnumfree = 0;
2873 jfs_info("diExtendFS: nextiag:%d numinos:%d numfree:%d",
2874 imap->im_nextiag, atomic_read(&imap->im_numinos),
2875 atomic_read(&imap->im_numfree));
2880 * coalesce contiguous k (newAGSize/oldAGSize) AGs;
2881 * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn;
2882 * note: new AG size = old AG size * (2**x).
2885 /* init per AG control information im_agctl[] */
2886 for (i = 0; i < MAXAG; i++) {
2887 imap->im_agctl[i].inofree = -1;
2888 imap->im_agctl[i].extfree = -1;
2889 imap->im_agctl[i].numinos = 0; /* number of backed inodes */
2890 imap->im_agctl[i].numfree = 0; /* number of free backed inodes */
2894 * process each iag page of the map.
2896 * rebuild AG Free Inode List, AG Free Inode Extent List;
2898 for (i = 0; i < imap->im_nextiag; i++) {
2899 if ((rc = diIAGRead(imap, i, &bp))) {
2903 iagp = (struct iag *) bp->data;
2904 if (le32_to_cpu(iagp->iagnum) != i) {
2905 release_metapage(bp);
2906 jfs_error(ipimap->i_sb, "unexpected value of iagnum\n");
2910 /* leave free iag in the free iag list */
2911 if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
2912 release_metapage(bp);
2916 agstart = le64_to_cpu(iagp->agstart);
2917 n = agstart >> mp->db_agl2size;
2918 iagp->agstart = cpu_to_le64((s64)n << mp->db_agl2size);
2920 /* compute backed inodes */
2921 numinos = (EXTSPERIAG - le32_to_cpu(iagp->nfreeexts))
2924 /* merge AG backed inodes */
2925 imap->im_agctl[n].numinos += numinos;
2926 xnuminos += numinos;
2929 /* if any backed free inodes, insert at AG free inode list */
2930 if ((int) le32_to_cpu(iagp->nfreeinos) > 0) {
2931 if ((head = imap->im_agctl[n].inofree) == -1) {
2932 iagp->inofreefwd = cpu_to_le32(-1);
2933 iagp->inofreeback = cpu_to_le32(-1);
2935 if ((rc = diIAGRead(imap, head, &hbp))) {
2939 hiagp = (struct iag *) hbp->data;
2940 hiagp->inofreeback = iagp->iagnum;
2941 iagp->inofreefwd = cpu_to_le32(head);
2942 iagp->inofreeback = cpu_to_le32(-1);
2943 write_metapage(hbp);
2946 imap->im_agctl[n].inofree =
2947 le32_to_cpu(iagp->iagnum);
2949 /* merge AG backed free inodes */
2950 imap->im_agctl[n].numfree +=
2951 le32_to_cpu(iagp->nfreeinos);
2952 xnumfree += le32_to_cpu(iagp->nfreeinos);
2955 /* if any free extents, insert at AG free extent list */
2956 if (le32_to_cpu(iagp->nfreeexts) > 0) {
2957 if ((head = imap->im_agctl[n].extfree) == -1) {
2958 iagp->extfreefwd = cpu_to_le32(-1);
2959 iagp->extfreeback = cpu_to_le32(-1);
2961 if ((rc = diIAGRead(imap, head, &hbp))) {
2965 hiagp = (struct iag *) hbp->data;
2966 hiagp->extfreeback = iagp->iagnum;
2967 iagp->extfreefwd = cpu_to_le32(head);
2968 iagp->extfreeback = cpu_to_le32(-1);
2969 write_metapage(hbp);
2972 imap->im_agctl[n].extfree =
2973 le32_to_cpu(iagp->iagnum);
2980 if (xnuminos != atomic_read(&imap->im_numinos) ||
2981 xnumfree != atomic_read(&imap->im_numfree)) {
2982 jfs_error(ipimap->i_sb, "numinos or numfree incorrect\n");
2993 * serialization: IWRITE_LOCK held on entry/exit
2995 * note: shadow page with regular inode (rel.2);
2997 static void duplicateIXtree(struct super_block *sb, s64 blkno,
2998 int xlen, s64 *xaddr)
3000 struct jfs_superblock *j_sb;
3001 struct buffer_head *bh;
3005 /* if AIT2 ipmap2 is bad, do not try to update it */
3006 if (JFS_SBI(sb)->mntflag & JFS_BAD_SAIT) /* s_flag */
3008 ip = diReadSpecial(sb, FILESYSTEM_I, 1);
3010 JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT;
3011 if (readSuper(sb, &bh))
3013 j_sb = (struct jfs_superblock *)bh->b_data;
3014 j_sb->s_flag |= cpu_to_le32(JFS_BAD_SAIT);
3016 mark_buffer_dirty(bh);
3017 sync_dirty_buffer(bh);
3022 /* start transaction */
3023 tid = txBegin(sb, COMMIT_FORCE);
3024 /* update the inode map addressing structure to point to it */
3025 if (xtInsert(tid, ip, 0, blkno, xlen, xaddr, 0)) {
3026 JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT;
3031 /* update the inode map's inode to reflect the extension */
3032 ip->i_size += PSIZE;
3033 inode_add_bytes(ip, PSIZE);
3034 txCommit(tid, 1, &ip, COMMIT_FORCE);
3041 * NAME: copy_from_dinode()
3043 * FUNCTION: Copies inode info from disk inode to in-memory inode
3047 * -ENOMEM - insufficient memory
3049 static int copy_from_dinode(struct dinode * dip, struct inode *ip)
3051 struct jfs_inode_info *jfs_ip = JFS_IP(ip);
3052 struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
3054 jfs_ip->fileset = le32_to_cpu(dip->di_fileset);
3055 jfs_ip->mode2 = le32_to_cpu(dip->di_mode);
3056 jfs_set_inode_flags(ip);
3058 ip->i_mode = le32_to_cpu(dip->di_mode) & 0xffff;
3059 if (sbi->umask != -1) {
3060 ip->i_mode = (ip->i_mode & ~0777) | (0777 & ~sbi->umask);
3061 /* For directories, add x permission if r is allowed by umask */
3062 if (S_ISDIR(ip->i_mode)) {
3063 if (ip->i_mode & 0400)
3065 if (ip->i_mode & 0040)
3067 if (ip->i_mode & 0004)
3071 set_nlink(ip, le32_to_cpu(dip->di_nlink));
3073 jfs_ip->saved_uid = make_kuid(&init_user_ns, le32_to_cpu(dip->di_uid));
3074 if (!uid_valid(sbi->uid))
3075 ip->i_uid = jfs_ip->saved_uid;
3077 ip->i_uid = sbi->uid;
3080 jfs_ip->saved_gid = make_kgid(&init_user_ns, le32_to_cpu(dip->di_gid));
3081 if (!gid_valid(sbi->gid))
3082 ip->i_gid = jfs_ip->saved_gid;
3084 ip->i_gid = sbi->gid;
3087 ip->i_size = le64_to_cpu(dip->di_size);
3088 ip->i_atime.tv_sec = le32_to_cpu(dip->di_atime.tv_sec);
3089 ip->i_atime.tv_nsec = le32_to_cpu(dip->di_atime.tv_nsec);
3090 ip->i_mtime.tv_sec = le32_to_cpu(dip->di_mtime.tv_sec);
3091 ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec);
3092 ip->i_ctime.tv_sec = le32_to_cpu(dip->di_ctime.tv_sec);
3093 ip->i_ctime.tv_nsec = le32_to_cpu(dip->di_ctime.tv_nsec);
3094 ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks));
3095 ip->i_generation = le32_to_cpu(dip->di_gen);
3097 jfs_ip->ixpxd = dip->di_ixpxd; /* in-memory pxd's are little-endian */
3098 jfs_ip->acl = dip->di_acl; /* as are dxd's */
3099 jfs_ip->ea = dip->di_ea;
3100 jfs_ip->next_index = le32_to_cpu(dip->di_next_index);
3101 jfs_ip->otime = le32_to_cpu(dip->di_otime.tv_sec);
3102 jfs_ip->acltype = le32_to_cpu(dip->di_acltype);
3104 if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode)) {
3105 jfs_ip->dev = le32_to_cpu(dip->di_rdev);
3106 ip->i_rdev = new_decode_dev(jfs_ip->dev);
3109 if (S_ISDIR(ip->i_mode)) {
3110 memcpy(&jfs_ip->i_dirtable, &dip->di_dirtable, 384);
3111 } else if (S_ISREG(ip->i_mode) || S_ISLNK(ip->i_mode)) {
3112 memcpy(&jfs_ip->i_xtroot, &dip->di_xtroot, 288);
3114 memcpy(&jfs_ip->i_inline_ea, &dip->di_inlineea, 128);
3116 /* Zero the in-memory-only stuff */
3118 jfs_ip->btindex = 0;
3119 jfs_ip->btorder = 0;
3122 jfs_ip->atlhead = 0;
3123 jfs_ip->atltail = 0;
3129 * NAME: copy_to_dinode()
3131 * FUNCTION: Copies inode info from in-memory inode to disk inode
3133 static void copy_to_dinode(struct dinode * dip, struct inode *ip)
3135 struct jfs_inode_info *jfs_ip = JFS_IP(ip);
3136 struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
3138 dip->di_fileset = cpu_to_le32(jfs_ip->fileset);
3139 dip->di_inostamp = cpu_to_le32(sbi->inostamp);
3140 dip->di_number = cpu_to_le32(ip->i_ino);
3141 dip->di_gen = cpu_to_le32(ip->i_generation);
3142 dip->di_size = cpu_to_le64(ip->i_size);
3143 dip->di_nblocks = cpu_to_le64(PBLK2LBLK(ip->i_sb, ip->i_blocks));
3144 dip->di_nlink = cpu_to_le32(ip->i_nlink);
3145 if (!uid_valid(sbi->uid))
3146 dip->di_uid = cpu_to_le32(i_uid_read(ip));
3148 dip->di_uid =cpu_to_le32(from_kuid(&init_user_ns,
3149 jfs_ip->saved_uid));
3150 if (!gid_valid(sbi->gid))
3151 dip->di_gid = cpu_to_le32(i_gid_read(ip));
3153 dip->di_gid = cpu_to_le32(from_kgid(&init_user_ns,
3154 jfs_ip->saved_gid));
3156 * mode2 is only needed for storing the higher order bits.
3157 * Trust i_mode for the lower order ones
3159 if (sbi->umask == -1)
3160 dip->di_mode = cpu_to_le32((jfs_ip->mode2 & 0xffff0000) |
3162 else /* Leave the original permissions alone */
3163 dip->di_mode = cpu_to_le32(jfs_ip->mode2);
3165 dip->di_atime.tv_sec = cpu_to_le32(ip->i_atime.tv_sec);
3166 dip->di_atime.tv_nsec = cpu_to_le32(ip->i_atime.tv_nsec);
3167 dip->di_ctime.tv_sec = cpu_to_le32(ip->i_ctime.tv_sec);
3168 dip->di_ctime.tv_nsec = cpu_to_le32(ip->i_ctime.tv_nsec);
3169 dip->di_mtime.tv_sec = cpu_to_le32(ip->i_mtime.tv_sec);
3170 dip->di_mtime.tv_nsec = cpu_to_le32(ip->i_mtime.tv_nsec);
3171 dip->di_ixpxd = jfs_ip->ixpxd; /* in-memory pxd's are little-endian */
3172 dip->di_acl = jfs_ip->acl; /* as are dxd's */
3173 dip->di_ea = jfs_ip->ea;
3174 dip->di_next_index = cpu_to_le32(jfs_ip->next_index);
3175 dip->di_otime.tv_sec = cpu_to_le32(jfs_ip->otime);
3176 dip->di_otime.tv_nsec = 0;
3177 dip->di_acltype = cpu_to_le32(jfs_ip->acltype);
3178 if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode))
3179 dip->di_rdev = cpu_to_le32(jfs_ip->dev);