diff options
Diffstat (limited to 'fs')
171 files changed, 6252 insertions, 3020 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 9e670d527646..ef5905f7c8a3 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1789,9 +1789,10 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, kfree(st); } else { /* Caching disabled. No need to get upto date stat info. - * This dentry will be released immediately. So, just i_count++ + * This dentry will be released immediately. So, just hold the + * inode */ - atomic_inc(&old_dentry->d_inode->i_count); + ihold(old_dentry->d_inode); } dentry->d_op = old_dentry->d_op; diff --git a/fs/Kconfig b/fs/Kconfig index 65781de44fc0..97673c955484 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -47,10 +47,12 @@ source "fs/nilfs2/Kconfig" endif # BLOCK +config EXPORTFS + tristate + config FILE_LOCKING bool "Enable POSIX file locking API" if EMBEDDED default y - select BKL # while lockd still uses it. help This option enables standard file locking support, required for filesystems like NFS and for the flock() system @@ -222,9 +224,6 @@ config LOCKD_V4 depends on FILE_LOCKING default y -config EXPORTFS - tristate - config NFS_ACL_SUPPORT tristate select FS_POSIX_ACL diff --git a/fs/Makefile b/fs/Makefile index e6ec1d309b1d..26956fcec917 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -29,10 +29,7 @@ obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_AIO) += aio.o obj-$(CONFIG_FILE_LOCKING) += locks.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o - -nfsd-$(CONFIG_NFSD) := nfsctl.o -obj-y += $(nfsd-y) $(nfsd-m) - +obj-$(CONFIG_NFSD_DEPRECATED) += nfsctl.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o diff --git a/fs/affs/file.c b/fs/affs/file.c index c4a9875bd1a6..0a90dcd46de2 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -894,9 +894,9 @@ affs_truncate(struct inode *inode) if (AFFS_SB(sb)->s_flags & SF_OFS) { struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0); u32 tmp; - if (IS_ERR(ext_bh)) { + if (IS_ERR(bh)) { affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)", - ext, PTR_ERR(ext_bh)); + ext, PTR_ERR(bh)); return; } tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next); diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 3a0fdec175ba..5d828903ac69 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -388,7 +388,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3 affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain)); mark_buffer_dirty_inode(inode_bh, inode); inode->i_nlink = 2; - atomic_inc(&inode->i_count); + ihold(inode); } affs_fix_checksum(sb, bh); mark_buffer_dirty_inode(bh, inode); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 0d38c09bd55e..5439e1bc9a86 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -1045,7 +1045,7 @@ static int afs_link(struct dentry *from, struct inode *dir, if (ret < 0) goto link_error; - atomic_inc(&vnode->vfs_inode.i_count); + ihold(&vnode->vfs_inode); d_instantiate(dentry, &vnode->vfs_inode); key_put(key); _leave(" = 0"); diff --git a/fs/afs/write.c b/fs/afs/write.c index 722743b152d8..15690bb1d3b5 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -438,7 +438,6 @@ no_more: */ int afs_writepage(struct page *page, struct writeback_control *wbc) { - struct backing_dev_info *bdi = page->mapping->backing_dev_info; struct afs_writeback *wb; int ret; @@ -455,8 +454,6 @@ int afs_writepage(struct page *page, struct writeback_control *wbc) } wbc->nr_to_write -= ret; - if (wbc->nonblocking && bdi_write_congested(bdi)) - wbc->encountered_congestion = 1; _leave(" = 0"); return 0; @@ -469,7 +466,6 @@ static int afs_writepages_region(struct address_space *mapping, struct writeback_control *wbc, pgoff_t index, pgoff_t end, pgoff_t *_next) { - struct backing_dev_info *bdi = mapping->backing_dev_info; struct afs_writeback *wb; struct page *page; int ret, n; @@ -529,11 +525,6 @@ static int afs_writepages_region(struct address_space *mapping, wbc->nr_to_write -= ret; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - break; - } - cond_resched(); } while (index < end && wbc->nr_to_write > 0); @@ -548,24 +539,16 @@ static int afs_writepages_region(struct address_space *mapping, int afs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct backing_dev_info *bdi = mapping->backing_dev_info; pgoff_t start, end, next; int ret; _enter(""); - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - _leave(" = 0 [congest]"); - return 0; - } - if (wbc->range_cyclic) { start = mapping->writeback_index; end = -1; ret = afs_writepages_region(mapping, wbc, start, end, &next); - if (start > 0 && wbc->nr_to_write > 0 && ret == 0 && - !(wbc->nonblocking && wbc->encountered_congestion)) + if (start > 0 && wbc->nr_to_write > 0 && ret == 0) ret = afs_writepages_region(mapping, wbc, 0, start, &next); mapping->writeback_index = next; @@ -1543,7 +1543,19 @@ static void aio_batch_add(struct address_space *mapping, } abe = mempool_alloc(abe_pool, GFP_KERNEL); - BUG_ON(!igrab(mapping->host)); + + /* + * we should be using igrab here, but + * we don't want to hammer on the global + * inode spinlock just to take an extra + * reference on a file that we must already + * have a reference to. + * + * When we're called, we always have a reference + * on the file, so we must always have a reference + * on the inode, so ihold() is safe here. + */ + ihold(mapping->host); abe->mapping = mapping; hlist_add_head(&abe->list, &batch_hash[bucket]); return; diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index e4b75d6eda83..5365527ca43f 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -111,10 +111,9 @@ struct file *anon_inode_getfile(const char *name, path.mnt = mntget(anon_inode_mnt); /* * We know the anon_inode inode count is always greater than zero, - * so we can avoid doing an igrab() and we can use an open-coded - * atomic_inc(). + * so ihold() is safe. */ - atomic_inc(&anon_inode_inode->i_count); + ihold(anon_inode_inode); path.dentry->d_op = &anon_inodefs_dentry_operations; d_instantiate(path.dentry, anon_inode_inode); @@ -194,6 +193,7 @@ static struct inode *anon_inode_mkinode(void) if (!inode) return ERR_PTR(-ENOMEM); + inode->i_ino = get_next_ino(); inode->i_fop = &anon_inode_fops; inode->i_mapping->a_ops = &anon_aops; diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 821b2b955dac..ac87e49fa706 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -398,6 +398,7 @@ struct inode *autofs4_get_inode(struct super_block *sb, inode->i_gid = sb->s_root->d_inode->i_gid; } inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_ino = get_next_ino(); if (S_ISDIR(inf->mode)) { inode->i_nlink = 2; diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index d967e052b779..685ecff3ab31 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -176,7 +176,7 @@ static int bfs_link(struct dentry *old, struct inode *dir, inc_nlink(inode); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - atomic_inc(&inode->i_count); + ihold(inode); d_instantiate(new, inode); mutex_unlock(&info->bfs_lock); return 0; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 139fc8083f53..29990f0eee0c 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -495,6 +495,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode) struct inode * inode = new_inode(sb); if (inode) { + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_atime = inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); diff --git a/fs/block_dev.c b/fs/block_dev.c index b737451e2e9d..dea3b628a6ce 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -48,6 +48,21 @@ inline struct block_device *I_BDEV(struct inode *inode) EXPORT_SYMBOL(I_BDEV); +/* + * move the inode from it's current bdi to the a new bdi. if the inode is dirty + * we need to move it onto the dirty list of @dst so that the inode is always + * on the right list. + */ +static void bdev_inode_switch_bdi(struct inode *inode, + struct backing_dev_info *dst) +{ + spin_lock(&inode_lock); + inode->i_data.backing_dev_info = dst; + if (inode->i_state & I_DIRTY) + list_move(&inode->i_wb_list, &dst->wb.b_dirty); + spin_unlock(&inode_lock); +} + static sector_t max_block(struct block_device *bdev) { sector_t retval = ~((sector_t)0); @@ -550,7 +565,7 @@ EXPORT_SYMBOL(bdget); */ struct block_device *bdgrab(struct block_device *bdev) { - atomic_inc(&bdev->bd_inode->i_count); + ihold(bdev->bd_inode); return bdev; } @@ -580,7 +595,7 @@ static struct block_device *bd_acquire(struct inode *inode) spin_lock(&bdev_lock); bdev = inode->i_bdev; if (bdev) { - atomic_inc(&bdev->bd_inode->i_count); + ihold(bdev->bd_inode); spin_unlock(&bdev_lock); return bdev; } @@ -591,12 +606,12 @@ static struct block_device *bd_acquire(struct inode *inode) spin_lock(&bdev_lock); if (!inode->i_bdev) { /* - * We take an additional bd_inode->i_count for inode, + * We take an additional reference to bd_inode, * and it's released in clear_inode() of inode. * So, we can access it via ->i_mapping always * without igrab(). */ - atomic_inc(&bdev->bd_inode->i_count); + ihold(bdev->bd_inode); inode->i_bdev = bdev; inode->i_mapping = bdev->bd_inode->i_mapping; list_add(&inode->i_devices, &bdev->bd_inodes); @@ -1390,7 +1405,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdi = blk_get_backing_dev_info(bdev); if (bdi == NULL) bdi = &default_backing_dev_info; - bdev->bd_inode->i_data.backing_dev_info = bdi; + bdev_inode_switch_bdi(bdev->bd_inode, bdi); } if (bdev->bd_invalidated) rescan_partitions(disk, bdev); @@ -1405,8 +1420,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (ret) goto out_clear; bdev->bd_contains = whole; - bdev->bd_inode->i_data.backing_dev_info = - whole->bd_inode->i_data.backing_dev_info; + bdev_inode_switch_bdi(bdev->bd_inode, + whole->bd_inode->i_data.backing_dev_info); bdev->bd_part = disk_get_part(disk, partno); if (!(disk->flags & GENHD_FL_UP) || !bdev->bd_part || !bdev->bd_part->nr_sects) { @@ -1439,7 +1454,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_disk = NULL; bdev->bd_part = NULL; - bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; + bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); bdev->bd_contains = NULL; @@ -1533,7 +1548,8 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; - bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; + bdev_inode_switch_bdi(bdev->bd_inode, + &default_backing_dev_info); if (bdev != bdev->bd_contains) victim = bdev->bd_contains; bdev->bd_contains = NULL; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c03864406af3..64f99cf69ce0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3849,7 +3849,7 @@ again: p = &root->inode_tree.rb_node; parent = NULL; - if (hlist_unhashed(&inode->i_hash)) + if (inode_unhashed(inode)) return; spin_lock(&root->inode_lock); @@ -4758,7 +4758,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } btrfs_set_trans_block_group(trans, dir); - atomic_inc(&inode->i_count); + ihold(inode); err = btrfs_add_nondir(trans, dentry, inode, 1, index); diff --git a/fs/buffer.c b/fs/buffer.c index 7f0b9b083f77..5930e382959b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -905,7 +905,6 @@ try_again: bh->b_state = 0; atomic_set(&bh->b_count, 0); - bh->b_private = NULL; bh->b_size = size; /* Link the buffer to its page */ @@ -1706,7 +1705,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, * and kswapd activity, but those code paths have their own * higher-level throttling. */ - if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { + if (wbc->sync_mode != WB_SYNC_NONE) { lock_buffer(bh); } else if (!trylock_buffer(bh)) { redirty_page_for_writepage(wbc, page); @@ -1834,9 +1833,11 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) } EXPORT_SYMBOL(page_zero_new_buffers); -int block_prepare_write(struct page *page, unsigned from, unsigned to, +int __block_write_begin(struct page *page, loff_t pos, unsigned len, get_block_t *get_block) { + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned to = from + len; struct inode *inode = page->mapping->host; unsigned block_start, block_end; sector_t block; @@ -1916,7 +1917,7 @@ int block_prepare_write(struct page *page, unsigned from, unsigned to, } return err; } -EXPORT_SYMBOL(block_prepare_write); +EXPORT_SYMBOL(__block_write_begin); static int __block_commit_write(struct inode *inode, struct page *page, unsigned from, unsigned to) @@ -1953,15 +1954,6 @@ static int __block_commit_write(struct inode *inode, struct page *page, return 0; } -int __block_write_begin(struct page *page, loff_t pos, unsigned len, - get_block_t *get_block) -{ - unsigned start = pos & (PAGE_CACHE_SIZE - 1); - - return block_prepare_write(page, start, start + len, get_block); -} -EXPORT_SYMBOL(__block_write_begin); - /* * block_write_begin takes care of the basic task of block allocation and * bringing partial write blocks uptodate first. @@ -2379,7 +2371,7 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, else end = PAGE_CACHE_SIZE; - ret = block_prepare_write(page, 0, end, get_block); + ret = __block_write_begin(page, 0, end, get_block); if (!ret) ret = block_commit_write(page, 0, end); @@ -2466,11 +2458,10 @@ int nobh_write_begin(struct address_space *mapping, *fsdata = NULL; if (page_has_buffers(page)) { - unlock_page(page); - page_cache_release(page); - *pagep = NULL; - return block_write_begin(mapping, pos, len, flags, pagep, - get_block); + ret = __block_write_begin(page, pos, len, get_block); + if (unlikely(ret)) + goto out_release; + return ret; } if (PageMappedToDisk(page)) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 51bcc5ce3230..e9c874abc9e1 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -591,7 +591,6 @@ static int ceph_writepages_start(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; - struct backing_dev_info *bdi = mapping->backing_dev_info; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc; pgoff_t index, start, end; @@ -633,13 +632,6 @@ static int ceph_writepages_start(struct address_space *mapping, pagevec_init(&pvec, 0); - /* ?? */ - if (wbc->nonblocking && bdi_write_congested(bdi)) { - dout(" writepages congested\n"); - wbc->encountered_congestion = 1; - goto out_final; - } - /* where to start/end? */ if (wbc->range_cyclic) { start = mapping->writeback_index; /* Start from prev offset */ @@ -885,7 +877,6 @@ out: rc = 0; /* vfs expects us to return 0 */ ceph_put_snap_context(snapc); dout("writepages done, rc = %d\n", rc); -out_final: return rc; } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8c81e7b14d53..45af003865d2 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1303,7 +1303,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) static int cifs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned int bytes_to_write; unsigned int bytes_written; struct cifs_sb_info *cifs_sb; @@ -1326,15 +1325,6 @@ static int cifs_writepages(struct address_space *mapping, int scanned = 0; int xid, long_op; - /* - * BB: Is this meaningful for a non-block-device file system? - * If it is, we should test it again after we do I/O - */ - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - return 0; - } - cifs_sb = CIFS_SB(mapping->host->i_sb); /* diff --git a/fs/coda/cache.c b/fs/coda/cache.c index a5bf5771a22a..9060f08e70cf 100644 --- a/fs/coda/cache.c +++ b/fs/coda/cache.c @@ -17,6 +17,7 @@ #include <linux/string.h> #include <linux/list.h> #include <linux/sched.h> +#include <linux/spinlock.h> #include <linux/coda.h> #include <linux/coda_linux.h> @@ -31,19 +32,23 @@ void coda_cache_enter(struct inode *inode, int mask) { struct coda_inode_info *cii = ITOC(inode); + spin_lock(&cii->c_lock); cii->c_cached_epoch = atomic_read(&permission_epoch); if (cii->c_uid != current_fsuid()) { cii->c_uid = current_fsuid(); cii->c_cached_perm = mask; } else cii->c_cached_perm |= mask; + spin_unlock(&cii->c_lock); } /* remove cached acl from an inode */ void coda_cache_clear_inode(struct inode *inode) { struct coda_inode_info *cii = ITOC(inode); + spin_lock(&cii->c_lock); cii->c_cached_epoch = atomic_read(&permission_epoch) - 1; + spin_unlock(&cii->c_lock); } /* remove all acl caches */ @@ -57,13 +62,15 @@ void coda_cache_clear_all(struct super_block *sb) int coda_cache_check(struct inode *inode, int mask) { struct coda_inode_info *cii = ITOC(inode); - int hit; + int hit; - hit = (mask & cii->c_cached_perm) == mask && - cii->c_uid == current_fsuid() && - cii->c_cached_epoch == atomic_read(&permission_epoch); + spin_lock(&cii->c_lock); + hit = (mask & cii->c_cached_perm) == mask && + cii->c_uid == current_fsuid() && + cii->c_cached_epoch == atomic_read(&permission_epoch); + spin_unlock(&cii->c_lock); - return hit; + return hit; } diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c index a7a780929eec..602240569c89 100644 --- a/fs/coda/cnode.c +++ b/fs/coda/cnode.c @@ -45,13 +45,15 @@ static void coda_fill_inode(struct inode *inode, struct coda_vattr *attr) static int coda_test_inode(struct inode *inode, void *data) { struct CodaFid *fid = (struct CodaFid *)data; - return coda_fideq(&(ITOC(inode)->c_fid), fid); + struct coda_inode_info *cii = ITOC(inode); + return coda_fideq(&cii->c_fid, fid); } static int coda_set_inode(struct inode *inode, void *data) { struct CodaFid *fid = (struct CodaFid *)data; - ITOC(inode)->c_fid = *fid; + struct coda_inode_info *cii = ITOC(inode); + cii->c_fid = *fid; return 0; } @@ -71,6 +73,7 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid, cii = ITOC(inode); /* we still need to set i_ino for things like stat(2) */ inode->i_ino = hash; + /* inode is locked and unique, no need to grab cii->c_lock */ cii->c_mapcount = 0; unlock_new_inode(inode); } @@ -107,14 +110,20 @@ int coda_cnode_make(struct inode **inode, struct CodaFid *fid, struct super_bloc } +/* Although we treat Coda file identifiers as immutable, there is one + * special case for files created during a disconnection where they may + * not be globally unique. When an identifier collision is detected we + * first try to flush the cached inode from the kernel and finally + * resort to renaming/rehashing in-place. Userspace remembers both old + * and new values of the identifier to handle any in-flight upcalls. + * The real solution is to use globally unique UUIDs as identifiers, but + * retrofitting the existing userspace code for this is non-trivial. */ void coda_replace_fid(struct inode *inode, struct CodaFid *oldfid, struct CodaFid *newfid) { - struct coda_inode_info *cii; + struct coda_inode_info *cii = ITOC(inode); unsigned long hash = coda_f2i(newfid); - cii = ITOC(inode); - BUG_ON(!coda_fideq(&cii->c_fid, oldfid)); /* replace fid and rehash inode */ diff --git a/fs/coda/dir.c b/fs/coda/dir.c index ccd98b0f2b0b..5d8b35539601 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -17,7 +17,7 @@ #include <linux/stat.h> #include <linux/errno.h> #include <linux/string.h> -#include <linux/smp_lock.h> +#include <linux/spinlock.h> #include <asm/uaccess.h> @@ -116,15 +116,11 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc goto exit; } - lock_kernel(); - error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length, &type, &resfid); if (!error) error = coda_cnode_make(&inode, &resfid, dir->i_sb); - unlock_kernel(); - if (error && error != -ENOENT) return ERR_PTR(error); @@ -140,28 +136,24 @@ exit: int coda_permission(struct inode *inode, int mask) { - int error = 0; + int error; mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (!mask) - return 0; + return 0; if ((mask & MAY_EXEC) && !execute_ok(inode)) return -EACCES; - lock_kernel(); - if (coda_cache_check(inode, mask)) - goto out; + return 0; - error = venus_access(inode->i_sb, coda_i2f(inode), mask); + error = venus_access(inode->i_sb, coda_i2f(inode), mask); if (!error) coda_cache_enter(inode, mask); - out: - unlock_kernel(); return error; } @@ -200,41 +192,34 @@ static inline void coda_dir_drop_nlink(struct inode *dir) /* creation routines: create, mknod, mkdir, link, symlink */ static int coda_create(struct inode *dir, struct dentry *de, int mode, struct nameidata *nd) { - int error=0; + int error; const char *name=de->d_name.name; int length=de->d_name.len; struct inode *inode; struct CodaFid newfid; struct coda_vattr attrs; - lock_kernel(); - - if (coda_isroot(dir) && coda_iscontrol(name, length)) { - unlock_kernel(); + if (coda_isroot(dir) && coda_iscontrol(name, length)) return -EPERM; - } error = venus_create(dir->i_sb, coda_i2f(dir), name, length, 0, mode, &newfid, &attrs); - - if ( error ) { - unlock_kernel(); - d_drop(de); - return error; - } + if (error) + goto err_out; inode = coda_iget(dir->i_sb, &newfid, &attrs); - if ( IS_ERR(inode) ) { - unlock_kernel(); - d_drop(de); - return PTR_ERR(inode); + if (IS_ERR(inode)) { + error = PTR_ERR(inode); + goto err_out; } /* invalidate the directory cnode's attributes */ coda_dir_update_mtime(dir); - unlock_kernel(); d_instantiate(de, inode); return 0; +err_out: + d_drop(de); + return error; } static int coda_mkdir(struct inode *dir, struct dentry *de, int mode) @@ -246,36 +231,29 @@ static int coda_mkdir(struct inode *dir, struct dentry *de, int mode) int error; struct CodaFid newfid; - lock_kernel(); - - if (coda_isroot(dir) && coda_iscontrol(name, len)) { - unlock_kernel(); + if (coda_isroot(dir) && coda_iscontrol(name, len)) return -EPERM; - } attrs.va_mode = mode; error = venus_mkdir(dir->i_sb, coda_i2f(dir), name, len, &newfid, &attrs); - - if ( error ) { - unlock_kernel(); - d_drop(de); - return error; - } + if (error) + goto err_out; inode = coda_iget(dir->i_sb, &newfid, &attrs); - if ( IS_ERR(inode) ) { - unlock_kernel(); - d_drop(de); - return PTR_ERR(inode); + if (IS_ERR(inode)) { + error = PTR_ERR(inode); + goto err_out; } /* invalidate the directory cnode's attributes */ coda_dir_inc_nlink(dir); coda_dir_update_mtime(dir); - unlock_kernel(); d_instantiate(de, inode); return 0; +err_out: + d_drop(de); + return error; } /* try to make de an entry in dir_inodde linked to source_de */ @@ -287,52 +265,38 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode, int len = de->d_name.len; int error; - lock_kernel(); - - if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) { - unlock_kernel(); + if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) return -EPERM; - } error = venus_link(dir_inode->i_sb, coda_i2f(inode), coda_i2f(dir_inode), (const char *)name, len); - if (error) { d_drop(de); - goto out; + return error; } coda_dir_update_mtime(dir_inode); - atomic_inc(&inode->i_count); + ihold(inode); d_instantiate(de, inode); inc_nlink(inode); - -out: - unlock_kernel(); - return(error); + return 0; } static int coda_symlink(struct inode *dir_inode, struct dentry *de, const char *symname) { - const char *name = de->d_name.name; + const char *name = de->d_name.name; int len = de->d_name.len; int symlen; - int error = 0; - - lock_kernel(); + int error; - if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) { - unlock_kernel(); + if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) return -EPERM; - } symlen = strlen(symname); - if ( symlen > CODA_MAXPATHLEN ) { - unlock_kernel(); + if (symlen > CODA_MAXPATHLEN) return -ENAMETOOLONG; - } /* * This entry is now negative. Since we do not create @@ -343,10 +307,9 @@ static int coda_symlink(struct inode *dir_inode, struct dentry *de, symname, symlen); /* mtime is no good anymore */ - if ( !error ) + if (!error) coda_dir_update_mtime(dir_inode); - unlock_kernel(); return error; } @@ -357,17 +320,12 @@ static int coda_unlink(struct inode *dir, struct dentry *de) const char *name = de->d_name.name; int len = de->d_name.len; - lock_kernel(); - error = venus_remove(dir->i_sb, coda_i2f(dir), name, len); - if ( error ) { - unlock_kernel(); + if (error) return error; - } coda_dir_update_mtime(dir); drop_nlink(de->d_inode); - unlock_kernel(); return 0; } @@ -377,8 +335,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de) int len = de->d_name.len; int error; - lock_kernel(); - error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len); if (!error) { /* VFS may delete the child */ @@ -389,7 +345,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de) coda_dir_drop_nlink(dir); coda_dir_update_mtime(dir); } - unlock_kernel(); return error; } @@ -403,15 +358,12 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry, int new_length = new_dentry->d_name.len; int error; - lock_kernel(); - error = venus_rename(old_dir->i_sb, coda_i2f(old_dir), coda_i2f(new_dir), old_length, new_length, (const char *) old_name, (const char *)new_name); - - if ( !error ) { - if ( new_dentry->d_inode ) { - if ( S_ISDIR(new_dentry->d_inode->i_mode) ) { + if (!error) { + if (new_dentry->d_inode) { + if (S_ISDIR(new_dentry->d_inode->i_mode)) { coda_dir_drop_nlink(old_dir); coda_dir_inc_nlink(new_dir); } @@ -423,8 +375,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry, coda_flag_inode(new_dir, C_VATTR); } } - unlock_kernel(); - return error; } @@ -594,10 +544,7 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd) struct inode *inode = de->d_inode; struct coda_inode_info *cii; - if (!inode) - return 1; - lock_kernel(); - if (coda_isroot(inode)) + if (!inode || coda_isroot(inode)) goto out; if (is_bad_inode(inode)) goto bad; @@ -617,13 +564,12 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd) goto out; /* clear the flags. */ + spin_lock(&cii->c_lock); cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH); - + spin_unlock(&cii->c_lock); bad: - unlock_kernel(); return 0; out: - unlock_kernel(); return 1; } @@ -656,20 +602,19 @@ static int coda_dentry_delete(struct dentry * dentry) int coda_revalidate_inode(struct dentry *dentry) { struct coda_vattr attr; - int error = 0; + int error; int old_mode; ino_t old_ino; struct inode *inode = dentry->d_inode; struct coda_inode_info *cii = ITOC(inode); - lock_kernel(); - if ( !cii->c_flags ) - goto ok; + if (!cii->c_flags) + return 0; if (cii->c_flags & (C_VATTR | C_PURGE | C_FLUSH)) { error = venus_getattr(inode->i_sb, &(cii->c_fid), &attr); - if ( error ) - goto return_bad; + if (error) + return -EIO; /* this inode may be lost if: - it's ino changed @@ -688,17 +633,13 @@ int coda_revalidate_inode(struct dentry *dentry) /* the following can happen when a local fid is replaced with a global one, here we lose and declare the inode bad */ if (inode->i_ino != old_ino) - goto return_bad; + return -EIO; coda_flag_inode_children(inode, C_FLUSH); + + spin_lock(&cii->c_lock); cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH); + spin_unlock(&cii->c_lock); } - -ok: - unlock_kernel(); return 0; - -return_bad: - unlock_kernel(); - return -EIO; } diff --git a/fs/coda/file.c b/fs/coda/file.c index ad3cd2abeeb4..c8b50ba4366a 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -15,7 +15,7 @@ #include <linux/stat.h> #include <linux/cred.h> #include <linux/errno.h> -#include <linux/smp_lock.h> +#include <linux/spinlock.h> #include <linux/string.h> #include <linux/slab.h> #include <asm/uaccess.h> @@ -109,19 +109,24 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma) coda_inode = coda_file->f_path.dentry->d_inode; host_inode = host_file->f_path.dentry->d_inode; + + cii = ITOC(coda_inode); + spin_lock(&cii->c_lock); coda_file->f_mapping = host_file->f_mapping; if (coda_inode->i_mapping == &coda_inode->i_data) coda_inode->i_mapping = host_inode->i_mapping; /* only allow additional mmaps as long as userspace isn't changing * the container file on us! */ - else if (coda_inode->i_mapping != host_inode->i_mapping) + else if (coda_inode->i_mapping != host_inode->i_mapping) { + spin_unlock(&cii->c_lock); return -EBUSY; + } /* keep track of how often the coda_inode/host_file has been mmapped */ - cii = ITOC(coda_inode); cii->c_mapcount++; cfi->cfi_mapcount++; + spin_unlock(&cii->c_lock); return host_file->f_op->mmap(host_file, vma); } @@ -138,8 +143,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file) if (!cfi) return -ENOMEM; - lock_kernel(); - error = venus_open(coda_inode->i_sb, coda_i2f(coda_inode), coda_flags, &host_file); if (!host_file) @@ -147,7 +150,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file) if (error) { kfree(cfi); - unlock_kernel(); return error; } @@ -159,8 +161,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file) BUG_ON(coda_file->private_data != NULL); coda_file->private_data = cfi; - - unlock_kernel(); return 0; } @@ -171,9 +171,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file) struct coda_file_info *cfi; struct coda_inode_info *cii; struct inode *host_inode; - int err = 0; - - lock_kernel(); + int err; cfi = CODA_FTOC(coda_file); BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); @@ -185,18 +183,18 @@ int coda_release(struct inode *coda_inode, struct file *coda_file) cii = ITOC(coda_inode); /* did we mmap this file? */ + spin_lock(&cii->c_lock); if (coda_inode->i_mapping == &host_inode->i_data) { cii->c_mapcount -= cfi->cfi_mapcount; if (!cii->c_mapcount) coda_inode->i_mapping = &coda_inode->i_data; } + spin_unlock(&cii->c_lock); fput(cfi->cfi_container); kfree(coda_file->private_data); coda_file->private_data = NULL; - unlock_kernel(); - /* VFS fput ignores the return value from file_operations->release, so * there is no use returning an error here */ return 0; @@ -207,7 +205,7 @@ int coda_fsync(struct file *coda_file, int datasync) struct file *host_file; struct inode *coda_inode = coda_file->f_path.dentry->d_inode; struct coda_file_info *cfi; - int err = 0; + int err; if (!(S_ISREG(coda_inode->i_mode) || S_ISDIR(coda_inode->i_mode) || S_ISLNK(coda_inode->i_mode))) @@ -218,11 +216,8 @@ int coda_fsync(struct file *coda_file, int datasync) host_file = cfi->cfi_container; err = vfs_fsync(host_file, datasync); - if ( !err && !datasync ) { - lock_kernel(); + if (!err && !datasync) err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); - unlock_kernel(); - } return err; } diff --git a/fs/coda/inode.c b/fs/coda/inode.c index bfe8179b1295..7993b96ca348 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -15,7 +15,8 @@ #include <linux/stat.h> #include <linux/errno.h> #include <linux/unistd.h> -#include <linux/smp_lock.h> +#include <linux/mutex.h> +#include <linux/spinlock.h> #include <linux/file.h> #include <linux/vfs.h> #include <linux/slab.h> @@ -51,6 +52,7 @@ static struct inode *coda_alloc_inode(struct super_block *sb) ei->c_flags = 0; ei->c_uid = 0; ei->c_cached_perm = 0; + spin_lock_init(&ei->c_lock); return &ei->vfs_inode; } @@ -143,13 +145,11 @@ static int get_device_index(struct coda_mount_data *data) static int coda_fill_super(struct super_block *sb, void *data, int silent) { struct inode *root = NULL; - struct venus_comm *vc = NULL; + struct venus_comm *vc; struct CodaFid fid; int error; int idx; - lock_kernel(); - idx = get_device_index((struct coda_mount_data *) data); /* Ignore errors in data, for backward compatibility */ @@ -159,23 +159,26 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) printk(KERN_INFO "coda_read_super: device index: %i\n", idx); vc = &coda_comms[idx]; + mutex_lock(&vc->vc_mutex); + if (!vc->vc_inuse) { printk("coda_read_super: No pseudo device\n"); - unlock_kernel(); - return -EINVAL; + error = -EINVAL; + goto unlock_out; } - if ( vc->vc_sb ) { + if (vc->vc_sb) { printk("coda_read_super: Device already mounted\n"); - unlock_kernel(); - return -EBUSY; + error = -EBUSY; + goto unlock_out; } error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY); if (error) - goto bdi_err; + goto unlock_out; vc->vc_sb = sb; + mutex_unlock(&vc->vc_mutex); sb->s_fs_info = vc; sb->s_flags |= MS_NOATIME; @@ -204,28 +207,33 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) printk("coda_read_super: rootinode is %ld dev %s\n", root->i_ino, root->i_sb->s_id); sb->s_root = d_alloc_root(root); - if (!sb->s_root) + if (!sb->s_root) { + error = -EINVAL; goto error; - unlock_kernel(); + } return 0; - error: - bdi_destroy(&vc->bdi); - bdi_err: +error: if (root) iput(root); - if (vc) - vc->vc_sb = NULL; - unlock_kernel(); - return -EINVAL; + mutex_lock(&vc->vc_mutex); + bdi_destroy(&vc->bdi); + vc->vc_sb = NULL; + sb->s_fs_info = NULL; +unlock_out: + mutex_unlock(&vc->vc_mutex); + return error; } static void coda_put_super(struct super_block *sb) { - bdi_destroy(&coda_vcp(sb)->bdi); - coda_vcp(sb)->vc_sb = NULL; + struct venus_comm *vcp = coda_vcp(sb); + mutex_lock(&vcp->vc_mutex); + bdi_destroy(&vcp->bdi); + vcp->vc_sb = NULL; sb->s_fs_info = NULL; + mutex_unlock(&vcp->vc_mutex); printk("Coda: Bye bye.\n"); } @@ -251,8 +259,6 @@ int coda_setattr(struct dentry *de, struct iattr *iattr) struct coda_vattr vattr; int error; - lock_kernel(); - memset(&vattr, 0, sizeof(vattr)); inode->i_ctime = CURRENT_TIME_SEC; @@ -262,13 +268,10 @@ int coda_setattr(struct dentry *de, struct iattr *iattr) /* Venus is responsible for truncating the container-file!!! */ error = venus_setattr(inode->i_sb, coda_i2f(inode), &vattr); - if ( !error ) { + if (!error) { coda_vattr_to_iattr(inode, &vattr); coda_cache_clear_inode(inode); } - - unlock_kernel(); - return error; } @@ -282,12 +285,8 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf) { int error; - lock_kernel(); - error = venus_statfs(dentry, buf); - unlock_kernel(); - if (error) { /* fake something like AFS does */ buf->f_blocks = 9000000; diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index 028a9a0f588b..2fd89b5c5c7b 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -23,8 +23,6 @@ #include <linux/coda_fs_i.h> #include <linux/coda_psdev.h> -#include <linux/smp_lock.h> - /* pioctl ops */ static int coda_ioctl_permission(struct inode *inode, int mask); static long coda_pioctl(struct file *filp, unsigned int cmd, @@ -58,13 +56,9 @@ static long coda_pioctl(struct file *filp, unsigned int cmd, struct inode *target_inode = NULL; struct coda_inode_info *cnp; - lock_kernel(); - /* get the Pioctl data arguments from user space */ - if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) { - error = -EINVAL; - goto out; - } + if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) + return -EINVAL; /* * Look up the pathname. Note that the pathname is in @@ -76,13 +70,12 @@ static long coda_pioctl(struct file *filp, unsigned int cmd, error = user_lpath(data.path, &path); if (error) - goto out; - else - target_inode = path.dentry->d_inode; + return error; + + target_inode = path.dentry->d_inode; /* return if it is not a Coda inode */ if (target_inode->i_sb != inode->i_sb) { - path_put(&path); error = -EINVAL; goto out; } @@ -91,10 +84,7 @@ static long coda_pioctl(struct file *filp, unsigned int cmd, cnp = ITOC(target_inode); error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data); - - path_put(&path); - out: - unlock_kernel(); + path_put(&path); return error; } diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index fdc2f3ef7ecd..62647a8595e4 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -35,7 +35,7 @@ #include <linux/poll.h> #include <linux/init.h> #include <linux/list.h> -#include <linux/smp_lock.h> +#include <linux/mutex.h> #include <linux/device.h> #include <asm/io.h> #include <asm/system.h> @@ -67,8 +67,10 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait) unsigned int mask = POLLOUT | POLLWRNORM; poll_wait(file, &vcp->vc_waitq, wait); + mutex_lock(&vcp->vc_mutex); if (!list_empty(&vcp->vc_pending)) mask |= POLLIN | POLLRDNORM; + mutex_unlock(&vcp->vc_mutex); return mask; } @@ -108,16 +110,9 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf, return -EFAULT; if (DOWNCALL(hdr.opcode)) { - struct super_block *sb = NULL; - union outputArgs *dcbuf; + union outputArgs *dcbuf; int size = sizeof(*dcbuf); - sb = vcp->vc_sb; - if ( !sb ) { - count = nbytes; - goto out; - } - if ( nbytes < sizeof(struct coda_out_hdr) ) { printk("coda_downcall opc %d uniq %d, not enough!\n", hdr.opcode, hdr.unique); @@ -137,9 +132,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf, } /* what downcall errors does Venus handle ? */ - lock_kernel(); - error = coda_downcall(hdr.opcode, dcbuf, sb); - unlock_kernel(); + error = coda_downcall(vcp, hdr.opcode, dcbuf); CODA_FREE(dcbuf, nbytes); if (error) { @@ -152,7 +145,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf, } /* Look for the message on the processing queue. */ - lock_kernel(); + mutex_lock(&vcp->vc_mutex); list_for_each(lh, &vcp->vc_processing) { tmp = list_entry(lh, struct upc_req , uc_chain); if (tmp->uc_unique == hdr.unique) { @@ -161,7 +154,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf, break; } } - unlock_kernel(); + mutex_unlock(&vcp->vc_mutex); if (!req) { printk("psdev_write: msg (%d, %d) not found\n", @@ -216,7 +209,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf, if (nbytes == 0) return 0; - lock_kernel(); + mutex_lock(&vcp->vc_mutex); add_wait_queue(&vcp->vc_waitq, &wait); set_current_state(TASK_INTERRUPTIBLE); @@ -230,7 +223,9 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf, retval = -ERESTARTSYS; break; } + mutex_unlock(&vcp->vc_mutex); schedule(); + mutex_lock(&vcp->vc_mutex); } set_current_state(TASK_RUNNING); @@ -263,7 +258,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf, CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr)); kfree(req); out: - unlock_kernel(); + mutex_unlock(&vcp->vc_mutex); return (count ? count : retval); } @@ -276,10 +271,10 @@ static int coda_psdev_open(struct inode * inode, struct file * file) if (idx < 0 || idx >= MAX_CODADEVS) return -ENODEV; - lock_kernel(); - err = -EBUSY; vcp = &coda_comms[idx]; + mutex_lock(&vcp->vc_mutex); + if (!vcp->vc_inuse) { vcp->vc_inuse++; @@ -293,7 +288,7 @@ static int coda_psdev_open(struct inode * inode, struct file * file) err = 0; } - unlock_kernel(); + mutex_unlock(&vcp->vc_mutex); return err; } @@ -308,7 +303,7 @@ static int coda_psdev_release(struct inode * inode, struct file * file) return -1; } - lock_kernel(); + mutex_lock(&vcp->vc_mutex); /* Wakeup clients so they can return. */ list_for_each_entry_safe(req, tmp, &vcp->vc_pending, uc_chain) { @@ -333,7 +328,7 @@ static int coda_psdev_release(struct inode * inode, struct file * file) file->private_data = NULL; vcp->vc_inuse--; - unlock_kernel(); + mutex_unlock(&vcp->vc_mutex); return 0; } @@ -362,9 +357,11 @@ static int init_coda_psdev(void) err = PTR_ERR(coda_psdev_class); goto out_chrdev; } - for (i = 0; i < MAX_CODADEVS; i++) + for (i = 0; i < MAX_CODADEVS; i++) { + mutex_init(&(&coda_comms[i])->vc_mutex); device_create(coda_psdev_class, NULL, MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i); + } coda_sysctl_init(); goto out; diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c index 4513b7258458..af78f007a2b0 100644 --- a/fs/coda/symlink.c +++ b/fs/coda/symlink.c @@ -14,7 +14,6 @@ #include <linux/stat.h> #include <linux/errno.h> #include <linux/pagemap.h> -#include <linux/smp_lock.h> #include <linux/coda.h> #include <linux/coda_linux.h> @@ -29,11 +28,9 @@ static int coda_symlink_filler(struct file *file, struct page *page) unsigned int len = PAGE_SIZE; char *p = kmap(page); - lock_kernel(); cii = ITOC(inode); error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len); - unlock_kernel(); if (error) goto fail; SetPageUptodate(page); diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index b8893ab6f9e6..c3563cab9758 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -27,6 +27,7 @@ #include <linux/errno.h> #include <linux/string.h> #include <linux/slab.h> +#include <linux/mutex.h> #include <asm/uaccess.h> #include <linux/vmalloc.h> #include <linux/vfs.h> @@ -606,7 +607,8 @@ static void coda_unblock_signals(sigset_t *old) (r)->uc_opcode != CODA_RELEASE) || \ (r)->uc_flags & CODA_REQ_READ)) -static inline void coda_waitfor_upcall(struct upc_req *req) +static inline void coda_waitfor_upcall(struct venus_comm *vcp, + struct upc_req *req) { DECLARE_WAITQUEUE(wait, current); unsigned long timeout = jiffies + coda_timeout * HZ; @@ -639,10 +641,12 @@ static inline void coda_waitfor_upcall(struct upc_req *req) break; } + mutex_unlock(&vcp->vc_mutex); if (blocked) schedule_timeout(HZ); else schedule(); + mutex_lock(&vcp->vc_mutex); } if (blocked) coda_unblock_signals(&old); @@ -667,18 +671,23 @@ static int coda_upcall(struct venus_comm *vcp, { union outputArgs *out; union inputArgs *sig_inputArgs; - struct upc_req *req, *sig_req; - int error = 0; + struct upc_req *req = NULL, *sig_req; + int error; + + mutex_lock(&vcp->vc_mutex); if (!vcp->vc_inuse) { printk(KERN_NOTICE "coda: Venus dead, not sending upcall\n"); - return -ENXIO; + error = -ENXIO; + goto exit; } /* Format the request message. */ req = kmalloc(sizeof(struct upc_req), GFP_KERNEL); - if (!req) - return -ENOMEM; + if (!req) { + error = -ENOMEM; + goto exit; + } req->uc_data = (void *)buffer; req->uc_flags = 0; @@ -705,7 +714,7 @@ static int coda_upcall(struct venus_comm *vcp, * ENODEV. */ /* Go to sleep. Wake up on signals only after the timeout. */ - coda_waitfor_upcall(req); + coda_waitfor_upcall(vcp, req); /* Op went through, interrupt or not... */ if (req->uc_flags & CODA_REQ_WRITE) { @@ -759,6 +768,7 @@ static int coda_upcall(struct venus_comm *vcp, exit: kfree(req); + mutex_unlock(&vcp->vc_mutex); return error; } @@ -796,21 +806,24 @@ exit: * * CODA_REPLACE -- replace one CodaFid with another throughout the name cache */ -int coda_downcall(int opcode, union outputArgs * out, struct super_block *sb) +int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out) { struct inode *inode = NULL; - struct CodaFid *fid, *newfid; + struct CodaFid *fid = NULL, *newfid; + struct super_block *sb; /* Handle invalidation requests. */ - if ( !sb || !sb->s_root) - return 0; + mutex_lock(&vcp->vc_mutex); + sb = vcp->vc_sb; + if (!sb || !sb->s_root) + goto unlock_out; switch (opcode) { case CODA_FLUSH: coda_cache_clear_all(sb); shrink_dcache_sb(sb); if (sb->s_root->d_inode) - coda_flag_inode(sb->s_root->d_inode, C_FLUSH); + coda_flag_inode(sb->s_root->d_inode, C_FLUSH); break; case CODA_PURGEUSER: @@ -819,45 +832,53 @@ int coda_downcall(int opcode, union outputArgs * out, struct super_block *sb) case CODA_ZAPDIR: fid = &out->coda_zapdir.CodaFid; - inode = coda_fid_to_inode(fid, sb); - if (inode) { - coda_flag_inode_children(inode, C_PURGE); - coda_flag_inode(inode, C_VATTR); - } break; case CODA_ZAPFILE: fid = &out->coda_zapfile.CodaFid; - inode = coda_fid_to_inode(fid, sb); - if (inode) - coda_flag_inode(inode, C_VATTR); break; case CODA_PURGEFID: fid = &out->coda_purgefid.CodaFid; + break; + + case CODA_REPLACE: + fid = &out->coda_replace.OldFid; + break; + } + if (fid) inode = coda_fid_to_inode(fid, sb); - if (inode) { - coda_flag_inode_children(inode, C_PURGE); - /* catch the dentries later if some are still busy */ - coda_flag_inode(inode, C_PURGE); - d_prune_aliases(inode); +unlock_out: + mutex_unlock(&vcp->vc_mutex); - } + if (!inode) + return 0; + + switch (opcode) { + case CODA_ZAPDIR: + coda_flag_inode_children(inode, C_PURGE); + coda_flag_inode(inode, C_VATTR); + break; + + case CODA_ZAPFILE: + coda_flag_inode(inode, C_VATTR); + break; + + case CODA_PURGEFID: + coda_flag_inode_children(inode, C_PURGE); + + /* catch the dentries later if some are still busy */ + coda_flag_inode(inode, C_PURGE); + d_prune_aliases(inode); break; case CODA_REPLACE: - fid = &out->coda_replace.OldFid; newfid = &out->coda_replace.NewFid; - inode = coda_fid_to_inode(fid, sb); - if (inode) - coda_replace_fid(inode, fid, newfid); + coda_replace_fid(inode, fid, newfid); break; } - - if (inode) - iput(inode); - + iput(inode); return 0; } diff --git a/fs/compat.c b/fs/compat.c index 0644a154672b..f03abdadc401 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1963,7 +1963,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, } #endif /* HAVE_SET_RESTORE_SIGMASK */ -#if defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE) +#if (defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)) && !defined(CONFIG_NFSD_DEPRECATED) /* Stuff for NFS server syscalls... */ struct compat_nfsctl_svc { u16 svc32_port; diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index cf78d44a8d6a..253476d78ed8 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -135,6 +135,7 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd) { struct inode * inode = new_inode(configfs_sb); if (inode) { + inode->i_ino = get_next_ino(); inode->i_mapping->a_ops = &configfs_aops; inode->i_mapping->backing_dev_info = &configfs_backing_dev_info; inode->i_op = &configfs_inode_operations; diff --git a/fs/dcache.c b/fs/dcache.c index 83293be48149..23702a9d4e6d 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -67,33 +67,43 @@ struct dentry_stat_t dentry_stat = { .age_limit = 45, }; -static void __d_free(struct dentry *dentry) +static struct percpu_counter nr_dentry __cacheline_aligned_in_smp; +static struct percpu_counter nr_dentry_unused __cacheline_aligned_in_smp; + +#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) +int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry); + dentry_stat.nr_unused = percpu_counter_sum_positive(&nr_dentry_unused); + return proc_dointvec(table, write, buffer, lenp, ppos); +} +#endif + +static void __d_free(struct rcu_head *head) { + struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); + WARN_ON(!list_empty(&dentry->d_alias)); if (dname_external(dentry)) kfree(dentry->d_name.name); kmem_cache_free(dentry_cache, dentry); } -static void d_callback(struct rcu_head *head) -{ - struct dentry * dentry = container_of(head, struct dentry, d_u.d_rcu); - __d_free(dentry); -} - /* - * no dcache_lock, please. The caller must decrement dentry_stat.nr_dentry - * inside dcache_lock. + * no dcache_lock, please. */ static void d_free(struct dentry *dentry) { + percpu_counter_dec(&nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); + /* if dentry was never inserted into hash, immediate free is OK */ if (hlist_unhashed(&dentry->d_hash)) - __d_free(dentry); + __d_free(&dentry->d_u.d_rcu); else - call_rcu(&dentry->d_u.d_rcu, d_callback); + call_rcu(&dentry->d_u.d_rcu, __d_free); } /* @@ -123,37 +133,34 @@ static void dentry_iput(struct dentry * dentry) } /* - * dentry_lru_(add|add_tail|del|del_init) must be called with dcache_lock held. + * dentry_lru_(add|del|move_tail) must be called with dcache_lock held. */ static void dentry_lru_add(struct dentry *dentry) { - list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); - dentry->d_sb->s_nr_dentry_unused++; - dentry_stat.nr_unused++; -} - -static void dentry_lru_add_tail(struct dentry *dentry) -{ - list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); - dentry->d_sb->s_nr_dentry_unused++; - dentry_stat.nr_unused++; + if (list_empty(&dentry->d_lru)) { + list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); + dentry->d_sb->s_nr_dentry_unused++; + percpu_counter_inc(&nr_dentry_unused); + } } static void dentry_lru_del(struct dentry *dentry) { if (!list_empty(&dentry->d_lru)) { - list_del(&dentry->d_lru); + list_del_init(&dentry->d_lru); dentry->d_sb->s_nr_dentry_unused--; - dentry_stat.nr_unused--; + percpu_counter_dec(&nr_dentry_unused); } } -static void dentry_lru_del_init(struct dentry *dentry) +static void dentry_lru_move_tail(struct dentry *dentry) { - if (likely(!list_empty(&dentry->d_lru))) { - list_del_init(&dentry->d_lru); - dentry->d_sb->s_nr_dentry_unused--; - dentry_stat.nr_unused--; + if (list_empty(&dentry->d_lru)) { + list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); + dentry->d_sb->s_nr_dentry_unused++; + percpu_counter_inc(&nr_dentry_unused); + } else { + list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); } } @@ -172,7 +179,6 @@ static struct dentry *d_kill(struct dentry *dentry) struct dentry *parent; list_del(&dentry->d_u.d_child); - dentry_stat.nr_dentry--; /* For d_free, below */ /*drops the locks, at that point nobody can reach this dentry */ dentry_iput(dentry); if (IS_ROOT(dentry)) @@ -237,13 +243,15 @@ repeat: if (dentry->d_op->d_delete(dentry)) goto unhash_it; } + /* Unreachable? Get rid of it */ if (d_unhashed(dentry)) goto kill_it; - if (list_empty(&dentry->d_lru)) { - dentry->d_flags |= DCACHE_REFERENCED; - dentry_lru_add(dentry); - } + + /* Otherwise leave it cached and ensure it's on the LRU */ + dentry->d_flags |= DCACHE_REFERENCED; + dentry_lru_add(dentry); + spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); return; @@ -318,11 +326,10 @@ int d_invalidate(struct dentry * dentry) EXPORT_SYMBOL(d_invalidate); /* This should be called _only_ with dcache_lock held */ - static inline struct dentry * __dget_locked(struct dentry *dentry) { atomic_inc(&dentry->d_count); - dentry_lru_del_init(dentry); + dentry_lru_del(dentry); return dentry; } @@ -441,73 +448,27 @@ static void prune_one_dentry(struct dentry * dentry) if (dentry->d_op && dentry->d_op->d_delete) dentry->d_op->d_delete(dentry); - dentry_lru_del_init(dentry); + dentry_lru_del(dentry); __d_drop(dentry); dentry = d_kill(dentry); spin_lock(&dcache_lock); } } -/* - * Shrink the dentry LRU on a given superblock. - * @sb : superblock to shrink dentry LRU. - * @count: If count is NULL, we prune all dentries on superblock. - * @flags: If flags is non-zero, we need to do special processing based on - * which flags are set. This means we don't need to maintain multiple - * similar copies of this loop. - */ -static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags) +static void shrink_dentry_list(struct list_head *list) { - LIST_HEAD(referenced); - LIST_HEAD(tmp); struct dentry *dentry; - int cnt = 0; - BUG_ON(!sb); - BUG_ON((flags & DCACHE_REFERENCED) && count == NULL); - spin_lock(&dcache_lock); - if (count != NULL) - /* called from prune_dcache() and shrink_dcache_parent() */ - cnt = *count; -restart: - if (count == NULL) - list_splice_init(&sb->s_dentry_lru, &tmp); - else { - while (!list_empty(&sb->s_dentry_lru)) { - dentry = list_entry(sb->s_dentry_lru.prev, - struct dentry, d_lru); - BUG_ON(dentry->d_sb != sb); + while (!list_empty(list)) { + dentry = list_entry(list->prev, struct dentry, d_lru); + dentry_lru_del(dentry); - spin_lock(&dentry->d_lock); - /* - * If we are honouring the DCACHE_REFERENCED flag and - * the dentry has this flag set, don't free it. Clear - * the flag and put it back on the LRU. - */ - if ((flags & DCACHE_REFERENCED) - && (dentry->d_flags & DCACHE_REFERENCED)) { - dentry->d_flags &= ~DCACHE_REFERENCED; - list_move(&dentry->d_lru, &referenced); - spin_unlock(&dentry->d_lock); - } else { - list_move_tail(&dentry->d_lru, &tmp); - spin_unlock(&dentry->d_lock); - cnt--; - if (!cnt) - break; - } - cond_resched_lock(&dcache_lock); - } - } - while (!list_empty(&tmp)) { - dentry = list_entry(tmp.prev, struct dentry, d_lru); - dentry_lru_del_init(dentry); - spin_lock(&dentry->d_lock); /* * We found an inuse dentry which was not removed from * the LRU because of laziness during lookup. Do not free * it - just keep it off the LRU list. */ + spin_lock(&dentry->d_lock); if (atomic_read(&dentry->d_count)) { spin_unlock(&dentry->d_lock); continue; @@ -516,13 +477,60 @@ restart: /* dentry->d_lock was dropped in prune_one_dentry() */ cond_resched_lock(&dcache_lock); } - if (count == NULL && !list_empty(&sb->s_dentry_lru)) - goto restart; - if (count != NULL) - *count = cnt; +} + +/** + * __shrink_dcache_sb - shrink the dentry LRU on a given superblock + * @sb: superblock to shrink dentry LRU. + * @count: number of entries to prune + * @flags: flags to control the dentry processing + * + * If flags contains DCACHE_REFERENCED reference dentries will not be pruned. + */ +static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags) +{ + /* called from prune_dcache() and shrink_dcache_parent() */ + struct dentry *dentry; + LIST_HEAD(referenced); + LIST_HEAD(tmp); + int cnt = *count; + + spin_lock(&dcache_lock); + while (!list_empty(&sb->s_dentry_lru)) { + dentry = list_entry(sb->s_dentry_lru.prev, + struct dentry, d_lru); + BUG_ON(dentry->d_sb != sb); + + /* + * If we are honouring the DCACHE_REFERENCED flag and the + * dentry has this flag set, don't free it. Clear the flag + * and put it back on the LRU. + */ + if (flags & DCACHE_REFERENCED) { + spin_lock(&dentry->d_lock); + if (dentry->d_flags & DCACHE_REFERENCED) { + dentry->d_flags &= ~DCACHE_REFERENCED; + list_move(&dentry->d_lru, &referenced); + spin_unlock(&dentry->d_lock); + cond_resched_lock(&dcache_lock); + continue; + } + spin_unlock(&dentry->d_lock); + } + + list_move_tail(&dentry->d_lru, &tmp); + if (!--cnt) + break; + cond_resched_lock(&dcache_lock); + } + + *count = cnt; + shrink_dentry_list(&tmp); + if (!list_empty(&referenced)) list_splice(&referenced, &sb->s_dentry_lru); spin_unlock(&dcache_lock); + } /** @@ -538,7 +546,7 @@ static void prune_dcache(int count) { struct super_block *sb, *p = NULL; int w_count; - int unused = dentry_stat.nr_unused; + int unused = percpu_counter_sum_positive(&nr_dentry_unused); int prune_ratio; int pruned; @@ -608,13 +616,19 @@ static void prune_dcache(int count) * shrink_dcache_sb - shrink dcache for a superblock * @sb: superblock * - * Shrink the dcache for the specified super block. This - * is used to free the dcache before unmounting a file - * system + * Shrink the dcache for the specified super block. This is used to free + * the dcache before unmounting a file system. */ -void shrink_dcache_sb(struct super_block * sb) +void shrink_dcache_sb(struct super_block *sb) { - __shrink_dcache_sb(sb, NULL, 0); + LIST_HEAD(tmp); + + spin_lock(&dcache_lock); + while (!list_empty(&sb->s_dentry_lru)) { + list_splice_init(&sb->s_dentry_lru, &tmp); + shrink_dentry_list(&tmp); + } + spin_unlock(&dcache_lock); } EXPORT_SYMBOL(shrink_dcache_sb); @@ -632,7 +646,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) /* detach this root from the system */ spin_lock(&dcache_lock); - dentry_lru_del_init(dentry); + dentry_lru_del(dentry); __d_drop(dentry); spin_unlock(&dcache_lock); @@ -646,7 +660,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) spin_lock(&dcache_lock); list_for_each_entry(loop, &dentry->d_subdirs, d_u.d_child) { - dentry_lru_del_init(loop); + dentry_lru_del(loop); __d_drop(loop); cond_resched_lock(&dcache_lock); } @@ -703,20 +717,13 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) * otherwise we ascend to the parent and move to the * next sibling if there is one */ if (!parent) - goto out; - + return; dentry = parent; - } while (list_empty(&dentry->d_subdirs)); dentry = list_entry(dentry->d_subdirs.next, struct dentry, d_u.d_child); } -out: - /* several dentries were freed, need to correct nr_dentry */ - spin_lock(&dcache_lock); - dentry_stat.nr_dentry -= detached; - spin_unlock(&dcache_lock); } /* @@ -830,14 +837,15 @@ resume: struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); next = tmp->next; - dentry_lru_del_init(dentry); /* * move only zero ref count dentries to the end * of the unused list for prune_dcache */ if (!atomic_read(&dentry->d_count)) { - dentry_lru_add_tail(dentry); + dentry_lru_move_tail(dentry); found++; + } else { + dentry_lru_del(dentry); } /* @@ -900,12 +908,16 @@ EXPORT_SYMBOL(shrink_dcache_parent); */ static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { + int nr_unused; + if (nr) { if (!(gfp_mask & __GFP_FS)) return -1; prune_dcache(nr); } - return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; + + nr_unused = percpu_counter_sum_positive(&nr_dentry_unused); + return (nr_unused / 100) * sysctl_vfs_cache_pressure; } static struct shrinker dcache_shrinker = { @@ -972,9 +984,10 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) spin_lock(&dcache_lock); if (parent) list_add(&dentry->d_u.d_child, &parent->d_subdirs); - dentry_stat.nr_dentry++; spin_unlock(&dcache_lock); + percpu_counter_inc(&nr_dentry); + return dentry; } EXPORT_SYMBOL(d_alloc); @@ -1478,33 +1491,26 @@ out: * This is used by ncpfs in its readdir implementation. * Zero is returned in the dentry is invalid. */ - -int d_validate(struct dentry *dentry, struct dentry *dparent) +int d_validate(struct dentry *dentry, struct dentry *parent) { - struct hlist_head *base; - struct hlist_node *lhp; + struct hlist_head *head = d_hash(parent, dentry->d_name.hash); + struct hlist_node *node; + struct dentry *d; /* Check whether the ptr might be valid at all.. */ if (!kmem_ptr_validate(dentry_cache, dentry)) - goto out; - - if (dentry->d_parent != dparent) - goto out; + return 0; + if (dentry->d_parent != parent) + return 0; - spin_lock(&dcache_lock); - base = d_hash(dparent, dentry->d_name.hash); - hlist_for_each(lhp,base) { - /* hlist_for_each_entry_rcu() not required for d_hash list - * as it is parsed under dcache_lock - */ - if (dentry == hlist_entry(lhp, struct dentry, d_hash)) { - __dget_locked(dentry); - spin_unlock(&dcache_lock); + rcu_read_lock(); + hlist_for_each_entry_rcu(d, node, head, d_hash) { + if (d == dentry) { + dget(dentry); return 1; } } - spin_unlock(&dcache_lock); -out: + rcu_read_unlock(); return 0; } EXPORT_SYMBOL(d_validate); @@ -1994,7 +2000,7 @@ global_root: * Returns a pointer into the buffer or an error code if the * path was too long. * - * "buflen" should be positive. Caller holds the dcache_lock. + * "buflen" should be positive. * * If path is not reachable from the supplied root, then the value of * root is changed (without modifying refcounts). @@ -2006,10 +2012,12 @@ char *__d_path(const struct path *path, struct path *root, int error; prepend(&res, &buflen, "\0", 1); + spin_lock(&dcache_lock); error = prepend_path(path, root, &res, &buflen); + spin_unlock(&dcache_lock); + if (error) return ERR_PTR(error); - return res; } @@ -2419,6 +2427,9 @@ static void __init dcache_init(void) { int loop; + percpu_counter_init(&nr_dentry, 0); + percpu_counter_init(&nr_dentry_unused, 0); + /* * A constructor could be added for stable state like the lists, * but it is probably not worth it because of the cache nature diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 30a87b3dbcac..a4ed8380e98a 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -40,6 +40,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d struct inode *inode = new_inode(sb); if (inode) { + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; switch (mode & S_IFMT) { diff --git a/fs/direct-io.c b/fs/direct-io.c index 48d74c7391d1..85882f6ba5f7 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -218,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio) * filesystems can use it to hold additional state between get_block calls and * dio_complete. */ -static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async) +static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async) { ssize_t transferred = 0; diff --git a/fs/exec.c b/fs/exec.c index 6d2b6f936858..3aa75b8888a1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -54,6 +54,7 @@ #include <linux/fsnotify.h> #include <linux/fs_struct.h> #include <linux/pipe_fs_i.h> +#include <linux/oom.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -759,6 +760,10 @@ static int exec_mmap(struct mm_struct *mm) tsk->mm = mm; tsk->active_mm = mm; activate_mm(active_mm, mm); + if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + atomic_dec(&old_mm->oom_disable_count); + atomic_inc(&tsk->mm->oom_disable_count); + } task_unlock(tsk); arch_pick_mmap_layout(mm); if (old_mm) { diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index d91e9d829bc1..dcc941d82d67 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -420,7 +420,7 @@ int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de, err = exofs_write_begin(NULL, page->mapping, pos, len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); if (err) - EXOFS_ERR("exofs_set_link: exofs_write_begin FAILD => %d\n", + EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n", err); de->inode_no = cpu_to_le64(inode->i_ino); @@ -556,7 +556,7 @@ int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page) err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0, &page, NULL); if (err) - EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILD => %d\n", + EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILED => %d\n", err); if (pde) pde->rec_len = cpu_to_le16(to - from); diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 68cb23e3bb98..b905c79b4f0a 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -46,10 +46,6 @@ static int exofs_file_fsync(struct file *filp, int datasync) { int ret; struct inode *inode = filp->f_mapping->host; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* metadata-only; caller takes care of data */ - }; struct super_block *sb; if (!(inode->i_state & I_DIRTY)) @@ -57,7 +53,7 @@ static int exofs_file_fsync(struct file *filp, int datasync) if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) return 0; - ret = sync_inode(inode, &wbc); + ret = sync_inode_metadata(inode, 1); /* This is a good place to write the sb */ /* TODO: Sechedule an sb-sync on create */ diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 44602754f758..42685424817b 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -185,7 +185,7 @@ static void update_write_page(struct page *page, int ret) /* Called at the end of reads, to optionally unlock pages and update their * status. */ -static int __readpages_done(struct page_collect *pcol, bool do_unlock) +static int __readpages_done(struct page_collect *pcol) { int i; u64 resid; @@ -221,7 +221,7 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock) page_stat ? "bad_bytes" : "good_bytes"); ret = update_read_page(page, page_stat); - if (do_unlock) + if (!pcol->read_4_write) unlock_page(page); length += PAGE_SIZE; } @@ -236,7 +236,7 @@ static void readpages_done(struct exofs_io_state *ios, void *p) { struct page_collect *pcol = p; - __readpages_done(pcol, true); + __readpages_done(pcol); atomic_dec(&pcol->sbi->s_curr_pending); kfree(pcol); } @@ -257,7 +257,7 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) } } -static int read_exec(struct page_collect *pcol, bool is_sync) +static int read_exec(struct page_collect *pcol) { struct exofs_i_info *oi = exofs_i(pcol->inode); struct exofs_io_state *ios = pcol->ios; @@ -267,17 +267,14 @@ static int read_exec(struct page_collect *pcol, bool is_sync) if (!pcol->pages) return 0; - /* see comment in _readpage() about sync reads */ - WARN_ON(is_sync && (pcol->nr_pages != 1)); - ios->pages = pcol->pages; ios->nr_pages = pcol->nr_pages; ios->length = pcol->length; ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT; - if (is_sync) { + if (pcol->read_4_write) { exofs_oi_read(oi, pcol->ios); - return __readpages_done(pcol, false); + return __readpages_done(pcol); } pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); @@ -303,7 +300,7 @@ static int read_exec(struct page_collect *pcol, bool is_sync) return 0; err: - if (!is_sync) + if (!pcol->read_4_write) _unlock_pcol_pages(pcol, ret, READ); pcol_free(pcol); @@ -356,7 +353,7 @@ static int readpage_strip(void *data, struct page *page) EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page," " splitting\n", inode->i_ino, page->index); - return read_exec(pcol, false); + return read_exec(pcol); } try_again: @@ -366,7 +363,7 @@ try_again: } else if (unlikely((pcol->pg_first + pcol->nr_pages) != page->index)) { /* Discontinuity detected, split the request */ - ret = read_exec(pcol, false); + ret = read_exec(pcol); if (unlikely(ret)) goto fail; goto try_again; @@ -391,7 +388,7 @@ try_again: page, len, pcol->nr_pages, pcol->length); /* split the request, and start again with current page */ - ret = read_exec(pcol, false); + ret = read_exec(pcol); if (unlikely(ret)) goto fail; @@ -420,27 +417,24 @@ static int exofs_readpages(struct file *file, struct address_space *mapping, return ret; } - return read_exec(&pcol, false); + return read_exec(&pcol); } -static int _readpage(struct page *page, bool is_sync) +static int _readpage(struct page *page, bool read_4_write) { struct page_collect pcol; int ret; _pcol_init(&pcol, 1, page->mapping->host); - /* readpage_strip might call read_exec(,is_sync==false) at several - * places but not if we have a single page. - */ - pcol.read_4_write = is_sync; + pcol.read_4_write = read_4_write; ret = readpage_strip(&pcol, page); if (ret) { EXOFS_ERR("_readpage => %d\n", ret); return ret; } - return read_exec(&pcol, is_sync); + return read_exec(&pcol); } /* @@ -1036,6 +1030,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data)); } + inode->i_mapping->backing_dev_info = sb->s_bdi; if (S_ISREG(inode->i_mode)) { inode->i_op = &exofs_file_inode_operations; inode->i_fop = &exofs_file_operations; @@ -1072,8 +1067,10 @@ bad_inode: int __exofs_wait_obj_created(struct exofs_i_info *oi) { if (!obj_created(oi)) { + EXOFS_DBGMSG("!obj_created\n"); BUG_ON(!obj_2bcreated(oi)); wait_event(oi->i_wq, obj_created(oi)); + EXOFS_DBGMSG("wait_event done\n"); } return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0; } @@ -1107,7 +1104,6 @@ static void create_done(struct exofs_io_state *ios, void *p) set_obj_created(oi); - atomic_dec(&inode->i_count); wake_up(&oi->i_wq); } @@ -1135,6 +1131,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) sbi = sb->s_fs_info; + inode->i_mapping->backing_dev_info = sb->s_bdi; sb->s_dirt = 1; inode_init_owner(inode, dir, mode); inode->i_ino = sbi->s_nextid++; @@ -1157,17 +1154,11 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) ios->obj.id = exofs_oi_objno(oi); exofs_make_credential(oi->i_cred, &ios->obj); - /* increment the refcount so that the inode will still be around when we - * reach the callback - */ - atomic_inc(&inode->i_count); - ios->done = create_done; ios->private = inode; ios->cred = oi->i_cred; ret = exofs_sbi_create(ios); if (ret) { - atomic_dec(&inode->i_count); exofs_put_io_state(ios); return ERR_PTR(ret); } @@ -1257,12 +1248,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync) ios->out_attr_len = 1; ios->out_attr = &attr; - if (!obj_created(oi)) { - EXOFS_DBGMSG("!obj_created\n"); - BUG_ON(!obj_2bcreated(oi)); - wait_event(oi->i_wq, obj_created(oi)); - EXOFS_DBGMSG("wait_event done\n"); - } + wait_obj_created(oi); if (!do_sync) { args->sbi = sbi; @@ -1325,12 +1311,12 @@ void exofs_evict_inode(struct inode *inode) inode->i_size = 0; end_writeback(inode); - /* if we are deleting an obj that hasn't been created yet, wait */ - if (!obj_created(oi)) { - BUG_ON(!obj_2bcreated(oi)); - wait_event(oi->i_wq, obj_created(oi)); - /* ignore the error attempt a remove anyway */ - } + /* if we are deleting an obj that hasn't been created yet, wait. + * This also makes sure that create_done cannot be called with an + * already evicted inode. + */ + wait_obj_created(oi); + /* ignore the error, attempt a remove anyway */ /* Now Remove the OSD objects */ ret = exofs_get_io_state(&sbi->layout, &ios); diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index b7dd0c236863..264e95d02830 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -153,7 +153,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir, inode->i_ctime = CURRENT_TIME; inode_inc_link_count(inode); - atomic_inc(&inode->i_count); + ihold(inode); return exofs_add_nondir(dentry, inode); } diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index e9e175949a63..51b304056f10 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -74,21 +74,20 @@ static struct dentry * find_disconnected_root(struct dentry *dentry) { dget(dentry); - spin_lock(&dentry->d_lock); - while (!IS_ROOT(dentry) && - (dentry->d_parent->d_flags & DCACHE_DISCONNECTED)) { - struct dentry *parent = dentry->d_parent; - dget(parent); - spin_unlock(&dentry->d_lock); + while (!IS_ROOT(dentry)) { + struct dentry *parent = dget_parent(dentry); + + if (!(parent->d_flags & DCACHE_DISCONNECTED)) { + dput(parent); + break; + } + dput(dentry); dentry = parent; - spin_lock(&dentry->d_lock); } - spin_unlock(&dentry->d_lock); return dentry; } - /* * Make sure target_dir is fully connected to the dentry tree. * diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 764109886ec0..2709b34206ab 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -98,7 +98,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) if (IS_DIRSYNC(dir)) { err = write_one_page(page, 1); if (!err) - err = ext2_sync_inode(dir); + err = sync_inode_metadata(dir, 1); } else { unlock_page(page); } diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 416daa62242c..6346a2acf326 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -120,7 +120,6 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned); extern struct inode *ext2_iget (struct super_block *, unsigned long); extern int ext2_write_inode (struct inode *, struct writeback_control *); extern void ext2_evict_inode(struct inode *); -extern int ext2_sync_inode (struct inode *); extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); extern int ext2_setattr (struct dentry *, struct iattr *); extern void ext2_set_inode_flags(struct inode *inode); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 533699c16040..40ad210a5049 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1203,7 +1203,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; if (inode_needs_sync(inode)) { sync_mapping_buffers(inode->i_mapping); - ext2_sync_inode (inode); + sync_inode_metadata(inode, 1); } else { mark_inode_dirty(inode); } @@ -1523,15 +1523,6 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc) return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } -int ext2_sync_inode(struct inode *inode) -{ - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* sys_fsync did this */ - }; - return sync_inode(inode, &wbc); -} - int ext2_setattr(struct dentry *dentry, struct iattr *iattr) { struct inode *inode = dentry->d_inode; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 71efb0e9a3f2..f8aecd2e3297 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -206,7 +206,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); - atomic_inc(&inode->i_count); + ihold(inode); err = ext2_add_link(dentry, inode); if (!err) { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 85df87d0f7b7..0901320671da 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1221,9 +1221,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) } es = sbi->s_es; - if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != - (old_mount_opt & EXT2_MOUNT_XIP)) && - invalidate_inodes(sb)) { + if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) { ext2_msg(sb, KERN_WARNING, "warning: refusing change of " "xip flag with busy inodes while remounting"); sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 8c29ae15129e..f84700be3274 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -699,7 +699,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; inode->i_ctime = CURRENT_TIME_SEC; if (IS_SYNC(inode)) { - error = ext2_sync_inode (inode); + error = sync_inode_metadata(inode, 1); /* In case sync failed due to ENOSPC the inode was actually * written (only some dirty data were not) so we just proceed * as if nothing happened and cleanup the unused block */ diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 5e0faf4cda79..ad05353040a1 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1696,8 +1696,8 @@ static int ext3_journalled_writepage(struct page *page, * doesn't seem much point in redirtying the page here. */ ClearPageChecked(page); - ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, - ext3_get_block); + ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE, + ext3_get_block); if (ret != 0) { ext3_journal_stop(handle); goto out_unlock; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 2b35ddb70d65..bce9dce639b8 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -2260,7 +2260,7 @@ retry: inode->i_ctime = CURRENT_TIME_SEC; inc_nlink(inode); - atomic_inc(&inode->i_count); + ihold(inode); err = ext3_add_entry(handle, dentry, inode); if (!err) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4b8debeb3965..49635ef236f8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1538,10 +1538,10 @@ static int do_journal_get_write_access(handle_t *handle, if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; /* - * __block_prepare_write() could have dirtied some buffers. Clean + * __block_write_begin() could have dirtied some buffers. Clean * the dirty bit as jbd2_journal_get_write_access() could complain * otherwise about fs integrity issues. Setting of the dirty bit - * by __block_prepare_write() isn't a real problem here as we clear + * by __block_write_begin() isn't a real problem here as we clear * the bit before releasing a page lock and thus writeback cannot * ever write the buffer. */ @@ -2550,8 +2550,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, if (buffer_delay(bh)) return 0; /* Not sure this could or should happen */ /* - * XXX: __block_prepare_write() unmaps passed block, - * is it OK? + * XXX: __block_write_begin() unmaps passed block, is it OK? */ ret = ext4_da_reserve_space(inode, iblock); if (ret) @@ -2583,7 +2582,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, /* * This function is used as a standard get_block_t calback function * when there is no desire to allocate any blocks. It is used as a - * callback function for block_prepare_write() and block_write_full_page(). + * callback function for block_write_begin() and block_write_full_page(). * These functions should only try to map a single block at a time. * * Since this function doesn't do block allocations even if the caller @@ -2743,7 +2742,7 @@ static int ext4_writepage(struct page *page, * all are mapped and non delay. We don't want to * do block allocation here. */ - ret = block_prepare_write(page, 0, len, + ret = __block_write_begin(page, 0, len, noalloc_get_block_write); if (!ret) { page_bufs = page_buffers(page); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 19aa0d44d822..42f77b1dc72d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2373,6 +2373,7 @@ static int ext4_mb_init_backend(struct super_block *sb) printk(KERN_ERR "EXT4-fs: can't get new inode\n"); goto err_freesgi; } + sbi->s_buddy_cache->i_ino = get_next_ino(); EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; for (i = 0; i < ngroups; i++) { desc = ext4_get_group_desc(sb, i, NULL); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 314c0d3b3fa9..bd39885b5998 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2312,7 +2312,7 @@ retry: inode->i_ctime = ext4_current_time(inode); ext4_inc_count(handle, inode); - atomic_inc(&inode->i_count); + ihold(inode); err = ext4_add_entry(handle, dentry, inode); if (!err) { diff --git a/fs/fcntl.c b/fs/fcntl.c index f8cc34f542c3..ecc8b3954ed6 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -640,7 +640,7 @@ static void fasync_free_rcu(struct rcu_head *head) * match the state "is the filp on a fasync list". * */ -static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) +int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) { struct fasync_struct *fa, **fp; int result = 0; @@ -666,21 +666,31 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) return result; } +struct fasync_struct *fasync_alloc(void) +{ + return kmem_cache_alloc(fasync_cache, GFP_KERNEL); +} + /* - * Add a fasync entry. Return negative on error, positive if - * added, and zero if did nothing but change an existing one. + * NOTE! This can be used only for unused fasync entries: + * entries that actually got inserted on the fasync list + * need to be released by rcu - see fasync_remove_entry. + */ +void fasync_free(struct fasync_struct *new) +{ + kmem_cache_free(fasync_cache, new); +} + +/* + * Insert a new entry into the fasync list. Return the pointer to the + * old one if we didn't use the new one. * * NOTE! It is very important that the FASYNC flag always * match the state "is the filp on a fasync list". */ -static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp) +struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new) { - struct fasync_struct *new, *fa, **fp; - int result = 0; - - new = kmem_cache_alloc(fasync_cache, GFP_KERNEL); - if (!new) - return -ENOMEM; + struct fasync_struct *fa, **fp; spin_lock(&filp->f_lock); spin_lock(&fasync_lock); @@ -691,8 +701,6 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa spin_lock_irq(&fa->fa_lock); fa->fa_fd = fd; spin_unlock_irq(&fa->fa_lock); - - kmem_cache_free(fasync_cache, new); goto out; } @@ -702,13 +710,39 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa new->fa_fd = fd; new->fa_next = *fapp; rcu_assign_pointer(*fapp, new); - result = 1; filp->f_flags |= FASYNC; out: spin_unlock(&fasync_lock); spin_unlock(&filp->f_lock); - return result; + return fa; +} + +/* + * Add a fasync entry. Return negative on error, positive if + * added, and zero if did nothing but change an existing one. + */ +static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp) +{ + struct fasync_struct *new; + + new = fasync_alloc(); + if (!new) + return -ENOMEM; + + /* + * fasync_insert_entry() returns the old (update) entry if + * it existed. + * + * So free the (unused) new entry and return 0 to let the + * caller know that we didn't add any new fasync entries. + */ + if (fasync_insert_entry(fd, filp, fapp, new)) { + fasync_free(new); + return 0; + } + + return 1; } /* diff --git a/fs/file_table.c b/fs/file_table.c index a04bdd81c11c..c3dee381f1b4 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -60,7 +60,7 @@ static inline void file_free(struct file *f) /* * Return the total number of open files in the system */ -static int get_nr_files(void) +static long get_nr_files(void) { return percpu_counter_read_positive(&nr_files); } @@ -68,7 +68,7 @@ static int get_nr_files(void) /* * Return the maximum number of open files in the system */ -int get_max_files(void) +unsigned long get_max_files(void) { return files_stat.max_files; } @@ -82,7 +82,7 @@ int proc_nr_files(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); - return proc_dointvec(table, write, buffer, lenp, ppos); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #else int proc_nr_files(ctl_table *table, int write, @@ -105,7 +105,7 @@ int proc_nr_files(ctl_table *table, int write, struct file *get_empty_filp(void) { const struct cred *cred = current_cred(); - static int old_max; + static long old_max; struct file * f; /* @@ -140,8 +140,7 @@ struct file *get_empty_filp(void) over: /* Ran out of filps - report that */ if (get_nr_files() > old_max) { - printk(KERN_INFO "VFS: file-max limit %d reached\n", - get_max_files()); + pr_info("VFS: file-max limit %lu reached\n", get_max_files()); old_max = get_nr_files(); } goto fail; @@ -487,7 +486,7 @@ retry: void __init files_init(unsigned long mempages) { - int n; + unsigned long n; filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); @@ -498,9 +497,7 @@ void __init files_init(unsigned long mempages) */ n = (mempages * (PAGE_SIZE / 1024)) / 10; - files_stat.max_files = n; - if (files_stat.max_files < NR_FILE) - files_stat.max_files = NR_FILE; + files_stat.max_files = max_t(unsigned long, n, NR_FILE); files_defer_init(); lg_lock_init(files_lglock); percpu_counter_init(&nr_files, 0); diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index 79d1b4ea13e7..8c04eac5079d 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -260,6 +260,7 @@ vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip) struct inode *ip = NULL; if ((ip = new_inode(sbp))) { + ip->i_ino = get_next_ino(); vxfs_iinit(ip, vip); ip->i_mapping->a_ops = &vxfs_aops; } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ab38fef1c9a1..aed881a76b22 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -79,6 +79,11 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) return sb->s_bdi; } +static inline struct inode *wb_inode(struct list_head *head) +{ + return list_entry(head, struct inode, i_wb_list); +} + static void bdi_queue_work(struct backing_dev_info *bdi, struct wb_writeback_work *work) { @@ -172,11 +177,11 @@ static void redirty_tail(struct inode *inode) if (!list_empty(&wb->b_dirty)) { struct inode *tail; - tail = list_entry(wb->b_dirty.next, struct inode, i_list); + tail = wb_inode(wb->b_dirty.next); if (time_before(inode->dirtied_when, tail->dirtied_when)) inode->dirtied_when = jiffies; } - list_move(&inode->i_list, &wb->b_dirty); + list_move(&inode->i_wb_list, &wb->b_dirty); } /* @@ -186,7 +191,7 @@ static void requeue_io(struct inode *inode) { struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - list_move(&inode->i_list, &wb->b_more_io); + list_move(&inode->i_wb_list, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) @@ -227,14 +232,14 @@ static void move_expired_inodes(struct list_head *delaying_queue, int do_sb_sort = 0; while (!list_empty(delaying_queue)) { - inode = list_entry(delaying_queue->prev, struct inode, i_list); + inode = wb_inode(delaying_queue->prev); if (older_than_this && inode_dirtied_after(inode, *older_than_this)) break; if (sb && sb != inode->i_sb) do_sb_sort = 1; sb = inode->i_sb; - list_move(&inode->i_list, &tmp); + list_move(&inode->i_wb_list, &tmp); } /* just one sb in list, splice to dispatch_queue and we're done */ @@ -245,12 +250,11 @@ static void move_expired_inodes(struct list_head *delaying_queue, /* Move inodes from one superblock together */ while (!list_empty(&tmp)) { - inode = list_entry(tmp.prev, struct inode, i_list); - sb = inode->i_sb; + sb = wb_inode(tmp.prev)->i_sb; list_for_each_prev_safe(pos, node, &tmp) { - inode = list_entry(pos, struct inode, i_list); + inode = wb_inode(pos); if (inode->i_sb == sb) - list_move(&inode->i_list, dispatch_queue); + list_move(&inode->i_wb_list, dispatch_queue); } } } @@ -408,16 +412,13 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * completion. */ redirty_tail(inode); - } else if (atomic_read(&inode->i_count)) { - /* - * The inode is clean, inuse - */ - list_move(&inode->i_list, &inode_in_use); } else { /* - * The inode is clean, unused + * The inode is clean. At this point we either have + * a reference to the inode or it's on it's way out. + * No need to add it back to the LRU. */ - list_move(&inode->i_list, &inode_unused); + list_del_init(&inode->i_wb_list); } } inode_sync_complete(inode); @@ -465,8 +466,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, { while (!list_empty(&wb->b_io)) { long pages_skipped; - struct inode *inode = list_entry(wb->b_io.prev, - struct inode, i_list); + struct inode *inode = wb_inode(wb->b_io.prev); if (inode->i_sb != sb) { if (only_this_sb) { @@ -487,10 +487,16 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, return 0; } - if (inode->i_state & (I_NEW | I_WILL_FREE)) { + /* + * Don't bother with new inodes or inodes beeing freed, first + * kind does not need peridic writeout yet, and for the latter + * kind writeout is handled by the freer. + */ + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { requeue_io(inode); continue; } + /* * Was this inode dirtied after sync_sb_inodes was called? * This keeps sync from extra jobs and livelock. @@ -498,7 +504,6 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, if (inode_dirtied_after(inode, wbc->wb_start)) return 1; - BUG_ON(inode->i_state & I_FREEING); __iget(inode); pages_skipped = wbc->pages_skipped; writeback_single_inode(inode, wbc); @@ -536,8 +541,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb, queue_io(wb, wbc->older_than_this); while (!list_empty(&wb->b_io)) { - struct inode *inode = list_entry(wb->b_io.prev, - struct inode, i_list); + struct inode *inode = wb_inode(wb->b_io.prev); struct super_block *sb = inode->i_sb; if (!pin_sb_for_writeback(sb)) { @@ -582,7 +586,7 @@ static inline bool over_bground_thresh(void) global_dirty_limits(&background_thresh, &dirty_thresh); return (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) >= background_thresh); + global_page_state(NR_UNSTABLE_NFS) > background_thresh); } /* @@ -675,8 +679,7 @@ static long wb_writeback(struct bdi_writeback *wb, */ spin_lock(&inode_lock); if (!list_empty(&wb->b_more_io)) { - inode = list_entry(wb->b_more_io.prev, - struct inode, i_list); + inode = wb_inode(wb->b_more_io.prev); trace_wbc_writeback_wait(&wbc, wb->bdi); inode_wait_for_writeback(inode); } @@ -721,9 +724,13 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) return 0; wb->last_old_flush = jiffies; + /* + * Add in the number of potentially dirty inodes, because each inode + * write can dirty pagecache in the underlying blockdev. + */ nr_pages = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS) + - (inodes_stat.nr_inodes - inodes_stat.nr_unused); + get_nr_dirty_inodes(); if (nr_pages) { struct wb_writeback_work work = { @@ -790,7 +797,7 @@ int bdi_writeback_thread(void *data) struct backing_dev_info *bdi = wb->bdi; long pages_written; - current->flags |= PF_FLUSHER | PF_SWAPWRITE; + current->flags |= PF_SWAPWRITE; set_freezable(); wb->last_active = jiffies; @@ -962,7 +969,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) * dirty list. Add blockdev inodes as well. */ if (!S_ISBLK(inode->i_mode)) { - if (hlist_unhashed(&inode->i_hash)) + if (inode_unhashed(inode)) goto out; } if (inode->i_state & I_FREEING) @@ -990,7 +997,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) } inode->dirtied_when = jiffies; - list_move(&inode->i_list, &bdi->wb.b_dirty); + list_move(&inode->i_wb_list, &bdi->wb.b_dirty); } } out: @@ -1090,8 +1097,7 @@ void writeback_inodes_sb(struct super_block *sb) WARN_ON(!rwsem_is_locked(&sb->s_umount)); - work.nr_pages = nr_dirty + nr_unstable + - (inodes_stat.nr_inodes - inodes_stat.nr_unused); + work.nr_pages = nr_dirty + nr_unstable + get_nr_dirty_inodes(); bdi_queue_work(sb->s_bdi, &work); wait_for_completion(&done); @@ -1198,3 +1204,23 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc) return ret; } EXPORT_SYMBOL(sync_inode); + +/** + * sync_inode - write an inode to disk + * @inode: the inode to sync + * @wait: wait for I/O to complete. + * + * Write an inode to disk and adjust it's dirty state after completion. + * + * Note: only writes the actual inode, no associated data or other metadata. + */ +int sync_inode_metadata(struct inode *inode, int wait) +{ + struct writeback_control wbc = { + .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, + .nr_to_write = 0, /* metadata-only */ + }; + + return sync_inode(inode, &wbc); +} +EXPORT_SYMBOL(sync_inode_metadata); diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 7367e177186f..4eba07661e5c 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -222,6 +222,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, if (!inode) return NULL; + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = fc->user_id; inode->i_gid = fc->group_id; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index cde755cca564..b98664275f02 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -809,11 +809,9 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, int err; struct page *page = *pagep; - if (page && zeroing && count < PAGE_SIZE) { - void *mapaddr = kmap_atomic(page, KM_USER1); - memset(mapaddr, 0, PAGE_SIZE); - kunmap_atomic(mapaddr, KM_USER1); - } + if (page && zeroing && count < PAGE_SIZE) + clear_highpage(page); + while (count) { if (cs->write && cs->pipebufs && page) { return fuse_ref_page(cs, page, offset, count); @@ -830,10 +828,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, } } if (page) { - void *mapaddr = kmap_atomic(page, KM_USER1); + void *mapaddr = kmap_atomic(page, KM_USER0); void *buf = mapaddr + offset; offset += fuse_copy_do(cs, &buf, &count); - kunmap_atomic(mapaddr, KM_USER1); + kunmap_atomic(mapaddr, KM_USER0); } else offset += fuse_copy_do(cs, NULL, &count); } diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 6b24afb96aae..4f36f8832b9b 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -618,7 +618,6 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, struct gfs2_alloc *al = NULL; pgoff_t index = pos >> PAGE_CACHE_SHIFT; unsigned from = pos & (PAGE_CACHE_SIZE - 1); - unsigned to = from + len; struct page *page; gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); @@ -691,7 +690,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, } prepare_write: - error = block_prepare_write(page, from, to, gfs2_block_map); + error = __block_write_begin(page, from, len, gfs2_block_map); out: if (error == 0) return 0; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index f3b071f921aa..939739c7b3f9 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -55,7 +55,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb * activity, but those code paths have their own higher-level * throttling. */ - if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { + if (wbc->sync_mode != WB_SYNC_NONE) { lock_buffer(bh); } else if (!trylock_buffer(bh)) { redirty_page_for_writepage(wbc, page); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index aeafc233dc89..cade1acbcea9 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1219,7 +1219,6 @@ fail_sb: fail_locking: init_locking(sdp, &mount_gh, UNDO); fail_lm: - invalidate_inodes(sb); gfs2_gl_hash_clear(sdp); gfs2_lm_unmount(sdp); fail_sys: diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 0534510200d5..12cbea7502c2 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -255,7 +255,7 @@ out_parent: gfs2_holder_uninit(ghs); gfs2_holder_uninit(ghs + 1); if (!error) { - atomic_inc(&inode->i_count); + ihold(inode); d_instantiate(dentry, inode); mark_inode_dirty(inode); } @@ -1294,7 +1294,7 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to) int error; if (!page_has_buffers(page)) { - error = block_prepare_write(page, from, to, gfs2_block_map); + error = __block_write_begin(page, from, to - from, gfs2_block_map); if (unlikely(error)) return error; @@ -1313,7 +1313,7 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to) next += bh->b_size; if (buffer_mapped(bh)) { if (end) { - error = block_prepare_write(page, start, end, + error = __block_write_begin(page, start, end - start, gfs2_block_map); if (unlikely(error)) return error; @@ -1328,7 +1328,7 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to) } while (next < to); if (end) { - error = block_prepare_write(page, start, end, gfs2_block_map); + error = __block_write_begin(page, start, end - start, gfs2_block_map); if (unlikely(error)) return error; empty_write_end(page, start, end); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 047d1176096c..2b2c4997430b 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -857,7 +857,6 @@ restart: gfs2_clear_rgrpd(sdp); gfs2_jindex_free(sdp); /* Take apart glock structures and buffer lists */ - invalidate_inodes(sdp->sd_vfs); gfs2_gl_hash_clear(sdp); /* Unmount the locking protocol */ gfs2_lm_unmount(sdp); diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 4f55651aaa51..c8cffb81e849 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -147,8 +147,6 @@ struct hfs_sb_info { u16 blockoffset; int fs_div; - - struct hlist_head rsrc_inodes; }; #define HFS_FLG_BITMAP_DIRTY 0 @@ -254,17 +252,6 @@ static inline void hfs_bitmap_dirty(struct super_block *sb) sb->s_dirt = 1; } -static inline void hfs_buffer_sync(struct buffer_head *bh) -{ - while (buffer_locked(bh)) { - wait_on_buffer(bh); - } - if (buffer_dirty(bh)) { - ll_rw_block(WRITE, 1, &bh); - wait_on_buffer(bh); - } -} - #define sb_bread512(sb, sec, data) ({ \ struct buffer_head *__bh; \ sector_t __block; \ diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 397b7adc7ce6..dffb4e996643 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -524,7 +524,7 @@ static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry, HFS_I(inode)->rsrc_inode = dir; HFS_I(dir)->rsrc_inode = inode; igrab(dir); - hlist_add_head(&inode->i_hash, &HFS_SB(dir->i_sb)->rsrc_inodes); + hlist_add_fake(&inode->i_hash); mark_inode_dirty(inode); out: d_add(dentry, inode); diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index 86428f5ac991..1563d5ce5764 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -220,7 +220,7 @@ int hfs_mdb_get(struct super_block *sb) mdb->drLsMod = hfs_mtime(); mark_buffer_dirty(HFS_SB(sb)->mdb_bh); - hfs_buffer_sync(HFS_SB(sb)->mdb_bh); + sync_dirty_buffer(HFS_SB(sb)->mdb_bh); } return 0; @@ -287,7 +287,7 @@ void hfs_mdb_commit(struct super_block *sb) HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT); HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT); mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh); - hfs_buffer_sync(HFS_SB(sb)->alt_mdb_bh); + sync_dirty_buffer(HFS_SB(sb)->alt_mdb_bh); } if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) { diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 33254160f650..6ee1586f2334 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -382,7 +382,6 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; sb->s_fs_info = sbi; - INIT_HLIST_HEAD(&sbi->rsrc_inodes); res = -EINVAL; if (!parse_options((char *)data, sbi)) { diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index d236d85ec9d7..e318bbc0daf6 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -286,7 +286,7 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, inc_nlink(inode); hfsplus_instantiate(dst_dentry, inode, cnid); - atomic_inc(&inode->i_count); + ihold(inode); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); sbi->file_count++; diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 78449280dae0..8afd7e84f98d 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -211,7 +211,7 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent * appear hashed, but do not put on any lists. hlist_del() * will work fine and require no locking. */ - inode->i_hash.pprev = &inode->i_hash.next; + hlist_add_fake(&inode->i_hash); mark_inode_dirty(inode); out: diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index 7c232c1487ee..bf15a43016b9 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -91,7 +91,6 @@ extern int rename_file(char *from, char *to); extern int do_statfs(char *root, long *bsize_out, long long *blocks_out, long long *bfree_out, long long *bavail_out, long long *files_out, long long *ffree_out, - void *fsid_out, int fsid_size, long *namelen_out, - long *spare_out); + void *fsid_out, int fsid_size, long *namelen_out); #endif diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index f7dc9b5f9ef8..cd7c93917cc7 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -217,7 +217,7 @@ int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf) err = do_statfs(dentry->d_sb->s_fs_info, &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files, &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid), - &sf->f_namelen, sf->f_spare); + &sf->f_namelen); if (err) return err; sf->f_blocks = f_blocks; diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index 6777aa06ce2c..d51a98384bc0 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -94,8 +94,7 @@ void *open_dir(char *path, int *err_out) dir = opendir(path); *err_out = errno; - if (dir == NULL) - return NULL; + return dir; } @@ -205,7 +204,7 @@ int set_attr(const char *file, struct hostfs_iattr *attrs, int fd) if (attrs->ia_valid & HOSTFS_ATTR_MODE) { if (fd >= 0) { if (fchmod(fd, attrs->ia_mode) != 0) - return (-errno); + return -errno; } else if (chmod(file, attrs->ia_mode) != 0) { return -errno; } @@ -364,8 +363,7 @@ int rename_file(char *from, char *to) int do_statfs(char *root, long *bsize_out, long long *blocks_out, long long *bfree_out, long long *bavail_out, long long *files_out, long long *ffree_out, - void *fsid_out, int fsid_size, long *namelen_out, - long *spare_out) + void *fsid_out, int fsid_size, long *namelen_out) { struct statfs64 buf; int err; @@ -384,10 +382,6 @@ int do_statfs(char *root, long *bsize_out, long long *blocks_out, sizeof(buf.f_fsid) > fsid_size ? fsid_size : sizeof(buf.f_fsid)); *namelen_out = buf.f_namelen; - spare_out[0] = buf.f_spare[0]; - spare_out[1] = buf.f_spare[1]; - spare_out[2] = buf.f_spare[2]; - spare_out[3] = buf.f_spare[3]; - spare_out[4] = buf.f_spare[4]; + return 0; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 113eba3d3c38..b14be3f781c7 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -31,6 +31,7 @@ #include <linux/statfs.h> #include <linux/security.h> #include <linux/magic.h> +#include <linux/migrate.h> #include <asm/uaccess.h> @@ -455,6 +456,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, inode = new_inode(sb); if (inode) { struct hugetlbfs_inode_info *info; + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = uid; inode->i_gid = gid; @@ -573,6 +575,19 @@ static int hugetlbfs_set_page_dirty(struct page *page) return 0; } +static int hugetlbfs_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + int rc; + + rc = migrate_huge_page_move_mapping(mapping, newpage, page); + if (rc) + return rc; + migrate_page_copy(newpage, page); + + return 0; +} + static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); @@ -659,6 +674,7 @@ static const struct address_space_operations hugetlbfs_aops = { .write_begin = hugetlbfs_write_begin, .write_end = hugetlbfs_write_end, .set_page_dirty = hugetlbfs_set_page_dirty, + .migratepage = hugetlbfs_migrate_page, }; diff --git a/fs/inode.c b/fs/inode.c index 86464332e590..ae2727ab0c3a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -24,11 +24,11 @@ #include <linux/mount.h> #include <linux/async.h> #include <linux/posix_acl.h> +#include <linux/ima.h> /* * This is needed for the following functions: * - inode_has_buffers - * - invalidate_inode_buffers * - invalidate_bdev * * FIXME: remove all knowledge of the buffer layer from this file @@ -72,8 +72,7 @@ static unsigned int i_hash_shift __read_mostly; * allowing for low-overhead inode sync() operations. */ -LIST_HEAD(inode_in_use); -LIST_HEAD(inode_unused); +static LIST_HEAD(inode_lru); static struct hlist_head *inode_hashtable __read_mostly; /* @@ -103,8 +102,41 @@ static DECLARE_RWSEM(iprune_sem); */ struct inodes_stat_t inodes_stat; +static struct percpu_counter nr_inodes __cacheline_aligned_in_smp; +static struct percpu_counter nr_inodes_unused __cacheline_aligned_in_smp; + static struct kmem_cache *inode_cachep __read_mostly; +static inline int get_nr_inodes(void) +{ + return percpu_counter_sum_positive(&nr_inodes); +} + +static inline int get_nr_inodes_unused(void) +{ + return percpu_counter_sum_positive(&nr_inodes_unused); +} + +int get_nr_dirty_inodes(void) +{ + int nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); + return nr_dirty > 0 ? nr_dirty : 0; + +} + +/* + * Handle nr_inode sysctl + */ +#ifdef CONFIG_SYSCTL +int proc_nr_inodes(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + inodes_stat.nr_inodes = get_nr_inodes(); + inodes_stat.nr_unused = get_nr_inodes_unused(); + return proc_dointvec(table, write, buffer, lenp, ppos); +} +#endif + static void wake_up_inode(struct inode *inode) { /* @@ -192,6 +224,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_fsnotify_mask = 0; #endif + percpu_counter_inc(&nr_inodes); + return 0; out: return -ENOMEM; @@ -232,11 +266,13 @@ void __destroy_inode(struct inode *inode) if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) posix_acl_release(inode->i_default_acl); #endif + percpu_counter_dec(&nr_inodes); } EXPORT_SYMBOL(__destroy_inode); -void destroy_inode(struct inode *inode) +static void destroy_inode(struct inode *inode) { + BUG_ON(!list_empty(&inode->i_lru)); __destroy_inode(inode); if (inode->i_sb->s_op->destroy_inode) inode->i_sb->s_op->destroy_inode(inode); @@ -255,6 +291,8 @@ void inode_init_once(struct inode *inode) INIT_HLIST_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_dentry); INIT_LIST_HEAD(&inode->i_devices); + INIT_LIST_HEAD(&inode->i_wb_list); + INIT_LIST_HEAD(&inode->i_lru); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); spin_lock_init(&inode->i_data.tree_lock); spin_lock_init(&inode->i_data.i_mmap_lock); @@ -281,14 +319,109 @@ static void init_once(void *foo) */ void __iget(struct inode *inode) { - if (atomic_inc_return(&inode->i_count) != 1) - return; + atomic_inc(&inode->i_count); +} + +/* + * get additional reference to inode; caller must already hold one. + */ +void ihold(struct inode *inode) +{ + WARN_ON(atomic_inc_return(&inode->i_count) < 2); +} +EXPORT_SYMBOL(ihold); + +static void inode_lru_list_add(struct inode *inode) +{ + if (list_empty(&inode->i_lru)) { + list_add(&inode->i_lru, &inode_lru); + percpu_counter_inc(&nr_inodes_unused); + } +} - if (!(inode->i_state & (I_DIRTY|I_SYNC))) - list_move(&inode->i_list, &inode_in_use); - inodes_stat.nr_unused--; +static void inode_lru_list_del(struct inode *inode) +{ + if (!list_empty(&inode->i_lru)) { + list_del_init(&inode->i_lru); + percpu_counter_dec(&nr_inodes_unused); + } +} + +static inline void __inode_sb_list_add(struct inode *inode) +{ + list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); } +/** + * inode_sb_list_add - add inode to the superblock list of inodes + * @inode: inode to add + */ +void inode_sb_list_add(struct inode *inode) +{ + spin_lock(&inode_lock); + __inode_sb_list_add(inode); + spin_unlock(&inode_lock); +} +EXPORT_SYMBOL_GPL(inode_sb_list_add); + +static inline void __inode_sb_list_del(struct inode *inode) +{ + list_del_init(&inode->i_sb_list); +} + +static unsigned long hash(struct super_block *sb, unsigned long hashval) +{ + unsigned long tmp; + + tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / + L1_CACHE_BYTES; + tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS); + return tmp & I_HASHMASK; +} + +/** + * __insert_inode_hash - hash an inode + * @inode: unhashed inode + * @hashval: unsigned long value used to locate this object in the + * inode_hashtable. + * + * Add an inode to the inode hash for this superblock. + */ +void __insert_inode_hash(struct inode *inode, unsigned long hashval) +{ + struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); + + spin_lock(&inode_lock); + hlist_add_head(&inode->i_hash, b); + spin_unlock(&inode_lock); +} +EXPORT_SYMBOL(__insert_inode_hash); + +/** + * __remove_inode_hash - remove an inode from the hash + * @inode: inode to unhash + * + * Remove an inode from the superblock. + */ +static void __remove_inode_hash(struct inode *inode) +{ + hlist_del_init(&inode->i_hash); +} + +/** + * remove_inode_hash - remove an inode from the hash + * @inode: inode to unhash + * + * Remove an inode from the superblock. + */ +void remove_inode_hash(struct inode *inode) +{ + spin_lock(&inode_lock); + hlist_del_init(&inode->i_hash); + spin_unlock(&inode_lock); +} +EXPORT_SYMBOL(remove_inode_hash); + void end_writeback(struct inode *inode) { might_sleep(); @@ -327,101 +460,113 @@ static void evict(struct inode *inode) */ static void dispose_list(struct list_head *head) { - int nr_disposed = 0; - while (!list_empty(head)) { struct inode *inode; - inode = list_first_entry(head, struct inode, i_list); - list_del(&inode->i_list); + inode = list_first_entry(head, struct inode, i_lru); + list_del_init(&inode->i_lru); evict(inode); spin_lock(&inode_lock); - hlist_del_init(&inode->i_hash); - list_del_init(&inode->i_sb_list); + __remove_inode_hash(inode); + __inode_sb_list_del(inode); spin_unlock(&inode_lock); wake_up_inode(inode); destroy_inode(inode); - nr_disposed++; } - spin_lock(&inode_lock); - inodes_stat.nr_inodes -= nr_disposed; - spin_unlock(&inode_lock); } -/* - * Invalidate all inodes for a device. +/** + * evict_inodes - evict all evictable inodes for a superblock + * @sb: superblock to operate on + * + * Make sure that no inodes with zero refcount are retained. This is + * called by superblock shutdown after having MS_ACTIVE flag removed, + * so any inode reaching zero refcount during or after that call will + * be immediately evicted. */ -static int invalidate_list(struct list_head *head, struct list_head *dispose) +void evict_inodes(struct super_block *sb) { - struct list_head *next; - int busy = 0, count = 0; - - next = head->next; - for (;;) { - struct list_head *tmp = next; - struct inode *inode; + struct inode *inode, *next; + LIST_HEAD(dispose); - /* - * We can reschedule here without worrying about the list's - * consistency because the per-sb list of inodes must not - * change during umount anymore, and because iprune_sem keeps - * shrink_icache_memory() away. - */ - cond_resched_lock(&inode_lock); + down_write(&iprune_sem); - next = next->next; - if (tmp == head) - break; - inode = list_entry(tmp, struct inode, i_sb_list); - if (inode->i_state & I_NEW) + spin_lock(&inode_lock); + list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { + if (atomic_read(&inode->i_count)) continue; - invalidate_inode_buffers(inode); - if (!atomic_read(&inode->i_count)) { - list_move(&inode->i_list, dispose); - WARN_ON(inode->i_state & I_NEW); - inode->i_state |= I_FREEING; - count++; + + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + WARN_ON(1); continue; } - busy = 1; + + inode->i_state |= I_FREEING; + + /* + * Move the inode off the IO lists and LRU once I_FREEING is + * set so that it won't get moved back on there if it is dirty. + */ + list_move(&inode->i_lru, &dispose); + list_del_init(&inode->i_wb_list); + if (!(inode->i_state & (I_DIRTY | I_SYNC))) + percpu_counter_dec(&nr_inodes_unused); } - /* only unused inodes may be cached with i_count zero */ - inodes_stat.nr_unused -= count; - return busy; + spin_unlock(&inode_lock); + + dispose_list(&dispose); + up_write(&iprune_sem); } /** - * invalidate_inodes - discard the inodes on a device - * @sb: superblock + * invalidate_inodes - attempt to free all inodes on a superblock + * @sb: superblock to operate on * - * Discard all of the inodes for a given superblock. If the discard - * fails because there are busy inodes then a non zero value is returned. - * If the discard is successful all the inodes have been discarded. + * Attempts to free all inodes for a given superblock. If there were any + * busy inodes return a non-zero value, else zero. */ int invalidate_inodes(struct super_block *sb) { - int busy; - LIST_HEAD(throw_away); + int busy = 0; + struct inode *inode, *next; + LIST_HEAD(dispose); down_write(&iprune_sem); + spin_lock(&inode_lock); - fsnotify_unmount_inodes(&sb->s_inodes); - busy = invalidate_list(&sb->s_inodes, &throw_away); + list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) + continue; + if (atomic_read(&inode->i_count)) { + busy = 1; + continue; + } + + inode->i_state |= I_FREEING; + + /* + * Move the inode off the IO lists and LRU once I_FREEING is + * set so that it won't get moved back on there if it is dirty. + */ + list_move(&inode->i_lru, &dispose); + list_del_init(&inode->i_wb_list); + if (!(inode->i_state & (I_DIRTY | I_SYNC))) + percpu_counter_dec(&nr_inodes_unused); + } spin_unlock(&inode_lock); - dispose_list(&throw_away); + dispose_list(&dispose); up_write(&iprune_sem); return busy; } -EXPORT_SYMBOL(invalidate_inodes); static int can_unuse(struct inode *inode) { - if (inode->i_state) + if (inode->i_state & ~I_REFERENCED) return 0; if (inode_has_buffers(inode)) return 0; @@ -433,22 +578,24 @@ static int can_unuse(struct inode *inode) } /* - * Scan `goal' inodes on the unused list for freeable ones. They are moved to - * a temporary list and then are freed outside inode_lock by dispose_list(). + * Scan `goal' inodes on the unused list for freeable ones. They are moved to a + * temporary list and then are freed outside inode_lock by dispose_list(). * * Any inodes which are pinned purely because of attached pagecache have their - * pagecache removed. We expect the final iput() on that inode to add it to - * the front of the inode_unused list. So look for it there and if the - * inode is still freeable, proceed. The right inode is found 99.9% of the - * time in testing on a 4-way. + * pagecache removed. If the inode has metadata buffers attached to + * mapping->private_list then try to remove them. * - * If the inode has metadata buffers attached to mapping->private_list then - * try to remove them. + * If the inode has the I_REFERENCED flag set, then it means that it has been + * used recently - the flag is set in iput_final(). When we encounter such an + * inode, clear the flag and move it to the back of the LRU so it gets another + * pass through the LRU before it gets reclaimed. This is necessary because of + * the fact we are doing lazy LRU updates to minimise lock contention so the + * LRU does not have strict ordering. Hence we don't want to reclaim inodes + * with this flag set because they are the inodes that are out of order. */ static void prune_icache(int nr_to_scan) { LIST_HEAD(freeable); - int nr_pruned = 0; int nr_scanned; unsigned long reap = 0; @@ -457,13 +604,26 @@ static void prune_icache(int nr_to_scan) for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { struct inode *inode; - if (list_empty(&inode_unused)) + if (list_empty(&inode_lru)) break; - inode = list_entry(inode_unused.prev, struct inode, i_list); + inode = list_entry(inode_lru.prev, struct inode, i_lru); - if (inode->i_state || atomic_read(&inode->i_count)) { - list_move(&inode->i_list, &inode_unused); + /* + * Referenced or dirty inodes are still in use. Give them + * another pass through the LRU as we canot reclaim them now. + */ + if (atomic_read(&inode->i_count) || + (inode->i_state & ~I_REFERENCED)) { + list_del_init(&inode->i_lru); + percpu_counter_dec(&nr_inodes_unused); + continue; + } + + /* recently referenced inodes get one more pass */ + if (inode->i_state & I_REFERENCED) { + list_move(&inode->i_lru, &inode_lru); + inode->i_state &= ~I_REFERENCED; continue; } if (inode_has_buffers(inode) || inode->i_data.nrpages) { @@ -475,18 +635,23 @@ static void prune_icache(int nr_to_scan) iput(inode); spin_lock(&inode_lock); - if (inode != list_entry(inode_unused.next, - struct inode, i_list)) + if (inode != list_entry(inode_lru.next, + struct inode, i_lru)) continue; /* wrong inode or list_empty */ if (!can_unuse(inode)) continue; } - list_move(&inode->i_list, &freeable); WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; - nr_pruned++; + + /* + * Move the inode off the IO lists and LRU once I_FREEING is + * set so that it won't get moved back on there if it is dirty. + */ + list_move(&inode->i_lru, &freeable); + list_del_init(&inode->i_wb_list); + percpu_counter_dec(&nr_inodes_unused); } - inodes_stat.nr_unused -= nr_pruned; if (current_is_kswapd()) __count_vm_events(KSWAPD_INODESTEAL, reap); else @@ -518,7 +683,7 @@ static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) return -1; prune_icache(nr); } - return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; + return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure; } static struct shrinker icache_shrinker = { @@ -529,9 +694,6 @@ static struct shrinker icache_shrinker = { static void __wait_on_freeing_inode(struct inode *inode); /* * Called with the inode lock held. - * NOTE: we are not increasing the inode-refcount, you must call __iget() - * by hand after calling find_inode now! This simplifies iunique and won't - * add any additional branch in the common code. */ static struct inode *find_inode(struct super_block *sb, struct hlist_head *head, @@ -551,9 +713,10 @@ repeat: __wait_on_freeing_inode(inode); goto repeat; } - break; + __iget(inode); + return inode; } - return node ? inode : NULL; + return NULL; } /* @@ -576,53 +739,49 @@ repeat: __wait_on_freeing_inode(inode); goto repeat; } - break; + __iget(inode); + return inode; } - return node ? inode : NULL; -} - -static unsigned long hash(struct super_block *sb, unsigned long hashval) -{ - unsigned long tmp; - - tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / - L1_CACHE_BYTES; - tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS); - return tmp & I_HASHMASK; -} - -static inline void -__inode_add_to_lists(struct super_block *sb, struct hlist_head *head, - struct inode *inode) -{ - inodes_stat.nr_inodes++; - list_add(&inode->i_list, &inode_in_use); - list_add(&inode->i_sb_list, &sb->s_inodes); - if (head) - hlist_add_head(&inode->i_hash, head); + return NULL; } -/** - * inode_add_to_lists - add a new inode to relevant lists - * @sb: superblock inode belongs to - * @inode: inode to mark in use +/* + * Each cpu owns a range of LAST_INO_BATCH numbers. + * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations, + * to renew the exhausted range. * - * When an inode is allocated it needs to be accounted for, added to the in use - * list, the owning superblock and the inode hash. This needs to be done under - * the inode_lock, so export a function to do this rather than the inode lock - * itself. We calculate the hash list to add to here so it is all internal - * which requires the caller to have already set up the inode number in the - * inode to add. + * This does not significantly increase overflow rate because every CPU can + * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is + * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the + * 2^32 range, and is a worst-case. Even a 50% wastage would only increase + * overflow rate by 2x, which does not seem too significant. + * + * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW + * error if st_ino won't fit in target struct field. Use 32bit counter + * here to attempt to avoid that. */ -void inode_add_to_lists(struct super_block *sb, struct inode *inode) +#define LAST_INO_BATCH 1024 +static DEFINE_PER_CPU(unsigned int, last_ino); + +unsigned int get_next_ino(void) { - struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino); + unsigned int *p = &get_cpu_var(last_ino); + unsigned int res = *p; - spin_lock(&inode_lock); - __inode_add_to_lists(sb, head, inode); - spin_unlock(&inode_lock); +#ifdef CONFIG_SMP + if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) { + static atomic_t shared_last_ino; + int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino); + + res = next - LAST_INO_BATCH; + } +#endif + + *p = ++res; + put_cpu_var(last_ino); + return res; } -EXPORT_SYMBOL_GPL(inode_add_to_lists); +EXPORT_SYMBOL(get_next_ino); /** * new_inode - obtain an inode @@ -638,12 +797,6 @@ EXPORT_SYMBOL_GPL(inode_add_to_lists); */ struct inode *new_inode(struct super_block *sb) { - /* - * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW - * error if st_ino won't fit in target struct field. Use 32bit counter - * here to attempt to avoid that. - */ - static unsigned int last_ino; struct inode *inode; spin_lock_prefetch(&inode_lock); @@ -651,8 +804,7 @@ struct inode *new_inode(struct super_block *sb) inode = alloc_inode(sb); if (inode) { spin_lock(&inode_lock); - __inode_add_to_lists(sb, NULL, inode); - inode->i_ino = ++last_ino; + __inode_sb_list_add(inode); inode->i_state = 0; spin_unlock(&inode_lock); } @@ -663,7 +815,7 @@ EXPORT_SYMBOL(new_inode); void unlock_new_inode(struct inode *inode) { #ifdef CONFIG_DEBUG_LOCK_ALLOC - if (inode->i_mode & S_IFDIR) { + if (S_ISDIR(inode->i_mode)) { struct file_system_type *type = inode->i_sb->s_type; /* Set new key only if filesystem hasn't already changed it */ @@ -720,7 +872,8 @@ static struct inode *get_new_inode(struct super_block *sb, if (set(inode, data)) goto set_failed; - __inode_add_to_lists(sb, head, inode); + hlist_add_head(&inode->i_hash, head); + __inode_sb_list_add(inode); inode->i_state = I_NEW; spin_unlock(&inode_lock); @@ -735,7 +888,6 @@ static struct inode *get_new_inode(struct super_block *sb, * us. Use the old inode instead of the one we just * allocated. */ - __iget(old); spin_unlock(&inode_lock); destroy_inode(inode); inode = old; @@ -767,7 +919,8 @@ static struct inode *get_new_inode_fast(struct super_block *sb, old = find_inode_fast(sb, head, ino); if (!old) { inode->i_ino = ino; - __inode_add_to_lists(sb, head, inode); + hlist_add_head(&inode->i_hash, head); + __inode_sb_list_add(inode); inode->i_state = I_NEW; spin_unlock(&inode_lock); @@ -782,7 +935,6 @@ static struct inode *get_new_inode_fast(struct super_block *sb, * us. Use the old inode instead of the one we just * allocated. */ - __iget(old); spin_unlock(&inode_lock); destroy_inode(inode); inode = old; @@ -791,6 +943,27 @@ static struct inode *get_new_inode_fast(struct super_block *sb, return inode; } +/* + * search the inode cache for a matching inode number. + * If we find one, then the inode number we are trying to + * allocate is not unique and so we should not use it. + * + * Returns 1 if the inode number is unique, 0 if it is not. + */ +static int test_inode_iunique(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *b = inode_hashtable + hash(sb, ino); + struct hlist_node *node; + struct inode *inode; + + hlist_for_each_entry(inode, node, b, i_hash) { + if (inode->i_ino == ino && inode->i_sb == sb) + return 0; + } + + return 1; +} + /** * iunique - get a unique inode number * @sb: superblock @@ -812,19 +985,18 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved) * error if st_ino won't fit in target struct field. Use 32bit counter * here to attempt to avoid that. */ + static DEFINE_SPINLOCK(iunique_lock); static unsigned int counter; - struct inode *inode; - struct hlist_head *head; ino_t res; spin_lock(&inode_lock); + spin_lock(&iunique_lock); do { if (counter <= max_reserved) counter = max_reserved + 1; res = counter++; - head = inode_hashtable + hash(sb, res); - inode = find_inode_fast(sb, head, res); - } while (inode != NULL); + } while (!test_inode_iunique(sb, res)); + spin_unlock(&iunique_lock); spin_unlock(&inode_lock); return res; @@ -876,7 +1048,6 @@ static struct inode *ifind(struct super_block *sb, spin_lock(&inode_lock); inode = find_inode(sb, head, test, data); if (inode) { - __iget(inode); spin_unlock(&inode_lock); if (likely(wait)) wait_on_inode(inode); @@ -909,7 +1080,6 @@ static struct inode *ifind_fast(struct super_block *sb, spin_lock(&inode_lock); inode = find_inode_fast(sb, head, ino); if (inode) { - __iget(inode); spin_unlock(&inode_lock); wait_on_inode(inode); return inode; @@ -1095,7 +1265,7 @@ int insert_inode_locked(struct inode *inode) __iget(old); spin_unlock(&inode_lock); wait_on_inode(old); - if (unlikely(!hlist_unhashed(&old->i_hash))) { + if (unlikely(!inode_unhashed(old))) { iput(old); return -EBUSY; } @@ -1134,7 +1304,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, __iget(old); spin_unlock(&inode_lock); wait_on_inode(old); - if (unlikely(!hlist_unhashed(&old->i_hash))) { + if (unlikely(!inode_unhashed(old))) { iput(old); return -EBUSY; } @@ -1143,36 +1313,6 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, } EXPORT_SYMBOL(insert_inode_locked4); -/** - * __insert_inode_hash - hash an inode - * @inode: unhashed inode - * @hashval: unsigned long value used to locate this object in the - * inode_hashtable. - * - * Add an inode to the inode hash for this superblock. - */ -void __insert_inode_hash(struct inode *inode, unsigned long hashval) -{ - struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); - spin_lock(&inode_lock); - hlist_add_head(&inode->i_hash, head); - spin_unlock(&inode_lock); -} -EXPORT_SYMBOL(__insert_inode_hash); - -/** - * remove_inode_hash - remove an inode from the hash - * @inode: inode to unhash - * - * Remove an inode from the superblock. - */ -void remove_inode_hash(struct inode *inode) -{ - spin_lock(&inode_lock); - hlist_del_init(&inode->i_hash); - spin_unlock(&inode_lock); -} -EXPORT_SYMBOL(remove_inode_hash); int generic_delete_inode(struct inode *inode) { @@ -1187,7 +1327,7 @@ EXPORT_SYMBOL(generic_delete_inode); */ int generic_drop_inode(struct inode *inode) { - return !inode->i_nlink || hlist_unhashed(&inode->i_hash); + return !inode->i_nlink || inode_unhashed(inode); } EXPORT_SYMBOL_GPL(generic_drop_inode); @@ -1213,10 +1353,11 @@ static void iput_final(struct inode *inode) drop = generic_drop_inode(inode); if (!drop) { - if (!(inode->i_state & (I_DIRTY|I_SYNC))) - list_move(&inode->i_list, &inode_unused); - inodes_stat.nr_unused++; if (sb->s_flags & MS_ACTIVE) { + inode->i_state |= I_REFERENCED; + if (!(inode->i_state & (I_DIRTY|I_SYNC))) { + inode_lru_list_add(inode); + } spin_unlock(&inode_lock); return; } @@ -1227,19 +1368,23 @@ static void iput_final(struct inode *inode) spin_lock(&inode_lock); WARN_ON(inode->i_state & I_NEW); inode->i_state &= ~I_WILL_FREE; - inodes_stat.nr_unused--; - hlist_del_init(&inode->i_hash); + __remove_inode_hash(inode); } - list_del_init(&inode->i_list); - list_del_init(&inode->i_sb_list); + WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; - inodes_stat.nr_inodes--; + + /* + * Move the inode off the IO lists and LRU once I_FREEING is + * set so that it won't get moved back on there if it is dirty. + */ + inode_lru_list_del(inode); + list_del_init(&inode->i_wb_list); + + __inode_sb_list_del(inode); spin_unlock(&inode_lock); evict(inode); - spin_lock(&inode_lock); - hlist_del_init(&inode->i_hash); - spin_unlock(&inode_lock); + remove_inode_hash(inode); wake_up_inode(inode); BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); destroy_inode(inode); @@ -1503,6 +1648,8 @@ void __init inode_init(void) SLAB_MEM_SPREAD), init_once); register_shrinker(&icache_shrinker); + percpu_counter_init(&nr_inodes, 0); + percpu_counter_init(&nr_inodes_unused, 0); /* Hash may have been set up in inode_init_early */ if (!hashdist) diff --git a/fs/internal.h b/fs/internal.h index a6910e91cee8..ebad3b90752d 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -101,3 +101,10 @@ extern void put_super(struct super_block *sb); struct nameidata; extern struct file *nameidata_to_filp(struct nameidata *); extern void release_open_intent(struct nameidata *); + +/* + * inode.c + */ +extern int get_nr_dirty_inodes(void); +extern int evict_inodes(struct super_block *); +extern int invalidate_inodes(struct super_block *); diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 09ff41a752a0..60c2b944d762 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -962,25 +962,23 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf) * or getblk() if they are not. Returns the number of blocks inserted * (-ve == error.) */ -int isofs_get_blocks(struct inode *inode, sector_t iblock_s, +int isofs_get_blocks(struct inode *inode, sector_t iblock, struct buffer_head **bh, unsigned long nblocks) { - unsigned long b_off; + unsigned long b_off = iblock; unsigned offset, sect_size; unsigned int firstext; unsigned long nextblk, nextoff; - long iblock = (long)iblock_s; int section, rv, error; struct iso_inode_info *ei = ISOFS_I(inode); error = -EIO; rv = 0; - if (iblock < 0 || iblock != iblock_s) { + if (iblock != b_off) { printk(KERN_DEBUG "%s: block number too large\n", __func__); goto abort; } - b_off = iblock; offset = 0; firstext = ei->i_first_extent; @@ -998,8 +996,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s, * I/O errors. */ if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) { - printk(KERN_DEBUG "%s: block >= EOF (%ld, %ld)\n", - __func__, iblock, (unsigned long) inode->i_size); + printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n", + __func__, b_off, + (unsigned long long)inode->i_size); goto abort; } @@ -1025,9 +1024,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s, if (++section > 100) { printk(KERN_DEBUG "%s: More than 100 file sections ?!?" " aborting...\n", __func__); - printk(KERN_DEBUG "%s: block=%ld firstext=%u sect_size=%u " + printk(KERN_DEBUG "%s: block=%lu firstext=%u sect_size=%u " "nextblk=%lu nextoff=%lu\n", __func__, - iblock, firstext, (unsigned) sect_size, + b_off, firstext, (unsigned) sect_size, nextblk, nextoff); goto abort; } diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index ed78a3cf3cb0..79121aa5858b 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -289,7 +289,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de mutex_unlock(&f->sem); d_instantiate(dentry, old_dentry->d_inode); dir_i->i_mtime = dir_i->i_ctime = ITIME(now); - atomic_inc(&old_dentry->d_inode->i_count); + ihold(old_dentry->d_inode); } return ret; } @@ -864,7 +864,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, printk(KERN_NOTICE "jffs2_rename(): Link succeeded, unlink failed (err %d). You now have a hard link\n", ret); /* Might as well let the VFS know */ d_instantiate(new_dentry, old_dentry->d_inode); - atomic_inc(&old_dentry->d_inode->i_count); + ihold(old_dentry->d_inode); new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now); return ret; } diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index f8332dc8eeb2..3a09423b6c22 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -497,7 +497,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) * appear hashed, but do not put on any lists. hlist_del() * will work fine and require no locking. */ - ip->i_hash.pprev = &ip->i_hash.next; + hlist_add_fake(&ip->i_hash); return (ip); } diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index d945ea76b445..9466957ec841 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -1279,7 +1279,7 @@ int txCommit(tid_t tid, /* transaction identifier */ * lazy commit thread finishes processing */ if (tblk->xflag & COMMIT_DELETE) { - atomic_inc(&tblk->u.ip->i_count); + ihold(tblk->u.ip); /* * Avoid a rare deadlock * diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index a9cf8e8675be..231ca4af9bce 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -839,7 +839,7 @@ static int jfs_link(struct dentry *old_dentry, ip->i_ctime = CURRENT_TIME; dir->i_ctime = dir->i_mtime = CURRENT_TIME; mark_inode_dirty(dir); - atomic_inc(&ip->i_count); + ihold(ip); iplist[0] = ip; iplist[1] = dir; diff --git a/fs/libfs.c b/fs/libfs.c index 62baa0387d6e..304a5132ca27 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -255,7 +255,7 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; inc_nlink(inode); - atomic_inc(&inode->i_count); + ihold(inode); dget(dentry); d_instantiate(dentry, inode); return 0; @@ -892,10 +892,6 @@ EXPORT_SYMBOL_GPL(generic_fh_to_parent); */ int generic_file_fsync(struct file *file, int datasync) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* metadata-only; caller takes care of data */ - }; struct inode *inode = file->f_mapping->host; int err; int ret; @@ -906,7 +902,7 @@ int generic_file_fsync(struct file *file, int datasync) if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) return ret; - err = sync_inode(inode, &wbc); + err = sync_inode_metadata(inode, 1); if (ret == 0) ret = err; return ret; diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 64fd427c993c..d5bb86866e6c 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -42,6 +42,7 @@ struct nlm_wait { }; static LIST_HEAD(nlm_blocked); +static DEFINE_SPINLOCK(nlm_blocked_lock); /** * nlmclnt_init - Set up per-NFS mount point lockd data structures @@ -97,7 +98,10 @@ struct nlm_wait *nlmclnt_prepare_block(struct nlm_host *host, struct file_lock * block->b_lock = fl; init_waitqueue_head(&block->b_wait); block->b_status = nlm_lck_blocked; + + spin_lock(&nlm_blocked_lock); list_add(&block->b_list, &nlm_blocked); + spin_unlock(&nlm_blocked_lock); } return block; } @@ -106,7 +110,9 @@ void nlmclnt_finish_block(struct nlm_wait *block) { if (block == NULL) return; + spin_lock(&nlm_blocked_lock); list_del(&block->b_list); + spin_unlock(&nlm_blocked_lock); kfree(block); } @@ -154,6 +160,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) * Look up blocked request based on arguments. * Warning: must not use cookie to match it! */ + spin_lock(&nlm_blocked_lock); list_for_each_entry(block, &nlm_blocked, b_list) { struct file_lock *fl_blocked = block->b_lock; @@ -178,6 +185,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) wake_up(&block->b_wait); res = nlm_granted; } + spin_unlock(&nlm_blocked_lock); return res; } @@ -216,10 +224,6 @@ reclaimer(void *ptr) allow_signal(SIGKILL); down_write(&host->h_rwsem); - - /* This one ensures that our parent doesn't terminate while the - * reclaim is in progress */ - lock_kernel(); lockd_up(); /* note: this cannot fail as lockd is already running */ dprintk("lockd: reclaiming locks for host %s\n", host->h_name); @@ -260,16 +264,17 @@ restart: dprintk("NLM: done reclaiming locks for host %s\n", host->h_name); /* Now, wake up all processes that sleep on a blocked lock */ + spin_lock(&nlm_blocked_lock); list_for_each_entry(block, &nlm_blocked, b_list) { if (block->b_host == host) { block->b_status = nlm_lck_denied_grace_period; wake_up(&block->b_wait); } } + spin_unlock(&nlm_blocked_lock); /* Release host handle after use */ nlm_release_host(host); lockd_down(); - unlock_kernel(); return 0; } diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 7932c399fab4..47ea1e1925b8 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -166,7 +166,6 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) /* Set up the argument struct */ nlmclnt_setlockargs(call, fl); - lock_kernel(); if (IS_SETLK(cmd) || IS_SETLKW(cmd)) { if (fl->fl_type != F_UNLCK) { call->a_args.block = IS_SETLKW(cmd) ? 1 : 0; @@ -177,10 +176,8 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) status = nlmclnt_test(call, fl); else status = -EINVAL; - fl->fl_ops->fl_release_private(fl); fl->fl_ops = NULL; - unlock_kernel(); dprintk("lockd: clnt proc returns %d\n", status); return status; @@ -226,9 +223,7 @@ void nlm_release_call(struct nlm_rqst *call) static void nlmclnt_rpc_release(void *data) { - lock_kernel(); nlm_release_call(data); - unlock_kernel(); } static int nlm_wait_on_grace(wait_queue_head_t *queue) @@ -448,14 +443,18 @@ out: static void nlmclnt_locks_copy_lock(struct file_lock *new, struct file_lock *fl) { + spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock); new->fl_u.nfs_fl.state = fl->fl_u.nfs_fl.state; new->fl_u.nfs_fl.owner = nlm_get_lockowner(fl->fl_u.nfs_fl.owner); list_add_tail(&new->fl_u.nfs_fl.list, &fl->fl_u.nfs_fl.owner->host->h_granted); + spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock); } static void nlmclnt_locks_release_private(struct file_lock *fl) { + spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock); list_del(&fl->fl_u.nfs_fl.list); + spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock); nlm_put_lockowner(fl->fl_u.nfs_fl.owner); } @@ -721,9 +720,7 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data) die: return; retry_rebind: - lock_kernel(); nlm_rebind_host(req->a_host); - unlock_kernel(); retry_unlock: rpc_restart_call(task); } @@ -801,9 +798,7 @@ retry_cancel: /* Don't ever retry more than 3 times */ if (req->a_retries++ >= NLMCLNT_MAX_RETRIES) goto die; - lock_kernel(); nlm_rebind_host(req->a_host); - unlock_kernel(); rpc_restart_call(task); rpc_delay(task, 30 * HZ); } diff --git a/fs/lockd/host.c b/fs/lockd/host.c index bb464d12104c..25e21e4023b2 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -353,6 +353,7 @@ nlm_bind_host(struct nlm_host *host) .to_retries = 5U, }; struct rpc_create_args args = { + .net = &init_net, .protocol = host->h_proto, .address = nlm_addr(host), .addrsize = host->h_addrlen, diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index e3015464fbab..e0c918949644 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -69,6 +69,7 @@ static struct rpc_clnt *nsm_create(void) .sin_addr.s_addr = htonl(INADDR_LOOPBACK), }; struct rpc_create_args args = { + .net = &init_net, .protocol = XPRT_TRANSPORT_UDP, .address = (struct sockaddr *)&sin, .addrsize = sizeof(sin), diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index f1bacf1a0391..abfff9d7979d 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -22,7 +22,6 @@ #include <linux/in.h> #include <linux/uio.h> #include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/mutex.h> #include <linux/kthread.h> #include <linux/freezer.h> @@ -130,15 +129,6 @@ lockd(void *vrqstp) dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n"); - /* - * FIXME: it would be nice if lockd didn't spend its entire life - * running under the BKL. At the very least, it would be good to - * have someone clarify what it's intended to protect here. I've - * seen some handwavy posts about posix locking needing to be - * done under the BKL, but it's far from clear. - */ - lock_kernel(); - if (!nlm_timeout) nlm_timeout = LOCKD_DFLT_TIMEO; nlmsvc_timeout = nlm_timeout * HZ; @@ -195,7 +185,6 @@ lockd(void *vrqstp) if (nlmsvc_ops) nlmsvc_invalidate_all(); nlm_shutdown_hosts(); - unlock_kernel(); return 0; } @@ -206,7 +195,7 @@ static int create_lockd_listener(struct svc_serv *serv, const char *name, xprt = svc_find_xprt(serv, name, family, 0); if (xprt == NULL) - return svc_create_xprt(serv, name, family, port, + return svc_create_xprt(serv, name, &init_net, family, port, SVC_SOCK_DEFAULTS); svc_xprt_put(xprt); return 0; diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 031c6569a134..a336e832475d 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -230,9 +230,7 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data) static void nlm4svc_callback_release(void *data) { - lock_kernel(); nlm_release_call(data); - unlock_kernel(); } static const struct rpc_call_ops nlm4svc_callback_ops = { diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 84055d31bfc5..c462d346acbd 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -52,12 +52,13 @@ static const struct rpc_call_ops nlmsvc_grant_ops; * The list of blocked locks to retry */ static LIST_HEAD(nlm_blocked); +static DEFINE_SPINLOCK(nlm_blocked_lock); /* * Insert a blocked lock into the global list */ static void -nlmsvc_insert_block(struct nlm_block *block, unsigned long when) +nlmsvc_insert_block_locked(struct nlm_block *block, unsigned long when) { struct nlm_block *b; struct list_head *pos; @@ -87,6 +88,13 @@ nlmsvc_insert_block(struct nlm_block *block, unsigned long when) block->b_when = when; } +static void nlmsvc_insert_block(struct nlm_block *block, unsigned long when) +{ + spin_lock(&nlm_blocked_lock); + nlmsvc_insert_block_locked(block, when); + spin_unlock(&nlm_blocked_lock); +} + /* * Remove a block from the global list */ @@ -94,7 +102,9 @@ static inline void nlmsvc_remove_block(struct nlm_block *block) { if (!list_empty(&block->b_list)) { + spin_lock(&nlm_blocked_lock); list_del_init(&block->b_list); + spin_unlock(&nlm_blocked_lock); nlmsvc_release_block(block); } } @@ -651,7 +661,7 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf, struct nlm_block *block; int rc = -ENOENT; - lock_kernel(); + spin_lock(&nlm_blocked_lock); list_for_each_entry(block, &nlm_blocked, b_list) { if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) { dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n", @@ -665,13 +675,13 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf, } else if (result == 0) block->b_granted = 1; - nlmsvc_insert_block(block, 0); + nlmsvc_insert_block_locked(block, 0); svc_wake_up(block->b_daemon); rc = 0; break; } } - unlock_kernel(); + spin_unlock(&nlm_blocked_lock); if (rc == -ENOENT) printk(KERN_WARNING "lockd: grant for unknown block\n"); return rc; @@ -690,14 +700,16 @@ nlmsvc_notify_blocked(struct file_lock *fl) struct nlm_block *block; dprintk("lockd: VFS unblock notification for block %p\n", fl); + spin_lock(&nlm_blocked_lock); list_for_each_entry(block, &nlm_blocked, b_list) { if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) { - nlmsvc_insert_block(block, 0); + nlmsvc_insert_block_locked(block, 0); + spin_unlock(&nlm_blocked_lock); svc_wake_up(block->b_daemon); return; } } - + spin_unlock(&nlm_blocked_lock); printk(KERN_WARNING "lockd: notification for unknown block!\n"); } @@ -803,7 +815,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data) dprintk("lockd: GRANT_MSG RPC callback\n"); - lock_kernel(); + spin_lock(&nlm_blocked_lock); /* if the block is not on a list at this point then it has * been invalidated. Don't try to requeue it. * @@ -825,19 +837,20 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data) /* Call was successful, now wait for client callback */ timeout = 60 * HZ; } - nlmsvc_insert_block(block, timeout); + nlmsvc_insert_block_locked(block, timeout); svc_wake_up(block->b_daemon); out: - unlock_kernel(); + spin_unlock(&nlm_blocked_lock); } +/* + * FIXME: nlmsvc_release_block() grabs a mutex. This is not allowed for an + * .rpc_release rpc_call_op + */ static void nlmsvc_grant_release(void *data) { struct nlm_rqst *call = data; - - lock_kernel(); nlmsvc_release_block(call->a_block); - unlock_kernel(); } static const struct rpc_call_ops nlmsvc_grant_ops = { diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 0f2ab741ae7c..c3069f38d602 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -260,9 +260,7 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data) static void nlmsvc_callback_release(void *data) { - lock_kernel(); nlm_release_call(data); - unlock_kernel(); } static const struct rpc_call_ops nlmsvc_callback_ops = { diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index d0ef94cfb3da..1ca0679c80bf 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -170,6 +170,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file, again: file->f_locks = 0; + lock_flocks(); /* protects i_flock list */ for (fl = inode->i_flock; fl; fl = fl->fl_next) { if (fl->fl_lmops != &nlmsvc_lock_operations) continue; @@ -181,6 +182,7 @@ again: if (match(lockhost, host)) { struct file_lock lock = *fl; + unlock_flocks(); lock.fl_type = F_UNLCK; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; @@ -192,6 +194,7 @@ again: goto again; } } + unlock_flocks(); return 0; } @@ -226,10 +229,14 @@ nlm_file_inuse(struct nlm_file *file) if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares) return 1; + lock_flocks(); for (fl = inode->i_flock; fl; fl = fl->fl_next) { - if (fl->fl_lmops == &nlmsvc_lock_operations) + if (fl->fl_lmops == &nlmsvc_lock_operations) { + unlock_flocks(); return 1; + } } + unlock_flocks(); file->f_locks = 0; return 0; } diff --git a/fs/locks.c b/fs/locks.c index 8b2b6ad56a09..50ec15927aab 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -142,6 +142,7 @@ int lease_break_time = 45; static LIST_HEAD(file_lock_list); static LIST_HEAD(blocked_list); +static DEFINE_SPINLOCK(file_lock_lock); /* * Protects the two list heads above, plus the inode->i_flock list @@ -149,23 +150,24 @@ static LIST_HEAD(blocked_list); */ void lock_flocks(void) { - lock_kernel(); + spin_lock(&file_lock_lock); } EXPORT_SYMBOL_GPL(lock_flocks); void unlock_flocks(void) { - unlock_kernel(); + spin_unlock(&file_lock_lock); } EXPORT_SYMBOL_GPL(unlock_flocks); static struct kmem_cache *filelock_cache __read_mostly; /* Allocate an empty lock structure. */ -static struct file_lock *locks_alloc_lock(void) +struct file_lock *locks_alloc_lock(void) { return kmem_cache_alloc(filelock_cache, GFP_KERNEL); } +EXPORT_SYMBOL_GPL(locks_alloc_lock); void locks_release_private(struct file_lock *fl) { @@ -1365,7 +1367,6 @@ int fcntl_getlease(struct file *filp) int generic_setlease(struct file *filp, long arg, struct file_lock **flp) { struct file_lock *fl, **before, **my_before = NULL, *lease; - struct file_lock *new_fl = NULL; struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; int error, rdlease_count = 0, wrlease_count = 0; @@ -1385,11 +1386,6 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) lease = *flp; if (arg != F_UNLCK) { - error = -ENOMEM; - new_fl = locks_alloc_lock(); - if (new_fl == NULL) - goto out; - error = -EAGAIN; if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) goto out; @@ -1434,7 +1430,6 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) goto out; } - error = 0; if (arg == F_UNLCK) goto out; @@ -1442,15 +1437,11 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) if (!leases_enable) goto out; - locks_copy_lock(new_fl, lease); - locks_insert_lock(before, new_fl); - - *flp = new_fl; + locks_insert_lock(before, lease); return 0; out: - if (new_fl != NULL) - locks_free_lock(new_fl); + locks_free_lock(lease); return error; } EXPORT_SYMBOL(generic_setlease); @@ -1514,26 +1505,38 @@ EXPORT_SYMBOL_GPL(vfs_setlease); */ int fcntl_setlease(unsigned int fd, struct file *filp, long arg) { - struct file_lock fl, *flp = &fl; + struct file_lock *fl; + struct fasync_struct *new; struct inode *inode = filp->f_path.dentry->d_inode; int error; - locks_init_lock(&fl); - error = lease_init(filp, arg, &fl); - if (error) - return error; + fl = lease_alloc(filp, arg); + if (IS_ERR(fl)) + return PTR_ERR(fl); + new = fasync_alloc(); + if (!new) { + locks_free_lock(fl); + return -ENOMEM; + } lock_flocks(); - - error = __vfs_setlease(filp, arg, &flp); + error = __vfs_setlease(filp, arg, &fl); if (error || arg == F_UNLCK) goto out_unlock; - error = fasync_helper(fd, filp, 1, &flp->fl_fasync); + /* + * fasync_insert_entry() returns the old entry if any. + * If there was no old entry, then it used 'new' and + * inserted it into the fasync list. Clear new so that + * we don't release it here. + */ + if (!fasync_insert_entry(fd, filp, &fl->fl_fasync, new)) + new = NULL; + if (error < 0) { /* remove lease just inserted by setlease */ - flp->fl_type = F_UNLCK | F_INPROGRESS; - flp->fl_break_time = jiffies - 10; + fl->fl_type = F_UNLCK | F_INPROGRESS; + fl->fl_break_time = jiffies - 10; time_out_leases(inode); goto out_unlock; } @@ -1541,6 +1544,8 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg) error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); out_unlock: unlock_flocks(); + if (new) + fasync_free(new); return error; } @@ -2109,7 +2114,7 @@ EXPORT_SYMBOL_GPL(vfs_cancel_lock); #include <linux/seq_file.h> static void lock_get_status(struct seq_file *f, struct file_lock *fl, - int id, char *pfx) + loff_t id, char *pfx) { struct inode *inode = NULL; unsigned int fl_pid; @@ -2122,7 +2127,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, if (fl->fl_file != NULL) inode = fl->fl_file->f_path.dentry->d_inode; - seq_printf(f, "%d:%s ", id, pfx); + seq_printf(f, "%lld:%s ", id, pfx); if (IS_POSIX(fl)) { seq_printf(f, "%6s %s ", (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ", @@ -2185,24 +2190,27 @@ static int locks_show(struct seq_file *f, void *v) fl = list_entry(v, struct file_lock, fl_link); - lock_get_status(f, fl, (long)f->private, ""); + lock_get_status(f, fl, *((loff_t *)f->private), ""); list_for_each_entry(bfl, &fl->fl_block, fl_block) - lock_get_status(f, bfl, (long)f->private, " ->"); + lock_get_status(f, bfl, *((loff_t *)f->private), " ->"); - f->private++; return 0; } static void *locks_start(struct seq_file *f, loff_t *pos) { + loff_t *p = f->private; + lock_flocks(); - f->private = (void *)1; + *p = (*pos + 1); return seq_list_start(&file_lock_list, *pos); } static void *locks_next(struct seq_file *f, void *v, loff_t *pos) { + loff_t *p = f->private; + ++*p; return seq_list_next(v, &file_lock_list, pos); } @@ -2220,14 +2228,14 @@ static const struct seq_operations locks_seq_operations = { static int locks_open(struct inode *inode, struct file *filp) { - return seq_open(filp, &locks_seq_operations); + return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t)); } static const struct file_operations proc_locks_operations = { .open = locks_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_private, }; static int __init proc_locks_init(void) diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 1eb4e89e045b..409dfd65e9a1 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -569,7 +569,7 @@ static int logfs_link(struct dentry *old_dentry, struct inode *dir, return -EMLINK; inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; - atomic_inc(&inode->i_count); + ihold(inode); inode->i_nlink++; mark_inode_dirty_sync(inode); diff --git a/fs/minix/namei.c b/fs/minix/namei.c index f3f3578393a4..c0d35a3accef 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -101,7 +101,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir, inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); - atomic_inc(&inode->i_count); + ihold(inode); return add_nondir(dentry, inode); } diff --git a/fs/namei.c b/fs/namei.c index 24896e833565..f7dbc06857ab 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1121,11 +1121,13 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, static struct dentry *__lookup_hash(struct qstr *name, struct dentry *base, struct nameidata *nd) { + struct inode *inode = base->d_inode; struct dentry *dentry; - struct inode *inode; int err; - inode = base->d_inode; + err = exec_permission(inode); + if (err) + return ERR_PTR(err); /* * See if the low-level filesystem might want @@ -1161,11 +1163,6 @@ out: */ static struct dentry *lookup_hash(struct nameidata *nd) { - int err; - - err = exec_permission(nd->path.dentry->d_inode); - if (err) - return ERR_PTR(err); return __lookup_hash(&nd->last, nd->path.dentry, nd); } @@ -1213,9 +1210,6 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) if (err) return ERR_PTR(err); - err = exec_permission(base->d_inode); - if (err) - return ERR_PTR(err); return __lookup_hash(&this, base, NULL); } @@ -2291,7 +2285,7 @@ static long do_unlinkat(int dfd, const char __user *pathname) goto slashes; inode = dentry->d_inode; if (inode) - atomic_inc(&inode->i_count); + ihold(inode); error = mnt_want_write(nd.path.mnt); if (error) goto exit2; diff --git a/fs/namespace.c b/fs/namespace.c index 7ca5182c0bed..8a415c9c5e55 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -595,7 +595,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, goto out_free; } - mnt->mnt_flags = old->mnt_flags; + mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD; atomic_inc(&sb->s_active); mnt->mnt_sb = sb; mnt->mnt_root = dget(root); diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index b950415d7c43..ba306658a6db 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -1,7 +1,6 @@ config NFS_FS tristate "NFS client support" depends on INET && FILE_LOCKING - depends on BKL # fix as soon as lockd is done select LOCKD select SUNRPC select NFS_ACL_SUPPORT if NFS_V3_ACL @@ -77,13 +76,17 @@ config NFS_V4 config NFS_V4_1 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" - depends on NFS_V4 && EXPERIMENTAL + depends on NFS_FS && NFS_V4 && EXPERIMENTAL + select PNFS_FILE_LAYOUT help This option enables support for minor version 1 of the NFSv4 protocol - (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client. + (RFC 5661) in the kernel's NFS client. If unsure, say N. +config PNFS_FILE_LAYOUT + tristate + config ROOT_NFS bool "Root file system on NFS" depends on NFS_FS=y && IP_PNP @@ -118,3 +121,14 @@ config NFS_USE_KERNEL_DNS select DNS_RESOLVER select KEYS default y + +config NFS_USE_NEW_IDMAPPER + bool "Use the new idmapper upcall routine" + depends on NFS_V4 && KEYS + help + Say Y here if you want NFS to use the new idmapper upcall functions. + You will need /sbin/request-key (usually provided by the keyutils + package). For details, read + <file:Documentation/filesystems/nfs/idmapper.txt>. + + If you are unsure, say N. diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index da7fda639eac..4776ff9e3814 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -15,5 +15,9 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ delegation.o idmap.o \ callback.o callback_xdr.o callback_proc.o \ nfs4namespace.o +nfs-$(CONFIG_NFS_V4_1) += pnfs.o nfs-$(CONFIG_SYSCTL) += sysctl.o nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o + +obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o +nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index e17b49e2eabd..aeec017fe814 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -109,7 +109,7 @@ nfs4_callback_up(struct svc_serv *serv) { int ret; - ret = svc_create_xprt(serv, "tcp", PF_INET, + ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET, nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); if (ret <= 0) goto out_err; @@ -117,7 +117,7 @@ nfs4_callback_up(struct svc_serv *serv) dprintk("NFS: Callback listener port = %u (af %u)\n", nfs_callback_tcpport, PF_INET); - ret = svc_create_xprt(serv, "tcp", PF_INET6, + ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6, nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); if (ret > 0) { nfs_callback_tcpport6 = ret; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 930d10fecdaf..2950fca0c61b 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -118,11 +118,11 @@ int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const n if (delegation == NULL) return 0; - /* seqid is 4-bytes long */ - if (((u32 *) &stateid->data)[0] != 0) + if (stateid->stateid.seqid != 0) return 0; - if (memcmp(&delegation->stateid.data[4], &stateid->data[4], - sizeof(stateid->data)-4)) + if (memcmp(&delegation->stateid.stateid.other, + &stateid->stateid.other, + NFS4_STATEID_OTHER_SIZE)) return 0; return 1; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index e7340729af89..0870d0d4efc0 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -48,6 +48,7 @@ #include "iostat.h" #include "internal.h" #include "fscache.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_CLIENT @@ -155,7 +156,9 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ cred = rpc_lookup_machine_cred(); if (!IS_ERR(cred)) clp->cl_machine_cred = cred; - +#if defined(CONFIG_NFS_V4_1) + INIT_LIST_HEAD(&clp->cl_layouts); +#endif nfs_fscache_get_client_cookie(clp); return clp; @@ -252,6 +255,7 @@ void nfs_put_client(struct nfs_client *clp) nfs_free_client(clp); } } +EXPORT_SYMBOL_GPL(nfs_put_client); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) /* @@ -601,6 +605,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp, { struct rpc_clnt *clnt = NULL; struct rpc_create_args args = { + .net = &init_net, .protocol = clp->cl_proto, .address = (struct sockaddr *)&clp->cl_addr, .addrsize = clp->cl_addrlen, @@ -635,7 +640,8 @@ static int nfs_create_rpc_client(struct nfs_client *clp, */ static void nfs_destroy_server(struct nfs_server *server) { - if (!(server->flags & NFS_MOUNT_NONLM)) + if (!(server->flags & NFS_MOUNT_LOCAL_FLOCK) || + !(server->flags & NFS_MOUNT_LOCAL_FCNTL)) nlmclnt_done(server->nlm_host); } @@ -657,7 +663,8 @@ static int nfs_start_lockd(struct nfs_server *server) if (nlm_init.nfs_version > 3) return 0; - if (server->flags & NFS_MOUNT_NONLM) + if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) && + (server->flags & NFS_MOUNT_LOCAL_FCNTL)) return 0; switch (clp->cl_proto) { @@ -898,11 +905,13 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo * if (server->wsize > NFS_MAX_FILE_IO_SIZE) server->wsize = NFS_MAX_FILE_IO_SIZE; server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + set_pnfs_layoutdriver(server, fsinfo->layouttype); + server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); - if (server->dtsize > PAGE_CACHE_SIZE) - server->dtsize = PAGE_CACHE_SIZE; + if (server->dtsize > PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES) + server->dtsize = PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES; if (server->dtsize > server->rsize) server->dtsize = server->rsize; @@ -913,6 +922,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo * server->maxfilesize = fsinfo->maxfilesize; + server->time_delta = fsinfo->time_delta; + /* We're airborne Set socket buffersize */ rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100); } @@ -935,6 +946,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str } fsinfo.fattr = fattr; + fsinfo.layouttype = 0; error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); if (error < 0) goto out_error; @@ -1017,6 +1029,7 @@ void nfs_free_server(struct nfs_server *server) { dprintk("--> nfs_free_server()\n"); + unset_pnfs_layoutdriver(server); spin_lock(&nfs_client_lock); list_del(&server->client_link); list_del(&server->master_link); @@ -1356,8 +1369,9 @@ static int nfs4_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = data->flags; - server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR| - NFS_CAP_POSIX_LOCK; + server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|NFS_CAP_POSIX_LOCK; + if (!(data->flags & NFS_MOUNT_NORDIRPLUS)) + server->caps |= NFS_CAP_READDIRPLUS; server->options = data->options; /* Get a client record */ diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e257172d438c..07ac3847e562 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -33,11 +33,12 @@ #include <linux/namei.h> #include <linux/mount.h> #include <linux/sched.h> +#include <linux/vmalloc.h> -#include "nfs4_fs.h" #include "delegation.h" #include "iostat.h" #include "internal.h" +#include "fscache.h" /* #define NFS_DEBUG_VERBOSE 1 */ @@ -55,6 +56,7 @@ static int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); static int nfs_fsync_dir(struct file *, int); static loff_t nfs_llseek_dir(struct file *, loff_t, int); +static int nfs_readdir_clear_array(struct page*, gfp_t); const struct file_operations nfs_dir_operations = { .llseek = nfs_llseek_dir, @@ -80,6 +82,10 @@ const struct inode_operations nfs_dir_inode_operations = { .setattr = nfs_setattr, }; +const struct address_space_operations nfs_dir_addr_space_ops = { + .releasepage = nfs_readdir_clear_array, +}; + #ifdef CONFIG_NFS_V3 const struct inode_operations nfs3_dir_inode_operations = { .create = nfs_create, @@ -104,8 +110,9 @@ const struct inode_operations nfs3_dir_inode_operations = { #ifdef CONFIG_NFS_V4 static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *); +static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd); const struct inode_operations nfs4_dir_inode_operations = { - .create = nfs_create, + .create = nfs_open_create, .lookup = nfs_atomic_lookup, .link = nfs_link, .unlink = nfs_unlink, @@ -150,51 +157,197 @@ nfs_opendir(struct inode *inode, struct file *filp) return res; } -typedef __be32 * (*decode_dirent_t)(__be32 *, struct nfs_entry *, int); +struct nfs_cache_array_entry { + u64 cookie; + u64 ino; + struct qstr string; +}; + +struct nfs_cache_array { + unsigned int size; + int eof_index; + u64 last_cookie; + struct nfs_cache_array_entry array[0]; +}; + +#define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry)) + +typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); typedef struct { struct file *file; struct page *page; unsigned long page_index; - __be32 *ptr; u64 *dir_cookie; loff_t current_index; - struct nfs_entry *entry; decode_dirent_t decode; - int plus; + unsigned long timestamp; unsigned long gencount; - int timestamp_valid; + unsigned int cache_entry_index; + unsigned int plus:1; + unsigned int eof:1; } nfs_readdir_descriptor_t; -/* Now we cache directories properly, by stuffing the dirent - * data directly in the page cache. - * - * Inode invalidation due to refresh etc. takes care of - * _everything_, no sloppy entry flushing logic, no extraneous - * copying, network direct to page cache, the way it was meant - * to be. - * - * NOTE: Dirent information verification is done always by the - * page-in of the RPC reply, nowhere else, this simplies - * things substantially. +/* + * The caller is responsible for calling nfs_readdir_release_array(page) */ static -int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) +struct nfs_cache_array *nfs_readdir_get_array(struct page *page) +{ + if (page == NULL) + return ERR_PTR(-EIO); + return (struct nfs_cache_array *)kmap(page); +} + +static +void nfs_readdir_release_array(struct page *page) +{ + kunmap(page); +} + +/* + * we are freeing strings created by nfs_add_to_readdir_array() + */ +static +int nfs_readdir_clear_array(struct page *page, gfp_t mask) +{ + struct nfs_cache_array *array = nfs_readdir_get_array(page); + int i; + for (i = 0; i < array->size; i++) + kfree(array->array[i].string.name); + nfs_readdir_release_array(page); + return 0; +} + +/* + * the caller is responsible for freeing qstr.name + * when called by nfs_readdir_add_to_array, the strings will be freed in + * nfs_clear_readdir_array() + */ +static +int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len) +{ + string->len = len; + string->name = kmemdup(name, len, GFP_KERNEL); + if (string->name == NULL) + return -ENOMEM; + string->hash = full_name_hash(name, len); + return 0; +} + +static +int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) +{ + struct nfs_cache_array *array = nfs_readdir_get_array(page); + struct nfs_cache_array_entry *cache_entry; + int ret; + + if (IS_ERR(array)) + return PTR_ERR(array); + ret = -EIO; + if (array->size >= MAX_READDIR_ARRAY) + goto out; + + cache_entry = &array->array[array->size]; + cache_entry->cookie = entry->prev_cookie; + cache_entry->ino = entry->ino; + ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len); + if (ret) + goto out; + array->last_cookie = entry->cookie; + if (entry->eof == 1) + array->eof_index = array->size; + array->size++; +out: + nfs_readdir_release_array(page); + return ret; +} + +static +int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) +{ + loff_t diff = desc->file->f_pos - desc->current_index; + unsigned int index; + + if (diff < 0) + goto out_eof; + if (diff >= array->size) { + if (array->eof_index > 0) + goto out_eof; + desc->current_index += array->size; + return -EAGAIN; + } + + index = (unsigned int)diff; + *desc->dir_cookie = array->array[index].cookie; + desc->cache_entry_index = index; + if (index == array->eof_index) + desc->eof = 1; + return 0; +out_eof: + desc->eof = 1; + return -EBADCOOKIE; +} + +static +int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) +{ + int i; + int status = -EAGAIN; + + for (i = 0; i < array->size; i++) { + if (i == array->eof_index) { + desc->eof = 1; + status = -EBADCOOKIE; + } + if (array->array[i].cookie == *desc->dir_cookie) { + desc->cache_entry_index = i; + status = 0; + break; + } + } + + return status; +} + +static +int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) +{ + struct nfs_cache_array *array; + int status = -EBADCOOKIE; + + if (desc->dir_cookie == NULL) + goto out; + + array = nfs_readdir_get_array(desc->page); + if (IS_ERR(array)) { + status = PTR_ERR(array); + goto out; + } + + if (*desc->dir_cookie == 0) + status = nfs_readdir_search_for_pos(array, desc); + else + status = nfs_readdir_search_for_cookie(array, desc); + + nfs_readdir_release_array(desc->page); +out: + return status; +} + +/* Fill a page with xdr information before transferring to the cache page */ +static +int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, + struct nfs_entry *entry, struct file *file, struct inode *inode) { - struct file *file = desc->file; - struct inode *inode = file->f_path.dentry->d_inode; struct rpc_cred *cred = nfs_file_cred(file); unsigned long timestamp, gencount; int error; - dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n", - __func__, (long long)desc->entry->cookie, - page->index); - again: timestamp = jiffies; gencount = nfs_inc_attr_generation_counter(); - error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page, + error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages, NFS_SERVER(inode)->dtsize, desc->plus); if (error < 0) { /* We requested READDIRPLUS, but the server doesn't grok it */ @@ -208,190 +361,292 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) } desc->timestamp = timestamp; desc->gencount = gencount; - desc->timestamp_valid = 1; - SetPageUptodate(page); - /* Ensure consistent page alignment of the data. - * Note: assumes we have exclusive access to this mapping either - * through inode->i_mutex or some other mechanism. - */ - if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) { - /* Should never happen */ - nfs_zap_mapping(inode, inode->i_mapping); - } - unlock_page(page); - return 0; - error: - unlock_page(page); - return -EIO; +error: + return error; } -static inline -int dir_decode(nfs_readdir_descriptor_t *desc) +/* Fill in an entry based on the xdr code stored in desc->page */ +static +int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *stream) { - __be32 *p = desc->ptr; - p = desc->decode(p, desc->entry, desc->plus); + __be32 *p = desc->decode(stream, entry, NFS_SERVER(desc->file->f_path.dentry->d_inode), desc->plus); if (IS_ERR(p)) return PTR_ERR(p); - desc->ptr = p; - if (desc->timestamp_valid) { - desc->entry->fattr->time_start = desc->timestamp; - desc->entry->fattr->gencount = desc->gencount; - } else - desc->entry->fattr->valid &= ~NFS_ATTR_FATTR; + + entry->fattr->time_start = desc->timestamp; + entry->fattr->gencount = desc->gencount; return 0; } -static inline -void dir_page_release(nfs_readdir_descriptor_t *desc) +static +int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) { - kunmap(desc->page); - page_cache_release(desc->page); - desc->page = NULL; - desc->ptr = NULL; + struct nfs_inode *node; + if (dentry->d_inode == NULL) + goto different; + node = NFS_I(dentry->d_inode); + if (node->fh.size != entry->fh->size) + goto different; + if (strncmp(node->fh.data, entry->fh->data, node->fh.size) != 0) + goto different; + return 1; +different: + return 0; } -/* - * Given a pointer to a buffer that has already been filled by a call - * to readdir, find the next entry with cookie '*desc->dir_cookie'. - * - * If the end of the buffer has been reached, return -EAGAIN, if not, - * return the offset within the buffer of the next entry to be - * read. - */ -static inline -int find_dirent(nfs_readdir_descriptor_t *desc) +static +void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) { - struct nfs_entry *entry = desc->entry; - int loop_count = 0, - status; + struct qstr filename = { + .len = entry->len, + .name = entry->name, + }; + struct dentry *dentry; + struct dentry *alias; + struct inode *dir = parent->d_inode; + struct inode *inode; - while((status = dir_decode(desc)) == 0) { - dfprintk(DIRCACHE, "NFS: %s: examining cookie %Lu\n", - __func__, (unsigned long long)entry->cookie); - if (entry->prev_cookie == *desc->dir_cookie) - break; - if (loop_count++ > 200) { - loop_count = 0; - schedule(); + if (filename.name[0] == '.') { + if (filename.len == 1) + return; + if (filename.len == 2 && filename.name[1] == '.') + return; + } + filename.hash = full_name_hash(filename.name, filename.len); + + dentry = d_lookup(parent, &filename); + if (dentry != NULL) { + if (nfs_same_file(dentry, entry)) { + nfs_refresh_inode(dentry->d_inode, entry->fattr); + goto out; + } else { + d_drop(dentry); + dput(dentry); } } - return status; + + dentry = d_alloc(parent, &filename); + if (dentry == NULL) + return; + + dentry->d_op = NFS_PROTO(dir)->dentry_ops; + inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); + if (IS_ERR(inode)) + goto out; + + alias = d_materialise_unique(dentry, inode); + if (IS_ERR(alias)) + goto out; + else if (alias) { + nfs_set_verifier(alias, nfs_save_change_attribute(dir)); + dput(alias); + } else + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + +out: + dput(dentry); +} + +/* Perform conversion from xdr to cache array */ +static +void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, + void *xdr_page, struct page *page, unsigned int buflen) +{ + struct xdr_stream stream; + struct xdr_buf buf; + __be32 *ptr = xdr_page; + int status; + struct nfs_cache_array *array; + + buf.head->iov_base = xdr_page; + buf.head->iov_len = buflen; + buf.tail->iov_len = 0; + buf.page_base = 0; + buf.page_len = 0; + buf.buflen = buf.head->iov_len; + buf.len = buf.head->iov_len; + + xdr_init_decode(&stream, &buf, ptr); + + + do { + status = xdr_decode(desc, entry, &stream); + if (status != 0) + break; + + if (nfs_readdir_add_to_array(entry, page) == -1) + break; + if (desc->plus == 1) + nfs_prime_dcache(desc->file->f_path.dentry, entry); + } while (!entry->eof); + + if (status == -EBADCOOKIE && entry->eof) { + array = nfs_readdir_get_array(page); + array->eof_index = array->size - 1; + status = 0; + nfs_readdir_release_array(page); + } +} + +static +void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages) +{ + unsigned int i; + for (i = 0; i < npages; i++) + put_page(pages[i]); +} + +static +void nfs_readdir_free_large_page(void *ptr, struct page **pages, + unsigned int npages) +{ + vm_unmap_ram(ptr, npages); + nfs_readdir_free_pagearray(pages, npages); } /* - * Given a pointer to a buffer that has already been filled by a call - * to readdir, find the entry at offset 'desc->file->f_pos'. - * - * If the end of the buffer has been reached, return -EAGAIN, if not, - * return the offset within the buffer of the next entry to be - * read. + * nfs_readdir_large_page will allocate pages that must be freed with a call + * to nfs_readdir_free_large_page */ -static inline -int find_dirent_index(nfs_readdir_descriptor_t *desc) +static +void *nfs_readdir_large_page(struct page **pages, unsigned int npages) { - struct nfs_entry *entry = desc->entry; - int loop_count = 0, - status; + void *ptr; + unsigned int i; + + for (i = 0; i < npages; i++) { + struct page *page = alloc_page(GFP_KERNEL); + if (page == NULL) + goto out_freepages; + pages[i] = page; + } - for(;;) { - status = dir_decode(desc); - if (status) - break; + ptr = vm_map_ram(pages, npages, 0, PAGE_KERNEL); + if (!IS_ERR_OR_NULL(ptr)) + return ptr; +out_freepages: + nfs_readdir_free_pagearray(pages, i); + return NULL; +} + +static +int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode) +{ + struct page *pages[NFS_MAX_READDIR_PAGES]; + void *pages_ptr = NULL; + struct nfs_entry entry; + struct file *file = desc->file; + struct nfs_cache_array *array; + int status = 0; + unsigned int array_size = ARRAY_SIZE(pages); + + entry.prev_cookie = 0; + entry.cookie = *desc->dir_cookie; + entry.eof = 0; + entry.fh = nfs_alloc_fhandle(); + entry.fattr = nfs_alloc_fattr(); + if (entry.fh == NULL || entry.fattr == NULL) + goto out; - dfprintk(DIRCACHE, "NFS: found cookie %Lu at index %Ld\n", - (unsigned long long)entry->cookie, desc->current_index); + array = nfs_readdir_get_array(page); + memset(array, 0, sizeof(struct nfs_cache_array)); + array->eof_index = -1; - if (desc->file->f_pos == desc->current_index) { - *desc->dir_cookie = entry->cookie; + pages_ptr = nfs_readdir_large_page(pages, array_size); + if (!pages_ptr) + goto out_release_array; + do { + status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode); + + if (status < 0) break; - } - desc->current_index++; - if (loop_count++ > 200) { - loop_count = 0; - schedule(); - } - } + nfs_readdir_page_filler(desc, &entry, pages_ptr, page, array_size * PAGE_SIZE); + } while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY); + + nfs_readdir_free_large_page(pages_ptr, pages, array_size); +out_release_array: + nfs_readdir_release_array(page); +out: + nfs_free_fattr(entry.fattr); + nfs_free_fhandle(entry.fh); return status; } /* - * Find the given page, and call find_dirent() or find_dirent_index in - * order to try to return the next entry. + * Now we cache directories properly, by converting xdr information + * to an array that can be used for lookups later. This results in + * fewer cache pages, since we can store more information on each page. + * We only need to convert from xdr once so future lookups are much simpler */ -static inline -int find_dirent_page(nfs_readdir_descriptor_t *desc) +static +int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) { struct inode *inode = desc->file->f_path.dentry->d_inode; - struct page *page; - int status; - dfprintk(DIRCACHE, "NFS: %s: searching page %ld for target %Lu\n", - __func__, desc->page_index, - (long long) *desc->dir_cookie); + if (nfs_readdir_xdr_to_array(desc, page, inode) < 0) + goto error; + SetPageUptodate(page); - /* If we find the page in the page_cache, we cannot be sure - * how fresh the data is, so we will ignore readdir_plus attributes. - */ - desc->timestamp_valid = 0; - page = read_cache_page(inode->i_mapping, desc->page_index, - (filler_t *)nfs_readdir_filler, desc); - if (IS_ERR(page)) { - status = PTR_ERR(page); - goto out; + if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) { + /* Should never happen */ + nfs_zap_mapping(inode, inode->i_mapping); } + unlock_page(page); + return 0; + error: + unlock_page(page); + return -EIO; +} - /* NOTE: Someone else may have changed the READDIRPLUS flag */ - desc->page = page; - desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */ - if (*desc->dir_cookie != 0) - status = find_dirent(desc); - else - status = find_dirent_index(desc); - if (status < 0) - dir_page_release(desc); - out: - dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status); - return status; +static +void cache_page_release(nfs_readdir_descriptor_t *desc) +{ + page_cache_release(desc->page); + desc->page = NULL; +} + +static +struct page *get_cache_page(nfs_readdir_descriptor_t *desc) +{ + struct page *page; + page = read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping, + desc->page_index, (filler_t *)nfs_readdir_filler, desc); + if (IS_ERR(page)) + desc->eof = 1; + return page; } /* - * Recurse through the page cache pages, and return a - * filled nfs_entry structure of the next directory entry if possible. - * - * The target for the search is '*desc->dir_cookie' if non-0, - * 'desc->file->f_pos' otherwise + * Returns 0 if desc->dir_cookie was found on page desc->page_index */ +static +int find_cache_page(nfs_readdir_descriptor_t *desc) +{ + int res; + + desc->page = get_cache_page(desc); + if (IS_ERR(desc->page)) + return PTR_ERR(desc->page); + + res = nfs_readdir_search_array(desc); + if (res == 0) + return 0; + cache_page_release(desc); + return res; +} + +/* Search for desc->dir_cookie from the beginning of the page cache */ static inline int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) { - int loop_count = 0; - int res; - - /* Always search-by-index from the beginning of the cache */ - if (*desc->dir_cookie == 0) { - dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for offset %Ld\n", - (long long)desc->file->f_pos); - desc->page_index = 0; - desc->entry->cookie = desc->entry->prev_cookie = 0; - desc->entry->eof = 0; - desc->current_index = 0; - } else - dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for cookie %Lu\n", - (unsigned long long)*desc->dir_cookie); + int res = -EAGAIN; - for (;;) { - res = find_dirent_page(desc); + while (1) { + res = find_cache_page(desc); if (res != -EAGAIN) break; - /* Align to beginning of next page */ - desc->page_index ++; - if (loop_count++ > 200) { - loop_count = 0; - schedule(); - } + desc->page_index++; } - - dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, res); return res; } @@ -400,8 +655,6 @@ static inline unsigned int dt_type(struct inode *inode) return (inode->i_mode >> 12) & 15; } -static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc); - /* * Once we've found the start of the dirent within a page: fill 'er up... */ @@ -410,49 +663,36 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, filldir_t filldir) { struct file *file = desc->file; - struct nfs_entry *entry = desc->entry; - struct dentry *dentry = NULL; - u64 fileid; - int loop_count = 0, - res; - - dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n", - (unsigned long long)entry->cookie); - - for(;;) { - unsigned d_type = DT_UNKNOWN; - /* Note: entry->prev_cookie contains the cookie for - * retrieving the current dirent on the server */ - fileid = entry->ino; - - /* Get a dentry if we have one */ - if (dentry != NULL) - dput(dentry); - dentry = nfs_readdir_lookup(desc); + int i = 0; + int res = 0; + struct nfs_cache_array *array = NULL; + unsigned int d_type = DT_UNKNOWN; + struct dentry *dentry = NULL; - /* Use readdirplus info */ - if (dentry != NULL && dentry->d_inode != NULL) { - d_type = dt_type(dentry->d_inode); - fileid = NFS_FILEID(dentry->d_inode); - } + array = nfs_readdir_get_array(desc->page); - res = filldir(dirent, entry->name, entry->len, - file->f_pos, nfs_compat_user_ino64(fileid), - d_type); + for (i = desc->cache_entry_index; i < array->size; i++) { + d_type = DT_UNKNOWN; + + res = filldir(dirent, array->array[i].string.name, + array->array[i].string.len, file->f_pos, + nfs_compat_user_ino64(array->array[i].ino), d_type); if (res < 0) break; file->f_pos++; - *desc->dir_cookie = entry->cookie; - if (dir_decode(desc) != 0) { - desc->page_index ++; + desc->cache_entry_index = i; + if (i < (array->size-1)) + *desc->dir_cookie = array->array[i+1].cookie; + else + *desc->dir_cookie = array->last_cookie; + if (i == array->eof_index) { + desc->eof = 1; break; } - if (loop_count++ > 200) { - loop_count = 0; - schedule(); - } } - dir_page_release(desc); + + nfs_readdir_release_array(desc->page); + cache_page_release(desc); if (dentry != NULL) dput(dentry); dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", @@ -476,12 +716,9 @@ static inline int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, filldir_t filldir) { - struct file *file = desc->file; - struct inode *inode = file->f_path.dentry->d_inode; - struct rpc_cred *cred = nfs_file_cred(file); struct page *page = NULL; int status; - unsigned long timestamp, gencount; + struct inode *inode = desc->file->f_path.dentry->d_inode; dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", (unsigned long long)*desc->dir_cookie); @@ -491,38 +728,22 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, status = -ENOMEM; goto out; } - timestamp = jiffies; - gencount = nfs_inc_attr_generation_counter(); - status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, - *desc->dir_cookie, page, - NFS_SERVER(inode)->dtsize, - desc->plus); - desc->page = page; - desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */ - if (status >= 0) { - desc->timestamp = timestamp; - desc->gencount = gencount; - desc->timestamp_valid = 1; - if ((status = dir_decode(desc)) == 0) - desc->entry->prev_cookie = *desc->dir_cookie; - } else + + if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) { status = -EIO; - if (status < 0) goto out_release; + } + desc->page_index = 0; + desc->page = page; status = nfs_do_filldir(desc, dirent, filldir); - /* Reset read descriptor so it searches the page cache from - * the start upon the next call to readdir_search_pagecache() */ - desc->page_index = 0; - desc->entry->cookie = desc->entry->prev_cookie = 0; - desc->entry->eof = 0; out: dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status); return status; out_release: - dir_page_release(desc); + cache_page_release(desc); goto out; } @@ -536,7 +757,6 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) struct inode *inode = dentry->d_inode; nfs_readdir_descriptor_t my_desc, *desc = &my_desc; - struct nfs_entry my_entry; int res = -ENOMEM; dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", @@ -557,26 +777,17 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) desc->decode = NFS_PROTO(inode)->decode_dirent; desc->plus = NFS_USE_READDIRPLUS(inode); - my_entry.cookie = my_entry.prev_cookie = 0; - my_entry.eof = 0; - my_entry.fh = nfs_alloc_fhandle(); - my_entry.fattr = nfs_alloc_fattr(); - if (my_entry.fh == NULL || my_entry.fattr == NULL) - goto out_alloc_failed; - - desc->entry = &my_entry; - nfs_block_sillyrename(dentry); res = nfs_revalidate_mapping(inode, filp->f_mapping); if (res < 0) goto out; - while(!desc->entry->eof) { + while (desc->eof != 1) { res = readdir_search_pagecache(desc); if (res == -EBADCOOKIE) { /* This means either end of directory */ - if (*desc->dir_cookie && desc->entry->cookie != *desc->dir_cookie) { + if (*desc->dir_cookie && desc->eof == 0) { /* Or that the server has 'lost' a cookie */ res = uncached_readdir(desc, dirent, filldir); if (res >= 0) @@ -588,8 +799,9 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (res == -ETOOSMALL && desc->plus) { clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); nfs_zap_caches(inode); + desc->page_index = 0; desc->plus = 0; - desc->entry->eof = 0; + desc->eof = 0; continue; } if (res < 0) @@ -605,9 +817,6 @@ out: nfs_unblock_sillyrename(dentry); if (res > 0) res = 0; -out_alloc_failed: - nfs_free_fattr(my_entry.fattr); - nfs_free_fhandle(my_entry.fh); dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n", dentry->d_parent->d_name.name, dentry->d_name.name, res); @@ -1029,10 +1238,63 @@ static int is_atomic_open(struct nameidata *nd) return 1; } +static struct nfs_open_context *nameidata_to_nfs_open_context(struct dentry *dentry, struct nameidata *nd) +{ + struct path path = { + .mnt = nd->path.mnt, + .dentry = dentry, + }; + struct nfs_open_context *ctx; + struct rpc_cred *cred; + fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC); + + cred = rpc_lookup_cred(); + if (IS_ERR(cred)) + return ERR_CAST(cred); + ctx = alloc_nfs_open_context(&path, cred, fmode); + put_rpccred(cred); + if (ctx == NULL) + return ERR_PTR(-ENOMEM); + return ctx; +} + +static int do_open(struct inode *inode, struct file *filp) +{ + nfs_fscache_set_inode_cookie(inode, filp); + return 0; +} + +static int nfs_intent_set_file(struct nameidata *nd, struct nfs_open_context *ctx) +{ + struct file *filp; + int ret = 0; + + /* If the open_intent is for execute, we have an extra check to make */ + if (ctx->mode & FMODE_EXEC) { + ret = nfs_may_open(ctx->path.dentry->d_inode, + ctx->cred, + nd->intent.open.flags); + if (ret < 0) + goto out; + } + filp = lookup_instantiate_filp(nd, ctx->path.dentry, do_open); + if (IS_ERR(filp)) + ret = PTR_ERR(filp); + else + nfs_file_set_open_context(filp, ctx); +out: + put_nfs_open_context(ctx); + return ret; +} + static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { + struct nfs_open_context *ctx; + struct iattr attr; struct dentry *res = NULL; - int error; + struct inode *inode; + int open_flags; + int err; dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n", dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); @@ -1054,13 +1316,32 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry goto out; } + ctx = nameidata_to_nfs_open_context(dentry, nd); + res = ERR_CAST(ctx); + if (IS_ERR(ctx)) + goto out; + + open_flags = nd->intent.open.flags; + if (nd->flags & LOOKUP_CREATE) { + attr.ia_mode = nd->intent.open.create_mode; + attr.ia_valid = ATTR_MODE; + if (!IS_POSIXACL(dir)) + attr.ia_mode &= ~current_umask(); + } else { + open_flags &= ~(O_EXCL | O_CREAT); + attr.ia_valid = 0; + } + /* Open the file on the server */ - res = nfs4_atomic_open(dir, dentry, nd); - if (IS_ERR(res)) { - error = PTR_ERR(res); - switch (error) { + nfs_block_sillyrename(dentry->d_parent); + inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr); + if (IS_ERR(inode)) { + nfs_unblock_sillyrename(dentry->d_parent); + put_nfs_open_context(ctx); + switch (PTR_ERR(inode)) { /* Make a negative dentry */ case -ENOENT: + d_add(dentry, NULL); res = NULL; goto out; /* This turned out not to be a regular file */ @@ -1072,11 +1353,25 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry goto no_open; /* case -EINVAL: */ default: + res = ERR_CAST(inode); goto out; } - } else if (res != NULL) + } + res = d_add_unique(dentry, inode); + nfs_unblock_sillyrename(dentry->d_parent); + if (res != NULL) { + dput(ctx->path.dentry); + ctx->path.dentry = dget(res); dentry = res; + } + err = nfs_intent_set_file(nd, ctx); + if (err < 0) { + if (res != NULL) + dput(res); + return ERR_PTR(err); + } out: + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); return res; no_open: return nfs_lookup(dir, dentry, nd); @@ -1087,12 +1382,15 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) struct dentry *parent = NULL; struct inode *inode = dentry->d_inode; struct inode *dir; + struct nfs_open_context *ctx; int openflags, ret = 0; if (!is_atomic_open(nd) || d_mountpoint(dentry)) goto no_open; + parent = dget_parent(dentry); dir = parent->d_inode; + /* We can't create new files in nfs_open_revalidate(), so we * optimize away revalidation of negative dentries. */ @@ -1112,99 +1410,96 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) /* We can't create new files, or truncate existing ones here */ openflags &= ~(O_CREAT|O_EXCL|O_TRUNC); + ctx = nameidata_to_nfs_open_context(dentry, nd); + ret = PTR_ERR(ctx); + if (IS_ERR(ctx)) + goto out; /* * Note: we're not holding inode->i_mutex and so may be racing with * operations that change the directory. We therefore save the * change attribute *before* we do the RPC call. */ - ret = nfs4_open_revalidate(dir, dentry, openflags, nd); + inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, NULL); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + switch (ret) { + case -EPERM: + case -EACCES: + case -EDQUOT: + case -ENOSPC: + case -EROFS: + goto out_put_ctx; + default: + goto out_drop; + } + } + iput(inode); + if (inode != dentry->d_inode) + goto out_drop; + + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + ret = nfs_intent_set_file(nd, ctx); + if (ret >= 0) + ret = 1; out: dput(parent); - if (!ret) - d_drop(dentry); return ret; +out_drop: + d_drop(dentry); + ret = 0; +out_put_ctx: + put_nfs_open_context(ctx); + goto out; + no_open_dput: dput(parent); no_open: return nfs_lookup_revalidate(dentry, nd); } -#endif /* CONFIG_NFSV4 */ -static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc) +static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) { - struct dentry *parent = desc->file->f_path.dentry; - struct inode *dir = parent->d_inode; - struct nfs_entry *entry = desc->entry; - struct dentry *dentry, *alias; - struct qstr name = { - .name = entry->name, - .len = entry->len, - }; - struct inode *inode; - unsigned long verf = nfs_save_change_attribute(dir); + struct nfs_open_context *ctx = NULL; + struct iattr attr; + int error; + int open_flags = 0; - switch (name.len) { - case 2: - if (name.name[0] == '.' && name.name[1] == '.') - return dget_parent(parent); - break; - case 1: - if (name.name[0] == '.') - return dget(parent); - } + dfprintk(VFS, "NFS: create(%s/%ld), %s\n", + dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); - spin_lock(&dir->i_lock); - if (NFS_I(dir)->cache_validity & NFS_INO_INVALID_DATA) { - spin_unlock(&dir->i_lock); - return NULL; - } - spin_unlock(&dir->i_lock); + attr.ia_mode = mode; + attr.ia_valid = ATTR_MODE; - name.hash = full_name_hash(name.name, name.len); - dentry = d_lookup(parent, &name); - if (dentry != NULL) { - /* Is this a positive dentry that matches the readdir info? */ - if (dentry->d_inode != NULL && - (NFS_FILEID(dentry->d_inode) == entry->ino || - d_mountpoint(dentry))) { - if (!desc->plus || entry->fh->size == 0) - return dentry; - if (nfs_compare_fh(NFS_FH(dentry->d_inode), - entry->fh) == 0) - goto out_renew; - } - /* No, so d_drop to allow one to be created */ - d_drop(dentry); - dput(dentry); - } - if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR)) - return NULL; - if (name.len > NFS_SERVER(dir)->namelen) - return NULL; - /* Note: caller is already holding the dir->i_mutex! */ - dentry = d_alloc(parent, &name); - if (dentry == NULL) - return NULL; - dentry->d_op = NFS_PROTO(dir)->dentry_ops; - inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); - if (IS_ERR(inode)) { - dput(dentry); - return NULL; - } + if ((nd->flags & LOOKUP_CREATE) != 0) { + open_flags = nd->intent.open.flags; - alias = d_materialise_unique(dentry, inode); - if (alias != NULL) { - dput(dentry); - if (IS_ERR(alias)) - return NULL; - dentry = alias; + ctx = nameidata_to_nfs_open_context(dentry, nd); + error = PTR_ERR(ctx); + if (IS_ERR(ctx)) + goto out_err_drop; } -out_renew: - nfs_set_verifier(dentry, verf); - return dentry; + error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, ctx); + if (error != 0) + goto out_put_ctx; + if (ctx != NULL) { + error = nfs_intent_set_file(nd, ctx); + if (error < 0) + goto out_err; + } + return 0; +out_put_ctx: + if (ctx != NULL) + put_nfs_open_context(ctx); +out_err_drop: + d_drop(dentry); +out_err: + return error; } +#endif /* CONFIG_NFSV4 */ + /* * Code common to create, mkdir, and mknod. */ @@ -1258,7 +1553,6 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode, { struct iattr attr; int error; - int open_flags = 0; dfprintk(VFS, "NFS: create(%s/%ld), %s\n", dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); @@ -1266,10 +1560,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode, attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; - if ((nd->flags & LOOKUP_CREATE) != 0) - open_flags = nd->intent.open.flags; - - error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd); + error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL); if (error != 0) goto out_err; return 0; @@ -1351,76 +1642,6 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry) return error; } -static int nfs_sillyrename(struct inode *dir, struct dentry *dentry) -{ - static unsigned int sillycounter; - const int fileidsize = sizeof(NFS_FILEID(dentry->d_inode))*2; - const int countersize = sizeof(sillycounter)*2; - const int slen = sizeof(".nfs")+fileidsize+countersize-1; - char silly[slen+1]; - struct qstr qsilly; - struct dentry *sdentry; - int error = -EIO; - - dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n", - dentry->d_parent->d_name.name, dentry->d_name.name, - atomic_read(&dentry->d_count)); - nfs_inc_stats(dir, NFSIOS_SILLYRENAME); - - /* - * We don't allow a dentry to be silly-renamed twice. - */ - error = -EBUSY; - if (dentry->d_flags & DCACHE_NFSFS_RENAMED) - goto out; - - sprintf(silly, ".nfs%*.*Lx", - fileidsize, fileidsize, - (unsigned long long)NFS_FILEID(dentry->d_inode)); - - /* Return delegation in anticipation of the rename */ - nfs_inode_return_delegation(dentry->d_inode); - - sdentry = NULL; - do { - char *suffix = silly + slen - countersize; - - dput(sdentry); - sillycounter++; - sprintf(suffix, "%*.*x", countersize, countersize, sillycounter); - - dfprintk(VFS, "NFS: trying to rename %s to %s\n", - dentry->d_name.name, silly); - - sdentry = lookup_one_len(silly, dentry->d_parent, slen); - /* - * N.B. Better to return EBUSY here ... it could be - * dangerous to delete the file while it's in use. - */ - if (IS_ERR(sdentry)) - goto out; - } while(sdentry->d_inode != NULL); /* need negative lookup */ - - qsilly.name = silly; - qsilly.len = strlen(silly); - if (dentry->d_inode) { - error = NFS_PROTO(dir)->rename(dir, &dentry->d_name, - dir, &qsilly); - nfs_mark_for_revalidate(dentry->d_inode); - } else - error = NFS_PROTO(dir)->rename(dir, &dentry->d_name, - dir, &qsilly); - if (!error) { - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - d_move(dentry, sdentry); - error = nfs_async_unlink(dir, dentry); - /* If we return 0 we don't unlink */ - } - dput(sdentry); -out: - return error; -} - /* * Remove a file after making sure there are no pending writes, * and after checking that the file has only one user. @@ -1580,7 +1801,7 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) d_drop(dentry); error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); if (error == 0) { - atomic_inc(&inode->i_count); + ihold(inode); d_add(dentry, inode); } return error; @@ -1711,14 +1932,14 @@ static void nfs_access_free_list(struct list_head *head) int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { LIST_HEAD(head); - struct nfs_inode *nfsi; + struct nfs_inode *nfsi, *next; struct nfs_access_entry *cache; if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) return (nr_to_scan == 0) ? 0 : -1; spin_lock(&nfs_access_lru_lock); - list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) { + list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) { struct inode *inode; if (nr_to_scan-- == 0) diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index dba50a5625db..a6e711ad130f 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -167,7 +167,7 @@ static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd, return 0; } item = container_of(h, struct nfs_dns_ent, h); - ttl = (long)item->h.expiry_time - (long)get_seconds(); + ttl = item->h.expiry_time - seconds_since_boot(); if (ttl < 0) ttl = 0; @@ -239,7 +239,7 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen) ttl = get_expiry(&buf); if (ttl == 0) goto out; - key.h.expiry_time = ttl + get_seconds(); + key.h.expiry_time = ttl + seconds_since_boot(); ret = -ENOMEM; item = nfs_dns_lookup(cd, &key); @@ -301,7 +301,7 @@ static int do_cache_lookup_nowait(struct cache_detail *cd, goto out_err; ret = -ETIMEDOUT; if (!test_bit(CACHE_VALID, &(*item)->h.flags) - || (*item)->h.expiry_time < get_seconds() + || (*item)->h.expiry_time < seconds_since_boot() || cd->flush_time > (*item)->h.last_refresh) goto out_put; ret = -ENOENT; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 05bf3c0dc751..e756075637b0 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -36,6 +36,7 @@ #include "internal.h" #include "iostat.h" #include "fscache.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_FILE @@ -386,6 +387,10 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, file->f_path.dentry->d_name.name, mapping->host->i_ino, len, (long long) pos); + pnfs_update_layout(mapping->host, + nfs_file_open_context(file), + IOMODE_RW); + start: /* * Prevent starvation issues if someone is doing a consistency @@ -551,7 +556,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) struct file *filp = vma->vm_file; struct dentry *dentry = filp->f_path.dentry; unsigned pagelen; - int ret = -EINVAL; + int ret = VM_FAULT_NOPAGE; struct address_space *mapping; dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n", @@ -567,21 +572,20 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (mapping != dentry->d_inode->i_mapping) goto out_unlock; - ret = 0; pagelen = nfs_page_length(page); if (pagelen == 0) goto out_unlock; - ret = nfs_flush_incompatible(filp, page); - if (ret != 0) - goto out_unlock; + ret = VM_FAULT_LOCKED; + if (nfs_flush_incompatible(filp, page) == 0 && + nfs_updatepage(filp, page, 0, pagelen) == 0) + goto out; - ret = nfs_updatepage(filp, page, 0, pagelen); + ret = VM_FAULT_SIGBUS; out_unlock: - if (!ret) - return VM_FAULT_LOCKED; unlock_page(page); - return VM_FAULT_SIGBUS; +out: + return ret; } static const struct vm_operations_struct nfs_file_vm_ops = { @@ -684,7 +688,8 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, return ret; } -static int do_getlk(struct file *filp, int cmd, struct file_lock *fl) +static int +do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { struct inode *inode = filp->f_mapping->host; int status = 0; @@ -699,7 +704,7 @@ static int do_getlk(struct file *filp, int cmd, struct file_lock *fl) if (nfs_have_delegation(inode, FMODE_READ)) goto out_noconflict; - if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM) + if (is_local) goto out_noconflict; status = NFS_PROTO(inode)->lock(filp, cmd, fl); @@ -726,7 +731,8 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl) return res; } -static int do_unlk(struct file *filp, int cmd, struct file_lock *fl) +static int +do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { struct inode *inode = filp->f_mapping->host; int status; @@ -741,15 +747,24 @@ static int do_unlk(struct file *filp, int cmd, struct file_lock *fl) * If we're signalled while cleaning up locks on process exit, we * still need to complete the unlock. */ - /* Use local locking if mounted with "-onolock" */ - if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) + /* + * Use local locking if mounted with "-onolock" or with appropriate + * "-olocal_lock=" + */ + if (!is_local) status = NFS_PROTO(inode)->lock(filp, cmd, fl); else status = do_vfs_lock(filp, fl); return status; } -static int do_setlk(struct file *filp, int cmd, struct file_lock *fl) +static int +is_time_granular(struct timespec *ts) { + return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000)); +} + +static int +do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { struct inode *inode = filp->f_mapping->host; int status; @@ -762,20 +777,31 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl) if (status != 0) goto out; - /* Use local locking if mounted with "-onolock" */ - if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) + /* + * Use local locking if mounted with "-onolock" or with appropriate + * "-olocal_lock=" + */ + if (!is_local) status = NFS_PROTO(inode)->lock(filp, cmd, fl); else status = do_vfs_lock(filp, fl); if (status < 0) goto out; + /* - * Make sure we clear the cache whenever we try to get the lock. + * Revalidate the cache if the server has time stamps granular + * enough to detect subsecond changes. Otherwise, clear the + * cache to prevent missing any changes. + * * This makes locking act as a cache coherency point. */ nfs_sync_mapping(filp->f_mapping); - if (!nfs_have_delegation(inode, FMODE_READ)) - nfs_zap_caches(inode); + if (!nfs_have_delegation(inode, FMODE_READ)) { + if (is_time_granular(&NFS_SERVER(inode)->time_delta)) + __nfs_revalidate_inode(NFS_SERVER(inode), inode); + else + nfs_zap_caches(inode); + } out: return status; } @@ -787,6 +813,7 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) { struct inode *inode = filp->f_mapping->host; int ret = -ENOLCK; + int is_local = 0; dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n", filp->f_path.dentry->d_parent->d_name.name, @@ -800,6 +827,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) goto out_err; + if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL) + is_local = 1; + if (NFS_PROTO(inode)->lock_check_bounds != NULL) { ret = NFS_PROTO(inode)->lock_check_bounds(fl); if (ret < 0) @@ -807,11 +837,11 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) } if (IS_GETLK(cmd)) - ret = do_getlk(filp, cmd, fl); + ret = do_getlk(filp, cmd, fl, is_local); else if (fl->fl_type == F_UNLCK) - ret = do_unlk(filp, cmd, fl); + ret = do_unlk(filp, cmd, fl, is_local); else - ret = do_setlk(filp, cmd, fl); + ret = do_setlk(filp, cmd, fl, is_local); out_err: return ret; } @@ -821,6 +851,9 @@ out_err: */ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) { + struct inode *inode = filp->f_mapping->host; + int is_local = 0; + dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n", filp->f_path.dentry->d_parent->d_name.name, filp->f_path.dentry->d_name.name, @@ -829,14 +862,17 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; + if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK) + is_local = 1; + /* We're simulating flock() locks using posix locks on the server */ fl->fl_owner = (fl_owner_t)filp; fl->fl_start = 0; fl->fl_end = OFFSET_MAX; if (fl->fl_type == F_UNLCK) - return do_unlk(filp, cmd, fl); - return do_setlk(filp, cmd, fl); + return do_unlk(filp, cmd, fl, is_local); + return do_setlk(filp, cmd, fl, is_local); } /* diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index a70e446e1605..ac7b814ce162 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -54,8 +54,7 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i iput(inode); return -ENOMEM; } - /* Circumvent igrab(): we know the inode is not being freed */ - atomic_inc(&inode->i_count); + ihold(inode); /* * Ensure that this dentry is invisible to d_find_alias(). * Otherwise, it may be spliced into the tree by diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 21a84d45916f..dec47ed8b6b9 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -34,6 +34,212 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#ifdef CONFIG_NFS_USE_NEW_IDMAPPER + +#include <linux/slab.h> +#include <linux/cred.h> +#include <linux/nfs_idmap.h> +#include <linux/keyctl.h> +#include <linux/key-type.h> +#include <linux/rcupdate.h> +#include <linux/kernel.h> +#include <linux/err.h> + +#include <keys/user-type.h> + +#define NFS_UINT_MAXLEN 11 + +const struct cred *id_resolver_cache; + +struct key_type key_type_id_resolver = { + .name = "id_resolver", + .instantiate = user_instantiate, + .match = user_match, + .revoke = user_revoke, + .destroy = user_destroy, + .describe = user_describe, + .read = user_read, +}; + +int nfs_idmap_init(void) +{ + struct cred *cred; + struct key *keyring; + int ret = 0; + + printk(KERN_NOTICE "Registering the %s key type\n", key_type_id_resolver.name); + + cred = prepare_kernel_cred(NULL); + if (!cred) + return -ENOMEM; + + keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA); + if (IS_ERR(keyring)) { + ret = PTR_ERR(keyring); + goto failed_put_cred; + } + + ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL); + if (ret < 0) + goto failed_put_key; + + ret = register_key_type(&key_type_id_resolver); + if (ret < 0) + goto failed_put_key; + + cred->thread_keyring = keyring; + cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; + id_resolver_cache = cred; + return 0; + +failed_put_key: + key_put(keyring); +failed_put_cred: + put_cred(cred); + return ret; +} + +void nfs_idmap_quit(void) +{ + key_revoke(id_resolver_cache->thread_keyring); + unregister_key_type(&key_type_id_resolver); + put_cred(id_resolver_cache); +} + +/* + * Assemble the description to pass to request_key() + * This function will allocate a new string and update dest to point + * at it. The caller is responsible for freeing dest. + * + * On error 0 is returned. Otherwise, the length of dest is returned. + */ +static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen, + const char *type, size_t typelen, char **desc) +{ + char *cp; + size_t desclen = typelen + namelen + 2; + + *desc = kmalloc(desclen, GFP_KERNEL); + if (!desc) + return -ENOMEM; + + cp = *desc; + memcpy(cp, type, typelen); + cp += typelen; + *cp++ = ':'; + + memcpy(cp, name, namelen); + cp += namelen; + *cp = '\0'; + return desclen; +} + +static ssize_t nfs_idmap_request_key(const char *name, size_t namelen, + const char *type, void *data, size_t data_size) +{ + const struct cred *saved_cred; + struct key *rkey; + char *desc; + struct user_key_payload *payload; + ssize_t ret; + + ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc); + if (ret <= 0) + goto out; + + saved_cred = override_creds(id_resolver_cache); + rkey = request_key(&key_type_id_resolver, desc, ""); + revert_creds(saved_cred); + kfree(desc); + if (IS_ERR(rkey)) { + ret = PTR_ERR(rkey); + goto out; + } + + rcu_read_lock(); + rkey->perm |= KEY_USR_VIEW; + + ret = key_validate(rkey); + if (ret < 0) + goto out_up; + + payload = rcu_dereference(rkey->payload.data); + if (IS_ERR_OR_NULL(payload)) { + ret = PTR_ERR(payload); + goto out_up; + } + + ret = payload->datalen; + if (ret > 0 && ret <= data_size) + memcpy(data, payload->data, ret); + else + ret = -EINVAL; + +out_up: + rcu_read_unlock(); + key_put(rkey); +out: + return ret; +} + + +/* ID -> Name */ +static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen) +{ + char id_str[NFS_UINT_MAXLEN]; + int id_len; + ssize_t ret; + + id_len = snprintf(id_str, sizeof(id_str), "%u", id); + ret = nfs_idmap_request_key(id_str, id_len, type, buf, buflen); + if (ret < 0) + return -EINVAL; + return ret; +} + +/* Name -> ID */ +static int nfs_idmap_lookup_id(const char *name, size_t namelen, + const char *type, __u32 *id) +{ + char id_str[NFS_UINT_MAXLEN]; + long id_long; + ssize_t data_size; + int ret = 0; + + data_size = nfs_idmap_request_key(name, namelen, type, id_str, NFS_UINT_MAXLEN); + if (data_size <= 0) { + ret = -EINVAL; + } else { + ret = strict_strtol(id_str, 10, &id_long); + *id = (__u32)id_long; + } + return ret; +} + +int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) +{ + return nfs_idmap_lookup_id(name, namelen, "uid", uid); +} + +int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid) +{ + return nfs_idmap_lookup_id(name, namelen, "gid", gid); +} + +int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) +{ + return nfs_idmap_lookup_name(uid, "user", buf, buflen); +} +int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen) +{ + return nfs_idmap_lookup_name(gid, "group", buf, buflen); +} + +#else /* CONFIG_NFS_USE_IDMAPPER not defined */ + #include <linux/module.h> #include <linux/mutex.h> #include <linux/init.h> @@ -503,16 +709,17 @@ int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namele return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid); } -int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf) +int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) { struct idmap *idmap = clp->cl_idmap; return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); } -int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf) +int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) { struct idmap *idmap = clp->cl_idmap; return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf); } +#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */ diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 7d2d6c72aa78..314f57164602 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -48,6 +48,7 @@ #include "internal.h" #include "fscache.h" #include "dns_resolve.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -234,9 +235,6 @@ nfs_init_locked(struct inode *inode, void *opaque) return 0; } -/* Don't use READDIRPLUS on directories that we believe are too large */ -#define NFS_LIMIT_READDIRPLUS (8*PAGE_SIZE) - /* * This is our front-end to iget that looks up inodes by file handle * instead of inode number. @@ -291,8 +289,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) } else if (S_ISDIR(inode->i_mode)) { inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; inode->i_fop = &nfs_dir_operations; - if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS) - && fattr->size <= NFS_LIMIT_READDIRPLUS) + if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)) set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); /* Deal with crossing mountpoints */ if ((fattr->valid & NFS_ATTR_FATTR_FSID) @@ -623,7 +620,7 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) nfs_revalidate_inode(server, inode); } -static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred) +struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred, fmode_t f_mode) { struct nfs_open_context *ctx; @@ -633,11 +630,13 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct path_get(&ctx->path); ctx->cred = get_rpccred(cred); ctx->state = NULL; + ctx->mode = f_mode; ctx->flags = 0; ctx->error = 0; ctx->dir_cookie = 0; nfs_init_lock_context(&ctx->lock_context); ctx->lock_context.open_context = ctx; + INIT_LIST_HEAD(&ctx->list); } return ctx; } @@ -653,11 +652,15 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) { struct inode *inode = ctx->path.dentry->d_inode; - if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) + if (!list_empty(&ctx->list)) { + if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) + return; + list_del(&ctx->list); + spin_unlock(&inode->i_lock); + } else if (!atomic_dec_and_test(&ctx->lock_context.count)) return; - list_del(&ctx->list); - spin_unlock(&inode->i_lock); - NFS_PROTO(inode)->close_context(ctx, is_sync); + if (inode != NULL) + NFS_PROTO(inode)->close_context(ctx, is_sync); if (ctx->cred != NULL) put_rpccred(ctx->cred); path_put(&ctx->path); @@ -673,7 +676,7 @@ void put_nfs_open_context(struct nfs_open_context *ctx) * Ensure that mmap has a recent RPC credential for use when writing out * shared pages */ -static void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) +void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) { struct inode *inode = filp->f_path.dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); @@ -730,11 +733,10 @@ int nfs_open(struct inode *inode, struct file *filp) cred = rpc_lookup_cred(); if (IS_ERR(cred)) return PTR_ERR(cred); - ctx = alloc_nfs_open_context(&filp->f_path, cred); + ctx = alloc_nfs_open_context(&filp->f_path, cred, filp->f_mode); put_rpccred(cred); if (ctx == NULL) return -ENOMEM; - ctx->mode = filp->f_mode; nfs_file_set_open_context(filp, ctx); put_nfs_open_context(ctx); nfs_fscache_set_inode_cookie(inode, filp); @@ -1409,6 +1411,7 @@ void nfs4_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); end_writeback(inode); + pnfs_destroy_layout(NFS_I(inode)); /* If we are holding a delegation, return it! */ nfs_inode_return_delegation_noreclaim(inode); /* First call standard NFS clear_inode() code */ @@ -1446,6 +1449,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi) nfsi->delegation = NULL; nfsi->delegation_state = 0; init_rwsem(&nfsi->rwsem); + nfsi->layout = NULL; #endif } @@ -1493,7 +1497,7 @@ static int nfsiod_start(void) { struct workqueue_struct *wq; dprintk("RPC: creating workqueue nfsiod\n"); - wq = create_singlethread_workqueue("nfsiod"); + wq = alloc_workqueue("nfsiod", WQ_RESCUER, 0); if (wq == NULL) return -ENOMEM; nfsiod_workqueue = wq; @@ -1521,6 +1525,10 @@ static int __init init_nfs_fs(void) { int err; + err = nfs_idmap_init(); + if (err < 0) + goto out9; + err = nfs_dns_resolver_init(); if (err < 0) goto out8; @@ -1585,6 +1593,8 @@ out6: out7: nfs_dns_resolver_destroy(); out8: + nfs_idmap_quit(); +out9: return err; } @@ -1597,6 +1607,7 @@ static void __exit exit_nfs_fs(void) nfs_destroy_nfspagecache(); nfs_fscache_unregister(); nfs_dns_resolver_destroy(); + nfs_idmap_quit(); #ifdef CONFIG_PROC_FS rpc_proc_unregister("nfs"); #endif diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index c961bc92c107..db08ff3ff454 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -63,6 +63,12 @@ struct nfs_clone_mount { #define NFS_UNSPEC_PORT (-1) /* + * Maximum number of pages that readdir can use for creating + * a vmapped array of pages. + */ +#define NFS_MAX_READDIR_PAGES 8 + +/* * In-kernel mount arguments */ struct nfs_parsed_mount_data { @@ -181,15 +187,15 @@ extern void nfs_destroy_directcache(void); /* nfs2xdr.c */ extern int nfs_stat_to_errno(int); extern struct rpc_procinfo nfs_procedures[]; -extern __be32 * nfs_decode_dirent(__be32 *, struct nfs_entry *, int); +extern __be32 *nfs_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); /* nfs3xdr.c */ extern struct rpc_procinfo nfs3_procedures[]; -extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int); +extern __be32 *nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); /* nfs4xdr.c */ #ifdef CONFIG_NFS_V4 -extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus); +extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); #endif #ifdef CONFIG_NFS_V4_1 extern const u32 nfs41_maxread_overhead; diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 59047f8d7d72..eceafe74f473 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -153,6 +153,7 @@ int nfs_mount(struct nfs_mount_request *info) .rpc_resp = &result, }; struct rpc_create_args args = { + .net = &init_net, .protocol = info->protocol, .address = info->sap, .addrsize = info->salen, @@ -224,6 +225,7 @@ void nfs_umount(const struct nfs_mount_request *info) .to_retries = 2, }; struct rpc_create_args args = { + .net = &init_net, .protocol = IPPROTO_UDP, .address = info->sap, .addrsize = info->salen, @@ -436,7 +438,7 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res) for (i = 0; i < entries; i++) { flavors[i] = ntohl(*p++); - dprintk("NFS:\tflavor %u: %d\n", i, flavors[i]); + dprintk("NFS: auth flavor[%u]: %d\n", i, flavors[i]); } *count = i; diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index db8846a0e82e..e6bf45710cc7 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -337,10 +337,10 @@ nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args) static int nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args) { - p = xdr_encode_fhandle(p, args->fromfh); - p = xdr_encode_array(p, args->fromname, args->fromlen); - p = xdr_encode_fhandle(p, args->tofh); - p = xdr_encode_array(p, args->toname, args->tolen); + p = xdr_encode_fhandle(p, args->old_dir); + p = xdr_encode_array(p, args->old_name->name, args->old_name->len); + p = xdr_encode_fhandle(p, args->new_dir); + p = xdr_encode_array(p, args->new_name->name, args->new_name->len); req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); return 0; } @@ -423,9 +423,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) struct page **page; size_t hdrlen; unsigned int pglen, recvd; - u32 len; int status, nr = 0; - __be32 *end, *entry, *kaddr; if ((status = ntohl(*p++))) return nfs_stat_to_errno(status); @@ -445,80 +443,59 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) if (pglen > recvd) pglen = recvd; page = rcvbuf->pages; - kaddr = p = kmap_atomic(*page, KM_USER0); - end = (__be32 *)((char *)p + pglen); - entry = p; - - /* Make sure the packet actually has a value_follows and EOF entry */ - if ((entry + 1) > end) - goto short_pkt; - - for (; *p++; nr++) { - if (p + 2 > end) - goto short_pkt; - p++; /* fileid */ - len = ntohl(*p++); - p += XDR_QUADLEN(len) + 1; /* name plus cookie */ - if (len > NFS2_MAXNAMLEN) { - dprintk("NFS: giant filename in readdir (len 0x%x)!\n", - len); - goto err_unmap; - } - if (p + 2 > end) - goto short_pkt; - entry = p; - } - - /* - * Apparently some server sends responses that are a valid size, but - * contain no entries, and have value_follows==0 and EOF==0. For - * those, just set the EOF marker. - */ - if (!nr && entry[1] == 0) { - dprintk("NFS: readdir reply truncated!\n"); - entry[1] = 1; - } - out: - kunmap_atomic(kaddr, KM_USER0); return nr; - short_pkt: - /* - * When we get a short packet there are 2 possibilities. We can - * return an error, or fix up the response to look like a valid - * response and return what we have so far. If there are no - * entries and the packet was short, then return -EIO. If there - * are valid entries in the response, return them and pretend that - * the call was successful, but incomplete. The caller can retry the - * readdir starting at the last cookie. - */ - entry[0] = entry[1] = 0; - if (!nr) - nr = -errno_NFSERR_IO; - goto out; -err_unmap: - nr = -errno_NFSERR_IO; - goto out; +} + +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("nfs: %s: prematurely hit end of receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); } __be32 * -nfs_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) +nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus) { - if (!*p++) { - if (!*p) + __be32 *p; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + if (!ntohl(*p++)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + if (!ntohl(*p++)) return ERR_PTR(-EAGAIN); entry->eof = 1; return ERR_PTR(-EBADCOOKIE); } + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + entry->ino = ntohl(*p++); entry->len = ntohl(*p++); + + p = xdr_inline_decode(xdr, entry->len + 4); + if (unlikely(!p)) + goto out_overflow; entry->name = (const char *) p; p += XDR_QUADLEN(entry->len); entry->prev_cookie = entry->cookie; entry->cookie = ntohl(*p++); - entry->eof = !p[0] && p[1]; + + p = xdr_inline_peek(xdr, 8); + if (p != NULL) + entry->eof = !p[0] && p[1]; + else + entry->eof = 0; return p; + +out_overflow: + print_overflow_msg(__func__, xdr); + return ERR_PTR(-EIO); } /* @@ -596,7 +573,6 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy) struct kvec *iov = rcvbuf->head; size_t hdrlen; u32 len, recvd; - char *kaddr; int status; if ((status = ntohl(*p++))) @@ -623,10 +599,7 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy) return -EIO; } - /* NULL terminate the string we got */ - kaddr = (char *)kmap_atomic(rcvbuf->pages[0], KM_USER0); - kaddr[len+rcvbuf->page_base] = '\0'; - kunmap_atomic(kaddr, KM_USER0); + xdr_terminate_string(rcvbuf, len); return 0; } diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index fabb4f2849a1..ce939c062a52 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -313,7 +313,7 @@ static void nfs3_free_createdata(struct nfs3_createdata *data) */ static int nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, - int flags, struct nameidata *nd) + int flags, struct nfs_open_context *ctx) { struct nfs3_createdata *data; mode_t mode = sattr->ia_mode; @@ -438,19 +438,38 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir) return 1; } +static void +nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +{ + msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME]; +} + +static int +nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir, + struct inode *new_dir) +{ + struct nfs_renameres *res; + + if (nfs3_async_handle_jukebox(task, old_dir)) + return 0; + res = task->tk_msg.rpc_resp; + + nfs_post_op_update_inode(old_dir, res->old_fattr); + nfs_post_op_update_inode(new_dir, res->new_fattr); + return 1; +} + static int nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, struct inode *new_dir, struct qstr *new_name) { - struct nfs3_renameargs arg = { - .fromfh = NFS_FH(old_dir), - .fromname = old_name->name, - .fromlen = old_name->len, - .tofh = NFS_FH(new_dir), - .toname = new_name->name, - .tolen = new_name->len + struct nfs_renameargs arg = { + .old_dir = NFS_FH(old_dir), + .old_name = old_name, + .new_dir = NFS_FH(new_dir), + .new_name = new_name, }; - struct nfs3_renameres res; + struct nfs_renameres res; struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME], .rpc_argp = &arg, @@ -460,17 +479,17 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); - res.fromattr = nfs_alloc_fattr(); - res.toattr = nfs_alloc_fattr(); - if (res.fromattr == NULL || res.toattr == NULL) + res.old_fattr = nfs_alloc_fattr(); + res.new_fattr = nfs_alloc_fattr(); + if (res.old_fattr == NULL || res.new_fattr == NULL) goto out; status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); - nfs_post_op_update_inode(old_dir, res.fromattr); - nfs_post_op_update_inode(new_dir, res.toattr); + nfs_post_op_update_inode(old_dir, res.old_fattr); + nfs_post_op_update_inode(new_dir, res.new_fattr); out: - nfs_free_fattr(res.toattr); - nfs_free_fattr(res.fromattr); + nfs_free_fattr(res.old_fattr); + nfs_free_fattr(res.new_fattr); dprintk("NFS reply rename: %d\n", status); return status; } @@ -611,7 +630,7 @@ out: */ static int nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, - u64 cookie, struct page *page, unsigned int count, int plus) + u64 cookie, struct page **pages, unsigned int count, int plus) { struct inode *dir = dentry->d_inode; __be32 *verf = NFS_COOKIEVERF(dir); @@ -621,7 +640,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, .verf = {verf[0], verf[1]}, .plus = plus, .count = count, - .pages = &page + .pages = pages }; struct nfs3_readdirres res = { .verf = verf, @@ -652,7 +671,8 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, nfs_free_fattr(res.dir_attr); out: - dprintk("NFS reply readdir: %d\n", status); + dprintk("NFS reply readdir%s: %d\n", + plus? "plus" : "", status); return status; } @@ -722,7 +742,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, dprintk("NFS call fsstat\n"); nfs_fattr_init(stat->fattr); status = rpc_call_sync(server->client, &msg, 0); - dprintk("NFS reply statfs: %d\n", status); + dprintk("NFS reply fsstat: %d\n", status); return status; } @@ -844,6 +864,8 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .unlink_setup = nfs3_proc_unlink_setup, .unlink_done = nfs3_proc_unlink_done, .rename = nfs3_proc_rename, + .rename_setup = nfs3_proc_rename_setup, + .rename_done = nfs3_proc_rename_done, .link = nfs3_proc_link, .symlink = nfs3_proc_symlink, .mkdir = nfs3_proc_mkdir, diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 9769704f8ce6..d9a5e832c257 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -100,6 +100,13 @@ static const umode_t nfs_type2fmt[] = { [NF3FIFO] = S_IFIFO, }; +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("nfs: %s: prematurely hit end of receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} + /* * Common NFS XDR functions as inlines */ @@ -119,6 +126,29 @@ xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh) return NULL; } +static inline __be32 * +xdr_decode_fhandle_stream(struct xdr_stream *xdr, struct nfs_fh *fh) +{ + __be32 *p; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + fh->size = ntohl(*p++); + + if (fh->size <= NFS3_FHSIZE) { + p = xdr_inline_decode(xdr, fh->size); + if (unlikely(!p)) + goto out_overflow; + memcpy(fh->data, p, fh->size); + return p + XDR_QUADLEN(fh->size); + } + return NULL; + +out_overflow: + print_overflow_msg(__func__, xdr); + return ERR_PTR(-EIO); +} + /* * Encode/decode time. */ @@ -241,6 +271,26 @@ xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr) } static inline __be32 * +xdr_decode_post_op_attr_stream(struct xdr_stream *xdr, struct nfs_fattr *fattr) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + if (ntohl(*p++)) { + p = xdr_inline_decode(xdr, 84); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_fattr(p, fattr); + } + return p; +out_overflow: + print_overflow_msg(__func__, xdr); + return ERR_PTR(-EIO); +} + +static inline __be32 * xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr) { if (*p++) @@ -442,12 +492,12 @@ nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args) * Encode RENAME arguments */ static int -nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs3_renameargs *args) +nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args) { - p = xdr_encode_fhandle(p, args->fromfh); - p = xdr_encode_array(p, args->fromname, args->fromlen); - p = xdr_encode_fhandle(p, args->tofh); - p = xdr_encode_array(p, args->toname, args->tolen); + p = xdr_encode_fhandle(p, args->old_dir); + p = xdr_encode_array(p, args->old_name->name, args->old_name->len); + p = xdr_encode_fhandle(p, args->new_dir); + p = xdr_encode_array(p, args->new_name->name, args->new_name->len); req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); return 0; } @@ -504,9 +554,8 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res struct kvec *iov = rcvbuf->head; struct page **page; size_t hdrlen; - u32 len, recvd, pglen; + u32 recvd, pglen; int status, nr = 0; - __be32 *entry, *end, *kaddr; status = ntohl(*p++); /* Decode post_op_attrs */ @@ -536,99 +585,38 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res if (pglen > recvd) pglen = recvd; page = rcvbuf->pages; - kaddr = p = kmap_atomic(*page, KM_USER0); - end = (__be32 *)((char *)p + pglen); - entry = p; - - /* Make sure the packet actually has a value_follows and EOF entry */ - if ((entry + 1) > end) - goto short_pkt; - - for (; *p++; nr++) { - if (p + 3 > end) - goto short_pkt; - p += 2; /* inode # */ - len = ntohl(*p++); /* string length */ - p += XDR_QUADLEN(len) + 2; /* name + cookie */ - if (len > NFS3_MAXNAMLEN) { - dprintk("NFS: giant filename in readdir (len 0x%x)!\n", - len); - goto err_unmap; - } - if (res->plus) { - /* post_op_attr */ - if (p + 2 > end) - goto short_pkt; - if (*p++) { - p += 21; - if (p + 1 > end) - goto short_pkt; - } - /* post_op_fh3 */ - if (*p++) { - if (p + 1 > end) - goto short_pkt; - len = ntohl(*p++); - if (len > NFS3_FHSIZE) { - dprintk("NFS: giant filehandle in " - "readdir (len 0x%x)!\n", len); - goto err_unmap; - } - p += XDR_QUADLEN(len); - } - } - - if (p + 2 > end) - goto short_pkt; - entry = p; - } - - /* - * Apparently some server sends responses that are a valid size, but - * contain no entries, and have value_follows==0 and EOF==0. For - * those, just set the EOF marker. - */ - if (!nr && entry[1] == 0) { - dprintk("NFS: readdir reply truncated!\n"); - entry[1] = 1; - } - out: - kunmap_atomic(kaddr, KM_USER0); return nr; - short_pkt: - /* - * When we get a short packet there are 2 possibilities. We can - * return an error, or fix up the response to look like a valid - * response and return what we have so far. If there are no - * entries and the packet was short, then return -EIO. If there - * are valid entries in the response, return them and pretend that - * the call was successful, but incomplete. The caller can retry the - * readdir starting at the last cookie. - */ - entry[0] = entry[1] = 0; - if (!nr) - nr = -errno_NFSERR_IO; - goto out; -err_unmap: - nr = -errno_NFSERR_IO; - goto out; } __be32 * -nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) +nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus) { + __be32 *p; struct nfs_entry old = *entry; - if (!*p++) { - if (!*p) + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + if (!ntohl(*p++)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + if (!ntohl(*p++)) return ERR_PTR(-EAGAIN); entry->eof = 1; return ERR_PTR(-EBADCOOKIE); } + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; p = xdr_decode_hyper(p, &entry->ino); entry->len = ntohl(*p++); + + p = xdr_inline_decode(xdr, entry->len + 8); + if (unlikely(!p)) + goto out_overflow; entry->name = (const char *) p; p += XDR_QUADLEN(entry->len); entry->prev_cookie = entry->cookie; @@ -636,10 +624,17 @@ nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) if (plus) { entry->fattr->valid = 0; - p = xdr_decode_post_op_attr(p, entry->fattr); + p = xdr_decode_post_op_attr_stream(xdr, entry->fattr); + if (IS_ERR(p)) + goto out_overflow_exit; /* In fact, a post_op_fh3: */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; if (*p++) { - p = xdr_decode_fhandle(p, entry->fh); + p = xdr_decode_fhandle_stream(xdr, entry->fh); + if (IS_ERR(p)) + goto out_overflow_exit; /* Ugh -- server reply was truncated */ if (p == NULL) { dprintk("NFS: FH truncated\n"); @@ -650,8 +645,18 @@ nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) memset((u8*)(entry->fh), 0, sizeof(*entry->fh)); } - entry->eof = !p[0] && p[1]; + p = xdr_inline_peek(xdr, 8); + if (p != NULL) + entry->eof = !p[0] && p[1]; + else + entry->eof = 0; + return p; + +out_overflow: + print_overflow_msg(__func__, xdr); +out_overflow_exit: + return ERR_PTR(-EIO); } /* @@ -824,7 +829,6 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) struct kvec *iov = rcvbuf->head; size_t hdrlen; u32 len, recvd; - char *kaddr; int status; status = ntohl(*p++); @@ -857,10 +861,7 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) return -EIO; } - /* NULL terminate the string we got */ - kaddr = (char*)kmap_atomic(rcvbuf->pages[0], KM_USER0); - kaddr[len+rcvbuf->page_base] = '\0'; - kunmap_atomic(kaddr, KM_USER0); + xdr_terminate_string(rcvbuf, len); return 0; } @@ -970,14 +971,14 @@ nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res) * Decode RENAME reply */ static int -nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs3_renameres *res) +nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs_renameres *res) { int status; if ((status = ntohl(*p++)) != 0) status = nfs_stat_to_errno(status); - p = xdr_decode_wcc_data(p, res->fromattr); - p = xdr_decode_wcc_data(p, res->toattr); + p = xdr_decode_wcc_data(p, res->old_fattr); + p = xdr_decode_wcc_data(p, res->new_fattr); return status; } @@ -1043,8 +1044,9 @@ nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res) res->wtmult = ntohl(*p++); res->dtpref = ntohl(*p++); p = xdr_decode_hyper(p, &res->maxfilesize); + p = xdr_decode_time3(p, &res->time_delta); - /* ignore time_delta and properties */ + /* ignore properties */ res->lease_time = 0; return 0; } diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 311e15cc8af0..9fa496387fdf 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -242,8 +242,6 @@ extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait); -extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); -extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, struct nfs4_fs_locations *fs_locations, struct page *page); @@ -333,7 +331,7 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid); extern const nfs4_stateid zero_stateid; /* nfs4xdr.c */ -extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus); +extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); extern struct rpc_procinfo nfs4_procedures[]; struct nfs4_mount_data; diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c new file mode 100644 index 000000000000..2e92f0d8d654 --- /dev/null +++ b/fs/nfs/nfs4filelayout.c @@ -0,0 +1,280 @@ +/* + * Module for the pnfs nfs4 file layout driver. + * Defines all I/O and Policy interface operations, plus code + * to register itself with the pNFS client. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand <dhildebz@umich.edu> + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#include <linux/nfs_fs.h> + +#include "internal.h" +#include "nfs4filelayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>"); +MODULE_DESCRIPTION("The NFSv4 file layout driver"); + +static int +filelayout_set_layoutdriver(struct nfs_server *nfss) +{ + int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client, + nfs4_fl_free_deviceid_callback); + if (status) { + printk(KERN_WARNING "%s: deviceid cache could not be " + "initialized\n", __func__); + return status; + } + dprintk("%s: deviceid cache has been initialized successfully\n", + __func__); + return 0; +} + +/* Clear out the layout by destroying its device list */ +static int +filelayout_clear_layoutdriver(struct nfs_server *nfss) +{ + dprintk("--> %s\n", __func__); + + if (nfss->nfs_client->cl_devid_cache) + pnfs_put_deviceid_cache(nfss->nfs_client); + return 0; +} + +/* + * filelayout_check_layout() + * + * Make sure layout segment parameters are sane WRT the device. + * At this point no generic layer initialization of the lseg has occurred, + * and nothing has been added to the layout_hdr cache. + * + */ +static int +filelayout_check_layout(struct pnfs_layout_hdr *lo, + struct nfs4_filelayout_segment *fl, + struct nfs4_layoutget_res *lgr, + struct nfs4_deviceid *id) +{ + struct nfs4_file_layout_dsaddr *dsaddr; + int status = -EINVAL; + struct nfs_server *nfss = NFS_SERVER(lo->inode); + + dprintk("--> %s\n", __func__); + + if (fl->pattern_offset > lgr->range.offset) { + dprintk("%s pattern_offset %lld to large\n", + __func__, fl->pattern_offset); + goto out; + } + + if (fl->stripe_unit % PAGE_SIZE) { + dprintk("%s Stripe unit (%u) not page aligned\n", + __func__, fl->stripe_unit); + goto out; + } + + /* find and reference the deviceid */ + dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id); + if (dsaddr == NULL) { + dsaddr = get_device_info(lo->inode, id); + if (dsaddr == NULL) + goto out; + } + fl->dsaddr = dsaddr; + + if (fl->first_stripe_index < 0 || + fl->first_stripe_index >= dsaddr->stripe_count) { + dprintk("%s Bad first_stripe_index %d\n", + __func__, fl->first_stripe_index); + goto out_put; + } + + if ((fl->stripe_type == STRIPE_SPARSE && + fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) || + (fl->stripe_type == STRIPE_DENSE && + fl->num_fh != dsaddr->stripe_count)) { + dprintk("%s num_fh %u not valid for given packing\n", + __func__, fl->num_fh); + goto out_put; + } + + if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) { + dprintk("%s Stripe unit (%u) not aligned with rsize %u " + "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize, + nfss->wsize); + } + + status = 0; +out: + dprintk("--> %s returns %d\n", __func__, status); + return status; +out_put: + pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid); + goto out; +} + +static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl) +{ + int i; + + for (i = 0; i < fl->num_fh; i++) { + if (!fl->fh_array[i]) + break; + kfree(fl->fh_array[i]); + } + kfree(fl->fh_array); + fl->fh_array = NULL; +} + +static void +_filelayout_free_lseg(struct nfs4_filelayout_segment *fl) +{ + filelayout_free_fh_array(fl); + kfree(fl); +} + +static int +filelayout_decode_layout(struct pnfs_layout_hdr *flo, + struct nfs4_filelayout_segment *fl, + struct nfs4_layoutget_res *lgr, + struct nfs4_deviceid *id) +{ + uint32_t *p = (uint32_t *)lgr->layout.buf; + uint32_t nfl_util; + int i; + + dprintk("%s: set_layout_map Begin\n", __func__); + + memcpy(id, p, sizeof(*id)); + p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); + print_deviceid(id); + + nfl_util = be32_to_cpup(p++); + if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS) + fl->commit_through_mds = 1; + if (nfl_util & NFL4_UFLG_DENSE) + fl->stripe_type = STRIPE_DENSE; + else + fl->stripe_type = STRIPE_SPARSE; + fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK; + + fl->first_stripe_index = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &fl->pattern_offset); + fl->num_fh = be32_to_cpup(p++); + + dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n", + __func__, nfl_util, fl->num_fh, fl->first_stripe_index, + fl->pattern_offset); + + fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *), + GFP_KERNEL); + if (!fl->fh_array) + return -ENOMEM; + + for (i = 0; i < fl->num_fh; i++) { + /* Do we want to use a mempool here? */ + fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); + if (!fl->fh_array[i]) { + filelayout_free_fh_array(fl); + return -ENOMEM; + } + fl->fh_array[i]->size = be32_to_cpup(p++); + if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) { + printk(KERN_ERR "Too big fh %d received %d\n", + i, fl->fh_array[i]->size); + filelayout_free_fh_array(fl); + return -EIO; + } + memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size); + p += XDR_QUADLEN(fl->fh_array[i]->size); + dprintk("DEBUG: %s: fh len %d\n", __func__, + fl->fh_array[i]->size); + } + + return 0; +} + +static struct pnfs_layout_segment * +filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, + struct nfs4_layoutget_res *lgr) +{ + struct nfs4_filelayout_segment *fl; + int rc; + struct nfs4_deviceid id; + + dprintk("--> %s\n", __func__); + fl = kzalloc(sizeof(*fl), GFP_KERNEL); + if (!fl) + return NULL; + + rc = filelayout_decode_layout(layoutid, fl, lgr, &id); + if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id)) { + _filelayout_free_lseg(fl); + return NULL; + } + return &fl->generic_hdr; +} + +static void +filelayout_free_lseg(struct pnfs_layout_segment *lseg) +{ + struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode); + struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); + + dprintk("--> %s\n", __func__); + pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, + &fl->dsaddr->deviceid); + _filelayout_free_lseg(fl); +} + +static struct pnfs_layoutdriver_type filelayout_type = { + .id = LAYOUT_NFSV4_1_FILES, + .name = "LAYOUT_NFSV4_1_FILES", + .owner = THIS_MODULE, + .set_layoutdriver = filelayout_set_layoutdriver, + .clear_layoutdriver = filelayout_clear_layoutdriver, + .alloc_lseg = filelayout_alloc_lseg, + .free_lseg = filelayout_free_lseg, +}; + +static int __init nfs4filelayout_init(void) +{ + printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n", + __func__); + return pnfs_register_layoutdriver(&filelayout_type); +} + +static void __exit nfs4filelayout_exit(void) +{ + printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n", + __func__); + pnfs_unregister_layoutdriver(&filelayout_type); +} + +module_init(nfs4filelayout_init); +module_exit(nfs4filelayout_exit); diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h new file mode 100644 index 000000000000..bbf60dd2ab9d --- /dev/null +++ b/fs/nfs/nfs4filelayout.h @@ -0,0 +1,94 @@ +/* + * NFSv4 file layout driver data structures. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand <dhildebz@umich.edu> + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#ifndef FS_NFS_NFS4FILELAYOUT_H +#define FS_NFS_NFS4FILELAYOUT_H + +#include "pnfs.h" + +/* + * Field testing shows we need to support upto 4096 stripe indices. + * We store each index as a u8 (u32 on the wire) to keep the memory footprint + * reasonable. This in turn means we support a maximum of 256 + * RFC 5661 multipath_list4 structures. + */ +#define NFS4_PNFS_MAX_STRIPE_CNT 4096 +#define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */ + +enum stripetype4 { + STRIPE_SPARSE = 1, + STRIPE_DENSE = 2 +}; + +/* Individual ip address */ +struct nfs4_pnfs_ds { + struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ + u32 ds_ip_addr; + u32 ds_port; + struct nfs_client *ds_clp; + atomic_t ds_count; +}; + +struct nfs4_file_layout_dsaddr { + struct pnfs_deviceid_node deviceid; + u32 stripe_count; + u8 *stripe_indices; + u32 ds_num; + struct nfs4_pnfs_ds *ds_list[1]; +}; + +struct nfs4_filelayout_segment { + struct pnfs_layout_segment generic_hdr; + u32 stripe_type; + u32 commit_through_mds; + u32 stripe_unit; + u32 first_stripe_index; + u64 pattern_offset; + struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */ + unsigned int num_fh; + struct nfs_fh **fh_array; +}; + +static inline struct nfs4_filelayout_segment * +FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg) +{ + return container_of(lseg, + struct nfs4_filelayout_segment, + generic_hdr); +} + +extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *); +extern void print_ds(struct nfs4_pnfs_ds *ds); +extern void print_deviceid(struct nfs4_deviceid *dev_id); +extern struct nfs4_file_layout_dsaddr * +nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id); +struct nfs4_file_layout_dsaddr * +get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id); + +#endif /* FS_NFS_NFS4FILELAYOUT_H */ diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c new file mode 100644 index 000000000000..51fe64ace55a --- /dev/null +++ b/fs/nfs/nfs4filelayoutdev.c @@ -0,0 +1,448 @@ +/* + * Device operations for the pnfs nfs4 file layout driver. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand <dhildebz@umich.edu> + * Garth Goodson <Garth.Goodson@netapp.com> + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#include <linux/nfs_fs.h> +#include <linux/vmalloc.h> + +#include "internal.h" +#include "nfs4filelayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +/* + * Data server cache + * + * Data servers can be mapped to different device ids. + * nfs4_pnfs_ds reference counting + * - set to 1 on allocation + * - incremented when a device id maps a data server already in the cache. + * - decremented when deviceid is removed from the cache. + */ +DEFINE_SPINLOCK(nfs4_ds_cache_lock); +static LIST_HEAD(nfs4_data_server_cache); + +/* Debug routines */ +void +print_ds(struct nfs4_pnfs_ds *ds) +{ + if (ds == NULL) { + printk("%s NULL device\n", __func__); + return; + } + printk(" ip_addr %x port %hu\n" + " ref count %d\n" + " client %p\n" + " cl_exchange_flags %x\n", + ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), + atomic_read(&ds->ds_count), ds->ds_clp, + ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); +} + +void +print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr) +{ + int i; + + ifdebug(FACILITY) { + printk("%s dsaddr->ds_num %d\n", __func__, + dsaddr->ds_num); + for (i = 0; i < dsaddr->ds_num; i++) + print_ds(dsaddr->ds_list[i]); + } +} + +void print_deviceid(struct nfs4_deviceid *id) +{ + u32 *p = (u32 *)id; + + dprintk("%s: device id= [%x%x%x%x]\n", __func__, + p[0], p[1], p[2], p[3]); +} + +/* nfs4_ds_cache_lock is held */ +static struct nfs4_pnfs_ds * +_data_server_lookup_locked(u32 ip_addr, u32 port) +{ + struct nfs4_pnfs_ds *ds; + + dprintk("_data_server_lookup: ip_addr=%x port=%hu\n", + ntohl(ip_addr), ntohs(port)); + + list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { + if (ds->ds_ip_addr == ip_addr && + ds->ds_port == port) { + return ds; + } + } + return NULL; +} + +static void +destroy_ds(struct nfs4_pnfs_ds *ds) +{ + dprintk("--> %s\n", __func__); + ifdebug(FACILITY) + print_ds(ds); + + if (ds->ds_clp) + nfs_put_client(ds->ds_clp); + kfree(ds); +} + +static void +nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) +{ + struct nfs4_pnfs_ds *ds; + int i; + + print_deviceid(&dsaddr->deviceid.de_id); + + for (i = 0; i < dsaddr->ds_num; i++) { + ds = dsaddr->ds_list[i]; + if (ds != NULL) { + if (atomic_dec_and_lock(&ds->ds_count, + &nfs4_ds_cache_lock)) { + list_del_init(&ds->ds_node); + spin_unlock(&nfs4_ds_cache_lock); + destroy_ds(ds); + } + } + } + kfree(dsaddr->stripe_indices); + kfree(dsaddr); +} + +void +nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device) +{ + struct nfs4_file_layout_dsaddr *dsaddr = + container_of(device, struct nfs4_file_layout_dsaddr, deviceid); + + nfs4_fl_free_deviceid(dsaddr); +} + +static struct nfs4_pnfs_ds * +nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port) +{ + struct nfs4_pnfs_ds *tmp_ds, *ds; + + ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL); + if (!ds) + goto out; + + spin_lock(&nfs4_ds_cache_lock); + tmp_ds = _data_server_lookup_locked(ip_addr, port); + if (tmp_ds == NULL) { + ds->ds_ip_addr = ip_addr; + ds->ds_port = port; + atomic_set(&ds->ds_count, 1); + INIT_LIST_HEAD(&ds->ds_node); + ds->ds_clp = NULL; + list_add(&ds->ds_node, &nfs4_data_server_cache); + dprintk("%s add new data server ip 0x%x\n", __func__, + ds->ds_ip_addr); + } else { + kfree(ds); + atomic_inc(&tmp_ds->ds_count); + dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n", + __func__, tmp_ds->ds_ip_addr, + atomic_read(&tmp_ds->ds_count)); + ds = tmp_ds; + } + spin_unlock(&nfs4_ds_cache_lock); +out: + return ds; +} + +/* + * Currently only support ipv4, and one multi-path address. + */ +static struct nfs4_pnfs_ds * +decode_and_add_ds(__be32 **pp, struct inode *inode) +{ + struct nfs4_pnfs_ds *ds = NULL; + char *buf; + const char *ipend, *pstr; + u32 ip_addr, port; + int nlen, rlen, i; + int tmp[2]; + __be32 *r_netid, *r_addr, *p = *pp; + + /* r_netid */ + nlen = be32_to_cpup(p++); + r_netid = p; + p += XDR_QUADLEN(nlen); + + /* r_addr */ + rlen = be32_to_cpup(p++); + r_addr = p; + p += XDR_QUADLEN(rlen); + *pp = p; + + /* Check that netid is "tcp" */ + if (nlen != 3 || memcmp((char *)r_netid, "tcp", 3)) { + dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__); + goto out_err; + } + + /* ipv6 length plus port is legal */ + if (rlen > INET6_ADDRSTRLEN + 8) { + dprintk("%s Invalid address, length %d\n", __func__, + rlen); + goto out_err; + } + buf = kmalloc(rlen + 1, GFP_KERNEL); + buf[rlen] = '\0'; + memcpy(buf, r_addr, rlen); + + /* replace the port dots with dashes for the in4_pton() delimiter*/ + for (i = 0; i < 2; i++) { + char *res = strrchr(buf, '.'); + *res = '-'; + } + + /* Currently only support ipv4 address */ + if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) { + dprintk("%s: Only ipv4 addresses supported\n", __func__); + goto out_free; + } + + /* port */ + pstr = ipend; + sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]); + port = htons((tmp[0] << 8) | (tmp[1])); + + ds = nfs4_pnfs_ds_add(inode, ip_addr, port); + dprintk("%s Decoded address and port %s\n", __func__, buf); +out_free: + kfree(buf); +out_err: + return ds; +} + +/* Decode opaque device data and return the result */ +static struct nfs4_file_layout_dsaddr* +decode_device(struct inode *ino, struct pnfs_device *pdev) +{ + int i, dummy; + u32 cnt, num; + u8 *indexp; + __be32 *p = (__be32 *)pdev->area, *indicesp; + struct nfs4_file_layout_dsaddr *dsaddr; + + /* Get the stripe count (number of stripe index) */ + cnt = be32_to_cpup(p++); + dprintk("%s stripe count %d\n", __func__, cnt); + if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { + printk(KERN_WARNING "%s: stripe count %d greater than " + "supported maximum %d\n", __func__, + cnt, NFS4_PNFS_MAX_STRIPE_CNT); + goto out_err; + } + + /* Check the multipath list count */ + indicesp = p; + p += XDR_QUADLEN(cnt << 2); + num = be32_to_cpup(p++); + dprintk("%s ds_num %u\n", __func__, num); + if (num > NFS4_PNFS_MAX_MULTI_CNT) { + printk(KERN_WARNING "%s: multipath count %d greater than " + "supported maximum %d\n", __func__, + num, NFS4_PNFS_MAX_MULTI_CNT); + goto out_err; + } + dsaddr = kzalloc(sizeof(*dsaddr) + + (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), + GFP_KERNEL); + if (!dsaddr) + goto out_err; + + dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL); + if (!dsaddr->stripe_indices) + goto out_err_free; + + dsaddr->stripe_count = cnt; + dsaddr->ds_num = num; + + memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id)); + + /* Go back an read stripe indices */ + p = indicesp; + indexp = &dsaddr->stripe_indices[0]; + for (i = 0; i < dsaddr->stripe_count; i++) { + *indexp = be32_to_cpup(p++); + if (*indexp >= num) + goto out_err_free; + indexp++; + } + /* Skip already read multipath list count */ + p++; + + for (i = 0; i < dsaddr->ds_num; i++) { + int j; + + dummy = be32_to_cpup(p++); /* multipath count */ + if (dummy > 1) { + printk(KERN_WARNING + "%s: Multipath count %d not supported, " + "skipping all greater than 1\n", __func__, + dummy); + } + for (j = 0; j < dummy; j++) { + if (j == 0) { + dsaddr->ds_list[i] = decode_and_add_ds(&p, ino); + if (dsaddr->ds_list[i] == NULL) + goto out_err_free; + } else { + u32 len; + /* skip extra multipath */ + len = be32_to_cpup(p++); + p += XDR_QUADLEN(len); + len = be32_to_cpup(p++); + p += XDR_QUADLEN(len); + continue; + } + } + } + return dsaddr; + +out_err_free: + nfs4_fl_free_deviceid(dsaddr); +out_err: + dprintk("%s ERROR: returning NULL\n", __func__); + return NULL; +} + +/* + * Decode the opaque device specified in 'dev' + * and add it to the list of available devices. + * If the deviceid is already cached, nfs4_add_deviceid will return + * a pointer to the cached struct and throw away the new. + */ +static struct nfs4_file_layout_dsaddr* +decode_and_add_device(struct inode *inode, struct pnfs_device *dev) +{ + struct nfs4_file_layout_dsaddr *dsaddr; + struct pnfs_deviceid_node *d; + + dsaddr = decode_device(inode, dev); + if (!dsaddr) { + printk(KERN_WARNING "%s: Could not decode or add device\n", + __func__); + return NULL; + } + + d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache, + &dsaddr->deviceid); + + return container_of(d, struct nfs4_file_layout_dsaddr, deviceid); +} + +/* + * Retrieve the information for dev_id, add it to the list + * of available devices, and return it. + */ +struct nfs4_file_layout_dsaddr * +get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id) +{ + struct pnfs_device *pdev = NULL; + u32 max_resp_sz; + int max_pages; + struct page **pages = NULL; + struct nfs4_file_layout_dsaddr *dsaddr = NULL; + int rc, i; + struct nfs_server *server = NFS_SERVER(inode); + + /* + * Use the session max response size as the basis for setting + * GETDEVICEINFO's maxcount + */ + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; + max_pages = max_resp_sz >> PAGE_SHIFT; + dprintk("%s inode %p max_resp_sz %u max_pages %d\n", + __func__, inode, max_resp_sz, max_pages); + + pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL); + if (pdev == NULL) + return NULL; + + pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); + if (pages == NULL) { + kfree(pdev); + return NULL; + } + for (i = 0; i < max_pages; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) + goto out_free; + } + + /* set pdev->area */ + pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); + if (!pdev->area) + goto out_free; + + memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id)); + pdev->layout_type = LAYOUT_NFSV4_1_FILES; + pdev->pages = pages; + pdev->pgbase = 0; + pdev->pglen = PAGE_SIZE * max_pages; + pdev->mincount = 0; + + rc = nfs4_proc_getdeviceinfo(server, pdev); + dprintk("%s getdevice info returns %d\n", __func__, rc); + if (rc) + goto out_free; + + /* + * Found new device, need to decode it and then add it to the + * list of known devices for this mountpoint. + */ + dsaddr = decode_and_add_device(inode, pdev); +out_free: + if (pdev->area != NULL) + vunmap(pdev->area); + for (i = 0; i < max_pages; i++) + __free_page(pages[i]); + kfree(pages); + kfree(pdev); + dprintk("<-- %s dsaddr %p\n", __func__, dsaddr); + return dsaddr; +} + +struct nfs4_file_layout_dsaddr * +nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id) +{ + struct pnfs_deviceid_node *d; + + d = pnfs_find_get_deviceid(clp->cl_devid_cache, id); + return (d == NULL) ? NULL : + container_of(d, struct nfs4_file_layout_dsaddr, deviceid); +} diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 089da5b5d20a..32c8758c99fd 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -55,6 +55,7 @@ #include "internal.h" #include "iostat.h" #include "callback.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PROC @@ -129,7 +130,8 @@ const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD0_MAXREAD | FATTR4_WORD0_MAXWRITE | FATTR4_WORD0_LEASE_TIME, - 0 + FATTR4_WORD1_TIME_DELTA + | FATTR4_WORD1_FS_LAYOUT_TYPES }; const u32 nfs4_fs_locations_bitmap[2] = { @@ -255,9 +257,6 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, nfs4_state_mark_reclaim_nograce(clp, state); goto do_state_recovery; case -NFS4ERR_STALE_STATEID: - if (state == NULL) - break; - nfs4_state_mark_reclaim_reboot(clp, state); case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_EXPIRED: goto do_state_recovery; @@ -334,10 +333,12 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp * Must be called while holding tbl->slot_tbl_lock */ static void -nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid) +nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot) { + int free_slotid = free_slot - tbl->slots; int slotid = free_slotid; + BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE); /* clear used bit in bitmap */ __clear_bit(slotid, tbl->used_slots); @@ -379,7 +380,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) struct nfs4_slot_table *tbl; tbl = &res->sr_session->fc_slot_table; - if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) { + if (!res->sr_slot) { /* just wake up the next guy waiting since * we may have not consumed a slot after all */ dprintk("%s: No slot\n", __func__); @@ -387,17 +388,15 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) } spin_lock(&tbl->slot_tbl_lock); - nfs4_free_slot(tbl, res->sr_slotid); + nfs4_free_slot(tbl, res->sr_slot); nfs41_check_drain_session_complete(res->sr_session); spin_unlock(&tbl->slot_tbl_lock); - res->sr_slotid = NFS4_MAX_SLOT_TABLE; + res->sr_slot = NULL; } static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { unsigned long timestamp; - struct nfs4_slot_table *tbl; - struct nfs4_slot *slot; struct nfs_client *clp; /* @@ -410,17 +409,14 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * res->sr_status = NFS_OK; /* -ERESTARTSYS can result in skipping nfs41_sequence_setup */ - if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) + if (!res->sr_slot) goto out; - tbl = &res->sr_session->fc_slot_table; - slot = tbl->slots + res->sr_slotid; - /* Check the SEQUENCE operation status */ switch (res->sr_status) { case 0: /* Update the slot's sequence and clientid lease timer */ - ++slot->seq_nr; + ++res->sr_slot->seq_nr; timestamp = res->sr_renewal_time; clp = res->sr_session->clp; do_renew_lease(clp, timestamp); @@ -433,12 +429,14 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * * returned NFS4ERR_DELAY as per Section 2.10.6.2 * of RFC5661. */ - dprintk("%s: slot=%d seq=%d: Operation in progress\n", - __func__, res->sr_slotid, slot->seq_nr); + dprintk("%s: slot=%ld seq=%d: Operation in progress\n", + __func__, + res->sr_slot - res->sr_session->fc_slot_table.slots, + res->sr_slot->seq_nr); goto out_retry; default: /* Just update the slot sequence no. */ - ++slot->seq_nr; + ++res->sr_slot->seq_nr; } out: /* The session may be reset by one of the error handlers. */ @@ -505,10 +503,9 @@ static int nfs41_setup_sequence(struct nfs4_session *session, dprintk("--> %s\n", __func__); /* slot already allocated? */ - if (res->sr_slotid != NFS4_MAX_SLOT_TABLE) + if (res->sr_slot != NULL) return 0; - res->sr_slotid = NFS4_MAX_SLOT_TABLE; tbl = &session->fc_slot_table; spin_lock(&tbl->slot_tbl_lock); @@ -550,7 +547,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session, dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr); res->sr_session = session; - res->sr_slotid = slotid; + res->sr_slot = slot; res->sr_renewal_time = jiffies; res->sr_status_flags = 0; /* @@ -576,8 +573,9 @@ int nfs4_setup_sequence(const struct nfs_server *server, goto out; } - dprintk("--> %s clp %p session %p sr_slotid %d\n", - __func__, session->clp, session, res->sr_slotid); + dprintk("--> %s clp %p session %p sr_slot %ld\n", + __func__, session->clp, session, res->sr_slot ? + res->sr_slot - session->fc_slot_table.slots : -1); ret = nfs41_setup_sequence(session, args, res, cache_reply, task); @@ -650,7 +648,7 @@ static int nfs4_call_sync_sequence(struct nfs_server *server, .callback_data = &data }; - res->sr_slotid = NFS4_MAX_SLOT_TABLE; + res->sr_slot = NULL; if (privileged) task_setup.callback_ops = &nfs41_call_priv_sync_ops; task = rpc_run_task(&task_setup); @@ -735,7 +733,6 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p) p->o_res.server = p->o_arg.server; nfs_fattr_init(&p->f_attr); nfs_fattr_init(&p->dir_attr); - p->o_res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; } static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, @@ -1120,6 +1117,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * clear_bit(NFS_DELEGATED_STATE, &state->flags); smp_rmb(); if (state->n_rdwr != 0) { + clear_bit(NFS_O_RDWR_STATE, &state->flags); ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate); if (ret != 0) return ret; @@ -1127,6 +1125,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * return -ESTALE; } if (state->n_wronly != 0) { + clear_bit(NFS_O_WRONLY_STATE, &state->flags); ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate); if (ret != 0) return ret; @@ -1134,6 +1133,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * return -ESTALE; } if (state->n_rdonly != 0) { + clear_bit(NFS_O_RDONLY_STATE, &state->flags); ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate); if (ret != 0) return ret; @@ -1188,7 +1188,7 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state int err; do { err = _nfs4_do_open_reclaim(ctx, state); - if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED) + if (err != -NFS4ERR_DELAY) break; nfs4_handle_exception(server, err, &exception); } while (exception.retry); @@ -1258,6 +1258,13 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: nfs4_state_mark_reclaim_nograce(server->nfs_client, state); + case -EKEYEXPIRED: + /* + * User RPCSEC_GSS context has expired. + * We cannot recover this stateid now, so + * skip it and allow recovery thread to + * proceed. + */ case -ENOMEM: err = 0; goto out; @@ -1605,7 +1612,6 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state goto out; case -NFS4ERR_GRACE: case -NFS4ERR_DELAY: - case -EKEYEXPIRED: nfs4_handle_exception(server, err, &exception); err = 0; } @@ -1975,7 +1981,6 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i calldata->res.fattr = &calldata->fattr; calldata->res.seqid = calldata->arg.seqid; calldata->res.server = server; - calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; path_get(path); calldata->path = *path; @@ -1998,120 +2003,17 @@ out: return status; } -static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state, fmode_t fmode) +static struct inode * +nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr) { - struct file *filp; - int ret; - - /* If the open_intent is for execute, we have an extra check to make */ - if (fmode & FMODE_EXEC) { - ret = nfs_may_open(state->inode, - state->owner->so_cred, - nd->intent.open.flags); - if (ret < 0) - goto out_close; - } - filp = lookup_instantiate_filp(nd, path->dentry, NULL); - if (!IS_ERR(filp)) { - struct nfs_open_context *ctx; - ctx = nfs_file_open_context(filp); - ctx->state = state; - return 0; - } - ret = PTR_ERR(filp); -out_close: - nfs4_close_sync(path, state, fmode & (FMODE_READ|FMODE_WRITE)); - return ret; -} - -struct dentry * -nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) -{ - struct path path = { - .mnt = nd->path.mnt, - .dentry = dentry, - }; - struct dentry *parent; - struct iattr attr; - struct rpc_cred *cred; struct nfs4_state *state; - struct dentry *res; - int open_flags = nd->intent.open.flags; - fmode_t fmode = open_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC); - - if (nd->flags & LOOKUP_CREATE) { - attr.ia_mode = nd->intent.open.create_mode; - attr.ia_valid = ATTR_MODE; - if (!IS_POSIXACL(dir)) - attr.ia_mode &= ~current_umask(); - } else { - open_flags &= ~O_EXCL; - attr.ia_valid = 0; - BUG_ON(open_flags & O_CREAT); - } - cred = rpc_lookup_cred(); - if (IS_ERR(cred)) - return (struct dentry *)cred; - parent = dentry->d_parent; /* Protect against concurrent sillydeletes */ - nfs_block_sillyrename(parent); - state = nfs4_do_open(dir, &path, fmode, open_flags, &attr, cred); - put_rpccred(cred); - if (IS_ERR(state)) { - if (PTR_ERR(state) == -ENOENT) { - d_add(dentry, NULL); - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - } - nfs_unblock_sillyrename(parent); - return (struct dentry *)state; - } - res = d_add_unique(dentry, igrab(state->inode)); - if (res != NULL) - path.dentry = res; - nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir)); - nfs_unblock_sillyrename(parent); - nfs4_intent_set_file(nd, &path, state, fmode); - return res; -} - -int -nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, struct nameidata *nd) -{ - struct path path = { - .mnt = nd->path.mnt, - .dentry = dentry, - }; - struct rpc_cred *cred; - struct nfs4_state *state; - fmode_t fmode = openflags & (FMODE_READ | FMODE_WRITE); - - cred = rpc_lookup_cred(); - if (IS_ERR(cred)) - return PTR_ERR(cred); - state = nfs4_do_open(dir, &path, fmode, openflags, NULL, cred); - put_rpccred(cred); - if (IS_ERR(state)) { - switch (PTR_ERR(state)) { - case -EPERM: - case -EACCES: - case -EDQUOT: - case -ENOSPC: - case -EROFS: - return PTR_ERR(state); - default: - goto out_drop; - } - } - if (state->inode == dentry->d_inode) { - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - nfs4_intent_set_file(nd, &path, state, fmode); - return 1; - } - nfs4_close_sync(&path, state, fmode); -out_drop: - d_drop(dentry); - return 0; + state = nfs4_do_open(dir, &ctx->path, ctx->mode, open_flags, attr, ctx->cred); + if (IS_ERR(state)) + return ERR_CAST(state); + ctx->state = state; + return igrab(state->inode); } static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) @@ -2568,36 +2470,34 @@ static int nfs4_proc_readlink(struct inode *inode, struct page *page, static int nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, - int flags, struct nameidata *nd) + int flags, struct nfs_open_context *ctx) { - struct path path = { - .mnt = nd->path.mnt, + struct path my_path = { .dentry = dentry, }; + struct path *path = &my_path; struct nfs4_state *state; - struct rpc_cred *cred; - fmode_t fmode = flags & (FMODE_READ | FMODE_WRITE); + struct rpc_cred *cred = NULL; + fmode_t fmode = 0; int status = 0; - cred = rpc_lookup_cred(); - if (IS_ERR(cred)) { - status = PTR_ERR(cred); - goto out; + if (ctx != NULL) { + cred = ctx->cred; + path = &ctx->path; + fmode = ctx->mode; } - state = nfs4_do_open(dir, &path, fmode, flags, sattr, cred); + state = nfs4_do_open(dir, path, fmode, flags, sattr, cred); d_drop(dentry); if (IS_ERR(state)) { status = PTR_ERR(state); - goto out_putcred; + goto out; } d_add(dentry, igrab(state->inode)); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0) - status = nfs4_intent_set_file(nd, &path, state, fmode); + if (ctx != NULL) + ctx->state = state; else - nfs4_close_sync(&path, state, fmode); -out_putcred: - put_rpccred(cred); + nfs4_close_sync(path, state, fmode); out: return status; } @@ -2655,6 +2555,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) args->bitmask = server->cache_consistency_bitmask; res->server = server; + res->seq_res.sr_slot = NULL; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; } @@ -2671,18 +2572,46 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) return 1; } +static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +{ + struct nfs_server *server = NFS_SERVER(dir); + struct nfs_renameargs *arg = msg->rpc_argp; + struct nfs_renameres *res = msg->rpc_resp; + + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME]; + arg->bitmask = server->attr_bitmask; + res->server = server; +} + +static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, + struct inode *new_dir) +{ + struct nfs_renameres *res = task->tk_msg.rpc_resp; + + if (!nfs4_sequence_done(task, &res->seq_res)) + return 0; + if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) + return 0; + + update_changeattr(old_dir, &res->old_cinfo); + nfs_post_op_update_inode(old_dir, res->old_fattr); + update_changeattr(new_dir, &res->new_cinfo); + nfs_post_op_update_inode(new_dir, res->new_fattr); + return 1; +} + static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, struct inode *new_dir, struct qstr *new_name) { struct nfs_server *server = NFS_SERVER(old_dir); - struct nfs4_rename_arg arg = { + struct nfs_renameargs arg = { .old_dir = NFS_FH(old_dir), .new_dir = NFS_FH(new_dir), .old_name = old_name, .new_name = new_name, .bitmask = server->attr_bitmask, }; - struct nfs4_rename_res res = { + struct nfs_renameres res = { .server = server, }; struct rpc_message msg = { @@ -2896,15 +2825,16 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, } static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, - u64 cookie, struct page *page, unsigned int count, int plus) + u64 cookie, struct page **pages, unsigned int count, int plus) { struct inode *dir = dentry->d_inode; struct nfs4_readdir_arg args = { .fh = NFS_FH(dir), - .pages = &page, + .pages = pages, .pgbase = 0, .count = count, .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask, + .plus = plus, }; struct nfs4_readdir_res res; struct rpc_message msg = { @@ -2932,14 +2862,14 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, } static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, - u64 cookie, struct page *page, unsigned int count, int plus) + u64 cookie, struct page **pages, unsigned int count, int plus) { struct nfs4_exception exception = { }; int err; do { err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode), _nfs4_proc_readdir(dentry, cred, cookie, - page, count, plus), + pages, count, plus), &exception); } while (exception.retry); return err; @@ -3490,9 +3420,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, nfs4_state_mark_reclaim_nograce(clp, state); goto do_state_recovery; case -NFS4ERR_STALE_STATEID: - if (state == NULL) - break; - nfs4_state_mark_reclaim_reboot(clp, state); case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_EXPIRED: goto do_state_recovery; @@ -3626,7 +3553,6 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp, case -NFS4ERR_RESOURCE: /* The IBM lawyers misread another document! */ case -NFS4ERR_DELAY: - case -EKEYEXPIRED: err = nfs4_delay(clp->cl_rpcclient, &timeout); } } while (err == 0); @@ -3721,7 +3647,6 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co memcpy(&data->stateid, stateid, sizeof(data->stateid)); data->res.fattr = &data->fattr; data->res.server = server; - data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; nfs_fattr_init(data->res.fattr); data->timestamp = jiffies; data->rpc_status = 0; @@ -3874,7 +3799,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, p->arg.fl = &p->fl; p->arg.seqid = seqid; p->res.seqid = seqid; - p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; p->arg.stateid = &lsp->ls_stateid; p->lsp = lsp; atomic_inc(&lsp->ls_count); @@ -4054,7 +3978,6 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; p->arg.lock_owner.id = lsp->ls_id.id; p->res.lock_seqid = p->arg.lock_seqid; - p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; p->lsp = lsp; p->server = server; atomic_inc(&lsp->ls_count); @@ -4241,7 +4164,7 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) return 0; err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM); - if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED) + if (err != -NFS4ERR_DELAY) break; nfs4_handle_exception(server, err, &exception); } while (exception.retry); @@ -4266,7 +4189,6 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request goto out; case -NFS4ERR_GRACE: case -NFS4ERR_DELAY: - case -EKEYEXPIRED: nfs4_handle_exception(server, err, &exception); err = 0; } @@ -4412,13 +4334,21 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) nfs4_state_mark_reclaim_nograce(server->nfs_client, state); err = 0; goto out; + case -EKEYEXPIRED: + /* + * User RPCSEC_GSS context has expired. + * We cannot recover this stateid now, so + * skip it and allow recovery thread to + * proceed. + */ + err = 0; + goto out; case -ENOMEM: case -NFS4ERR_DENIED: /* kill_proc(fl->fl_pid, SIGLOST, 1); */ err = 0; goto out; case -NFS4ERR_DELAY: - case -EKEYEXPIRED: break; } err = nfs4_handle_exception(server, err, &exception); @@ -4647,7 +4577,6 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case -NFS4ERR_DELAY: case -NFS4ERR_GRACE: - case -EKEYEXPIRED: dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status); rpc_delay(task, NFS4_POLL_RETRY_MIN); task->tk_status = 0; @@ -4687,7 +4616,6 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) }; int status; - res.lr_seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; dprintk("--> %s\n", __func__); task = rpc_run_task(&task_setup); @@ -4914,49 +4842,56 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) args->bc_attrs.max_reqs); } -static int _verify_channel_attr(char *chan, char *attr_name, u32 sent, u32 rcvd) +static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session) { - if (rcvd <= sent) - return 0; - printk(KERN_WARNING "%s: Session INVALID: %s channel %s increased. " - "sent=%u rcvd=%u\n", __func__, chan, attr_name, sent, rcvd); - return -EINVAL; + struct nfs4_channel_attrs *sent = &args->fc_attrs; + struct nfs4_channel_attrs *rcvd = &session->fc_attrs; + + if (rcvd->headerpadsz > sent->headerpadsz) + return -EINVAL; + if (rcvd->max_resp_sz > sent->max_resp_sz) + return -EINVAL; + /* + * Our requested max_ops is the minimum we need; we're not + * prepared to break up compounds into smaller pieces than that. + * So, no point even trying to continue if the server won't + * cooperate: + */ + if (rcvd->max_ops < sent->max_ops) + return -EINVAL; + if (rcvd->max_reqs == 0) + return -EINVAL; + return 0; } -#define _verify_fore_channel_attr(_name_) \ - _verify_channel_attr("fore", #_name_, \ - args->fc_attrs._name_, \ - session->fc_attrs._name_) +static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session) +{ + struct nfs4_channel_attrs *sent = &args->bc_attrs; + struct nfs4_channel_attrs *rcvd = &session->bc_attrs; -#define _verify_back_channel_attr(_name_) \ - _verify_channel_attr("back", #_name_, \ - args->bc_attrs._name_, \ - session->bc_attrs._name_) + if (rcvd->max_rqst_sz > sent->max_rqst_sz) + return -EINVAL; + if (rcvd->max_resp_sz < sent->max_resp_sz) + return -EINVAL; + if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached) + return -EINVAL; + /* These would render the backchannel useless: */ + if (rcvd->max_ops == 0) + return -EINVAL; + if (rcvd->max_reqs == 0) + return -EINVAL; + return 0; +} -/* - * The server is not allowed to increase the fore channel header pad size, - * maximum response size, or maximum number of operations. - * - * The back channel attributes are only negotiatied down: We send what the - * (back channel) server insists upon. - */ static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session) { - int ret = 0; - - ret |= _verify_fore_channel_attr(headerpadsz); - ret |= _verify_fore_channel_attr(max_resp_sz); - ret |= _verify_fore_channel_attr(max_ops); - - ret |= _verify_back_channel_attr(headerpadsz); - ret |= _verify_back_channel_attr(max_rqst_sz); - ret |= _verify_back_channel_attr(max_resp_sz); - ret |= _verify_back_channel_attr(max_resp_sz_cached); - ret |= _verify_back_channel_attr(max_ops); - ret |= _verify_back_channel_attr(max_reqs); + int ret; - return ret; + ret = nfs4_verify_fore_channel_attrs(args, session); + if (ret) + return ret; + return nfs4_verify_back_channel_attrs(args, session); } static int _nfs4_proc_create_session(struct nfs_client *clp) @@ -5111,7 +5046,6 @@ static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client { switch(task->tk_status) { case -NFS4ERR_DELAY: - case -EKEYEXPIRED: rpc_delay(task, NFS4_POLL_RETRY_MAX); return -EAGAIN; default: @@ -5180,12 +5114,11 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_ if (!atomic_inc_not_zero(&clp->cl_count)) return ERR_PTR(-EIO); - calldata = kmalloc(sizeof(*calldata), GFP_NOFS); + calldata = kzalloc(sizeof(*calldata), GFP_NOFS); if (calldata == NULL) { nfs_put_client(clp); return ERR_PTR(-ENOMEM); } - calldata->res.sr_slotid = NFS4_MAX_SLOT_TABLE; msg.rpc_argp = &calldata->args; msg.rpc_resp = &calldata->res; calldata->clp = clp; @@ -5254,7 +5187,6 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf case -NFS4ERR_WRONG_CRED: /* What to do here? */ break; case -NFS4ERR_DELAY: - case -EKEYEXPIRED: rpc_delay(task, NFS4_POLL_RETRY_MAX); return -EAGAIN; default: @@ -5317,7 +5249,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp) goto out; calldata->clp = clp; calldata->arg.one_fs = 0; - calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; msg.rpc_argp = &calldata->arg; msg.rpc_resp = &calldata->res; @@ -5333,6 +5264,147 @@ out: dprintk("<-- %s status=%d\n", __func__, status); return status; } + +static void +nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutget *lgp = calldata; + struct inode *ino = lgp->args.inode; + struct nfs_server *server = NFS_SERVER(ino); + + dprintk("--> %s\n", __func__); + if (nfs4_setup_sequence(server, &lgp->args.seq_args, + &lgp->res.seq_res, 0, task)) + return; + rpc_call_start(task); +} + +static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutget *lgp = calldata; + struct nfs_server *server = NFS_SERVER(lgp->args.inode); + + dprintk("--> %s\n", __func__); + + if (!nfs4_sequence_done(task, &lgp->res.seq_res)) + return; + + switch (task->tk_status) { + case 0: + break; + case -NFS4ERR_LAYOUTTRYLATER: + case -NFS4ERR_RECALLCONFLICT: + task->tk_status = -NFS4ERR_DELAY; + /* Fall through */ + default: + if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { + rpc_restart_call_prepare(task); + return; + } + } + lgp->status = task->tk_status; + dprintk("<-- %s\n", __func__); +} + +static void nfs4_layoutget_release(void *calldata) +{ + struct nfs4_layoutget *lgp = calldata; + + dprintk("--> %s\n", __func__); + put_layout_hdr(lgp->args.inode); + if (lgp->res.layout.buf != NULL) + free_page((unsigned long) lgp->res.layout.buf); + put_nfs_open_context(lgp->args.ctx); + kfree(calldata); + dprintk("<-- %s\n", __func__); +} + +static const struct rpc_call_ops nfs4_layoutget_call_ops = { + .rpc_call_prepare = nfs4_layoutget_prepare, + .rpc_call_done = nfs4_layoutget_done, + .rpc_release = nfs4_layoutget_release, +}; + +int nfs4_proc_layoutget(struct nfs4_layoutget *lgp) +{ + struct nfs_server *server = NFS_SERVER(lgp->args.inode); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET], + .rpc_argp = &lgp->args, + .rpc_resp = &lgp->res, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs4_layoutget_call_ops, + .callback_data = lgp, + .flags = RPC_TASK_ASYNC, + }; + int status = 0; + + dprintk("--> %s\n", __func__); + + lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS); + if (lgp->res.layout.buf == NULL) { + nfs4_layoutget_release(lgp); + return -ENOMEM; + } + + lgp->res.seq_res.sr_slot = NULL; + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + status = nfs4_wait_for_completion_rpc_task(task); + if (status != 0) + goto out; + status = lgp->status; + if (status != 0) + goto out; + status = pnfs_layout_process(lgp); +out: + rpc_put_task(task); + dprintk("<-- %s status=%d\n", __func__, status); + return status; +} + +static int +_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) +{ + struct nfs4_getdeviceinfo_args args = { + .pdev = pdev, + }; + struct nfs4_getdeviceinfo_res res = { + .pdev = pdev, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO], + .rpc_argp = &args, + .rpc_resp = &res, + }; + int status; + + dprintk("--> %s\n", __func__); + status = nfs4_call_sync(server, &msg, &args, &res, 0); + dprintk("<-- %s status=%d\n", __func__, status); + + return status; +} + +int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) +{ + struct nfs4_exception exception = { }; + int err; + + do { + err = nfs4_handle_exception(server, + _nfs4_proc_getdeviceinfo(server, pdev), + &exception); + } while (exception.retry); + return err; +} +EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo); + #endif /* CONFIG_NFS_V4_1 */ struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { @@ -5443,6 +5515,8 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .unlink_setup = nfs4_proc_unlink_setup, .unlink_done = nfs4_proc_unlink_done, .rename = nfs4_proc_rename, + .rename_setup = nfs4_proc_rename_setup, + .rename_done = nfs4_proc_rename_done, .link = nfs4_proc_link, .symlink = nfs4_proc_symlink, .mkdir = nfs4_proc_mkdir, @@ -5463,6 +5537,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .lock = nfs4_proc_lock, .clear_acl_cache = nfs4_zap_acl_attr, .close_context = nfs4_close_context, + .open_context = nfs4_atomic_open, }; /* diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 96524c5dca6b..f575a3126737 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -46,6 +46,7 @@ #include <linux/kthread.h> #include <linux/module.h> #include <linux/random.h> +#include <linux/ratelimit.h> #include <linux/workqueue.h> #include <linux/bitops.h> @@ -53,6 +54,7 @@ #include "callback.h" #include "delegation.h" #include "internal.h" +#include "pnfs.h" #define OPENOWNER_POOL_SIZE 8 @@ -1063,6 +1065,14 @@ restart: /* Mark the file as being 'closed' */ state->state = 0; break; + case -EKEYEXPIRED: + /* + * User RPCSEC_GSS context has expired. + * We cannot recover this stateid now, so + * skip it and allow recovery thread to + * proceed. + */ + break; case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_BAD_STATEID: @@ -1138,16 +1148,14 @@ static void nfs4_reclaim_complete(struct nfs_client *clp, (void)ops->reclaim_complete(clp); } -static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) +static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp) { struct nfs4_state_owner *sp; struct rb_node *pos; struct nfs4_state *state; if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) - return; - - nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops); + return 0; for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); @@ -1161,6 +1169,14 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) } nfs_delegation_reap_unclaimed(clp); + return 1; +} + +static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) +{ + if (!nfs4_state_clear_reclaim_reboot(clp)) + return; + nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops); } static void nfs_delegation_clear_all(struct nfs_client *clp) @@ -1175,6 +1191,14 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp) nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); } +static void nfs4_warn_keyexpired(const char *s) +{ + printk_ratelimited(KERN_WARNING "Error: state manager" + " encountered RPCSEC_GSS session" + " expired against NFSv4 server %s.\n", + s); +} + static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) { switch (error) { @@ -1187,7 +1211,7 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_LEASE_MOVED: set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - nfs4_state_end_reclaim_reboot(clp); + nfs4_state_clear_reclaim_reboot(clp); nfs4_state_start_reclaim_reboot(clp); break; case -NFS4ERR_EXPIRED: @@ -1204,6 +1228,10 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); /* Zero session reset errors */ return 0; + case -EKEYEXPIRED: + /* Nothing we can do */ + nfs4_warn_keyexpired(clp->cl_hostname); + return 0; } return error; } @@ -1414,9 +1442,10 @@ static void nfs4_set_lease_expired(struct nfs_client *clp, int status) case -NFS4ERR_DELAY: case -NFS4ERR_CLID_INUSE: case -EAGAIN: - case -EKEYEXPIRED: break; + case -EKEYEXPIRED: + nfs4_warn_keyexpired(clp->cl_hostname); case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery * in nfs4_exchange_id */ default: @@ -1447,6 +1476,7 @@ static void nfs4_state_manager(struct nfs_client *clp) } clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); + pnfs_destroy_all_layouts(clp); } if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 08ef91291132..f313c4cce7e4 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -52,6 +52,7 @@ #include <linux/nfs_idmap.h> #include "nfs4_fs.h" #include "internal.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_XDR @@ -310,6 +311,19 @@ static int nfs4_stat_to_errno(int); XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) +#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ + XDR_QUADLEN(NFS4_DEVICEID4_SIZE)) +#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ + 1 /* layout type */ + \ + 1 /* opaque devaddr4 length */ + \ + /* devaddr4 payload is read into page */ \ + 1 /* notification bitmap length */ + \ + 1 /* notification bitmap */) +#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ + encode_stateid_maxsz) +#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ + decode_stateid_maxsz + \ + XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) #else /* CONFIG_NFS_V4_1 */ #define encode_sequence_maxsz 0 #define decode_sequence_maxsz 0 @@ -699,6 +713,20 @@ static int nfs4_stat_to_errno(int); #define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_reclaim_complete_maxsz) +#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz +\ + encode_getdeviceinfo_maxsz) +#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_getdeviceinfo_maxsz) +#define NFS4_enc_layoutget_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_layoutget_maxsz) +#define NFS4_dec_layoutget_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_layoutget_maxsz) const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + compound_encode_hdr_maxsz + @@ -816,7 +844,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const if (iap->ia_valid & ATTR_MODE) len += 4; if (iap->ia_valid & ATTR_UID) { - owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name); + owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ); if (owner_namelen < 0) { dprintk("nfs: couldn't resolve uid %d to string\n", iap->ia_uid); @@ -828,7 +856,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const len += 4 + (XDR_QUADLEN(owner_namelen) << 2); } if (iap->ia_valid & ATTR_GID) { - owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group); + owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ); if (owner_grouplen < 0) { dprintk("nfs: couldn't resolve gid %d to string\n", iap->ia_gid); @@ -1385,24 +1413,35 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) { - uint32_t attrs[2] = { - FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID, - FATTR4_WORD1_MOUNTED_ON_FILEID, - }; + uint32_t attrs[2] = {0, 0}; + uint32_t dircount = readdir->count >> 1; __be32 *p; + if (readdir->plus) { + attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE| + FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE; + attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER| + FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV| + FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS| + FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; + dircount >>= 1; + } + attrs[0] |= FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID; + attrs[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID; + /* Switch to mounted_on_fileid if the server supports it */ + if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) + attrs[0] &= ~FATTR4_WORD0_FILEID; + else + attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; + p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20); *p++ = cpu_to_be32(OP_READDIR); p = xdr_encode_hyper(p, readdir->cookie); p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE); - *p++ = cpu_to_be32(readdir->count >> 1); /* We're not doing readdirplus */ + *p++ = cpu_to_be32(dircount); *p++ = cpu_to_be32(readdir->count); *p++ = cpu_to_be32(2); - /* Switch to mounted_on_fileid if the server supports it */ - if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) - attrs[0] &= ~FATTR4_WORD0_FILEID; - else - attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; + *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); hdr->nops++; @@ -1726,6 +1765,58 @@ static void encode_sequence(struct xdr_stream *xdr, #endif /* CONFIG_NFS_V4_1 */ } +#ifdef CONFIG_NFS_V4_1 +static void +encode_getdeviceinfo(struct xdr_stream *xdr, + const struct nfs4_getdeviceinfo_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE); + *p++ = cpu_to_be32(OP_GETDEVICEINFO); + p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, + NFS4_DEVICEID4_SIZE); + *p++ = cpu_to_be32(args->pdev->layout_type); + *p++ = cpu_to_be32(args->pdev->pglen); /* gdia_maxcount */ + *p++ = cpu_to_be32(0); /* bitmap length 0 */ + hdr->nops++; + hdr->replen += decode_getdeviceinfo_maxsz; +} + +static void +encode_layoutget(struct xdr_stream *xdr, + const struct nfs4_layoutget_args *args, + struct compound_hdr *hdr) +{ + nfs4_stateid stateid; + __be32 *p; + + p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_LAYOUTGET); + *p++ = cpu_to_be32(0); /* Signal layout available */ + *p++ = cpu_to_be32(args->type); + *p++ = cpu_to_be32(args->range.iomode); + p = xdr_encode_hyper(p, args->range.offset); + p = xdr_encode_hyper(p, args->range.length); + p = xdr_encode_hyper(p, args->minlength); + pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout, + args->ctx->state); + p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(args->maxcount); + + dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n", + __func__, + args->type, + args->range.iomode, + (unsigned long)args->range.offset, + (unsigned long)args->range.length, + args->maxcount); + hdr->nops++; + hdr->replen += decode_layoutget_maxsz; +} +#endif /* CONFIG_NFS_V4_1 */ + /* * END OF "GENERIC" ENCODE ROUTINES. */ @@ -1823,7 +1914,7 @@ static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs /* * Encode RENAME request */ -static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs4_rename_arg *args) +static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs_renameargs *args) { struct xdr_stream xdr; struct compound_hdr hdr = { @@ -2543,6 +2634,51 @@ static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p, return 0; } +/* + * Encode GETDEVICEINFO request + */ +static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p, + struct nfs4_getdeviceinfo_args *args) +{ + struct xdr_stream xdr; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); + encode_sequence(&xdr, &args->seq_args, &hdr); + encode_getdeviceinfo(&xdr, args, &hdr); + + /* set up reply kvec. Subtract notification bitmap max size (2) + * so that notification bitmap is put in xdr_buf tail */ + xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2, + args->pdev->pages, args->pdev->pgbase, + args->pdev->pglen); + + encode_nops(&hdr); + return 0; +} + +/* + * Encode LAYOUTGET request + */ +static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p, + struct nfs4_layoutget_args *args) +{ + struct xdr_stream xdr; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); + encode_sequence(&xdr, &args->seq_args, &hdr); + encode_putfh(&xdr, NFS_FH(args->inode), &hdr); + encode_layoutget(&xdr, args, &hdr); + encode_nops(&hdr); + return 0; +} #endif /* CONFIG_NFS_V4_1 */ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) @@ -2676,7 +2812,10 @@ out_overflow: static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask) { if (likely(bitmap[0] & FATTR4_WORD0_SUPPORTED_ATTRS)) { - decode_attr_bitmap(xdr, bitmask); + int ret; + ret = decode_attr_bitmap(xdr, bitmask); + if (unlikely(ret < 0)) + return ret; bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; } else bitmask[0] = bitmask[1] = 0; @@ -2848,6 +2987,56 @@ out_overflow: return -EIO; } +static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap) +{ + __be32 *p; + + if (unlikely(bitmap[0] & (FATTR4_WORD0_RDATTR_ERROR - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh) +{ + __be32 *p; + int len; + + if (fh != NULL) + memset(fh, 0, sizeof(*fh)); + + if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEHANDLE - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + if (len > NFS4_FHSIZE) + return -EIO; + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + if (fh != NULL) { + memcpy(fh->data, p, len); + fh->size = len; + } + bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE; + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) { __be32 *p; @@ -3521,6 +3710,24 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s return status; } +static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap, + struct timespec *time) +{ + int status = 0; + + time->tv_sec = 0; + time->tv_nsec = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_DELTA - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_TIME_DELTA)) { + status = decode_attr_time(xdr, time); + bitmap[1] &= ~FATTR4_WORD1_TIME_DELTA; + } + dprintk("%s: time_delta=%ld %ld\n", __func__, (long)time->tv_sec, + (long)time->tv_nsec); + return status; +} + static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) { int status = 0; @@ -3744,29 +3951,14 @@ xdr_error: return status; } -static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, +static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, + struct nfs_fattr *fattr, struct nfs_fh *fh, const struct nfs_server *server, int may_sleep) { - __be32 *savep; - uint32_t attrlen, - bitmap[2] = {0}, - type; int status; umode_t fmode = 0; uint64_t fileid; - - status = decode_op_hdr(xdr, OP_GETATTR); - if (status < 0) - goto xdr_error; - - status = decode_attr_bitmap(xdr, bitmap); - if (status < 0) - goto xdr_error; - - status = decode_attr_length(xdr, &attrlen, &savep); - if (status < 0) - goto xdr_error; - + uint32_t type; status = decode_attr_type(xdr, bitmap, &type); if (status < 0) @@ -3792,6 +3984,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, goto xdr_error; fattr->valid |= status; + status = decode_attr_error(xdr, bitmap); + if (status < 0) + goto xdr_error; + + status = decode_attr_filehandle(xdr, bitmap, fh); + if (status < 0) + goto xdr_error; + status = decode_attr_fileid(xdr, bitmap, &fattr->fileid); if (status < 0) goto xdr_error; @@ -3862,12 +4062,101 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, fattr->valid |= status; } +xdr_error: + dprintk("%s: xdr returned %d\n", __func__, -status); + return status; +} + +static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr, + struct nfs_fh *fh, const struct nfs_server *server, int may_sleep) +{ + __be32 *savep; + uint32_t attrlen, + bitmap[2] = {0}; + int status; + + status = decode_op_hdr(xdr, OP_GETATTR); + if (status < 0) + goto xdr_error; + + status = decode_attr_bitmap(xdr, bitmap); + if (status < 0) + goto xdr_error; + + status = decode_attr_length(xdr, &attrlen, &savep); + if (status < 0) + goto xdr_error; + + status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server, may_sleep); + if (status < 0) + goto xdr_error; + status = verify_attr_len(xdr, savep, attrlen); xdr_error: dprintk("%s: xdr returned %d\n", __func__, -status); return status; } +static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, + const struct nfs_server *server, int may_sleep) +{ + return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep); +} + +/* + * Decode potentially multiple layout types. Currently we only support + * one layout driver per file system. + */ +static int decode_first_pnfs_layout_type(struct xdr_stream *xdr, + uint32_t *layouttype) +{ + uint32_t *p; + int num; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + num = be32_to_cpup(p); + + /* pNFS is not supported by the underlying file system */ + if (num == 0) { + *layouttype = 0; + return 0; + } + if (num > 1) + printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers " + "per filesystem not supported\n", __func__); + + /* Decode and set first layout type, move xdr->p past unused types */ + p = xdr_inline_decode(xdr, num * 4); + if (unlikely(!p)) + goto out_overflow; + *layouttype = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * The type of file system exported. + * Note we must ensure that layouttype is set in any non-error case. + */ +static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, + uint32_t *layouttype) +{ + int status = 0; + + dprintk("%s: bitmap is %x\n", __func__, bitmap[1]); + if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U))) + return -EIO; + if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) { + status = decode_first_pnfs_layout_type(xdr, layouttype); + bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES; + } else + *layouttype = 0; + return status; +} static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) { @@ -3894,6 +4183,12 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0) goto xdr_error; fsinfo->wtpref = fsinfo->wtmax; + status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta); + if (status != 0) + goto xdr_error; + status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); + if (status != 0) + goto xdr_error; status = verify_attr_len(xdr, savep, attrlen); xdr_error: @@ -3950,13 +4245,13 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) __be32 *p; uint32_t namelen, type; - p = xdr_inline_decode(xdr, 32); + p = xdr_inline_decode(xdr, 32); /* read 32 bytes */ if (unlikely(!p)) goto out_overflow; - p = xdr_decode_hyper(p, &offset); + p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */ p = xdr_decode_hyper(p, &length); - type = be32_to_cpup(p++); - if (fl != NULL) { + type = be32_to_cpup(p++); /* 4 byte read */ + if (fl != NULL) { /* manipulate file lock */ fl->fl_start = (loff_t)offset; fl->fl_end = fl->fl_start + (loff_t)length - 1; if (length == ~(uint64_t)0) @@ -3966,9 +4261,9 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) fl->fl_type = F_RDLCK; fl->fl_pid = 0; } - p = xdr_decode_hyper(p, &clientid); - namelen = be32_to_cpup(p); - p = xdr_inline_decode(xdr, namelen); + p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */ + namelen = be32_to_cpup(p); /* read 4 bytes */ /* have read all 32 bytes now */ + p = xdr_inline_decode(xdr, namelen); /* variable size field */ if (likely(p)) return -NFS4ERR_DENIED; out_overflow: @@ -4200,12 +4495,9 @@ out_overflow: static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir) { struct xdr_buf *rcvbuf = &req->rq_rcv_buf; - struct page *page = *rcvbuf->pages; struct kvec *iov = rcvbuf->head; size_t hdrlen; u32 recvd, pglen = rcvbuf->page_len; - __be32 *end, *entry, *p, *kaddr; - unsigned int nr = 0; int status; status = decode_op_hdr(xdr, OP_READDIR); @@ -4225,71 +4517,8 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n pglen = recvd; xdr_read_pages(xdr, pglen); - BUG_ON(pglen + readdir->pgbase > PAGE_CACHE_SIZE); - kaddr = p = kmap_atomic(page, KM_USER0); - end = p + ((pglen + readdir->pgbase) >> 2); - entry = p; - - /* Make sure the packet actually has a value_follows and EOF entry */ - if ((entry + 1) > end) - goto short_pkt; - - for (; *p++; nr++) { - u32 len, attrlen, xlen; - if (end - p < 3) - goto short_pkt; - dprintk("cookie = %Lu, ", *((unsigned long long *)p)); - p += 2; /* cookie */ - len = ntohl(*p++); /* filename length */ - if (len > NFS4_MAXNAMLEN) { - dprintk("NFS: giant filename in readdir (len 0x%x)\n", - len); - goto err_unmap; - } - xlen = XDR_QUADLEN(len); - if (end - p < xlen + 1) - goto short_pkt; - dprintk("filename = %*s\n", len, (char *)p); - p += xlen; - len = ntohl(*p++); /* bitmap length */ - if (end - p < len + 1) - goto short_pkt; - p += len; - attrlen = XDR_QUADLEN(ntohl(*p++)); - if (end - p < attrlen + 2) - goto short_pkt; - p += attrlen; /* attributes */ - entry = p; - } - /* - * Apparently some server sends responses that are a valid size, but - * contain no entries, and have value_follows==0 and EOF==0. For - * those, just set the EOF marker. - */ - if (!nr && entry[1] == 0) { - dprintk("NFS: readdir reply truncated!\n"); - entry[1] = 1; - } -out: - kunmap_atomic(kaddr, KM_USER0); + return 0; -short_pkt: - /* - * When we get a short packet there are 2 possibilities. We can - * return an error, or fix up the response to look like a valid - * response and return what we have so far. If there are no - * entries and the packet was short, then return -EIO. If there - * are valid entries in the response, return them and pretend that - * the call was successful, but incomplete. The caller can retry the - * readdir starting at the last cookie. - */ - dprintk("%s: short packet at entry %d\n", __func__, nr); - entry[0] = entry[1] = 0; - if (nr) - goto out; -err_unmap: - kunmap_atomic(kaddr, KM_USER0); - return -errno_NFSERR_IO; } static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) @@ -4299,7 +4528,6 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) size_t hdrlen; u32 len, recvd; __be32 *p; - char *kaddr; int status; status = decode_op_hdr(xdr, OP_READLINK); @@ -4330,9 +4558,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) * and and null-terminate the text (the VFS expects * null-termination). */ - kaddr = (char *)kmap_atomic(rcvbuf->pages[0], KM_USER0); - kaddr[len+rcvbuf->page_base] = '\0'; - kunmap_atomic(kaddr, KM_USER0); + xdr_terminate_string(rcvbuf, len); return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -4668,7 +4894,6 @@ static int decode_sequence(struct xdr_stream *xdr, struct rpc_rqst *rqstp) { #if defined(CONFIG_NFS_V4_1) - struct nfs4_slot *slot; struct nfs4_sessionid id; u32 dummy; int status; @@ -4700,15 +4925,14 @@ static int decode_sequence(struct xdr_stream *xdr, goto out_overflow; /* seqid */ - slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid]; dummy = be32_to_cpup(p++); - if (dummy != slot->seq_nr) { + if (dummy != res->sr_slot->seq_nr) { dprintk("%s Invalid sequence number\n", __func__); goto out_err; } /* slot id */ dummy = be32_to_cpup(p++); - if (dummy != res->sr_slotid) { + if (dummy != res->sr_slot - res->sr_session->fc_slot_table.slots) { dprintk("%s Invalid slot id\n", __func__); goto out_err; } @@ -4731,6 +4955,134 @@ out_overflow: #endif /* CONFIG_NFS_V4_1 */ } +#if defined(CONFIG_NFS_V4_1) + +static int decode_getdeviceinfo(struct xdr_stream *xdr, + struct pnfs_device *pdev) +{ + __be32 *p; + uint32_t len, type; + int status; + + status = decode_op_hdr(xdr, OP_GETDEVICEINFO); + if (status) { + if (status == -ETOOSMALL) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + pdev->mincount = be32_to_cpup(p); + dprintk("%s: Min count too small. mincnt = %u\n", + __func__, pdev->mincount); + } + return status; + } + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + type = be32_to_cpup(p++); + if (type != pdev->layout_type) { + dprintk("%s: layout mismatch req: %u pdev: %u\n", + __func__, pdev->layout_type, type); + return -EINVAL; + } + /* + * Get the length of the opaque device_addr4. xdr_read_pages places + * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages) + * and places the remaining xdr data in xdr_buf->tail + */ + pdev->mincount = be32_to_cpup(p); + xdr_read_pages(xdr, pdev->mincount); /* include space for the length */ + + /* Parse notification bitmap, verifying that it is zero. */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + if (len) { + int i; + + p = xdr_inline_decode(xdr, 4 * len); + if (unlikely(!p)) + goto out_overflow; + for (i = 0; i < len; i++, p++) { + if (be32_to_cpup(p)) { + dprintk("%s: notifications not supported\n", + __func__); + return -EIO; + } + } + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, + struct nfs4_layoutget_res *res) +{ + __be32 *p; + int status; + u32 layout_count; + + status = decode_op_hdr(xdr, OP_LAYOUTGET); + if (status) + return status; + p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE); + if (unlikely(!p)) + goto out_overflow; + res->return_on_close = be32_to_cpup(p++); + p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE); + layout_count = be32_to_cpup(p); + if (!layout_count) { + dprintk("%s: server responded with empty layout array\n", + __func__); + return -EINVAL; + } + + p = xdr_inline_decode(xdr, 24); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &res->range.offset); + p = xdr_decode_hyper(p, &res->range.length); + res->range.iomode = be32_to_cpup(p++); + res->type = be32_to_cpup(p++); + + status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p); + if (unlikely(status)) + return status; + + dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n", + __func__, + (unsigned long)res->range.offset, + (unsigned long)res->range.length, + res->range.iomode, + res->type, + res->layout.len); + + /* nfs4_proc_layoutget allocated a single page */ + if (res->layout.len > PAGE_SIZE) + return -ENOMEM; + memcpy(res->layout.buf, p, res->layout.len); + + if (layout_count > 1) { + /* We only handle a length one array at the moment. Any + * further entries are just ignored. Note that this means + * the client may see a response that is less than the + * minimum it requested. + */ + dprintk("%s: server responded with %d layouts, dropping tail\n", + __func__, layout_count); + } + + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} +#endif /* CONFIG_NFS_V4_1 */ + /* * END OF "GENERIC" DECODE ROUTINES. */ @@ -4873,7 +5225,7 @@ out: /* * Decode RENAME response */ -static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_rename_res *res) +static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs_renameres *res) { struct xdr_stream xdr; struct compound_hdr hdr; @@ -5758,25 +6110,84 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p, status = decode_reclaim_complete(&xdr, (void *)NULL); return status; } + +/* + * Decode GETDEVINFO response + */ +static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p, + struct nfs4_getdeviceinfo_res *res) +{ + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (status != 0) + goto out; + status = decode_sequence(&xdr, &res->seq_res, rqstp); + if (status != 0) + goto out; + status = decode_getdeviceinfo(&xdr, res->pdev); +out: + return status; +} + +/* + * Decode LAYOUTGET response + */ +static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p, + struct nfs4_layoutget_res *res) +{ + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (status) + goto out; + status = decode_sequence(&xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(&xdr); + if (status) + goto out; + status = decode_layoutget(&xdr, rqstp, res); +out: + return status; +} #endif /* CONFIG_NFS_V4_1 */ -__be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) +__be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, + struct nfs_server *server, int plus) { uint32_t bitmap[2] = {0}; uint32_t len; - - if (!*p++) { - if (!*p) + __be32 *p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + if (!ntohl(*p++)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + if (!ntohl(*p++)) return ERR_PTR(-EAGAIN); entry->eof = 1; return ERR_PTR(-EBADCOOKIE); } + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; entry->prev_cookie = entry->cookie; p = xdr_decode_hyper(p, &entry->cookie); entry->len = ntohl(*p++); + + p = xdr_inline_decode(xdr, entry->len); + if (unlikely(!p)) + goto out_overflow; entry->name = (const char *) p; - p += XDR_QUADLEN(entry->len); /* * In case the server doesn't return an inode number, @@ -5784,32 +6195,33 @@ __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) * since glibc seems to choke on it...) */ entry->ino = 1; + entry->fattr->valid = 0; - len = ntohl(*p++); /* bitmap length */ - if (len-- > 0) { - bitmap[0] = ntohl(*p++); - if (len-- > 0) { - bitmap[1] = ntohl(*p++); - p += len; - } - } - len = XDR_QUADLEN(ntohl(*p++)); /* attribute buffer length */ - if (len > 0) { - if (bitmap[0] & FATTR4_WORD0_RDATTR_ERROR) { - bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; - /* Ignore the return value of rdattr_error for now */ - p++; - len--; - } - if (bitmap[0] == 0 && bitmap[1] == FATTR4_WORD1_MOUNTED_ON_FILEID) - xdr_decode_hyper(p, &entry->ino); - else if (bitmap[0] == FATTR4_WORD0_FILEID) - xdr_decode_hyper(p, &entry->ino); - p += len; - } + if (decode_attr_bitmap(xdr, bitmap) < 0) + goto out_overflow; + + if (decode_attr_length(xdr, &len, &p) < 0) + goto out_overflow; + + if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, server, 1) < 0) + goto out_overflow; + if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID) + entry->ino = entry->fattr->fileid; + + if (verify_attr_len(xdr, p, len) < 0) + goto out_overflow; + + p = xdr_inline_peek(xdr, 8); + if (p != NULL) + entry->eof = !p[0] && p[1]; + else + entry->eof = 0; - entry->eof = !p[0] && p[1]; return p; + +out_overflow: + print_overflow_msg(__func__, xdr); + return ERR_PTR(-EIO); } /* @@ -5936,6 +6348,8 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(SEQUENCE, enc_sequence, dec_sequence), PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), + PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), + PROC(LAYOUTGET, enc_layoutget, dec_layoutget), #endif /* CONFIG_NFS_V4_1 */ }; diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index df101d9f546a..903908a20023 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -3,9 +3,10 @@ * * Allow an NFS filesystem to be mounted as root. The way this works is: * (1) Use the IP autoconfig mechanism to set local IP addresses and routes. - * (2) Handle RPC negotiation with the system which replied to RARP or - * was reported as a boot server by BOOTP or manually. - * (3) The actual mounting is done later, when init() is running. + * (2) Construct the device string and the options string using DHCP + * option 17 and/or kernel command line options. + * (3) When mount_root() sets up the root file system, pass these strings + * to the NFS client's regular mount interface via sys_mount(). * * * Changes: @@ -65,470 +66,245 @@ * Hua Qin : Support for mounting root file system via * NFS over TCP. * Fabian Frederick: Option parser rebuilt (using parser lib) -*/ + * Chuck Lever : Use super.c's text-based mount option parsing + * Chuck Lever : Add "nfsrootdebug". + */ #include <linux/types.h> #include <linux/string.h> -#include <linux/kernel.h> -#include <linux/time.h> -#include <linux/fs.h> #include <linux/init.h> -#include <linux/sunrpc/clnt.h> -#include <linux/sunrpc/xprtsock.h> #include <linux/nfs.h> #include <linux/nfs_fs.h> -#include <linux/nfs_mount.h> -#include <linux/in.h> -#include <linux/major.h> #include <linux/utsname.h> -#include <linux/inet.h> #include <linux/root_dev.h> #include <net/ipconfig.h> -#include <linux/parser.h> #include "internal.h" -/* Define this to allow debugging output */ -#undef NFSROOT_DEBUG #define NFSDBG_FACILITY NFSDBG_ROOT -/* Default port to use if server is not running a portmapper */ -#define NFS_MNT_PORT 627 - /* Default path we try to mount. "%s" gets replaced by our IP address */ #define NFS_ROOT "/tftpboot/%s" /* Parameters passed from the kernel command line */ -static char nfs_root_name[256] __initdata = ""; +static char nfs_root_parms[256] __initdata = ""; + +/* Text-based mount options passed to super.c */ +static char nfs_root_options[256] __initdata = ""; /* Address of NFS server */ -static __be32 servaddr __initdata = 0; +static __be32 servaddr __initdata = htonl(INADDR_NONE); /* Name of directory to mount */ -static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = { 0, }; - -/* NFS-related data */ -static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */ -static int nfs_port __initdata = 0; /* Port to connect to for NFS */ -static int mount_port __initdata = 0; /* Mount daemon port number */ - - -/*************************************************************************** - - Parsing of options - - ***************************************************************************/ - -enum { - /* Options that take integer arguments */ - Opt_port, Opt_rsize, Opt_wsize, Opt_timeo, Opt_retrans, Opt_acregmin, - Opt_acregmax, Opt_acdirmin, Opt_acdirmax, - /* Options that take no arguments */ - Opt_soft, Opt_hard, Opt_intr, - Opt_nointr, Opt_posix, Opt_noposix, Opt_cto, Opt_nocto, Opt_ac, - Opt_noac, Opt_lock, Opt_nolock, Opt_v2, Opt_v3, Opt_udp, Opt_tcp, - Opt_acl, Opt_noacl, - /* Error token */ - Opt_err -}; - -static const match_table_t tokens __initconst = { - {Opt_port, "port=%u"}, - {Opt_rsize, "rsize=%u"}, - {Opt_wsize, "wsize=%u"}, - {Opt_timeo, "timeo=%u"}, - {Opt_retrans, "retrans=%u"}, - {Opt_acregmin, "acregmin=%u"}, - {Opt_acregmax, "acregmax=%u"}, - {Opt_acdirmin, "acdirmin=%u"}, - {Opt_acdirmax, "acdirmax=%u"}, - {Opt_soft, "soft"}, - {Opt_hard, "hard"}, - {Opt_intr, "intr"}, - {Opt_nointr, "nointr"}, - {Opt_posix, "posix"}, - {Opt_noposix, "noposix"}, - {Opt_cto, "cto"}, - {Opt_nocto, "nocto"}, - {Opt_ac, "ac"}, - {Opt_noac, "noac"}, - {Opt_lock, "lock"}, - {Opt_nolock, "nolock"}, - {Opt_v2, "nfsvers=2"}, - {Opt_v2, "v2"}, - {Opt_v3, "nfsvers=3"}, - {Opt_v3, "v3"}, - {Opt_udp, "proto=udp"}, - {Opt_udp, "udp"}, - {Opt_tcp, "proto=tcp"}, - {Opt_tcp, "tcp"}, - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_err, NULL} - -}; +static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = ""; +/* server:export path string passed to super.c */ +static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = ""; + +#ifdef RPC_DEBUG /* - * Parse option string. + * When the "nfsrootdebug" kernel command line option is specified, + * enable debugging messages for NFSROOT. */ - -static int __init root_nfs_parse(char *name, char *buf) +static int __init nfs_root_debug(char *__unused) { - - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - - if (!name) - return 1; - - /* Set the NFS remote path */ - p = strsep(&name, ","); - if (p[0] != '\0' && strcmp(p, "default") != 0) - strlcpy(buf, p, NFS_MAXPATHLEN); - - while ((p = strsep (&name, ",")) != NULL) { - int token; - if (!*p) - continue; - token = match_token(p, tokens, args); - - /* %u tokens only. Beware if you add new tokens! */ - if (token < Opt_soft && match_int(&args[0], &option)) - return 0; - switch (token) { - case Opt_port: - nfs_port = option; - break; - case Opt_rsize: - nfs_data.rsize = option; - break; - case Opt_wsize: - nfs_data.wsize = option; - break; - case Opt_timeo: - nfs_data.timeo = option; - break; - case Opt_retrans: - nfs_data.retrans = option; - break; - case Opt_acregmin: - nfs_data.acregmin = option; - break; - case Opt_acregmax: - nfs_data.acregmax = option; - break; - case Opt_acdirmin: - nfs_data.acdirmin = option; - break; - case Opt_acdirmax: - nfs_data.acdirmax = option; - break; - case Opt_soft: - nfs_data.flags |= NFS_MOUNT_SOFT; - break; - case Opt_hard: - nfs_data.flags &= ~NFS_MOUNT_SOFT; - break; - case Opt_intr: - case Opt_nointr: - break; - case Opt_posix: - nfs_data.flags |= NFS_MOUNT_POSIX; - break; - case Opt_noposix: - nfs_data.flags &= ~NFS_MOUNT_POSIX; - break; - case Opt_cto: - nfs_data.flags &= ~NFS_MOUNT_NOCTO; - break; - case Opt_nocto: - nfs_data.flags |= NFS_MOUNT_NOCTO; - break; - case Opt_ac: - nfs_data.flags &= ~NFS_MOUNT_NOAC; - break; - case Opt_noac: - nfs_data.flags |= NFS_MOUNT_NOAC; - break; - case Opt_lock: - nfs_data.flags &= ~NFS_MOUNT_NONLM; - break; - case Opt_nolock: - nfs_data.flags |= NFS_MOUNT_NONLM; - break; - case Opt_v2: - nfs_data.flags &= ~NFS_MOUNT_VER3; - break; - case Opt_v3: - nfs_data.flags |= NFS_MOUNT_VER3; - break; - case Opt_udp: - nfs_data.flags &= ~NFS_MOUNT_TCP; - break; - case Opt_tcp: - nfs_data.flags |= NFS_MOUNT_TCP; - break; - case Opt_acl: - nfs_data.flags &= ~NFS_MOUNT_NOACL; - break; - case Opt_noacl: - nfs_data.flags |= NFS_MOUNT_NOACL; - break; - default: - printk(KERN_WARNING "Root-NFS: unknown " - "option: %s\n", p); - return 0; - } - } - + nfs_debug |= NFSDBG_ROOT | NFSDBG_MOUNT; return 1; } +__setup("nfsrootdebug", nfs_root_debug); +#endif + /* - * Prepare the NFS data structure and parse all options. + * Parse NFS server and directory information passed on the kernel + * command line. + * + * nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>] + * + * If there is a "%s" token in the <root-dir> string, it is replaced + * by the ASCII-representation of the client's IP address. */ -static int __init root_nfs_name(char *name) +static int __init nfs_root_setup(char *line) { - static char buf[NFS_MAXPATHLEN] __initdata; - char *cp; - - /* Set some default values */ - memset(&nfs_data, 0, sizeof(nfs_data)); - nfs_port = -1; - nfs_data.version = NFS_MOUNT_VERSION; - nfs_data.flags = NFS_MOUNT_NONLM; /* No lockd in nfs root yet */ - nfs_data.rsize = NFS_DEF_FILE_IO_SIZE; - nfs_data.wsize = NFS_DEF_FILE_IO_SIZE; - nfs_data.acregmin = NFS_DEF_ACREGMIN; - nfs_data.acregmax = NFS_DEF_ACREGMAX; - nfs_data.acdirmin = NFS_DEF_ACDIRMIN; - nfs_data.acdirmax = NFS_DEF_ACDIRMAX; - strcpy(buf, NFS_ROOT); - - /* Process options received from the remote server */ - root_nfs_parse(root_server_path, buf); - - /* Override them by options set on kernel command-line */ - root_nfs_parse(name, buf); - - cp = utsname()->nodename; - if (strlen(buf) + strlen(cp) > NFS_MAXPATHLEN) { - printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n"); - return -1; + ROOT_DEV = Root_NFS; + + if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) { + strlcpy(nfs_root_parms, line, sizeof(nfs_root_parms)); + } else { + size_t n = strlen(line) + sizeof(NFS_ROOT) - 1; + if (n >= sizeof(nfs_root_parms)) + line[sizeof(nfs_root_parms) - sizeof(NFS_ROOT) - 2] = '\0'; + sprintf(nfs_root_parms, NFS_ROOT, line); } - sprintf(nfs_export_path, buf, cp); + + /* + * Extract the IP address of the NFS server containing our + * root file system, if one was specified. + * + * Note: root_nfs_parse_addr() removes the server-ip from + * nfs_root_parms, if it exists. + */ + root_server_addr = root_nfs_parse_addr(nfs_root_parms); return 1; } +__setup("nfsroot=", nfs_root_setup); -/* - * Get NFS server address. - */ -static int __init root_nfs_addr(void) +static int __init root_nfs_copy(char *dest, const char *src, + const size_t destlen) { - if ((servaddr = root_server_addr) == htonl(INADDR_NONE)) { - printk(KERN_ERR "Root-NFS: No NFS server available, giving up.\n"); + if (strlcpy(dest, src, destlen) > destlen) return -1; - } + return 0; +} - snprintf(nfs_data.hostname, sizeof(nfs_data.hostname), - "%pI4", &servaddr); +static int __init root_nfs_cat(char *dest, const char *src, + const size_t destlen) +{ + if (strlcat(dest, src, destlen) > destlen) + return -1; return 0; } /* - * Tell the user what's going on. + * Parse out root export path and mount options from + * passed-in string @incoming. + * + * Copy the export path into @exppath. */ -#ifdef NFSROOT_DEBUG -static void __init root_nfs_print(void) +static int __init root_nfs_parse_options(char *incoming, char *exppath, + const size_t exppathlen) { - printk(KERN_NOTICE "Root-NFS: Mounting %s on server %s as root\n", - nfs_export_path, nfs_data.hostname); - printk(KERN_NOTICE "Root-NFS: rsize = %d, wsize = %d, timeo = %d, retrans = %d\n", - nfs_data.rsize, nfs_data.wsize, nfs_data.timeo, nfs_data.retrans); - printk(KERN_NOTICE "Root-NFS: acreg (min,max) = (%d,%d), acdir (min,max) = (%d,%d)\n", - nfs_data.acregmin, nfs_data.acregmax, - nfs_data.acdirmin, nfs_data.acdirmax); - printk(KERN_NOTICE "Root-NFS: nfsd port = %d, mountd port = %d, flags = %08x\n", - nfs_port, mount_port, nfs_data.flags); -} -#endif - + char *p; -static int __init root_nfs_init(void) -{ -#ifdef NFSROOT_DEBUG - nfs_debug |= NFSDBG_ROOT; -#endif + /* + * Set the NFS remote path + */ + p = strsep(&incoming, ","); + if (*p != '\0' && strcmp(p, "default") != 0) + if (root_nfs_copy(exppath, p, exppathlen)) + return -1; /* - * Decode the root directory path name and NFS options from - * the kernel command line. This has to go here in order to - * be able to use the client IP address for the remote root - * directory (necessary for pure RARP booting). + * @incoming now points to the rest of the string; if it + * contains something, append it to our root options buffer */ - if (root_nfs_name(nfs_root_name) < 0 || - root_nfs_addr() < 0) - return -1; + if (incoming != NULL && *incoming != '\0') + if (root_nfs_cat(nfs_root_options, incoming, + sizeof(nfs_root_options))) + return -1; -#ifdef NFSROOT_DEBUG - root_nfs_print(); -#endif + /* + * Possibly prepare for more options to be appended + */ + if (nfs_root_options[0] != '\0' && + nfs_root_options[strlen(nfs_root_options)] != ',') + if (root_nfs_cat(nfs_root_options, ",", + sizeof(nfs_root_options))) + return -1; return 0; } - /* - * Parse NFS server and directory information passed on the kernel - * command line. + * Decode the export directory path name and NFS options from + * the kernel command line. This has to be done late in order to + * use a dynamically acquired client IP address for the remote + * root directory path. + * + * Returns zero if successful; otherwise -1 is returned. */ -static int __init nfs_root_setup(char *line) +static int __init root_nfs_data(char *cmdline) { - ROOT_DEV = Root_NFS; - if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) { - strlcpy(nfs_root_name, line, sizeof(nfs_root_name)); - } else { - int n = strlen(line) + sizeof(NFS_ROOT) - 1; - if (n >= sizeof(nfs_root_name)) - line[sizeof(nfs_root_name) - sizeof(NFS_ROOT) - 2] = '\0'; - sprintf(nfs_root_name, NFS_ROOT, line); + char addr_option[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1]; + int len, retval = -1; + char *tmp = NULL; + const size_t tmplen = sizeof(nfs_export_path); + + tmp = kzalloc(tmplen, GFP_KERNEL); + if (tmp == NULL) + goto out_nomem; + strcpy(tmp, NFS_ROOT); + + if (root_server_path[0] != '\0') { + dprintk("Root-NFS: DHCPv4 option 17: %s\n", + root_server_path); + if (root_nfs_parse_options(root_server_path, tmp, tmplen)) + goto out_optionstoolong; } - root_server_addr = root_nfs_parse_addr(nfs_root_name); - return 1; -} - -__setup("nfsroot=", nfs_root_setup); - -/*************************************************************************** - Routines to actually mount the root directory + if (cmdline[0] != '\0') { + dprintk("Root-NFS: nfsroot=%s\n", cmdline); + if (root_nfs_parse_options(cmdline, tmp, tmplen)) + goto out_optionstoolong; + } - ***************************************************************************/ + /* + * Append mandatory options for nfsroot so they override + * what has come before + */ + snprintf(addr_option, sizeof(addr_option), "nolock,addr=%pI4", + &servaddr); + if (root_nfs_cat(nfs_root_options, addr_option, + sizeof(nfs_root_options))) + goto out_optionstoolong; -/* - * Construct sockaddr_in from address and port number. - */ -static inline void -set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port) -{ - sin->sin_family = AF_INET; - sin->sin_addr.s_addr = addr; - sin->sin_port = port; -} + /* + * Set up nfs_root_device. For NFS mounts, this looks like + * + * server:/path + * + * At this point, utsname()->nodename contains our local + * IP address or hostname, set by ipconfig. If "%s" exists + * in tmp, substitute the nodename, then shovel the whole + * mess into nfs_root_device. + */ + len = snprintf(nfs_export_path, sizeof(nfs_export_path), + tmp, utsname()->nodename); + if (len > (int)sizeof(nfs_export_path)) + goto out_devnametoolong; + len = snprintf(nfs_root_device, sizeof(nfs_root_device), + "%pI4:%s", &servaddr, nfs_export_path); + if (len > (int)sizeof(nfs_root_device)) + goto out_devnametoolong; -/* - * Query server portmapper for the port of a daemon program. - */ -static int __init root_nfs_getport(int program, int version, int proto) -{ - struct sockaddr_in sin; + retval = 0; - printk(KERN_NOTICE "Looking up port of RPC %d/%d on %pI4\n", - program, version, &servaddr); - set_sockaddr(&sin, servaddr, 0); - return rpcb_getport_sync(&sin, program, version, proto); +out: + kfree(tmp); + return retval; +out_nomem: + printk(KERN_ERR "Root-NFS: could not allocate memory\n"); + goto out; +out_optionstoolong: + printk(KERN_ERR "Root-NFS: mount options string too long\n"); + goto out; +out_devnametoolong: + printk(KERN_ERR "Root-NFS: root device name too long.\n"); + goto out; } - -/* - * Use portmapper to find mountd and nfsd port numbers if not overriden - * by the user. Use defaults if portmapper is not available. - * XXX: Is there any nfs server with no portmapper? +/** + * nfs_root_data - Return prepared 'data' for NFSROOT mount + * @root_device: OUT: address of string containing NFSROOT device + * @root_data: OUT: address of string containing NFSROOT mount options + * + * Returns zero and sets @root_device and @root_data if successful, + * otherwise -1 is returned. */ -static int __init root_nfs_ports(void) +int __init nfs_root_data(char **root_device, char **root_data) { - int port; - int nfsd_ver, mountd_ver; - int nfsd_port, mountd_port; - int proto; - - if (nfs_data.flags & NFS_MOUNT_VER3) { - nfsd_ver = NFS3_VERSION; - mountd_ver = NFS_MNT3_VERSION; - nfsd_port = NFS_PORT; - mountd_port = NFS_MNT_PORT; - } else { - nfsd_ver = NFS2_VERSION; - mountd_ver = NFS_MNT_VERSION; - nfsd_port = NFS_PORT; - mountd_port = NFS_MNT_PORT; - } - - proto = (nfs_data.flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP; - - if (nfs_port < 0) { - if ((port = root_nfs_getport(NFS_PROGRAM, nfsd_ver, proto)) < 0) { - printk(KERN_ERR "Root-NFS: Unable to get nfsd port " - "number from server, using default\n"); - port = nfsd_port; - } - nfs_port = port; - dprintk("Root-NFS: Portmapper on server returned %d " - "as nfsd port\n", port); + servaddr = root_server_addr; + if (servaddr == htonl(INADDR_NONE)) { + printk(KERN_ERR "Root-NFS: no NFS server address\n"); + return -1; } - if ((port = root_nfs_getport(NFS_MNT_PROGRAM, mountd_ver, proto)) < 0) { - printk(KERN_ERR "Root-NFS: Unable to get mountd port " - "number from server, using default\n"); - port = mountd_port; - } - mount_port = port; - dprintk("Root-NFS: mountd port is %d\n", port); + if (root_nfs_data(nfs_root_parms) < 0) + return -1; + *root_device = nfs_root_device; + *root_data = nfs_root_options; return 0; } - - -/* - * Get a file handle from the server for the directory which is to be - * mounted. - */ -static int __init root_nfs_get_handle(void) -{ - struct sockaddr_in sin; - unsigned int auth_flav_len = 0; - struct nfs_mount_request request = { - .sap = (struct sockaddr *)&sin, - .salen = sizeof(sin), - .dirpath = nfs_export_path, - .version = (nfs_data.flags & NFS_MOUNT_VER3) ? - NFS_MNT3_VERSION : NFS_MNT_VERSION, - .protocol = (nfs_data.flags & NFS_MOUNT_TCP) ? - XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP, - .auth_flav_len = &auth_flav_len, - }; - int status = -ENOMEM; - - request.fh = nfs_alloc_fhandle(); - if (!request.fh) - goto out; - set_sockaddr(&sin, servaddr, htons(mount_port)); - status = nfs_mount(&request); - if (status < 0) - printk(KERN_ERR "Root-NFS: Server returned error %d " - "while mounting %s\n", status, nfs_export_path); - else { - nfs_data.root.size = request.fh->size; - memcpy(&nfs_data.root.data, request.fh->data, request.fh->size); - } - nfs_free_fhandle(request.fh); -out: - return status; -} - -/* - * Get the NFS port numbers and file handle, and return the prepared 'data' - * argument for mount() if everything went OK. Return NULL otherwise. - */ -void * __init nfs_root_data(void) -{ - if (root_nfs_init() < 0 - || root_nfs_ports() < 0 - || root_nfs_get_handle() < 0) - return NULL; - set_sockaddr((struct sockaddr_in *) &nfs_data.addr, servaddr, htons(nfs_port)); - return (void*)&nfs_data; -} diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c new file mode 100644 index 000000000000..db773428f95f --- /dev/null +++ b/fs/nfs/pnfs.c @@ -0,0 +1,783 @@ +/* + * pNFS functions to call and manage layout drivers. + * + * Copyright (c) 2002 [year of first publication] + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand <dhildebz@umich.edu> + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#include <linux/nfs_fs.h> +#include "internal.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS + +/* Locking: + * + * pnfs_spinlock: + * protects pnfs_modules_tbl. + */ +static DEFINE_SPINLOCK(pnfs_spinlock); + +/* + * pnfs_modules_tbl holds all pnfs modules + */ +static LIST_HEAD(pnfs_modules_tbl); + +/* Return the registered pnfs layout driver module matching given id */ +static struct pnfs_layoutdriver_type * +find_pnfs_driver_locked(u32 id) +{ + struct pnfs_layoutdriver_type *local; + + list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid) + if (local->id == id) + goto out; + local = NULL; +out: + dprintk("%s: Searching for id %u, found %p\n", __func__, id, local); + return local; +} + +static struct pnfs_layoutdriver_type * +find_pnfs_driver(u32 id) +{ + struct pnfs_layoutdriver_type *local; + + spin_lock(&pnfs_spinlock); + local = find_pnfs_driver_locked(id); + spin_unlock(&pnfs_spinlock); + return local; +} + +void +unset_pnfs_layoutdriver(struct nfs_server *nfss) +{ + if (nfss->pnfs_curr_ld) { + nfss->pnfs_curr_ld->clear_layoutdriver(nfss); + module_put(nfss->pnfs_curr_ld->owner); + } + nfss->pnfs_curr_ld = NULL; +} + +/* + * Try to set the server's pnfs module to the pnfs layout type specified by id. + * Currently only one pNFS layout driver per filesystem is supported. + * + * @id layout type. Zero (illegal layout type) indicates pNFS not in use. + */ +void +set_pnfs_layoutdriver(struct nfs_server *server, u32 id) +{ + struct pnfs_layoutdriver_type *ld_type = NULL; + + if (id == 0) + goto out_no_driver; + if (!(server->nfs_client->cl_exchange_flags & + (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) { + printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__, + id, server->nfs_client->cl_exchange_flags); + goto out_no_driver; + } + ld_type = find_pnfs_driver(id); + if (!ld_type) { + request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); + ld_type = find_pnfs_driver(id); + if (!ld_type) { + dprintk("%s: No pNFS module found for %u.\n", + __func__, id); + goto out_no_driver; + } + } + if (!try_module_get(ld_type->owner)) { + dprintk("%s: Could not grab reference on module\n", __func__); + goto out_no_driver; + } + server->pnfs_curr_ld = ld_type; + if (ld_type->set_layoutdriver(server)) { + printk(KERN_ERR + "%s: Error initializing mount point for layout driver %u.\n", + __func__, id); + module_put(ld_type->owner); + goto out_no_driver; + } + dprintk("%s: pNFS module for %u set\n", __func__, id); + return; + +out_no_driver: + dprintk("%s: Using NFSv4 I/O\n", __func__); + server->pnfs_curr_ld = NULL; +} + +int +pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type) +{ + int status = -EINVAL; + struct pnfs_layoutdriver_type *tmp; + + if (ld_type->id == 0) { + printk(KERN_ERR "%s id 0 is reserved\n", __func__); + return status; + } + if (!ld_type->alloc_lseg || !ld_type->free_lseg) { + printk(KERN_ERR "%s Layout driver must provide " + "alloc_lseg and free_lseg.\n", __func__); + return status; + } + + spin_lock(&pnfs_spinlock); + tmp = find_pnfs_driver_locked(ld_type->id); + if (!tmp) { + list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl); + status = 0; + dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id, + ld_type->name); + } else { + printk(KERN_ERR "%s Module with id %d already loaded!\n", + __func__, ld_type->id); + } + spin_unlock(&pnfs_spinlock); + + return status; +} +EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver); + +void +pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type) +{ + dprintk("%s Deregistering id:%u\n", __func__, ld_type->id); + spin_lock(&pnfs_spinlock); + list_del(&ld_type->pnfs_tblid); + spin_unlock(&pnfs_spinlock); +} +EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver); + +/* + * pNFS client layout cache + */ + +static void +get_layout_hdr_locked(struct pnfs_layout_hdr *lo) +{ + assert_spin_locked(&lo->inode->i_lock); + lo->refcount++; +} + +static void +put_layout_hdr_locked(struct pnfs_layout_hdr *lo) +{ + assert_spin_locked(&lo->inode->i_lock); + BUG_ON(lo->refcount == 0); + + lo->refcount--; + if (!lo->refcount) { + dprintk("%s: freeing layout cache %p\n", __func__, lo); + BUG_ON(!list_empty(&lo->layouts)); + NFS_I(lo->inode)->layout = NULL; + kfree(lo); + } +} + +void +put_layout_hdr(struct inode *inode) +{ + spin_lock(&inode->i_lock); + put_layout_hdr_locked(NFS_I(inode)->layout); + spin_unlock(&inode->i_lock); +} + +static void +init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) +{ + INIT_LIST_HEAD(&lseg->fi_list); + kref_init(&lseg->kref); + lseg->layout = lo; +} + +/* Called without i_lock held, as the free_lseg call may sleep */ +static void +destroy_lseg(struct kref *kref) +{ + struct pnfs_layout_segment *lseg = + container_of(kref, struct pnfs_layout_segment, kref); + struct inode *ino = lseg->layout->inode; + + dprintk("--> %s\n", __func__); + NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); + /* Matched by get_layout_hdr_locked in pnfs_insert_layout */ + put_layout_hdr(ino); +} + +static void +put_lseg(struct pnfs_layout_segment *lseg) +{ + if (!lseg) + return; + + dprintk("%s: lseg %p ref %d\n", __func__, lseg, + atomic_read(&lseg->kref.refcount)); + kref_put(&lseg->kref, destroy_lseg); +} + +static void +pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list) +{ + struct pnfs_layout_segment *lseg, *next; + struct nfs_client *clp; + + dprintk("%s:Begin lo %p\n", __func__, lo); + + assert_spin_locked(&lo->inode->i_lock); + list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) { + dprintk("%s: freeing lseg %p\n", __func__, lseg); + list_move(&lseg->fi_list, tmp_list); + } + clp = NFS_SERVER(lo->inode)->nfs_client; + spin_lock(&clp->cl_lock); + /* List does not take a reference, so no need for put here */ + list_del_init(&lo->layouts); + spin_unlock(&clp->cl_lock); + write_seqlock(&lo->seqlock); + clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state); + write_sequnlock(&lo->seqlock); + + dprintk("%s:Return\n", __func__); +} + +static void +pnfs_free_lseg_list(struct list_head *tmp_list) +{ + struct pnfs_layout_segment *lseg; + + while (!list_empty(tmp_list)) { + lseg = list_entry(tmp_list->next, struct pnfs_layout_segment, + fi_list); + dprintk("%s calling put_lseg on %p\n", __func__, lseg); + list_del(&lseg->fi_list); + put_lseg(lseg); + } +} + +void +pnfs_destroy_layout(struct nfs_inode *nfsi) +{ + struct pnfs_layout_hdr *lo; + LIST_HEAD(tmp_list); + + spin_lock(&nfsi->vfs_inode.i_lock); + lo = nfsi->layout; + if (lo) { + pnfs_clear_lseg_list(lo, &tmp_list); + /* Matched by refcount set to 1 in alloc_init_layout_hdr */ + put_layout_hdr_locked(lo); + } + spin_unlock(&nfsi->vfs_inode.i_lock); + pnfs_free_lseg_list(&tmp_list); +} + +/* + * Called by the state manger to remove all layouts established under an + * expired lease. + */ +void +pnfs_destroy_all_layouts(struct nfs_client *clp) +{ + struct pnfs_layout_hdr *lo; + LIST_HEAD(tmp_list); + + spin_lock(&clp->cl_lock); + list_splice_init(&clp->cl_layouts, &tmp_list); + spin_unlock(&clp->cl_lock); + + while (!list_empty(&tmp_list)) { + lo = list_entry(tmp_list.next, struct pnfs_layout_hdr, + layouts); + dprintk("%s freeing layout for inode %lu\n", __func__, + lo->inode->i_ino); + pnfs_destroy_layout(NFS_I(lo->inode)); + } +} + +/* update lo->stateid with new if is more recent + * + * lo->stateid could be the open stateid, in which case we just use what given. + */ +static void +pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, + const nfs4_stateid *new) +{ + nfs4_stateid *old = &lo->stateid; + bool overwrite = false; + + write_seqlock(&lo->seqlock); + if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) || + memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other))) + overwrite = true; + else { + u32 oldseq, newseq; + + oldseq = be32_to_cpu(old->stateid.seqid); + newseq = be32_to_cpu(new->stateid.seqid); + if ((int)(newseq - oldseq) > 0) + overwrite = true; + } + if (overwrite) + memcpy(&old->stateid, &new->stateid, sizeof(new->stateid)); + write_sequnlock(&lo->seqlock); +} + +static void +pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo, + struct nfs4_state *state) +{ + int seq; + + dprintk("--> %s\n", __func__); + write_seqlock(&lo->seqlock); + do { + seq = read_seqbegin(&state->seqlock); + memcpy(lo->stateid.data, state->stateid.data, + sizeof(state->stateid.data)); + } while (read_seqretry(&state->seqlock, seq)); + set_bit(NFS_LAYOUT_STATEID_SET, &lo->state); + write_sequnlock(&lo->seqlock); + dprintk("<-- %s\n", __func__); +} + +void +pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, + struct nfs4_state *open_state) +{ + int seq; + + dprintk("--> %s\n", __func__); + do { + seq = read_seqbegin(&lo->seqlock); + if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) { + /* This will trigger retry of the read */ + pnfs_layout_from_open_stateid(lo, open_state); + } else + memcpy(dst->data, lo->stateid.data, + sizeof(lo->stateid.data)); + } while (read_seqretry(&lo->seqlock, seq)); + dprintk("<-- %s\n", __func__); +} + +/* +* Get layout from server. +* for now, assume that whole file layouts are requested. +* arg->offset: 0 +* arg->length: all ones +*/ +static struct pnfs_layout_segment * +send_layoutget(struct pnfs_layout_hdr *lo, + struct nfs_open_context *ctx, + u32 iomode) +{ + struct inode *ino = lo->inode; + struct nfs_server *server = NFS_SERVER(ino); + struct nfs4_layoutget *lgp; + struct pnfs_layout_segment *lseg = NULL; + + dprintk("--> %s\n", __func__); + + BUG_ON(ctx == NULL); + lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); + if (lgp == NULL) { + put_layout_hdr(lo->inode); + return NULL; + } + lgp->args.minlength = NFS4_MAX_UINT64; + lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; + lgp->args.range.iomode = iomode; + lgp->args.range.offset = 0; + lgp->args.range.length = NFS4_MAX_UINT64; + lgp->args.type = server->pnfs_curr_ld->id; + lgp->args.inode = ino; + lgp->args.ctx = get_nfs_open_context(ctx); + lgp->lsegpp = &lseg; + + /* Synchronously retrieve layout information from server and + * store in lseg. + */ + nfs4_proc_layoutget(lgp); + if (!lseg) { + /* remember that LAYOUTGET failed and suspend trying */ + set_bit(lo_fail_bit(iomode), &lo->state); + } + return lseg; +} + +/* + * Compare two layout segments for sorting into layout cache. + * We want to preferentially return RW over RO layouts, so ensure those + * are seen first. + */ +static s64 +cmp_layout(u32 iomode1, u32 iomode2) +{ + /* read > read/write */ + return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ); +} + +static void +pnfs_insert_layout(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_layout_segment *lp; + int found = 0; + + dprintk("%s:Begin\n", __func__); + + assert_spin_locked(&lo->inode->i_lock); + if (list_empty(&lo->segs)) { + struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client; + + spin_lock(&clp->cl_lock); + BUG_ON(!list_empty(&lo->layouts)); + list_add_tail(&lo->layouts, &clp->cl_layouts); + spin_unlock(&clp->cl_lock); + } + list_for_each_entry(lp, &lo->segs, fi_list) { + if (cmp_layout(lp->range.iomode, lseg->range.iomode) > 0) + continue; + list_add_tail(&lseg->fi_list, &lp->fi_list); + dprintk("%s: inserted lseg %p " + "iomode %d offset %llu length %llu before " + "lp %p iomode %d offset %llu length %llu\n", + __func__, lseg, lseg->range.iomode, + lseg->range.offset, lseg->range.length, + lp, lp->range.iomode, lp->range.offset, + lp->range.length); + found = 1; + break; + } + if (!found) { + list_add_tail(&lseg->fi_list, &lo->segs); + dprintk("%s: inserted lseg %p " + "iomode %d offset %llu length %llu at tail\n", + __func__, lseg, lseg->range.iomode, + lseg->range.offset, lseg->range.length); + } + get_layout_hdr_locked(lo); + + dprintk("%s:Return\n", __func__); +} + +static struct pnfs_layout_hdr * +alloc_init_layout_hdr(struct inode *ino) +{ + struct pnfs_layout_hdr *lo; + + lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL); + if (!lo) + return NULL; + lo->refcount = 1; + INIT_LIST_HEAD(&lo->layouts); + INIT_LIST_HEAD(&lo->segs); + seqlock_init(&lo->seqlock); + lo->inode = ino; + return lo; +} + +static struct pnfs_layout_hdr * +pnfs_find_alloc_layout(struct inode *ino) +{ + struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_hdr *new = NULL; + + dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); + + assert_spin_locked(&ino->i_lock); + if (nfsi->layout) + return nfsi->layout; + + spin_unlock(&ino->i_lock); + new = alloc_init_layout_hdr(ino); + spin_lock(&ino->i_lock); + + if (likely(nfsi->layout == NULL)) /* Won the race? */ + nfsi->layout = new; + else + kfree(new); + return nfsi->layout; +} + +/* + * iomode matching rules: + * iomode lseg match + * ----- ----- ----- + * ANY READ true + * ANY RW true + * RW READ false + * RW RW true + * READ READ true + * READ RW true + */ +static int +is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode) +{ + return (iomode != IOMODE_RW || lseg->range.iomode == IOMODE_RW); +} + +/* + * lookup range in layout + */ +static struct pnfs_layout_segment * +pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode) +{ + struct pnfs_layout_segment *lseg, *ret = NULL; + + dprintk("%s:Begin\n", __func__); + + assert_spin_locked(&lo->inode->i_lock); + list_for_each_entry(lseg, &lo->segs, fi_list) { + if (is_matching_lseg(lseg, iomode)) { + ret = lseg; + break; + } + if (cmp_layout(iomode, lseg->range.iomode) > 0) + break; + } + + dprintk("%s:Return lseg %p ref %d\n", + __func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0); + return ret; +} + +/* + * Layout segment is retreived from the server if not cached. + * The appropriate layout segment is referenced and returned to the caller. + */ +struct pnfs_layout_segment * +pnfs_update_layout(struct inode *ino, + struct nfs_open_context *ctx, + enum pnfs_iomode iomode) +{ + struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_hdr *lo; + struct pnfs_layout_segment *lseg = NULL; + + if (!pnfs_enabled_sb(NFS_SERVER(ino))) + return NULL; + spin_lock(&ino->i_lock); + lo = pnfs_find_alloc_layout(ino); + if (lo == NULL) { + dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); + goto out_unlock; + } + + /* Check to see if the layout for the given range already exists */ + lseg = pnfs_has_layout(lo, iomode); + if (lseg) { + dprintk("%s: Using cached lseg %p for iomode %d)\n", + __func__, lseg, iomode); + goto out_unlock; + } + + /* if LAYOUTGET already failed once we don't try again */ + if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state)) + goto out_unlock; + + get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */ + spin_unlock(&ino->i_lock); + + lseg = send_layoutget(lo, ctx, iomode); +out: + dprintk("%s end, state 0x%lx lseg %p\n", __func__, + nfsi->layout->state, lseg); + return lseg; +out_unlock: + spin_unlock(&ino->i_lock); + goto out; +} + +int +pnfs_layout_process(struct nfs4_layoutget *lgp) +{ + struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; + struct nfs4_layoutget_res *res = &lgp->res; + struct pnfs_layout_segment *lseg; + struct inode *ino = lo->inode; + int status = 0; + + /* Inject layout blob into I/O device driver */ + lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res); + if (!lseg || IS_ERR(lseg)) { + if (!lseg) + status = -ENOMEM; + else + status = PTR_ERR(lseg); + dprintk("%s: Could not allocate layout: error %d\n", + __func__, status); + goto out; + } + + spin_lock(&ino->i_lock); + init_lseg(lo, lseg); + lseg->range = res->range; + *lgp->lsegpp = lseg; + pnfs_insert_layout(lo, lseg); + + /* Done processing layoutget. Set the layout stateid */ + pnfs_set_layout_stateid(lo, &res->stateid); + spin_unlock(&ino->i_lock); +out: + return status; +} + +/* + * Device ID cache. Currently supports one layout type per struct nfs_client. + * Add layout type to the lookup key to expand to support multiple types. + */ +int +pnfs_alloc_init_deviceid_cache(struct nfs_client *clp, + void (*free_callback)(struct pnfs_deviceid_node *)) +{ + struct pnfs_deviceid_cache *c; + + c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL); + if (!c) + return -ENOMEM; + spin_lock(&clp->cl_lock); + if (clp->cl_devid_cache != NULL) { + atomic_inc(&clp->cl_devid_cache->dc_ref); + dprintk("%s [kref [%d]]\n", __func__, + atomic_read(&clp->cl_devid_cache->dc_ref)); + kfree(c); + } else { + /* kzalloc initializes hlists */ + spin_lock_init(&c->dc_lock); + atomic_set(&c->dc_ref, 1); + c->dc_free_callback = free_callback; + clp->cl_devid_cache = c; + dprintk("%s [new]\n", __func__); + } + spin_unlock(&clp->cl_lock); + return 0; +} +EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache); + +/* + * Called from pnfs_layoutdriver_type->free_lseg + * last layout segment reference frees deviceid + */ +void +pnfs_put_deviceid(struct pnfs_deviceid_cache *c, + struct pnfs_deviceid_node *devid) +{ + struct nfs4_deviceid *id = &devid->de_id; + struct pnfs_deviceid_node *d; + struct hlist_node *n; + long h = nfs4_deviceid_hash(id); + + dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref)); + if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock)) + return; + + hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node) + if (!memcmp(&d->de_id, id, sizeof(*id))) { + hlist_del_rcu(&d->de_node); + spin_unlock(&c->dc_lock); + synchronize_rcu(); + c->dc_free_callback(devid); + return; + } + spin_unlock(&c->dc_lock); + /* Why wasn't it found in the list? */ + BUG(); +} +EXPORT_SYMBOL_GPL(pnfs_put_deviceid); + +/* Find and reference a deviceid */ +struct pnfs_deviceid_node * +pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id) +{ + struct pnfs_deviceid_node *d; + struct hlist_node *n; + long hash = nfs4_deviceid_hash(id); + + dprintk("--> %s hash %ld\n", __func__, hash); + rcu_read_lock(); + hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { + if (!memcmp(&d->de_id, id, sizeof(*id))) { + if (!atomic_inc_not_zero(&d->de_ref)) { + goto fail; + } else { + rcu_read_unlock(); + return d; + } + } + } +fail: + rcu_read_unlock(); + return NULL; +} +EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid); + +/* + * Add a deviceid to the cache. + * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new + */ +struct pnfs_deviceid_node * +pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new) +{ + struct pnfs_deviceid_node *d; + long hash = nfs4_deviceid_hash(&new->de_id); + + dprintk("--> %s hash %ld\n", __func__, hash); + spin_lock(&c->dc_lock); + d = pnfs_find_get_deviceid(c, &new->de_id); + if (d) { + spin_unlock(&c->dc_lock); + dprintk("%s [discard]\n", __func__); + c->dc_free_callback(new); + return d; + } + INIT_HLIST_NODE(&new->de_node); + atomic_set(&new->de_ref, 1); + hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]); + spin_unlock(&c->dc_lock); + dprintk("%s [new]\n", __func__); + return new; +} +EXPORT_SYMBOL_GPL(pnfs_add_deviceid); + +void +pnfs_put_deviceid_cache(struct nfs_client *clp) +{ + struct pnfs_deviceid_cache *local = clp->cl_devid_cache; + + dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache); + if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) { + int i; + /* Verify cache is empty */ + for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) + BUG_ON(!hlist_empty(&local->dc_deviceids[i])); + clp->cl_devid_cache = NULL; + spin_unlock(&clp->cl_lock); + kfree(local); + } +} +EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h new file mode 100644 index 000000000000..e12367d50489 --- /dev/null +++ b/fs/nfs/pnfs.h @@ -0,0 +1,189 @@ +/* + * pNFS client data structures. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand <dhildebz@umich.edu> + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#ifndef FS_NFS_PNFS_H +#define FS_NFS_PNFS_H + +struct pnfs_layout_segment { + struct list_head fi_list; + struct pnfs_layout_range range; + struct kref kref; + struct pnfs_layout_hdr *layout; +}; + +#ifdef CONFIG_NFS_V4_1 + +#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" + +enum { + NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */ + NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ + NFS_LAYOUT_STATEID_SET, /* have a valid layout stateid */ +}; + +/* Per-layout driver specific registration structure */ +struct pnfs_layoutdriver_type { + struct list_head pnfs_tblid; + const u32 id; + const char *name; + struct module *owner; + int (*set_layoutdriver) (struct nfs_server *); + int (*clear_layoutdriver) (struct nfs_server *); + struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr); + void (*free_lseg) (struct pnfs_layout_segment *lseg); +}; + +struct pnfs_layout_hdr { + unsigned long refcount; + struct list_head layouts; /* other client layouts */ + struct list_head segs; /* layout segments list */ + seqlock_t seqlock; /* Protects the stateid */ + nfs4_stateid stateid; + unsigned long state; + struct inode *inode; +}; + +struct pnfs_device { + struct nfs4_deviceid dev_id; + unsigned int layout_type; + unsigned int mincount; + struct page **pages; + void *area; + unsigned int pgbase; + unsigned int pglen; +}; + +/* + * Device ID RCU cache. A device ID is unique per client ID and layout type. + */ +#define NFS4_DEVICE_ID_HASH_BITS 5 +#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) +#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) + +static inline u32 +nfs4_deviceid_hash(struct nfs4_deviceid *id) +{ + unsigned char *cptr = (unsigned char *)id->data; + unsigned int nbytes = NFS4_DEVICEID4_SIZE; + u32 x = 0; + + while (nbytes--) { + x *= 37; + x += *cptr++; + } + return x & NFS4_DEVICE_ID_HASH_MASK; +} + +struct pnfs_deviceid_node { + struct hlist_node de_node; + struct nfs4_deviceid de_id; + atomic_t de_ref; +}; + +struct pnfs_deviceid_cache { + spinlock_t dc_lock; + atomic_t dc_ref; + void (*dc_free_callback)(struct pnfs_deviceid_node *); + struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE]; +}; + +extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *, + void (*free_callback)(struct pnfs_deviceid_node *)); +extern void pnfs_put_deviceid_cache(struct nfs_client *); +extern struct pnfs_deviceid_node *pnfs_find_get_deviceid( + struct pnfs_deviceid_cache *, + struct nfs4_deviceid *); +extern struct pnfs_deviceid_node *pnfs_add_deviceid( + struct pnfs_deviceid_cache *, + struct pnfs_deviceid_node *); +extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c, + struct pnfs_deviceid_node *devid); + +extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); +extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); + +/* nfs4proc.c */ +extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, + struct pnfs_device *dev); +extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); + +/* pnfs.c */ +struct pnfs_layout_segment * +pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, + enum pnfs_iomode access_type); +void set_pnfs_layoutdriver(struct nfs_server *, u32 id); +void unset_pnfs_layoutdriver(struct nfs_server *); +int pnfs_layout_process(struct nfs4_layoutget *lgp); +void pnfs_destroy_layout(struct nfs_inode *); +void pnfs_destroy_all_layouts(struct nfs_client *); +void put_layout_hdr(struct inode *inode); +void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, + struct nfs4_state *open_state); + + +static inline int lo_fail_bit(u32 iomode) +{ + return iomode == IOMODE_RW ? + NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED; +} + +/* Return true if a layout driver is being used for this mountpoint */ +static inline int pnfs_enabled_sb(struct nfs_server *nfss) +{ + return nfss->pnfs_curr_ld != NULL; +} + +#else /* CONFIG_NFS_V4_1 */ + +static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) +{ +} + +static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) +{ +} + +static inline struct pnfs_layout_segment * +pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, + enum pnfs_iomode access_type) +{ + return NULL; +} + +static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id) +{ +} + +static inline void unset_pnfs_layoutdriver(struct nfs_server *s) +{ +} + +#endif /* CONFIG_NFS_V4_1 */ + +#endif /* FS_NFS_PNFS_H */ diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 611bec22f552..58e7f84fc1fd 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -258,7 +258,7 @@ static void nfs_free_createdata(const struct nfs_createdata *data) static int nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, - int flags, struct nameidata *nd) + int flags, struct nfs_open_context *ctx) { struct nfs_createdata *data; struct rpc_message msg = { @@ -365,17 +365,32 @@ static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir) return 1; } +static void +nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +{ + msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME]; +} + +static int +nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir, + struct inode *new_dir) +{ + if (nfs_async_handle_expired_key(task)) + return 0; + nfs_mark_for_revalidate(old_dir); + nfs_mark_for_revalidate(new_dir); + return 1; +} + static int nfs_proc_rename(struct inode *old_dir, struct qstr *old_name, struct inode *new_dir, struct qstr *new_name) { struct nfs_renameargs arg = { - .fromfh = NFS_FH(old_dir), - .fromname = old_name->name, - .fromlen = old_name->len, - .tofh = NFS_FH(new_dir), - .toname = new_name->name, - .tolen = new_name->len + .old_dir = NFS_FH(old_dir), + .old_name = old_name, + .new_dir = NFS_FH(new_dir), + .new_name = new_name, }; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_RENAME], @@ -519,14 +534,14 @@ nfs_proc_rmdir(struct inode *dir, struct qstr *name) */ static int nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, - u64 cookie, struct page *page, unsigned int count, int plus) + u64 cookie, struct page **pages, unsigned int count, int plus) { struct inode *dir = dentry->d_inode; struct nfs_readdirargs arg = { .fh = NFS_FH(dir), .cookie = cookie, .count = count, - .pages = &page, + .pages = pages, }; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_READDIR], @@ -705,6 +720,8 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .unlink_setup = nfs_proc_unlink_setup, .unlink_done = nfs_proc_unlink_done, .rename = nfs_proc_rename, + .rename_setup = nfs_proc_rename_setup, + .rename_done = nfs_proc_rename_done, .link = nfs_proc_link, .symlink = nfs_proc_symlink, .mkdir = nfs_proc_mkdir, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 87adc2744246..e4b62c6f5a6e 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -25,6 +25,7 @@ #include "internal.h" #include "iostat.h" #include "fscache.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE @@ -46,7 +47,6 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); p->npages = pagecount; - p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; if (pagecount <= ARRAY_SIZE(p->page_array)) p->pagevec = p->page_array; else { @@ -121,6 +121,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, len = nfs_page_length(page); if (len == 0) return nfs_return_empty_page(page); + pnfs_update_layout(inode, ctx, IOMODE_READ); new = nfs_create_request(ctx, inode, page, 0, len); if (IS_ERR(new)) { unlock_page(page); @@ -625,6 +626,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, if (ret == 0) goto read_complete; /* all pages were read */ + pnfs_update_layout(inode, desc.ctx, IOMODE_READ); if (rsize < PAGE_CACHE_SIZE) nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); else diff --git a/fs/nfs/super.c b/fs/nfs/super.c index f4cbf0c306c6..3600ec700d58 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -100,6 +100,7 @@ enum { Opt_addr, Opt_mountaddr, Opt_clientaddr, Opt_lookupcache, Opt_fscache_uniq, + Opt_local_lock, /* Special mount options */ Opt_userspace, Opt_deprecated, Opt_sloppy, @@ -171,6 +172,7 @@ static const match_table_t nfs_mount_option_tokens = { { Opt_lookupcache, "lookupcache=%s" }, { Opt_fscache_uniq, "fsc=%s" }, + { Opt_local_lock, "local_lock=%s" }, { Opt_err, NULL } }; @@ -236,6 +238,22 @@ static match_table_t nfs_lookupcache_tokens = { { Opt_lookupcache_err, NULL } }; +enum { + Opt_local_lock_all, Opt_local_lock_flock, Opt_local_lock_posix, + Opt_local_lock_none, + + Opt_local_lock_err +}; + +static match_table_t nfs_local_lock_tokens = { + { Opt_local_lock_all, "all" }, + { Opt_local_lock_flock, "flock" }, + { Opt_local_lock_posix, "posix" }, + { Opt_local_lock_none, "none" }, + + { Opt_local_lock_err, NULL } +}; + static void nfs_umount_begin(struct super_block *); static int nfs_statfs(struct dentry *, struct kstatfs *); @@ -622,6 +640,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, const struct proc_nfs_info *nfs_infop; struct nfs_client *clp = nfss->nfs_client; u32 version = clp->rpc_ops->version; + int local_flock, local_fcntl; seq_printf(m, ",vers=%u", version); seq_printf(m, ",rsize=%u", nfss->rsize); @@ -670,6 +689,18 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, else seq_printf(m, ",lookupcache=pos"); } + + local_flock = nfss->flags & NFS_MOUNT_LOCAL_FLOCK; + local_fcntl = nfss->flags & NFS_MOUNT_LOCAL_FCNTL; + + if (!local_flock && !local_fcntl) + seq_printf(m, ",local_lock=none"); + else if (local_flock && local_fcntl) + seq_printf(m, ",local_lock=all"); + else if (local_flock) + seq_printf(m, ",local_lock=flock"); + else + seq_printf(m, ",local_lock=posix"); } /* @@ -1017,9 +1048,13 @@ static int nfs_parse_mount_options(char *raw, break; case Opt_lock: mnt->flags &= ~NFS_MOUNT_NONLM; + mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | + NFS_MOUNT_LOCAL_FCNTL); break; case Opt_nolock: mnt->flags |= NFS_MOUNT_NONLM; + mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK | + NFS_MOUNT_LOCAL_FCNTL); break; case Opt_v2: mnt->flags &= ~NFS_MOUNT_VER3; @@ -1420,6 +1455,34 @@ static int nfs_parse_mount_options(char *raw, mnt->fscache_uniq = string; mnt->options |= NFS_OPTION_FSCACHE; break; + case Opt_local_lock: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + token = match_token(string, nfs_local_lock_tokens, + args); + kfree(string); + switch (token) { + case Opt_local_lock_all: + mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK | + NFS_MOUNT_LOCAL_FCNTL); + break; + case Opt_local_lock_flock: + mnt->flags |= NFS_MOUNT_LOCAL_FLOCK; + break; + case Opt_local_lock_posix: + mnt->flags |= NFS_MOUNT_LOCAL_FCNTL; + break; + case Opt_local_lock_none: + mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | + NFS_MOUNT_LOCAL_FCNTL); + break; + default: + dfprintk(MOUNT, "NFS: invalid " + "local_lock argument\n"); + return 0; + }; + break; /* * Special options @@ -1825,6 +1888,12 @@ static int nfs_validate_mount_data(void *options, if (!args->nfs_server.hostname) goto out_nomem; + if (!(data->flags & NFS_MOUNT_NONLM)) + args->flags &= ~(NFS_MOUNT_LOCAL_FLOCK| + NFS_MOUNT_LOCAL_FCNTL); + else + args->flags |= (NFS_MOUNT_LOCAL_FLOCK| + NFS_MOUNT_LOCAL_FCNTL); /* * The legacy version 6 binary mount data from userspace has a * field used only to transport selinux information into the @@ -2441,7 +2510,8 @@ static void nfs4_fill_super(struct super_block *sb) static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args) { - args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3); + args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3| + NFS_MOUNT_LOCAL_FLOCK|NFS_MOUNT_LOCAL_FCNTL); } static int nfs4_validate_text_mount_data(void *options, diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index ad4d2e787b20..978aaeb8a093 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -32,6 +32,7 @@ static ctl_table nfs_cb_sysctls[] = { .extra1 = (int *)&nfs_set_port_min, .extra2 = (int *)&nfs_set_port_max, }, +#ifndef CONFIG_NFS_USE_NEW_IDMAPPER { .procname = "idmap_cache_timeout", .data = &nfs_idmap_cache_timeout, @@ -39,6 +40,7 @@ static ctl_table nfs_cb_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, +#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */ #endif { .procname = "nfs_mountpoint_timeout", diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 2f84adaad427..9a16bad5d2ea 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -13,9 +13,12 @@ #include <linux/nfs_fs.h> #include <linux/sched.h> #include <linux/wait.h> +#include <linux/namei.h> #include "internal.h" #include "nfs4_fs.h" +#include "iostat.h" +#include "delegation.h" struct nfs_unlinkdata { struct hlist_node list; @@ -244,7 +247,7 @@ void nfs_unblock_sillyrename(struct dentry *dentry) * @dir: parent directory of dentry * @dentry: dentry to unlink */ -int +static int nfs_async_unlink(struct inode *dir, struct dentry *dentry) { struct nfs_unlinkdata *data; @@ -259,7 +262,6 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry) status = PTR_ERR(data->cred); goto out_free; } - data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; data->res.dir_attr = &data->dir_attr; status = -EBUSY; @@ -303,3 +305,256 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode) if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data))) nfs_free_unlinkdata(data); } + +/* Cancel a queued async unlink. Called when a sillyrename run fails. */ +static void +nfs_cancel_async_unlink(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { + struct nfs_unlinkdata *data = dentry->d_fsdata; + + dentry->d_flags &= ~DCACHE_NFSFS_RENAMED; + spin_unlock(&dentry->d_lock); + nfs_free_unlinkdata(data); + return; + } + spin_unlock(&dentry->d_lock); +} + +struct nfs_renamedata { + struct nfs_renameargs args; + struct nfs_renameres res; + struct rpc_cred *cred; + struct inode *old_dir; + struct dentry *old_dentry; + struct nfs_fattr old_fattr; + struct inode *new_dir; + struct dentry *new_dentry; + struct nfs_fattr new_fattr; +}; + +/** + * nfs_async_rename_done - Sillyrename post-processing + * @task: rpc_task of the sillyrename + * @calldata: nfs_renamedata for the sillyrename + * + * Do the directory attribute updates and the d_move + */ +static void nfs_async_rename_done(struct rpc_task *task, void *calldata) +{ + struct nfs_renamedata *data = calldata; + struct inode *old_dir = data->old_dir; + struct inode *new_dir = data->new_dir; + + if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { + nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client); + return; + } + + if (task->tk_status != 0) { + nfs_cancel_async_unlink(data->old_dentry); + return; + } + + nfs_set_verifier(data->old_dentry, nfs_save_change_attribute(old_dir)); + d_move(data->old_dentry, data->new_dentry); +} + +/** + * nfs_async_rename_release - Release the sillyrename data. + * @calldata: the struct nfs_renamedata to be released + */ +static void nfs_async_rename_release(void *calldata) +{ + struct nfs_renamedata *data = calldata; + struct super_block *sb = data->old_dir->i_sb; + + if (data->old_dentry->d_inode) + nfs_mark_for_revalidate(data->old_dentry->d_inode); + + dput(data->old_dentry); + dput(data->new_dentry); + iput(data->old_dir); + iput(data->new_dir); + nfs_sb_deactive(sb); + put_rpccred(data->cred); + kfree(data); +} + +#if defined(CONFIG_NFS_V4_1) +static void nfs_rename_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs_renamedata *data = calldata; + struct nfs_server *server = NFS_SERVER(data->old_dir); + + if (nfs4_setup_sequence(server, &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +} +#endif /* CONFIG_NFS_V4_1 */ + +static const struct rpc_call_ops nfs_rename_ops = { + .rpc_call_done = nfs_async_rename_done, + .rpc_release = nfs_async_rename_release, +#if defined(CONFIG_NFS_V4_1) + .rpc_call_prepare = nfs_rename_prepare, +#endif /* CONFIG_NFS_V4_1 */ +}; + +/** + * nfs_async_rename - perform an asynchronous rename operation + * @old_dir: directory that currently holds the dentry to be renamed + * @new_dir: target directory for the rename + * @old_dentry: original dentry to be renamed + * @new_dentry: dentry to which the old_dentry should be renamed + * + * It's expected that valid references to the dentries and inodes are held + */ +static struct rpc_task * +nfs_async_rename(struct inode *old_dir, struct inode *new_dir, + struct dentry *old_dentry, struct dentry *new_dentry) +{ + struct nfs_renamedata *data; + struct rpc_message msg = { }; + struct rpc_task_setup task_setup_data = { + .rpc_message = &msg, + .callback_ops = &nfs_rename_ops, + .workqueue = nfsiod_workqueue, + .rpc_client = NFS_CLIENT(old_dir), + .flags = RPC_TASK_ASYNC, + }; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data == NULL) + return ERR_PTR(-ENOMEM); + task_setup_data.callback_data = data, + + data->cred = rpc_lookup_cred(); + if (IS_ERR(data->cred)) { + struct rpc_task *task = ERR_CAST(data->cred); + kfree(data); + return task; + } + + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + msg.rpc_cred = data->cred; + + /* set up nfs_renamedata */ + data->old_dir = old_dir; + atomic_inc(&old_dir->i_count); + data->new_dir = new_dir; + atomic_inc(&new_dir->i_count); + data->old_dentry = dget(old_dentry); + data->new_dentry = dget(new_dentry); + nfs_fattr_init(&data->old_fattr); + nfs_fattr_init(&data->new_fattr); + + /* set up nfs_renameargs */ + data->args.old_dir = NFS_FH(old_dir); + data->args.old_name = &old_dentry->d_name; + data->args.new_dir = NFS_FH(new_dir); + data->args.new_name = &new_dentry->d_name; + + /* set up nfs_renameres */ + data->res.old_fattr = &data->old_fattr; + data->res.new_fattr = &data->new_fattr; + + nfs_sb_active(old_dir->i_sb); + + NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dir); + + return rpc_run_task(&task_setup_data); +} + +/** + * nfs_sillyrename - Perform a silly-rename of a dentry + * @dir: inode of directory that contains dentry + * @dentry: dentry to be sillyrenamed + * + * NFSv2/3 is stateless and the server doesn't know when the client is + * holding a file open. To prevent application problems when a file is + * unlinked while it's still open, the client performs a "silly-rename". + * That is, it renames the file to a hidden file in the same directory, + * and only performs the unlink once the last reference to it is put. + * + * The final cleanup is done during dentry_iput. + */ +int +nfs_sillyrename(struct inode *dir, struct dentry *dentry) +{ + static unsigned int sillycounter; + const int fileidsize = sizeof(NFS_FILEID(dentry->d_inode))*2; + const int countersize = sizeof(sillycounter)*2; + const int slen = sizeof(".nfs")+fileidsize+countersize-1; + char silly[slen+1]; + struct dentry *sdentry; + struct rpc_task *task; + int error = -EIO; + + dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + atomic_read(&dentry->d_count)); + nfs_inc_stats(dir, NFSIOS_SILLYRENAME); + + /* + * We don't allow a dentry to be silly-renamed twice. + */ + error = -EBUSY; + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) + goto out; + + sprintf(silly, ".nfs%*.*Lx", + fileidsize, fileidsize, + (unsigned long long)NFS_FILEID(dentry->d_inode)); + + /* Return delegation in anticipation of the rename */ + nfs_inode_return_delegation(dentry->d_inode); + + sdentry = NULL; + do { + char *suffix = silly + slen - countersize; + + dput(sdentry); + sillycounter++; + sprintf(suffix, "%*.*x", countersize, countersize, sillycounter); + + dfprintk(VFS, "NFS: trying to rename %s to %s\n", + dentry->d_name.name, silly); + + sdentry = lookup_one_len(silly, dentry->d_parent, slen); + /* + * N.B. Better to return EBUSY here ... it could be + * dangerous to delete the file while it's in use. + */ + if (IS_ERR(sdentry)) + goto out; + } while (sdentry->d_inode != NULL); /* need negative lookup */ + + /* queue unlink first. Can't do this from rpc_release as it + * has to allocate memory + */ + error = nfs_async_unlink(dir, dentry); + if (error) + goto out_dput; + + /* run the rename task, undo unlink if it fails */ + task = nfs_async_rename(dir, dir, dentry, sdentry); + if (IS_ERR(task)) { + error = -EBUSY; + nfs_cancel_async_unlink(dentry); + goto out_dput; + } + + /* wait for the RPC task to complete, unless a SIGKILL intervenes */ + error = rpc_wait_for_completion_task(task); + if (error == 0) + error = task->tk_status; + rpc_put_task(task); +out_dput: + dput(sdentry); +out: + return error; +} diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 874972d9427c..4c14c17a5276 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -55,7 +55,6 @@ struct nfs_write_data *nfs_commitdata_alloc(void) if (p) { memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); - p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; } return p; } @@ -75,7 +74,6 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); p->npages = pagecount; - p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; if (pagecount <= ARRAY_SIZE(p->page_array)) p->pagevec = p->page_array; else { @@ -292,9 +290,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); nfs_pageio_cond_complete(pgio, page->index); - ret = nfs_page_async_flush(pgio, page, - wbc->sync_mode == WB_SYNC_NONE || - wbc->nonblocking != 0); + ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); if (ret == -EAGAIN) { redirty_page_for_writepage(wbc, page); ret = 0; @@ -1433,15 +1429,17 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr int flags = FLUSH_SYNC; int ret = 0; - /* Don't commit yet if this is a non-blocking flush and there are - * lots of outstanding writes for this mapping. - */ - if (wbc->sync_mode == WB_SYNC_NONE && - nfsi->ncommit <= (nfsi->npages >> 1)) - goto out_mark_dirty; + if (wbc->sync_mode == WB_SYNC_NONE) { + /* Don't commit yet if this is a non-blocking flush and there + * are a lot of outstanding writes for this mapping. + */ + if (nfsi->ncommit <= (nfsi->npages >> 1)) + goto out_mark_dirty; - if (wbc->nonblocking || wbc->for_background) + /* don't wait for the COMMIT response */ flags = 0; + } + ret = nfs_commit_inode(inode, flags); if (ret >= 0) { if (wbc->sync_mode == WB_SYNC_NONE) { diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 7cf4ddafb4ab..18b3e8975fe0 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -2,7 +2,6 @@ config NFSD tristate "NFS server support" depends on INET depends on FILE_LOCKING - depends on BKL # fix as soon as lockd is done select LOCKD select SUNRPC select EXPORTFS @@ -29,6 +28,18 @@ config NFSD If unsure, say N. +config NFSD_DEPRECATED + bool "Include support for deprecated syscall interface to NFSD" + depends on NFSD + default y + help + The syscall interface to nfsd was obsoleted in 2.6.0 by a new + filesystem based interface. The old interface is due for removal + in 2.6.40. If you wish to remove the interface before then + say N. + + In unsure, say Y. + config NFSD_V2_ACL bool depends on NFSD diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index c2a4f71d87dd..c0fcb7ab7f6d 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -28,9 +28,6 @@ typedef struct auth_domain svc_client; typedef struct svc_export svc_export; -static void exp_do_unexport(svc_export *unexp); -static int exp_verify_string(char *cp, int max); - /* * We have two caches. * One maps client+vfsmnt+dentry to export options - the export map @@ -802,6 +799,7 @@ exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp) return ek; } +#ifdef CONFIG_NFSD_DEPRECATED static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv, struct svc_export *exp) { @@ -852,6 +850,7 @@ exp_get_fsid_key(svc_client *clp, int fsid) return exp_find_key(clp, FSID_NUM, fsidv, NULL); } +#endif static svc_export *exp_get_by_name(svc_client *clp, const struct path *path, struct cache_req *reqp) @@ -893,6 +892,7 @@ static struct svc_export *exp_parent(svc_client *clp, struct path *path) return exp; } +#ifdef CONFIG_NFSD_DEPRECATED /* * Hashtable locking. Write locks are placed only by user processes * wanting to modify export information. @@ -925,6 +925,19 @@ exp_writeunlock(void) { up_write(&hash_sem); } +#else + +/* hash_sem not needed once deprecated interface is removed */ +void exp_readlock(void) {} +static inline void exp_writelock(void){} +void exp_readunlock(void) {} +static inline void exp_writeunlock(void){} + +#endif + +#ifdef CONFIG_NFSD_DEPRECATED +static void exp_do_unexport(svc_export *unexp); +static int exp_verify_string(char *cp, int max); static void exp_fsid_unhash(struct svc_export *exp) { @@ -935,10 +948,9 @@ static void exp_fsid_unhash(struct svc_export *exp) ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid); if (!IS_ERR(ek)) { - ek->h.expiry_time = get_seconds()-1; + sunrpc_invalidate(&ek->h, &svc_expkey_cache); cache_put(&ek->h, &svc_expkey_cache); } - svc_expkey_cache.nextcheck = get_seconds(); } static int exp_fsid_hash(svc_client *clp, struct svc_export *exp) @@ -973,10 +985,9 @@ static void exp_unhash(struct svc_export *exp) ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino); if (!IS_ERR(ek)) { - ek->h.expiry_time = get_seconds()-1; + sunrpc_invalidate(&ek->h, &svc_expkey_cache); cache_put(&ek->h, &svc_expkey_cache); } - svc_expkey_cache.nextcheck = get_seconds(); } /* @@ -1097,8 +1108,7 @@ out: static void exp_do_unexport(svc_export *unexp) { - unexp->h.expiry_time = get_seconds()-1; - svc_export_cache.nextcheck = get_seconds(); + sunrpc_invalidate(&unexp->h, &svc_export_cache); exp_unhash(unexp); exp_fsid_unhash(unexp); } @@ -1150,6 +1160,7 @@ out_unlock: exp_writeunlock(); return err; } +#endif /* CONFIG_NFSD_DEPRECATED */ /* * Obtain the root fh on behalf of a client. @@ -1459,25 +1470,43 @@ static void show_secinfo_flags(struct seq_file *m, int flags) show_expflags(m, flags, NFSEXP_SECINFO_FLAGS); } +static bool secinfo_flags_equal(int f, int g) +{ + f &= NFSEXP_SECINFO_FLAGS; + g &= NFSEXP_SECINFO_FLAGS; + return f == g; +} + +static int show_secinfo_run(struct seq_file *m, struct exp_flavor_info **fp, struct exp_flavor_info *end) +{ + int flags; + + flags = (*fp)->flags; + seq_printf(m, ",sec=%d", (*fp)->pseudoflavor); + (*fp)++; + while (*fp != end && secinfo_flags_equal(flags, (*fp)->flags)) { + seq_printf(m, ":%d", (*fp)->pseudoflavor); + (*fp)++; + } + return flags; +} + static void show_secinfo(struct seq_file *m, struct svc_export *exp) { struct exp_flavor_info *f; struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; - int lastflags = 0, first = 0; + int flags; if (exp->ex_nflavors == 0) return; - for (f = exp->ex_flavors; f < end; f++) { - if (first || f->flags != lastflags) { - if (!first) - show_secinfo_flags(m, lastflags); - seq_printf(m, ",sec=%d", f->pseudoflavor); - lastflags = f->flags; - } else { - seq_printf(m, ":%d", f->pseudoflavor); - } + f = exp->ex_flavors; + flags = show_secinfo_run(m, &f, end); + if (!secinfo_flags_equal(flags, exp->ex_flags)) + show_secinfo_flags(m, flags); + while (f != end) { + flags = show_secinfo_run(m, &f, end); + show_secinfo_flags(m, flags); } - show_secinfo_flags(m, lastflags); } static void exp_flags(struct seq_file *m, int flag, int fsid, @@ -1532,6 +1561,7 @@ const struct seq_operations nfs_exports_op = { .show = e_show, }; +#ifdef CONFIG_NFSD_DEPRECATED /* * Add or modify a client. * Change requests may involve the list of host addresses. The list of @@ -1563,7 +1593,7 @@ exp_addclient(struct nfsctl_client *ncp) /* Insert client into hashtable. */ for (i = 0; i < ncp->cl_naddr; i++) { ipv6_addr_set_v4mapped(ncp->cl_addrlist[i].s_addr, &addr6); - auth_unix_add_addr(&addr6, dom); + auth_unix_add_addr(&init_net, &addr6, dom); } auth_unix_forget_old(dom); auth_domain_put(dom); @@ -1621,6 +1651,7 @@ exp_verify_string(char *cp, int max) printk(KERN_NOTICE "nfsd: couldn't validate string %s\n", cp); return 0; } +#endif /* CONFIG_NFSD_DEPRECATED */ /* * Initialize the exports module. diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 988cbb3a19b6..143da2eecd7b 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -41,7 +41,6 @@ #define NFSPROC4_CB_NULL 0 #define NFSPROC4_CB_COMPOUND 1 -#define NFS4_STATEID_SIZE 16 /* Index of predefined Linux callback client operations */ @@ -248,10 +247,11 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp, } static void -encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args, +encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb, struct nfs4_cb_compound_hdr *hdr) { __be32 *p; + struct nfsd4_session *ses = cb->cb_clp->cl_cb_session; if (hdr->minorversion == 0) return; @@ -259,8 +259,8 @@ encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args, RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20); WRITE32(OP_CB_SEQUENCE); - WRITEMEM(args->cbs_clp->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN); - WRITE32(args->cbs_clp->cl_cb_seq_nr); + WRITEMEM(ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN); + WRITE32(ses->se_cb_seq_nr); WRITE32(0); /* slotid, always 0 */ WRITE32(0); /* highest slotid always 0 */ WRITE32(0); /* cachethis always 0 */ @@ -280,18 +280,18 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p) static int nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, - struct nfs4_rpc_args *rpc_args) + struct nfsd4_callback *cb) { struct xdr_stream xdr; - struct nfs4_delegation *args = rpc_args->args_op; + struct nfs4_delegation *args = cb->cb_op; struct nfs4_cb_compound_hdr hdr = { - .ident = args->dl_ident, - .minorversion = rpc_args->args_seq.cbs_minorversion, + .ident = cb->cb_clp->cl_cb_ident, + .minorversion = cb->cb_minorversion, }; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_cb_compound_hdr(&xdr, &hdr); - encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); + encode_cb_sequence(&xdr, cb, &hdr); encode_cb_recall(&xdr, args, &hdr); encode_cb_nops(&hdr); return 0; @@ -339,15 +339,16 @@ decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) * with a single slot. */ static int -decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res, +decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb, struct rpc_rqst *rqstp) { + struct nfsd4_session *ses = cb->cb_clp->cl_cb_session; struct nfs4_sessionid id; int status; u32 dummy; __be32 *p; - if (res->cbs_minorversion == 0) + if (cb->cb_minorversion == 0) return 0; status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE); @@ -363,13 +364,12 @@ decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res, READ_BUF(NFS4_MAX_SESSIONID_LEN + 16); memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN); p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN); - if (memcmp(id.data, res->cbs_clp->cl_sessionid.data, - NFS4_MAX_SESSIONID_LEN)) { + if (memcmp(id.data, ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) { dprintk("%s Invalid session id\n", __func__); goto out; } READ32(dummy); - if (dummy != res->cbs_clp->cl_cb_seq_nr) { + if (dummy != ses->se_cb_seq_nr) { dprintk("%s Invalid sequence number\n", __func__); goto out; } @@ -393,7 +393,7 @@ nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p) static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p, - struct nfsd4_cb_sequence *seq) + struct nfsd4_callback *cb) { struct xdr_stream xdr; struct nfs4_cb_compound_hdr hdr; @@ -403,8 +403,8 @@ nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p, status = decode_cb_compound_hdr(&xdr, &hdr); if (status) goto out; - if (seq) { - status = decode_cb_sequence(&xdr, seq, rqstp); + if (cb) { + status = decode_cb_sequence(&xdr, cb, rqstp); if (status) goto out; } @@ -473,30 +473,34 @@ static int max_cb_time(void) /* Reference counting, callback cleanup, etc., all look racy as heck. * And why is cl_cb_set an atomic? */ -int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb) +int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn) { struct rpc_timeout timeparms = { .to_initval = max_cb_time(), .to_retries = 0, }; struct rpc_create_args args = { - .protocol = XPRT_TRANSPORT_TCP, - .address = (struct sockaddr *) &cb->cb_addr, - .addrsize = cb->cb_addrlen, + .net = &init_net, + .address = (struct sockaddr *) &conn->cb_addr, + .addrsize = conn->cb_addrlen, .timeout = &timeparms, .program = &cb_program, - .prognumber = cb->cb_prog, .version = 0, .authflavor = clp->cl_flavor, .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), - .client_name = clp->cl_principal, }; struct rpc_clnt *client; - if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) - return -EINVAL; - if (cb->cb_minorversion) { - args.bc_xprt = cb->cb_xprt; + if (clp->cl_minorversion == 0) { + if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) + return -EINVAL; + args.client_name = clp->cl_principal; + args.prognumber = conn->cb_prog, + args.protocol = XPRT_TRANSPORT_TCP; + clp->cl_cb_ident = conn->cb_ident; + } else { + args.bc_xprt = conn->cb_xprt; + args.prognumber = clp->cl_cb_session->se_cb_prog; args.protocol = XPRT_TRANSPORT_BC_TCP; } /* Create RPC client */ @@ -506,7 +510,7 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb) PTR_ERR(client)); return PTR_ERR(client); } - nfsd4_set_callback_client(clp, client); + clp->cl_cb_client = client; return 0; } @@ -519,7 +523,7 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason) static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) { - struct nfs4_client *clp = calldata; + struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); if (task->tk_status) warn_no_callback_path(clp, task->tk_status); @@ -528,6 +532,8 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) } static const struct rpc_call_ops nfsd4_cb_probe_ops = { + /* XXX: release method to ensure we set the cb channel down if + * necessary on early failure? */ .rpc_call_done = nfsd4_cb_probe_done, }; @@ -543,38 +549,42 @@ int set_callback_cred(void) return 0; } +static struct workqueue_struct *callback_wq; -void do_probe_callback(struct nfs4_client *clp) +static void do_probe_callback(struct nfs4_client *clp) { - struct rpc_message msg = { - .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], - .rpc_argp = clp, - .rpc_cred = callback_cred - }; - int status; + struct nfsd4_callback *cb = &clp->cl_cb_null; - status = rpc_call_async(clp->cl_cb_client, &msg, - RPC_TASK_SOFT | RPC_TASK_SOFTCONN, - &nfsd4_cb_probe_ops, (void *)clp); - if (status) - warn_no_callback_path(clp, status); + cb->cb_op = NULL; + cb->cb_clp = clp; + + cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL]; + cb->cb_msg.rpc_argp = NULL; + cb->cb_msg.rpc_resp = NULL; + cb->cb_msg.rpc_cred = callback_cred; + + cb->cb_ops = &nfsd4_cb_probe_ops; + + queue_work(callback_wq, &cb->cb_work); } /* - * Set up the callback client and put a NFSPROC4_CB_NULL on the wire... + * Poke the callback thread to process any updates to the callback + * parameters, and send a null probe. */ -void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb) +void nfsd4_probe_callback(struct nfs4_client *clp) { - int status; + set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags); + do_probe_callback(clp); +} +void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn) +{ BUG_ON(atomic_read(&clp->cl_cb_set)); - status = setup_callback_client(clp, cb); - if (status) { - warn_no_callback_path(clp, status); - return; - } - do_probe_callback(clp); + spin_lock(&clp->cl_lock); + memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn)); + spin_unlock(&clp->cl_lock); } /* @@ -585,8 +595,7 @@ void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb) static int nfsd41_cb_setup_sequence(struct nfs4_client *clp, struct rpc_task *task) { - struct nfs4_rpc_args *args = task->tk_msg.rpc_argp; - u32 *ptr = (u32 *)clp->cl_sessionid.data; + u32 *ptr = (u32 *)clp->cl_cb_session->se_sessionid.data; int status = 0; dprintk("%s: %u:%u:%u:%u\n", __func__, @@ -598,14 +607,6 @@ static int nfsd41_cb_setup_sequence(struct nfs4_client *clp, status = -EAGAIN; goto out; } - - /* - * We'll need the clp during XDR encoding and decoding, - * and the sequence during decoding to verify the reply - */ - args->args_seq.cbs_clp = clp; - task->tk_msg.rpc_resp = &args->args_seq; - out: dprintk("%s status=%d\n", __func__, status); return status; @@ -617,13 +618,13 @@ out: */ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) { - struct nfs4_delegation *dp = calldata; + struct nfsd4_callback *cb = calldata; + struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); struct nfs4_client *clp = dp->dl_client; - struct nfs4_rpc_args *args = task->tk_msg.rpc_argp; - u32 minorversion = clp->cl_cb_conn.cb_minorversion; + u32 minorversion = clp->cl_minorversion; int status = 0; - args->args_seq.cbs_minorversion = minorversion; + cb->cb_minorversion = minorversion; if (minorversion) { status = nfsd41_cb_setup_sequence(clp, task); if (status) { @@ -640,19 +641,20 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) static void nfsd4_cb_done(struct rpc_task *task, void *calldata) { - struct nfs4_delegation *dp = calldata; + struct nfsd4_callback *cb = calldata; + struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); struct nfs4_client *clp = dp->dl_client; dprintk("%s: minorversion=%d\n", __func__, - clp->cl_cb_conn.cb_minorversion); + clp->cl_minorversion); - if (clp->cl_cb_conn.cb_minorversion) { + if (clp->cl_minorversion) { /* No need for lock, access serialized in nfsd4_cb_prepare */ - ++clp->cl_cb_seq_nr; + ++clp->cl_cb_session->se_cb_seq_nr; clear_bit(0, &clp->cl_cb_slot_busy); rpc_wake_up_next(&clp->cl_cb_waitq); dprintk("%s: freed slot, new seqid=%d\n", __func__, - clp->cl_cb_seq_nr); + clp->cl_cb_session->se_cb_seq_nr); /* We're done looking into the sequence information */ task->tk_msg.rpc_resp = NULL; @@ -662,7 +664,8 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) { - struct nfs4_delegation *dp = calldata; + struct nfsd4_callback *cb = calldata; + struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); struct nfs4_client *clp = dp->dl_client; struct rpc_clnt *current_rpc_client = clp->cl_cb_client; @@ -707,7 +710,8 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) static void nfsd4_cb_recall_release(void *calldata) { - struct nfs4_delegation *dp = calldata; + struct nfsd4_callback *cb = calldata; + struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); nfs4_put_delegation(dp); } @@ -718,8 +722,6 @@ static const struct rpc_call_ops nfsd4_cb_recall_ops = { .rpc_release = nfsd4_cb_recall_release, }; -static struct workqueue_struct *callback_wq; - int nfsd4_create_callback_queue(void) { callback_wq = create_singlethread_workqueue("nfsd4_callbacks"); @@ -734,57 +736,88 @@ void nfsd4_destroy_callback_queue(void) } /* must be called under the state lock */ -void nfsd4_set_callback_client(struct nfs4_client *clp, struct rpc_clnt *new) +void nfsd4_shutdown_callback(struct nfs4_client *clp) { - struct rpc_clnt *old = clp->cl_cb_client; - - clp->cl_cb_client = new; + set_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags); /* - * After this, any work that saw the old value of cl_cb_client will - * be gone: + * Note this won't actually result in a null callback; + * instead, nfsd4_do_callback_rpc() will detect the killed + * client, destroy the rpc client, and stop: */ + do_probe_callback(clp); flush_workqueue(callback_wq); - /* So we can safely shut it down: */ - if (old) - rpc_shutdown_client(old); } -/* - * called with dp->dl_count inc'ed. - */ -static void _nfsd4_cb_recall(struct nfs4_delegation *dp) +void nfsd4_release_cb(struct nfsd4_callback *cb) { - struct nfs4_client *clp = dp->dl_client; - struct rpc_clnt *clnt = clp->cl_cb_client; - struct nfs4_rpc_args *args = &dp->dl_recall.cb_args; - struct rpc_message msg = { - .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL], - .rpc_cred = callback_cred - }; + if (cb->cb_ops->rpc_release) + cb->cb_ops->rpc_release(cb); +} - if (clnt == NULL) { - nfs4_put_delegation(dp); - return; /* Client is shutting down; give up. */ +void nfsd4_process_cb_update(struct nfsd4_callback *cb) +{ + struct nfs4_cb_conn conn; + struct nfs4_client *clp = cb->cb_clp; + int err; + + /* + * This is either an update, or the client dying; in either case, + * kill the old client: + */ + if (clp->cl_cb_client) { + rpc_shutdown_client(clp->cl_cb_client); + clp->cl_cb_client = NULL; } + if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags)) + return; + spin_lock(&clp->cl_lock); + /* + * Only serialized callback code is allowed to clear these + * flags; main nfsd code can only set them: + */ + BUG_ON(!clp->cl_cb_flags); + clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags); + memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn)); + spin_unlock(&clp->cl_lock); - args->args_op = dp; - msg.rpc_argp = args; - dp->dl_retries = 1; - rpc_call_async(clnt, &msg, RPC_TASK_SOFT, &nfsd4_cb_recall_ops, dp); + err = setup_callback_client(clp, &conn); + if (err) + warn_no_callback_path(clp, err); } void nfsd4_do_callback_rpc(struct work_struct *w) { - /* XXX: for now, just send off delegation recall. */ - /* In future, generalize to handle any sort of callback. */ - struct nfsd4_callback *c = container_of(w, struct nfsd4_callback, cb_work); - struct nfs4_delegation *dp = container_of(c, struct nfs4_delegation, dl_recall); + struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work); + struct nfs4_client *clp = cb->cb_clp; + struct rpc_clnt *clnt; - _nfsd4_cb_recall(dp); -} + if (clp->cl_cb_flags) + nfsd4_process_cb_update(cb); + clnt = clp->cl_cb_client; + if (!clnt) { + /* Callback channel broken, or client killed; give up: */ + nfsd4_release_cb(cb); + return; + } + rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN, + cb->cb_ops, cb); +} void nfsd4_cb_recall(struct nfs4_delegation *dp) { + struct nfsd4_callback *cb = &dp->dl_recall; + + dp->dl_retries = 1; + cb->cb_op = dp; + cb->cb_clp = dp->dl_client; + cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL]; + cb->cb_msg.rpc_argp = cb; + cb->cb_msg.rpc_resp = cb; + cb->cb_msg.rpc_cred = callback_cred; + + cb->cb_ops = &nfsd4_cb_recall_ops; + dp->dl_retries = 1; + queue_work(callback_wq, &dp->dl_recall.cb_work); } diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index c78dbf493424..f0695e815f0e 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -482,109 +482,26 @@ nfsd_idmap_shutdown(void) cache_unregister(&nametoid_cache); } -/* - * Deferred request handling - */ - -struct idmap_defer_req { - struct cache_req req; - struct cache_deferred_req deferred_req; - wait_queue_head_t waitq; - atomic_t count; -}; - -static inline void -put_mdr(struct idmap_defer_req *mdr) -{ - if (atomic_dec_and_test(&mdr->count)) - kfree(mdr); -} - -static inline void -get_mdr(struct idmap_defer_req *mdr) -{ - atomic_inc(&mdr->count); -} - -static void -idmap_revisit(struct cache_deferred_req *dreq, int toomany) -{ - struct idmap_defer_req *mdr = - container_of(dreq, struct idmap_defer_req, deferred_req); - - wake_up(&mdr->waitq); - put_mdr(mdr); -} - -static struct cache_deferred_req * -idmap_defer(struct cache_req *req) -{ - struct idmap_defer_req *mdr = - container_of(req, struct idmap_defer_req, req); - - mdr->deferred_req.revisit = idmap_revisit; - get_mdr(mdr); - return (&mdr->deferred_req); -} - -static inline int -do_idmap_lookup(struct ent *(*lookup_fn)(struct ent *), struct ent *key, - struct cache_detail *detail, struct ent **item, - struct idmap_defer_req *mdr) -{ - *item = lookup_fn(key); - if (!*item) - return -ENOMEM; - return cache_check(detail, &(*item)->h, &mdr->req); -} - -static inline int -do_idmap_lookup_nowait(struct ent *(*lookup_fn)(struct ent *), - struct ent *key, struct cache_detail *detail, - struct ent **item) -{ - int ret = -ENOMEM; - - *item = lookup_fn(key); - if (!*item) - goto out_err; - ret = -ETIMEDOUT; - if (!test_bit(CACHE_VALID, &(*item)->h.flags) - || (*item)->h.expiry_time < get_seconds() - || detail->flush_time > (*item)->h.last_refresh) - goto out_put; - ret = -ENOENT; - if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags)) - goto out_put; - return 0; -out_put: - cache_put(&(*item)->h, detail); -out_err: - *item = NULL; - return ret; -} - static int idmap_lookup(struct svc_rqst *rqstp, struct ent *(*lookup_fn)(struct ent *), struct ent *key, struct cache_detail *detail, struct ent **item) { - struct idmap_defer_req *mdr; int ret; - mdr = kzalloc(sizeof(*mdr), GFP_KERNEL); - if (!mdr) + *item = lookup_fn(key); + if (!*item) return -ENOMEM; - atomic_set(&mdr->count, 1); - init_waitqueue_head(&mdr->waitq); - mdr->req.defer = idmap_defer; - ret = do_idmap_lookup(lookup_fn, key, detail, item, mdr); - if (ret == -EAGAIN) { - wait_event_interruptible_timeout(mdr->waitq, - test_bit(CACHE_VALID, &(*item)->h.flags), 1 * HZ); - ret = do_idmap_lookup_nowait(lookup_fn, key, detail, item); + retry: + ret = cache_check(detail, &(*item)->h, &rqstp->rq_chandle); + + if (ret == -ETIMEDOUT) { + struct ent *prev_item = *item; + *item = lookup_fn(key); + if (*item != prev_item) + goto retry; + cache_put(&(*item)->h, detail); } - put_mdr(mdr); return ret; } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 59ec449b0c7f..0cdfd022bb7b 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1031,8 +1031,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, resp->cstate.session = NULL; fh_init(&resp->cstate.current_fh, NFS4_FHSIZE); fh_init(&resp->cstate.save_fh, NFS4_FHSIZE); - /* Use the deferral mechanism only for NFSv4.0 compounds */ - rqstp->rq_usedeferral = (args->minorversion == 0); + /* + * Don't use the deferral mechanism for NFSv4; compounds make it + * too hard to avoid non-idempotency problems. + */ + rqstp->rq_usedeferral = 0; /* * According to RFC3010, this takes precedence over all other errors. diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a7292fcf7718..56347e0ac88d 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -207,7 +207,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f { struct nfs4_delegation *dp; struct nfs4_file *fp = stp->st_file; - struct nfs4_cb_conn *cb = &stp->st_stateowner->so_client->cl_cb_conn; dprintk("NFSD alloc_init_deleg\n"); /* @@ -234,7 +233,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f nfs4_file_get_access(fp, O_RDONLY); dp->dl_flock = NULL; dp->dl_type = type; - dp->dl_ident = cb->cb_ident; dp->dl_stateid.si_boot = boot_time; dp->dl_stateid.si_stateownerid = current_delegid++; dp->dl_stateid.si_fileid = 0; @@ -535,171 +533,258 @@ gen_sessionid(struct nfsd4_session *ses) */ #define NFSD_MIN_HDR_SEQ_SZ (24 + 12 + 44) +static void +free_session_slots(struct nfsd4_session *ses) +{ + int i; + + for (i = 0; i < ses->se_fchannel.maxreqs; i++) + kfree(ses->se_slots[i]); +} + /* - * Give the client the number of ca_maxresponsesize_cached slots it - * requests, of size bounded by NFSD_SLOT_CACHE_SIZE, - * NFSD_MAX_MEM_PER_SESSION, and nfsd_drc_max_mem. Do not allow more - * than NFSD_MAX_SLOTS_PER_SESSION. - * - * If we run out of reserved DRC memory we should (up to a point) + * We don't actually need to cache the rpc and session headers, so we + * can allocate a little less for each slot: + */ +static inline int slot_bytes(struct nfsd4_channel_attrs *ca) +{ + return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; +} + +static int nfsd4_sanitize_slot_size(u32 size) +{ + size -= NFSD_MIN_HDR_SEQ_SZ; /* We don't cache the rpc header */ + size = min_t(u32, size, NFSD_SLOT_CACHE_SIZE); + + return size; +} + +/* + * XXX: If we run out of reserved DRC memory we could (up to a point) * re-negotiate active sessions and reduce their slot usage to make * rooom for new connections. For now we just fail the create session. */ -static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan) +static int nfsd4_get_drc_mem(int slotsize, u32 num) { - int mem, size = fchan->maxresp_cached; + int avail; - if (fchan->maxreqs < 1) - return nfserr_inval; + num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION); - if (size < NFSD_MIN_HDR_SEQ_SZ) - size = NFSD_MIN_HDR_SEQ_SZ; - size -= NFSD_MIN_HDR_SEQ_SZ; - if (size > NFSD_SLOT_CACHE_SIZE) - size = NFSD_SLOT_CACHE_SIZE; - - /* bound the maxreqs by NFSD_MAX_MEM_PER_SESSION */ - mem = fchan->maxreqs * size; - if (mem > NFSD_MAX_MEM_PER_SESSION) { - fchan->maxreqs = NFSD_MAX_MEM_PER_SESSION / size; - if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION) - fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION; - mem = fchan->maxreqs * size; - } + spin_lock(&nfsd_drc_lock); + avail = min_t(int, NFSD_MAX_MEM_PER_SESSION, + nfsd_drc_max_mem - nfsd_drc_mem_used); + num = min_t(int, num, avail / slotsize); + nfsd_drc_mem_used += num * slotsize; + spin_unlock(&nfsd_drc_lock); + return num; +} + +static void nfsd4_put_drc_mem(int slotsize, int num) +{ spin_lock(&nfsd_drc_lock); - /* bound the total session drc memory ussage */ - if (mem + nfsd_drc_mem_used > nfsd_drc_max_mem) { - fchan->maxreqs = (nfsd_drc_max_mem - nfsd_drc_mem_used) / size; - mem = fchan->maxreqs * size; - } - nfsd_drc_mem_used += mem; + nfsd_drc_mem_used -= slotsize * num; spin_unlock(&nfsd_drc_lock); +} - if (fchan->maxreqs == 0) - return nfserr_jukebox; +static struct nfsd4_session *alloc_session(int slotsize, int numslots) +{ + struct nfsd4_session *new; + int mem, i; - fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ; - return 0; + BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *) + + sizeof(struct nfsd4_session) > PAGE_SIZE); + mem = numslots * sizeof(struct nfsd4_slot *); + + new = kzalloc(sizeof(*new) + mem, GFP_KERNEL); + if (!new) + return NULL; + /* allocate each struct nfsd4_slot and data cache in one piece */ + for (i = 0; i < numslots; i++) { + mem = sizeof(struct nfsd4_slot) + slotsize; + new->se_slots[i] = kzalloc(mem, GFP_KERNEL); + if (!new->se_slots[i]) + goto out_free; + } + return new; +out_free: + while (i--) + kfree(new->se_slots[i]); + kfree(new); + return NULL; } -/* - * fchan holds the client values on input, and the server values on output - * sv_max_mesg is the maximum payload plus one page for overhead. - */ -static int init_forechannel_attrs(struct svc_rqst *rqstp, - struct nfsd4_channel_attrs *session_fchan, - struct nfsd4_channel_attrs *fchan) +static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4_channel_attrs *req, int numslots, int slotsize) { - int status = 0; - __u32 maxcount = nfsd_serv->sv_max_mesg; + u32 maxrpc = nfsd_serv->sv_max_mesg; - /* headerpadsz set to zero in encode routine */ + new->maxreqs = numslots; + new->maxresp_cached = slotsize + NFSD_MIN_HDR_SEQ_SZ; + new->maxreq_sz = min_t(u32, req->maxreq_sz, maxrpc); + new->maxresp_sz = min_t(u32, req->maxresp_sz, maxrpc); + new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND); +} - /* Use the client's max request and max response size if possible */ - if (fchan->maxreq_sz > maxcount) - fchan->maxreq_sz = maxcount; - session_fchan->maxreq_sz = fchan->maxreq_sz; +static void free_conn(struct nfsd4_conn *c) +{ + svc_xprt_put(c->cn_xprt); + kfree(c); +} - if (fchan->maxresp_sz > maxcount) - fchan->maxresp_sz = maxcount; - session_fchan->maxresp_sz = fchan->maxresp_sz; +static void nfsd4_conn_lost(struct svc_xpt_user *u) +{ + struct nfsd4_conn *c = container_of(u, struct nfsd4_conn, cn_xpt_user); + struct nfs4_client *clp = c->cn_session->se_client; - /* Use the client's maxops if possible */ - if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND) - fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND; - session_fchan->maxops = fchan->maxops; + spin_lock(&clp->cl_lock); + if (!list_empty(&c->cn_persession)) { + list_del(&c->cn_persession); + free_conn(c); + } + spin_unlock(&clp->cl_lock); +} - /* FIXME: Error means no more DRC pages so the server should - * recover pages from existing sessions. For now fail session - * creation. - */ - status = set_forechannel_drc_size(fchan); +static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags) +{ + struct nfsd4_conn *conn; - session_fchan->maxresp_cached = fchan->maxresp_cached; - session_fchan->maxreqs = fchan->maxreqs; + conn = kmalloc(sizeof(struct nfsd4_conn), GFP_KERNEL); + if (!conn) + return NULL; + svc_xprt_get(rqstp->rq_xprt); + conn->cn_xprt = rqstp->rq_xprt; + conn->cn_flags = flags; + INIT_LIST_HEAD(&conn->cn_xpt_user.list); + return conn; +} - dprintk("%s status %d\n", __func__, status); - return status; +static void __nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses) +{ + conn->cn_session = ses; + list_add(&conn->cn_persession, &ses->se_conns); } -static void -free_session_slots(struct nfsd4_session *ses) +static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses) { - int i; + struct nfs4_client *clp = ses->se_client; - for (i = 0; i < ses->se_fchannel.maxreqs; i++) - kfree(ses->se_slots[i]); + spin_lock(&clp->cl_lock); + __nfsd4_hash_conn(conn, ses); + spin_unlock(&clp->cl_lock); } -/* - * We don't actually need to cache the rpc and session headers, so we - * can allocate a little less for each slot: - */ -static inline int slot_bytes(struct nfsd4_channel_attrs *ca) +static void nfsd4_register_conn(struct nfsd4_conn *conn) { - return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; + conn->cn_xpt_user.callback = nfsd4_conn_lost; + register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user); } -static int -alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, - struct nfsd4_create_session *cses) +static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses) { - struct nfsd4_session *new, tmp; - struct nfsd4_slot *sp; - int idx, slotsize, cachesize, i; - int status; + struct nfsd4_conn *conn; + u32 flags = NFS4_CDFC4_FORE; - memset(&tmp, 0, sizeof(tmp)); + if (ses->se_flags & SESSION4_BACK_CHAN) + flags |= NFS4_CDFC4_BACK; + conn = alloc_conn(rqstp, flags); + if (!conn) + return nfserr_jukebox; + nfsd4_hash_conn(conn, ses); + nfsd4_register_conn(conn); + return nfs_ok; +} - /* FIXME: For now, we just accept the client back channel attributes. */ - tmp.se_bchannel = cses->back_channel; - status = init_forechannel_attrs(rqstp, &tmp.se_fchannel, - &cses->fore_channel); - if (status) - goto out; +static void nfsd4_del_conns(struct nfsd4_session *s) +{ + struct nfs4_client *clp = s->se_client; + struct nfsd4_conn *c; - BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot) - + sizeof(struct nfsd4_session) > PAGE_SIZE); + spin_lock(&clp->cl_lock); + while (!list_empty(&s->se_conns)) { + c = list_first_entry(&s->se_conns, struct nfsd4_conn, cn_persession); + list_del_init(&c->cn_persession); + spin_unlock(&clp->cl_lock); - status = nfserr_jukebox; - /* allocate struct nfsd4_session and slot table pointers in one piece */ - slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *); - new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL); - if (!new) - goto out; + unregister_xpt_user(c->cn_xprt, &c->cn_xpt_user); + free_conn(c); - memcpy(new, &tmp, sizeof(*new)); + spin_lock(&clp->cl_lock); + } + spin_unlock(&clp->cl_lock); +} - /* allocate each struct nfsd4_slot and data cache in one piece */ - cachesize = slot_bytes(&new->se_fchannel); - for (i = 0; i < new->se_fchannel.maxreqs; i++) { - sp = kzalloc(sizeof(*sp) + cachesize, GFP_KERNEL); - if (!sp) - goto out_free; - new->se_slots[i] = sp; +void free_session(struct kref *kref) +{ + struct nfsd4_session *ses; + int mem; + + ses = container_of(kref, struct nfsd4_session, se_ref); + nfsd4_del_conns(ses); + spin_lock(&nfsd_drc_lock); + mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel); + nfsd_drc_mem_used -= mem; + spin_unlock(&nfsd_drc_lock); + free_session_slots(ses); + kfree(ses); +} + +static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses) +{ + struct nfsd4_session *new; + struct nfsd4_channel_attrs *fchan = &cses->fore_channel; + int numslots, slotsize; + int status; + int idx; + + /* + * Note decreasing slot size below client's request may + * make it difficult for client to function correctly, whereas + * decreasing the number of slots will (just?) affect + * performance. When short on memory we therefore prefer to + * decrease number of slots instead of their size. + */ + slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached); + numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs); + + new = alloc_session(slotsize, numslots); + if (!new) { + nfsd4_put_drc_mem(slotsize, fchan->maxreqs); + return NULL; } + init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize); new->se_client = clp; gen_sessionid(new); - idx = hash_sessionid(&new->se_sessionid); - memcpy(clp->cl_sessionid.data, new->se_sessionid.data, - NFS4_MAX_SESSIONID_LEN); + INIT_LIST_HEAD(&new->se_conns); + + new->se_cb_seq_nr = 1; new->se_flags = cses->flags; + new->se_cb_prog = cses->callback_prog; kref_init(&new->se_ref); + idx = hash_sessionid(&new->se_sessionid); spin_lock(&client_lock); list_add(&new->se_hash, &sessionid_hashtbl[idx]); list_add(&new->se_perclnt, &clp->cl_sessions); spin_unlock(&client_lock); - status = nfs_ok; -out: - return status; -out_free: - free_session_slots(new); - kfree(new); - goto out; + status = nfsd4_new_conn(rqstp, new); + /* whoops: benny points out, status is ignored! (err, or bogus) */ + if (status) { + free_session(&new->se_ref); + return NULL; + } + if (!clp->cl_cb_session && (cses->flags & SESSION4_BACK_CHAN)) { + struct sockaddr *sa = svc_addr(rqstp); + + clp->cl_cb_session = new; + clp->cl_cb_conn.cb_xprt = rqstp->rq_xprt; + svc_xprt_get(rqstp->rq_xprt); + rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa); + clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa); + nfsd4_probe_callback(clp); + } + return new; } /* caller must hold client_lock */ @@ -731,21 +816,6 @@ unhash_session(struct nfsd4_session *ses) list_del(&ses->se_perclnt); } -void -free_session(struct kref *kref) -{ - struct nfsd4_session *ses; - int mem; - - ses = container_of(kref, struct nfsd4_session, se_ref); - spin_lock(&nfsd_drc_lock); - mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel); - nfsd_drc_mem_used -= mem; - spin_unlock(&nfsd_drc_lock); - free_session_slots(ses); - kfree(ses); -} - /* must be called under the client_lock */ static inline void renew_client_locked(struct nfs4_client *clp) @@ -812,6 +882,13 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) static inline void free_client(struct nfs4_client *clp) { + while (!list_empty(&clp->cl_sessions)) { + struct nfsd4_session *ses; + ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, + se_perclnt); + list_del(&ses->se_perclnt); + nfsd4_put_session(ses); + } if (clp->cl_cred.cr_group_info) put_group_info(clp->cl_cred.cr_group_info); kfree(clp->cl_principal); @@ -838,15 +915,12 @@ release_session_client(struct nfsd4_session *session) static inline void unhash_client_locked(struct nfs4_client *clp) { + struct nfsd4_session *ses; + mark_client_expired(clp); list_del(&clp->cl_lru); - while (!list_empty(&clp->cl_sessions)) { - struct nfsd4_session *ses; - ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, - se_perclnt); - unhash_session(ses); - nfsd4_put_session(ses); - } + list_for_each_entry(ses, &clp->cl_sessions, se_perclnt) + list_del_init(&ses->se_hash); } static void @@ -875,7 +949,7 @@ expire_client(struct nfs4_client *clp) sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); release_openowner(sop); } - nfsd4_set_callback_client(clp, NULL); + nfsd4_shutdown_callback(clp); if (clp->cl_cb_conn.cb_xprt) svc_xprt_put(clp->cl_cb_conn.cb_xprt); list_del(&clp->cl_idhash); @@ -960,6 +1034,8 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, if (clp == NULL) return NULL; + INIT_LIST_HEAD(&clp->cl_sessions); + princ = svc_gss_principal(rqstp); if (princ) { clp->cl_principal = kstrdup(princ, GFP_KERNEL); @@ -976,8 +1052,9 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, INIT_LIST_HEAD(&clp->cl_strhash); INIT_LIST_HEAD(&clp->cl_openowners); INIT_LIST_HEAD(&clp->cl_delegations); - INIT_LIST_HEAD(&clp->cl_sessions); INIT_LIST_HEAD(&clp->cl_lru); + spin_lock_init(&clp->cl_lock); + INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc); clp->cl_time = get_seconds(); clear_bit(0, &clp->cl_cb_slot_busy); rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); @@ -986,7 +1063,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, clp->cl_flavor = rqstp->rq_flavor; copy_cred(&clp->cl_cred, &rqstp->rq_cred); gen_confirm(clp); - + clp->cl_cb_session = NULL; return clp; } @@ -1098,7 +1175,7 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval, static void gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) { - struct nfs4_cb_conn *cb = &clp->cl_cb_conn; + struct nfs4_cb_conn *conn = &clp->cl_cb_conn; unsigned short expected_family; /* Currently, we only support tcp and tcp6 for the callback channel */ @@ -1111,24 +1188,23 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) else goto out_err; - cb->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val, + conn->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val, se->se_callback_addr_len, - (struct sockaddr *) &cb->cb_addr, - sizeof(cb->cb_addr)); + (struct sockaddr *)&conn->cb_addr, + sizeof(conn->cb_addr)); - if (!cb->cb_addrlen || cb->cb_addr.ss_family != expected_family) + if (!conn->cb_addrlen || conn->cb_addr.ss_family != expected_family) goto out_err; - if (cb->cb_addr.ss_family == AF_INET6) - ((struct sockaddr_in6 *) &cb->cb_addr)->sin6_scope_id = scopeid; + if (conn->cb_addr.ss_family == AF_INET6) + ((struct sockaddr_in6 *)&conn->cb_addr)->sin6_scope_id = scopeid; - cb->cb_minorversion = 0; - cb->cb_prog = se->se_callback_prog; - cb->cb_ident = se->se_callback_ident; + conn->cb_prog = se->se_callback_prog; + conn->cb_ident = se->se_callback_ident; return; out_err: - cb->cb_addr.ss_family = AF_UNSPEC; - cb->cb_addrlen = 0; + conn->cb_addr.ss_family = AF_UNSPEC; + conn->cb_addrlen = 0; dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) " "will not receive delegations\n", clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); @@ -1415,7 +1491,9 @@ nfsd4_create_session(struct svc_rqst *rqstp, { struct sockaddr *sa = svc_addr(rqstp); struct nfs4_client *conf, *unconf; + struct nfsd4_session *new; struct nfsd4_clid_slot *cs_slot = NULL; + bool confirm_me = false; int status = 0; nfs4_lock_state(); @@ -1438,7 +1516,6 @@ nfsd4_create_session(struct svc_rqst *rqstp, cs_slot->sl_seqid, cr_ses->seqid); goto out; } - cs_slot->sl_seqid++; } else if (unconf) { if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { @@ -1451,25 +1528,10 @@ nfsd4_create_session(struct svc_rqst *rqstp, if (status) { /* an unconfirmed replay returns misordered */ status = nfserr_seq_misordered; - goto out_cache; + goto out; } - cs_slot->sl_seqid++; /* from 0 to 1 */ - move_to_confirmed(unconf); - - if (cr_ses->flags & SESSION4_BACK_CHAN) { - unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt; - svc_xprt_get(rqstp->rq_xprt); - rpc_copy_addr( - (struct sockaddr *)&unconf->cl_cb_conn.cb_addr, - sa); - unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa); - unconf->cl_cb_conn.cb_minorversion = - cstate->minorversion; - unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog; - unconf->cl_cb_seq_nr = 1; - nfsd4_probe_callback(unconf, &unconf->cl_cb_conn); - } + confirm_me = true; conf = unconf; } else { status = nfserr_stale_clientid; @@ -1477,22 +1539,30 @@ nfsd4_create_session(struct svc_rqst *rqstp, } /* + * XXX: we should probably set this at creation time, and check + * for consistent minorversion use throughout: + */ + conf->cl_minorversion = 1; + /* * We do not support RDMA or persistent sessions */ cr_ses->flags &= ~SESSION4_PERSIST; cr_ses->flags &= ~SESSION4_RDMA; - status = alloc_init_session(rqstp, conf, cr_ses); - if (status) + status = nfserr_jukebox; + new = alloc_init_session(rqstp, conf, cr_ses); + if (!new) goto out; - - memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data, + status = nfs_ok; + memcpy(cr_ses->sessionid.data, new->se_sessionid.data, NFS4_MAX_SESSIONID_LEN); + cs_slot->sl_seqid++; cr_ses->seqid = cs_slot->sl_seqid; -out_cache: /* cache solo and embedded create sessions under the state lock */ nfsd4_cache_create_session(cr_ses, cs_slot, status); + if (confirm_me) + move_to_confirmed(conf); out: nfs4_unlock_state(); dprintk("%s returns %d\n", __func__, ntohl(status)); @@ -1546,8 +1616,11 @@ nfsd4_destroy_session(struct svc_rqst *r, nfs4_lock_state(); /* wait for callbacks */ - nfsd4_set_callback_client(ses->se_client, NULL); + nfsd4_shutdown_callback(ses->se_client); nfs4_unlock_state(); + + nfsd4_del_conns(ses); + nfsd4_put_session(ses); status = nfs_ok; out: @@ -1555,6 +1628,36 @@ out: return status; } +static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_session *s) +{ + struct nfsd4_conn *c; + + list_for_each_entry(c, &s->se_conns, cn_persession) { + if (c->cn_xprt == xpt) { + return c; + } + } + return NULL; +} + +static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses) +{ + struct nfs4_client *clp = ses->se_client; + struct nfsd4_conn *c; + + spin_lock(&clp->cl_lock); + c = __nfsd4_find_conn(new->cn_xprt, ses); + if (c) { + spin_unlock(&clp->cl_lock); + free_conn(new); + return; + } + __nfsd4_hash_conn(new, ses); + spin_unlock(&clp->cl_lock); + nfsd4_register_conn(new); + return; +} + __be32 nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, @@ -1563,11 +1666,20 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compoundres *resp = rqstp->rq_resp; struct nfsd4_session *session; struct nfsd4_slot *slot; + struct nfsd4_conn *conn; int status; if (resp->opcnt != 1) return nfserr_sequence_pos; + /* + * Will be either used or freed by nfsd4_sequence_check_conn + * below. + */ + conn = alloc_conn(rqstp, NFS4_CDFC4_FORE); + if (!conn) + return nfserr_jukebox; + spin_lock(&client_lock); status = nfserr_badsession; session = find_in_sessionid_hashtbl(&seq->sessionid); @@ -1599,6 +1711,9 @@ nfsd4_sequence(struct svc_rqst *rqstp, if (status) goto out; + nfsd4_sequence_check_conn(conn, session); + conn = NULL; + /* Success! bump slot seqid */ slot->sl_inuse = true; slot->sl_seqid = seq->seqid; @@ -1613,6 +1728,7 @@ out: nfsd4_get_session(cstate->session); atomic_inc(&session->se_client->cl_refcount); } + kfree(conn); spin_unlock(&client_lock); dprintk("%s: return %d\n", __func__, ntohl(status)); return status; @@ -1747,6 +1863,11 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; gen_clid(new); } + /* + * XXX: we should probably set this at creation time, and check + * for consistent minorversion use throughout: + */ + new->cl_minorversion = 0; gen_callback(new, setclid, rpc_get_scope_id(sa)); add_to_unconfirmed(new, strhashval); setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; @@ -1807,7 +1928,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, status = nfserr_clid_inuse; else { atomic_set(&conf->cl_cb_set, 0); - nfsd4_probe_callback(conf, &unconf->cl_cb_conn); + nfsd4_change_callback(conf, &unconf->cl_cb_conn); + nfsd4_probe_callback(conf); expire_client(unconf); status = nfs_ok; @@ -1841,7 +1963,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, } move_to_confirmed(unconf); conf = unconf; - nfsd4_probe_callback(conf, &conf->cl_cb_conn); + nfsd4_probe_callback(conf); status = nfs_ok; } } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm))) @@ -2492,7 +2614,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta struct nfs4_delegation *dp; struct nfs4_stateowner *sop = stp->st_stateowner; int cb_up = atomic_read(&sop->so_client->cl_cb_set); - struct file_lock fl, *flp = &fl; + struct file_lock *fl; int status, flag = 0; flag = NFS4_OPEN_DELEGATE_NONE; @@ -2526,20 +2648,24 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta flag = NFS4_OPEN_DELEGATE_NONE; goto out; } - locks_init_lock(&fl); - fl.fl_lmops = &nfsd_lease_mng_ops; - fl.fl_flags = FL_LEASE; - fl.fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; - fl.fl_end = OFFSET_MAX; - fl.fl_owner = (fl_owner_t)dp; - fl.fl_file = find_readable_file(stp->st_file); - BUG_ON(!fl.fl_file); - fl.fl_pid = current->tgid; + status = -ENOMEM; + fl = locks_alloc_lock(); + if (!fl) + goto out; + locks_init_lock(fl); + fl->fl_lmops = &nfsd_lease_mng_ops; + fl->fl_flags = FL_LEASE; + fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; + fl->fl_end = OFFSET_MAX; + fl->fl_owner = (fl_owner_t)dp; + fl->fl_file = find_readable_file(stp->st_file); + BUG_ON(!fl->fl_file); + fl->fl_pid = current->tgid; /* vfs_setlease checks to see if delegation should be handed out. * the lock_manager callbacks fl_mylease and fl_change are used */ - if ((status = vfs_setlease(fl.fl_file, fl.fl_type, &flp))) { + if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) { dprintk("NFSD: setlease failed [%d], no delegation\n", status); unhash_delegation(dp); flag = NFS4_OPEN_DELEGATE_NONE; @@ -2944,7 +3070,11 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, if (STALE_STATEID(stateid)) goto out; - status = nfserr_bad_stateid; + /* + * We assume that any stateid that has the current boot time, + * but that we can't find, is expired: + */ + status = nfserr_expired; if (is_delegation_stateid(stateid)) { dp = find_delegation_stateid(ino, stateid); if (!dp) @@ -2964,6 +3094,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, stp = find_stateid(stateid, flags); if (!stp) goto out; + status = nfserr_bad_stateid; if (nfs4_check_fh(current_fh, stp)) goto out; if (!stp->st_stateowner->so_confirmed) @@ -3038,8 +3169,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, * a replayed close: */ sop = search_close_lru(stateid->si_stateownerid, flags); + /* It's not stale; let's assume it's expired: */ if (sop == NULL) - return nfserr_bad_stateid; + return nfserr_expired; *sopp = sop; goto check_replay; } @@ -3304,6 +3436,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfserr_bad_stateid; if (!is_delegation_stateid(stateid)) goto out; + status = nfserr_expired; dp = find_delegation_stateid(inode, stateid); if (!dp) goto out; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 1a468bbd330f..f35a94a04026 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1805,19 +1805,23 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, goto out_nfserr; } } - if ((buflen -= 16) < 0) - goto out_resource; - if (unlikely(bmval2)) { + if (bmval2) { + if ((buflen -= 16) < 0) + goto out_resource; WRITE32(3); WRITE32(bmval0); WRITE32(bmval1); WRITE32(bmval2); - } else if (likely(bmval1)) { + } else if (bmval1) { + if ((buflen -= 12) < 0) + goto out_resource; WRITE32(2); WRITE32(bmval0); WRITE32(bmval1); } else { + if ((buflen -= 8) < 0) + goto out_resource; WRITE32(1); WRITE32(bmval0); } @@ -1828,15 +1832,17 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, u32 word1 = nfsd_suppattrs1(minorversion); u32 word2 = nfsd_suppattrs2(minorversion); - if ((buflen -= 12) < 0) - goto out_resource; if (!aclsupport) word0 &= ~FATTR4_WORD0_ACL; if (!word2) { + if ((buflen -= 12) < 0) + goto out_resource; WRITE32(2); WRITE32(word0); WRITE32(word1); } else { + if ((buflen -= 16) < 0) + goto out_resource; WRITE32(3); WRITE32(word0); WRITE32(word1); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 06fa87e52e82..d6dc3f61f8ba 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -22,6 +22,7 @@ */ enum { NFSD_Root = 1, +#ifdef CONFIG_NFSD_DEPRECATED NFSD_Svc, NFSD_Add, NFSD_Del, @@ -29,6 +30,7 @@ enum { NFSD_Unexport, NFSD_Getfd, NFSD_Getfs, +#endif NFSD_List, NFSD_Export_features, NFSD_Fh, @@ -54,6 +56,7 @@ enum { /* * write() for these nodes. */ +#ifdef CONFIG_NFSD_DEPRECATED static ssize_t write_svc(struct file *file, char *buf, size_t size); static ssize_t write_add(struct file *file, char *buf, size_t size); static ssize_t write_del(struct file *file, char *buf, size_t size); @@ -61,6 +64,7 @@ static ssize_t write_export(struct file *file, char *buf, size_t size); static ssize_t write_unexport(struct file *file, char *buf, size_t size); static ssize_t write_getfd(struct file *file, char *buf, size_t size); static ssize_t write_getfs(struct file *file, char *buf, size_t size); +#endif static ssize_t write_filehandle(struct file *file, char *buf, size_t size); static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size); static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size); @@ -76,6 +80,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); #endif static ssize_t (*write_op[])(struct file *, char *, size_t) = { +#ifdef CONFIG_NFSD_DEPRECATED [NFSD_Svc] = write_svc, [NFSD_Add] = write_add, [NFSD_Del] = write_del, @@ -83,6 +88,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = { [NFSD_Unexport] = write_unexport, [NFSD_Getfd] = write_getfd, [NFSD_Getfs] = write_getfs, +#endif [NFSD_Fh] = write_filehandle, [NFSD_FO_UnlockIP] = write_unlock_ip, [NFSD_FO_UnlockFS] = write_unlock_fs, @@ -121,6 +127,14 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) { + static int warned; + if (file->f_dentry->d_name.name[0] == '.' && !warned) { + printk(KERN_INFO + "Warning: \"%s\" uses deprecated NFSD interface: %s." + " This will be removed in 2.6.40\n", + current->comm, file->f_dentry->d_name.name); + warned = 1; + } if (! file->private_data) { /* An attempt to read a transaction file without writing * causes a 0-byte write so that the file can return @@ -187,6 +201,7 @@ static const struct file_operations pool_stats_operations = { * payload - write methods */ +#ifdef CONFIG_NFSD_DEPRECATED /** * write_svc - Start kernel's NFSD server * @@ -402,7 +417,7 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size) ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6); - clp = auth_unix_lookup(&in6); + clp = auth_unix_lookup(&init_net, &in6); if (!clp) err = -EPERM; else { @@ -465,7 +480,7 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size) ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6); - clp = auth_unix_lookup(&in6); + clp = auth_unix_lookup(&init_net, &in6); if (!clp) err = -EPERM; else { @@ -482,6 +497,7 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size) out: return err; } +#endif /* CONFIG_NFSD_DEPRECATED */ /** * write_unlock_ip - Release all locks used by a client @@ -1000,12 +1016,12 @@ static ssize_t __write_ports_addxprt(char *buf) if (err != 0) return err; - err = svc_create_xprt(nfsd_serv, transport, + err = svc_create_xprt(nfsd_serv, transport, &init_net, PF_INET, port, SVC_SOCK_ANONYMOUS); if (err < 0) goto out_err; - err = svc_create_xprt(nfsd_serv, transport, + err = svc_create_xprt(nfsd_serv, transport, &init_net, PF_INET6, port, SVC_SOCK_ANONYMOUS); if (err < 0 && err != -EAFNOSUPPORT) goto out_close; @@ -1356,6 +1372,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) static int nfsd_fill_super(struct super_block * sb, void * data, int silent) { static struct tree_descr nfsd_files[] = { +#ifdef CONFIG_NFSD_DEPRECATED [NFSD_Svc] = {".svc", &transaction_ops, S_IWUSR}, [NFSD_Add] = {".add", &transaction_ops, S_IWUSR}, [NFSD_Del] = {".del", &transaction_ops, S_IWUSR}, @@ -1363,6 +1380,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) [NFSD_Unexport] = {".unexport", &transaction_ops, S_IWUSR}, [NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR}, +#endif [NFSD_List] = {"exports", &exports_operations, S_IRUGO}, [NFSD_Export_features] = {"export_features", &export_features_operations, S_IRUGO}, diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index b76ac3a82e39..6b641cf2c19a 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -249,7 +249,7 @@ extern time_t nfsd4_grace; #define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ #define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */ -#define NFSD_LAUNDROMAT_MINTIMEOUT 10 /* seconds */ +#define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */ /* * The following attributes are currently not supported by the NFSv4 server: diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index e2c43464f237..2bae1d86f5f2 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -16,6 +16,7 @@ #include <linux/lockd/bind.h> #include <linux/nfsacl.h> #include <linux/seq_file.h> +#include <net/net_namespace.h> #include "nfsd.h" #include "cache.h" #include "vfs.h" @@ -186,12 +187,12 @@ static int nfsd_init_socks(int port) if (!list_empty(&nfsd_serv->sv_permsocks)) return 0; - error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port, + error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, port, SVC_SOCK_DEFAULTS); if (error < 0) return error; - error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port, + error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, port, SVC_SOCK_DEFAULTS); if (error < 0) return error; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 322518c88e4b..39adc27b0685 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -35,6 +35,7 @@ #ifndef _NFSD4_STATE_H #define _NFSD4_STATE_H +#include <linux/sunrpc/svc_xprt.h> #include <linux/nfsd/nfsfh.h> #include "nfsfh.h" @@ -64,19 +65,12 @@ typedef struct { (s)->si_fileid, \ (s)->si_generation -struct nfsd4_cb_sequence { - /* args/res */ - u32 cbs_minorversion; - struct nfs4_client *cbs_clp; -}; - -struct nfs4_rpc_args { - void *args_op; - struct nfsd4_cb_sequence args_seq; -}; - struct nfsd4_callback { - struct nfs4_rpc_args cb_args; + void *cb_op; + struct nfs4_client *cb_clp; + u32 cb_minorversion; + struct rpc_message cb_msg; + const struct rpc_call_ops *cb_ops; struct work_struct cb_work; }; @@ -91,7 +85,6 @@ struct nfs4_delegation { u32 dl_type; time_t dl_time; /* For recall: */ - u32 dl_ident; stateid_t dl_stateid; struct knfsd_fh dl_fh; int dl_retries; @@ -103,8 +96,8 @@ struct nfs4_cb_conn { /* SETCLIENTID info */ struct sockaddr_storage cb_addr; size_t cb_addrlen; - u32 cb_prog; - u32 cb_minorversion; + u32 cb_prog; /* used only in 4.0 case; + per-session otherwise */ u32 cb_ident; /* minorversion 0 only */ struct svc_xprt *cb_xprt; /* minorversion 1 only */ }; @@ -160,6 +153,15 @@ struct nfsd4_clid_slot { struct nfsd4_create_session sl_cr_ses; }; +struct nfsd4_conn { + struct list_head cn_persession; + struct svc_xprt *cn_xprt; + struct svc_xpt_user cn_xpt_user; + struct nfsd4_session *cn_session; +/* CDFC4_FORE, CDFC4_BACK: */ + unsigned char cn_flags; +}; + struct nfsd4_session { struct kref se_ref; struct list_head se_hash; /* hash by sessionid */ @@ -169,6 +171,9 @@ struct nfsd4_session { struct nfs4_sessionid se_sessionid; struct nfsd4_channel_attrs se_fchannel; struct nfsd4_channel_attrs se_bchannel; + struct list_head se_conns; + u32 se_cb_prog; + u32 se_cb_seq_nr; struct nfsd4_slot *se_slots[]; /* forward channel slots */ }; @@ -221,24 +226,32 @@ struct nfs4_client { clientid_t cl_clientid; /* generated by server */ nfs4_verifier cl_confirm; /* generated by server */ u32 cl_firststate; /* recovery dir creation */ + u32 cl_minorversion; /* for v4.0 and v4.1 callbacks: */ struct nfs4_cb_conn cl_cb_conn; +#define NFSD4_CLIENT_CB_UPDATE 1 +#define NFSD4_CLIENT_KILL 2 + unsigned long cl_cb_flags; struct rpc_clnt *cl_cb_client; + u32 cl_cb_ident; atomic_t cl_cb_set; + struct nfsd4_callback cl_cb_null; + struct nfsd4_session *cl_cb_session; + + /* for all client information that callback code might need: */ + spinlock_t cl_lock; /* for nfs41 */ struct list_head cl_sessions; struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */ u32 cl_exchange_flags; - struct nfs4_sessionid cl_sessionid; /* number of rpc's in progress over an associated session: */ atomic_t cl_refcount; /* for nfs41 callbacks */ /* We currently support a single back channel with a single slot */ unsigned long cl_cb_slot_busy; - u32 cl_cb_seq_nr; struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ /* wait here for slots */ }; @@ -440,12 +453,13 @@ extern int nfs4_in_grace(void); extern __be32 nfs4_check_open_reclaim(clientid_t *clid); extern void nfs4_free_stateowner(struct kref *kref); extern int set_callback_cred(void); -extern void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); +extern void nfsd4_probe_callback(struct nfs4_client *clp); +extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); extern void nfsd4_do_callback_rpc(struct work_struct *); extern void nfsd4_cb_recall(struct nfs4_delegation *dp); extern int nfsd4_create_callback_queue(void); extern void nfsd4_destroy_callback_queue(void); -extern void nfsd4_set_callback_client(struct nfs4_client *, struct rpc_clnt *); +extern void nfsd4_shutdown_callback(struct nfs4_client *); extern void nfs4_put_delegation(struct nfs4_delegation *dp); extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); extern void nfsd4_init_recdir(char *recdir_name); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 661a6cf8e826..184938fcff04 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -281,23 +281,13 @@ commit_metadata(struct svc_fh *fhp) { struct inode *inode = fhp->fh_dentry->d_inode; const struct export_operations *export_ops = inode->i_sb->s_export_op; - int error = 0; if (!EX_ISSYNC(fhp->fh_export)) return 0; - if (export_ops->commit_metadata) { - error = export_ops->commit_metadata(inode); - } else { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* metadata only */ - }; - - error = sync_inode(inode, &wbc); - } - - return error; + if (export_ops->commit_metadata) + return export_ops->commit_metadata(inode); + return sync_inode_metadata(inode, 1); } /* diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 185d1607cb00..6e9557ecf161 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -207,7 +207,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir, inode->i_ctime = CURRENT_TIME; inode_inc_link_count(inode); - atomic_inc(&inode->i_count); + ihold(inode); err = nilfs_add_nondir(dentry, inode); if (!err) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index d926af626177..687d090cea34 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1609,7 +1609,7 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out) kunmap_atomic(kaddr, KM_USER0); if (!TestSetPageWriteback(clone_page)) - inc_zone_page_state(clone_page, NR_WRITEBACK); + account_page_writeback(clone_page); unlock_page(clone_page); return 0; diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 36802420d69a..4498a208df94 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -88,8 +88,6 @@ void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) { struct dentry *parent; struct inode *p_inode; - bool send = false; - bool should_update_children = false; if (!dentry) dentry = path->dentry; @@ -97,29 +95,12 @@ void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED)) return; - spin_lock(&dentry->d_lock); - parent = dentry->d_parent; + parent = dget_parent(dentry); p_inode = parent->d_inode; - if (fsnotify_inode_watches_children(p_inode)) { - if (p_inode->i_fsnotify_mask & mask) { - dget(parent); - send = true; - } - } else { - /* - * The parent doesn't care about events on it's children but - * at least one child thought it did. We need to run all the - * children and update their d_flags to let them know p_inode - * doesn't care about them any more. - */ - dget(parent); - should_update_children = true; - } - - spin_unlock(&dentry->d_lock); - - if (send) { + if (unlikely(!fsnotify_inode_watches_children(p_inode))) + __fsnotify_update_child_dentry_flags(p_inode); + else if (p_inode->i_fsnotify_mask & mask) { /* we are notifying a parent so come up with the new mask which * specifies these are events which came from a child. */ mask |= FS_EVENT_ON_CHILD; @@ -130,13 +111,9 @@ void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) else fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0); - dput(parent); } - if (unlikely(should_update_children)) { - __fsnotify_update_child_dentry_flags(p_inode); - dput(parent); - } + dput(parent); } EXPORT_SYMBOL_GPL(__fsnotify_parent); diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 33297c005060..21ed10660b80 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -240,6 +240,7 @@ void fsnotify_unmount_inodes(struct list_head *list) { struct inode *inode, *next_i, *need_iput = NULL; + spin_lock(&inode_lock); list_for_each_entry_safe(inode, next_i, list, i_sb_list) { struct inode *need_iput_tmp; @@ -297,4 +298,5 @@ void fsnotify_unmount_inodes(struct list_head *list) spin_lock(&inode_lock); } + spin_unlock(&inode_lock); } diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 19c5180f8a28..d3fbe5730bfc 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -2911,8 +2911,8 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) goto unl_upcase_iput_tmp_ino_err_out_now; } if ((sb->s_root = d_alloc_root(vol->root_ino))) { - /* We increment i_count simulating an ntfs_iget(). */ - atomic_inc(&vol->root_ino->i_count); + /* We grab a reference, simulating an ntfs_iget(). */ + ihold(vol->root_ino); ntfs_debug("Exiting, status successful."); /* Release the default upcase if it has no users. */ mutex_lock(&ntfs_lock); @@ -3021,21 +3021,6 @@ iput_tmp_ino_err_out_now: if (vol->mft_ino && vol->mft_ino != tmp_ino) iput(vol->mft_ino); vol->mft_ino = NULL; - /* - * This is needed to get ntfs_clear_extent_inode() called for each - * inode we have ever called ntfs_iget()/iput() on, otherwise we A) - * leak resources and B) a subsequent mount fails automatically due to - * ntfs_iget() never calling down into our ntfs_read_locked_inode() - * method again... FIXME: Do we need to do this twice now because of - * attribute inodes? I think not, so leave as is for now... (AIA) - */ - if (invalidate_inodes(sb)) { - ntfs_error(sb, "Busy inodes left. This is most likely a NTFS " - "driver bug."); - /* Copied from fs/super.c. I just love this message. (-; */ - printk("NTFS: Busy inodes after umount. Self-destruct in 5 " - "seconds. Have a nice day...\n"); - } /* Errors at this stage are irrelevant. */ err_out_now: sb->s_fs_info = NULL; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 5cfeee118158..f1e962cb3b73 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -165,7 +165,7 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock, * ocfs2 never allocates in this function - the only time we * need to use BH_New is when we're extending i_size on a file * system which doesn't support holes, in which case BH_New - * allows block_prepare_write() to zero. + * allows __block_write_begin() to zero. * * If we see this on a sparse file system, then a truncate has * raced us and removed the cluster. In this case, we clear @@ -407,21 +407,6 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) return ret; } -/* - * This is called from ocfs2_write_zero_page() which has handled it's - * own cluster locking and has ensured allocation exists for those - * blocks to be written. - */ -int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, - unsigned from, unsigned to) -{ - int ret; - - ret = block_prepare_write(page, from, to, ocfs2_get_block); - - return ret; -} - /* Taken from ext3. We don't necessarily need the full blown * functionality yet, but IMHO it's better to cut and paste the whole * thing so we can avoid introducing our own bugs (and easily pick up @@ -732,7 +717,7 @@ static int ocfs2_should_read_blk(struct inode *inode, struct page *page, } /* - * Some of this taken from block_prepare_write(). We already have our + * Some of this taken from __block_write_begin(). We already have our * mapping by now though, and the entire write will be allocating or * it won't, so not much need to use BH_New. * diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 7606f663da6d..76bfdfda691a 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -22,9 +22,6 @@ #ifndef OCFS2_AOPS_H #define OCFS2_AOPS_H -int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, - unsigned from, unsigned to); - handle_t *ocfs2_start_walk_page_trans(struct inode *inode, struct page *page, unsigned from, diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index a7ebd9d42dc8..75e115f1bd73 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -400,6 +400,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb) if (inode) { ip = DLMFS_I(inode); + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); @@ -425,6 +426,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent, if (!inode) return NULL; + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 1ca6867935bb..77b4c04a2809 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -796,13 +796,12 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, block_end = block_start + (1 << inode->i_blkbits); /* - * block_start is block-aligned. Bump it by one to - * force ocfs2_{prepare,commit}_write() to zero the + * block_start is block-aligned. Bump it by one to force + * __block_write_begin and block_commit_write to zero the * whole block. */ - ret = ocfs2_prepare_write_nolock(inode, page, - block_start + 1, - block_start + 1); + ret = __block_write_begin(page, block_start + 1, 0, + ocfs2_get_block); if (ret < 0) { mlog_errno(ret); goto out_unlock; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index e7bde21149ae..ff5744e1e36f 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -742,7 +742,7 @@ static int ocfs2_link(struct dentry *old_dentry, goto out_commit; } - atomic_inc(&inode->i_count); + ihold(inode); dentry->d_op = &ocfs2_dentry_ops; d_instantiate(dentry, inode); diff --git a/fs/partitions/check.c b/fs/partitions/check.c index b81bfc016a05..0a8b0ad0c7e2 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -365,25 +365,17 @@ struct device_type part_type = { static void delete_partition_rcu_cb(struct rcu_head *head) { struct hd_struct *part = container_of(head, struct hd_struct, rcu_head); - struct gendisk *disk = part_to_disk(part); - struct request_queue *q = disk->queue; - unsigned long flags; part->start_sect = 0; part->nr_sects = 0; part_stat_set_all(part, 0); put_device(part_to_dev(part)); - - spin_lock_irqsave(q->queue_lock, flags); - elv_quiesce_end(q); - spin_unlock_irqrestore(q->queue_lock, flags); } void delete_partition(struct gendisk *disk, int partno) { struct disk_part_tbl *ptbl = disk->part_tbl; struct hd_struct *part; - struct request_queue *q = disk->queue; if (partno >= ptbl->len) return; @@ -398,10 +390,6 @@ void delete_partition(struct gendisk *disk, int partno) kobject_put(part->holder_dir); device_del(part_to_dev(part)); - spin_lock_irq(q->queue_lock); - elv_quiesce_start(q); - spin_unlock_irq(q->queue_lock); - call_rcu(&part->rcu_head, delete_partition_rcu_cb); } diff --git a/fs/pipe.c b/fs/pipe.c index 37eb1ebeaa90..d2d7566ce68e 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -954,6 +954,8 @@ static struct inode * get_pipe_inode(void) if (!inode) goto fail_inode; + inode->i_ino = get_next_ino(); + pipe = alloc_pipe_info(inode); if (!pipe) goto fail_iput; diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 50f8f0600f06..6a0068841d96 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -33,8 +33,8 @@ config PROC_KCORE depends on PROC_FS && MMU config PROC_VMCORE - bool "/proc/vmcore support (EXPERIMENTAL)" - depends on PROC_FS && CRASH_DUMP + bool "/proc/vmcore support" + depends on PROC_FS && CRASH_DUMP default y help Exports the dump image of crashed kernel in ELF format. diff --git a/fs/proc/base.c b/fs/proc/base.c index dc5d5f51f3fe..9b094c1c8465 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -771,6 +771,8 @@ static const struct file_operations proc_single_file_operations = { static int mem_open(struct inode* inode, struct file* file) { file->private_data = (void*)((long)current->self_exec_id); + /* OK to pass negative loff_t, we can catch out-of-range */ + file->f_mode |= FMODE_UNSIGNED_OFFSET; return 0; } @@ -1023,28 +1025,47 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; - if (copy_from_user(buffer, buf, count)) - return -EFAULT; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } err = strict_strtol(strstrip(buffer), 0, &oom_adjust); if (err) - return -EINVAL; + goto out; if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && - oom_adjust != OOM_DISABLE) - return -EINVAL; + oom_adjust != OOM_DISABLE) { + err = -EINVAL; + goto out; + } task = get_proc_task(file->f_path.dentry->d_inode); - if (!task) - return -ESRCH; + if (!task) { + err = -ESRCH; + goto out; + } + + task_lock(task); + if (!task->mm) { + err = -EINVAL; + goto err_task_lock; + } + if (!lock_task_sighand(task, &flags)) { - put_task_struct(task); - return -ESRCH; + err = -ESRCH; + goto err_task_lock; } if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) { - unlock_task_sighand(task, &flags); - put_task_struct(task); - return -EACCES; + err = -EACCES; + goto err_sighand; + } + + if (oom_adjust != task->signal->oom_adj) { + if (oom_adjust == OOM_DISABLE) + atomic_inc(&task->mm->oom_disable_count); + if (task->signal->oom_adj == OOM_DISABLE) + atomic_dec(&task->mm->oom_disable_count); } /* @@ -1065,10 +1086,13 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, else task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; +err_sighand: unlock_task_sighand(task, &flags); +err_task_lock: + task_unlock(task); put_task_struct(task); - - return count; +out: + return err < 0 ? err : count; } static const struct file_operations proc_oom_adjust_operations = { @@ -1109,30 +1133,49 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; - if (copy_from_user(buffer, buf, count)) - return -EFAULT; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } err = strict_strtol(strstrip(buffer), 0, &oom_score_adj); if (err) - return -EINVAL; + goto out; if (oom_score_adj < OOM_SCORE_ADJ_MIN || - oom_score_adj > OOM_SCORE_ADJ_MAX) - return -EINVAL; + oom_score_adj > OOM_SCORE_ADJ_MAX) { + err = -EINVAL; + goto out; + } task = get_proc_task(file->f_path.dentry->d_inode); - if (!task) - return -ESRCH; + if (!task) { + err = -ESRCH; + goto out; + } + + task_lock(task); + if (!task->mm) { + err = -EINVAL; + goto err_task_lock; + } + if (!lock_task_sighand(task, &flags)) { - put_task_struct(task); - return -ESRCH; + err = -ESRCH; + goto err_task_lock; } + if (oom_score_adj < task->signal->oom_score_adj && !capable(CAP_SYS_RESOURCE)) { - unlock_task_sighand(task, &flags); - put_task_struct(task); - return -EACCES; + err = -EACCES; + goto err_sighand; } + if (oom_score_adj != task->signal->oom_score_adj) { + if (oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_inc(&task->mm->oom_disable_count); + if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_dec(&task->mm->oom_disable_count); + } task->signal->oom_score_adj = oom_score_adj; /* * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is @@ -1143,9 +1186,13 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, else task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) / OOM_SCORE_ADJ_MAX; +err_sighand: unlock_task_sighand(task, &flags); +err_task_lock: + task_unlock(task); put_task_struct(task); - return count; +out: + return err < 0 ? err : count; } static const struct file_operations proc_oom_score_adj_operations = { @@ -1601,6 +1648,7 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st /* Common stuff */ ei = PROC_I(inode); + inode->i_ino = get_next_ino(); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_op = &proc_def_inode_operations; @@ -2547,6 +2595,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir, /* Initialize the inode */ ei = PROC_I(inode); + inode->i_ino = get_next_ino(); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; /* diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 2fc52552271d..b652cb00906b 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -23,6 +23,8 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, if (!inode) goto out; + inode->i_ino = get_next_ino(); + sysctl_head_get(head); ei = PROC_I(inode); ei->sysctl = head; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index a5ebae70dc6d..67fadb1ad2c1 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -58,6 +58,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, struct inode * inode = new_inode(sb); if (inode) { + inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); inode->i_mapping->a_ops = &ramfs_aops; inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; diff --git a/fs/read_write.c b/fs/read_write.c index e757ef26e4ce..9cd9d148105d 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -31,6 +31,20 @@ const struct file_operations generic_ro_fops = { EXPORT_SYMBOL(generic_ro_fops); +static int +__negative_fpos_check(struct file *file, loff_t pos, size_t count) +{ + /* + * pos or pos+count is negative here, check overflow. + * too big "count" will be caught in rw_verify_area(). + */ + if ((pos < 0) && (pos + count < pos)) + return -EOVERFLOW; + if (file->f_mode & FMODE_UNSIGNED_OFFSET) + return 0; + return -EINVAL; +} + /** * generic_file_llseek_unlocked - lockless generic llseek implementation * @file: file structure to seek on @@ -62,7 +76,9 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) break; } - if (offset < 0 || offset > inode->i_sb->s_maxbytes) + if (offset < 0 && __negative_fpos_check(file, offset, 0)) + return -EINVAL; + if (offset > inode->i_sb->s_maxbytes) return -EINVAL; /* Special lock needed here? */ @@ -137,7 +153,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin) offset += file->f_pos; } retval = -EINVAL; - if (offset >= 0) { + if (offset >= 0 || !__negative_fpos_check(file, offset, 0)) { if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; @@ -221,6 +237,7 @@ bad: } #endif + /* * rw_verify_area doesn't like huge counts. We limit * them to something that fits in "int" so that others @@ -238,8 +255,11 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count if (unlikely((ssize_t) count < 0)) return retval; pos = *ppos; - if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) - return retval; + if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) { + retval = __negative_fpos_check(file, pos, count); + if (retval) + return retval; + } if (unlikely(inode->i_flock && mandatory_lock(inode))) { retval = locks_mandatory_area( diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index caa758377d66..41656d40dc5c 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -22,8 +22,6 @@ int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); -int reiserfs_prepare_write(struct file *f, struct page *page, - unsigned from, unsigned to); void reiserfs_evict_inode(struct inode *inode) { @@ -165,7 +163,7 @@ inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key, ** but tail is still sitting in a direct item, and we can't write to ** it. So, look through this page, and check all the mapped buffers ** to make sure they have valid block numbers. Any that don't need -** to be unmapped, so that block_prepare_write will correctly call +** to be unmapped, so that __block_write_begin will correctly call ** reiserfs_get_block to convert the tail into an unformatted node */ static inline void fix_tail_page_for_writing(struct page *page) @@ -439,13 +437,13 @@ static int reiserfs_bmap(struct inode *inode, sector_t block, } /* special version of get_block that is only used by grab_tail_page right -** now. It is sent to block_prepare_write, and when you try to get a +** now. It is sent to __block_write_begin, and when you try to get a ** block past the end of the file (or a block from a hole) it returns -** -ENOENT instead of a valid buffer. block_prepare_write expects to +** -ENOENT instead of a valid buffer. __block_write_begin expects to ** be able to do i/o on the buffers returned, unless an error value ** is also returned. ** -** So, this allows block_prepare_write to be used for reading a single block +** So, this allows __block_write_begin to be used for reading a single block ** in a page. Where it does not produce a valid page for holes, or past the ** end of the file. This turns out to be exactly what we need for reading ** tails for conversion. @@ -558,11 +556,12 @@ static int convert_tail_for_hole(struct inode *inode, ** ** We must fix the tail page for writing because it might have buffers ** that are mapped, but have a block number of 0. This indicates tail - ** data that has been read directly into the page, and block_prepare_write - ** won't trigger a get_block in this case. + ** data that has been read directly into the page, and + ** __block_write_begin won't trigger a get_block in this case. */ fix_tail_page_for_writing(tail_page); - retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end); + retval = __reiserfs_write_begin(tail_page, tail_start, + tail_end - tail_start); if (retval) goto unlock; @@ -2033,7 +2032,7 @@ static int grab_tail_page(struct inode *inode, /* start within the page of the last block in the file */ start = (offset / blocksize) * blocksize; - error = block_prepare_write(page, start, offset, + error = __block_write_begin(page, start, offset - start, reiserfs_get_block_create_0); if (error) goto unlock; @@ -2438,7 +2437,7 @@ static int reiserfs_write_full_page(struct page *page, /* from this point on, we know the buffer is mapped to a * real block and not a direct item */ - if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { + if (wbc->sync_mode != WB_SYNC_NONE) { lock_buffer(bh); } else { if (!trylock_buffer(bh)) { @@ -2628,8 +2627,7 @@ static int reiserfs_write_begin(struct file *file, return ret; } -int reiserfs_prepare_write(struct file *f, struct page *page, - unsigned from, unsigned to) +int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len) { struct inode *inode = page->mapping->host; int ret; @@ -2650,7 +2648,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page, th->t_refcount++; } - ret = block_prepare_write(page, from, to, reiserfs_get_block); + ret = __block_write_begin(page, from, len, reiserfs_get_block); if (ret && reiserfs_transaction_running(inode->i_sb)) { struct reiserfs_transaction_handle *th = current->journal_info; /* this gets a little ugly. If reiserfs_get_block returned an diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 5cbb81e134ac..adf22b485cea 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -160,8 +160,6 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd, int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); -int reiserfs_prepare_write(struct file *f, struct page *page, - unsigned from, unsigned to); /* ** reiserfs_unpack ** Function try to convert tail from direct item into indirect. @@ -200,7 +198,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) } /* we unpack by finding the page with the tail, and calling - ** reiserfs_prepare_write on that page. This will force a + ** __reiserfs_write_begin on that page. This will force a ** reiserfs_get_block to unpack the tail for us. */ index = inode->i_size >> PAGE_CACHE_SHIFT; @@ -210,7 +208,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) if (!page) { goto out; } - retval = reiserfs_prepare_write(NULL, page, write_from, write_from); + retval = __reiserfs_write_begin(page, write_from, 0); if (retval) goto out_unlock; diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index ee78d4a0086a..ba5f51ec3458 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -1156,7 +1156,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, inode->i_ctime = CURRENT_TIME_SEC; reiserfs_update_sd(&th, inode); - atomic_inc(&inode->i_count); + ihold(inode); d_instantiate(dentry, inode); retval = journal_end(&th, dir->i_sb, jbegin_count); reiserfs_write_unlock(dir->i_sb); diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 8c4cf273c672..5d04a7828e7a 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -418,13 +418,11 @@ static inline __u32 xattr_hash(const char *msg, int len) int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); -int reiserfs_prepare_write(struct file *f, struct page *page, - unsigned from, unsigned to); static void update_ctime(struct inode *inode) { struct timespec now = current_fs_time(inode->i_sb); - if (hlist_unhashed(&inode->i_hash) || !inode->i_nlink || + if (inode_unhashed(inode) || !inode->i_nlink || timespec_equal(&inode->i_ctime, &now)) return; @@ -532,8 +530,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, rxh->h_hash = cpu_to_le32(xahash); } - err = reiserfs_prepare_write(NULL, page, page_offset, - page_offset + chunk + skip); + err = __reiserfs_write_begin(page, page_offset, chunk + skip); if (!err) { if (buffer) memcpy(data + skip, buffer + buffer_pos, chunk); diff --git a/fs/seq_file.c b/fs/seq_file.c index 0e7cb1395a94..05d6b0e78c95 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -462,9 +462,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root, if (size) { char *p; - spin_lock(&dcache_lock); p = __d_path(path, root, buf, size); - spin_unlock(&dcache_lock); res = PTR_ERR(p); if (!IS_ERR(p)) { char *end = mangle_path(buf, p, esc); diff --git a/fs/signalfd.c b/fs/signalfd.c index 74047304b01a..492465b451dd 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -99,6 +99,16 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, #ifdef __ARCH_SI_TRAPNO err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno); #endif +#ifdef BUS_MCEERR_AO + /* + * Other callers might not initialize the si_lsb field, + * so check explicitly for the right codes here. + */ + if (kinfo->si_code == BUS_MCEERR_AR || + kinfo->si_code == BUS_MCEERR_AO) + err |= __put_user((short) kinfo->si_addr_lsb, + &uinfo->ssi_addr_lsb); +#endif break; case __SI_CHLD: err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c index 00a70cab1f36..f678d421e541 100644 --- a/fs/smbfs/dir.c +++ b/fs/smbfs/dir.c @@ -406,21 +406,15 @@ void smb_renew_times(struct dentry * dentry) { dget(dentry); - spin_lock(&dentry->d_lock); - for (;;) { - struct dentry *parent; + dentry->d_time = jiffies; - dentry->d_time = jiffies; - if (IS_ROOT(dentry)) - break; - parent = dentry->d_parent; - dget(parent); - spin_unlock(&dentry->d_lock); + while (!IS_ROOT(dentry)) { + struct dentry *parent = dget_parent(dentry); dput(dentry); dentry = parent; - spin_lock(&dentry->d_lock); + + dentry->d_time = jiffies; } - spin_unlock(&dentry->d_lock); dput(dentry); } diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c index 8fc5e50e142f..f6e9ee59757e 100644 --- a/fs/smbfs/inode.c +++ b/fs/smbfs/inode.c @@ -229,7 +229,6 @@ smb_invalidate_inodes(struct smb_sb_info *server) { VERBOSE("\n"); shrink_dcache_sb(SB_of(server)); - invalidate_inodes(SB_of(server)); } /* diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c index 71c29b6670b4..3dcf638d4d3a 100644 --- a/fs/smbfs/proc.c +++ b/fs/smbfs/proc.c @@ -332,16 +332,15 @@ static int smb_build_path(struct smb_sb_info *server, unsigned char *buf, * and store it in reversed order [see reverse_string()] */ dget(entry); - spin_lock(&entry->d_lock); while (!IS_ROOT(entry)) { struct dentry *parent; if (maxlen < (3<<unicode)) { - spin_unlock(&entry->d_lock); dput(entry); return -ENAMETOOLONG; } + spin_lock(&entry->d_lock); len = server->ops->convert(path, maxlen-2, entry->d_name.name, entry->d_name.len, server->local_nls, server->remote_nls); @@ -359,15 +358,12 @@ static int smb_build_path(struct smb_sb_info *server, unsigned char *buf, } *path++ = '\\'; maxlen -= len+1; - - parent = entry->d_parent; - dget(parent); spin_unlock(&entry->d_lock); + + parent = dget_parent(entry); dput(entry); entry = parent; - spin_lock(&entry->d_lock); } - spin_unlock(&entry->d_lock); dput(entry); reverse_string(buf, path-buf); diff --git a/fs/super.c b/fs/super.c index 8819e3a7ff20..b9c9869165db 100644 --- a/fs/super.c +++ b/fs/super.c @@ -273,14 +273,14 @@ void generic_shutdown_super(struct super_block *sb) get_fs_excl(); sb->s_flags &= ~MS_ACTIVE; - /* bad name - it should be evict_inodes() */ - invalidate_inodes(sb); + fsnotify_unmount_inodes(&sb->s_inodes); + + evict_inodes(sb); if (sop->put_super) sop->put_super(sb); - /* Forget any remaining inodes */ - if (invalidate_inodes(sb)) { + if (!list_empty(&sb->s_inodes)) { printk("VFS: Busy inodes after unmount of %s. " "Self-destruct in 5 seconds. Have a nice day...\n", sb->s_id); diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index 33e047b59b8d..11e7f7d11cd0 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -126,7 +126,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir, inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); - atomic_inc(&inode->i_count); + ihold(inode); return add_nondir(dentry, inode); } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 87ebcce72213..14f64b689d7f 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -550,7 +550,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, lock_2_inodes(dir, inode); inc_nlink(inode); - atomic_inc(&inode->i_count); + ihold(inode); inode->i_ctime = ubifs_current_time(inode); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index bf5fc674193c..6d8dc02baebb 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -1101,7 +1101,7 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, inc_nlink(inode); inode->i_ctime = current_fs_time(inode->i_sb); mark_inode_dirty(inode); - atomic_inc(&inode->i_count); + ihold(inode); d_instantiate(dentry, inode); unlock_kernel(); diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index b056f02b1fb3..12f39b9e4437 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -180,7 +180,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); - atomic_inc(&inode->i_count); + ihold(inode); error = ufs_add_nondir(dentry, inode); unlock_kernel(); diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index b552f816de15..c9af48fffcd7 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1139,8 +1139,7 @@ xfs_vm_writepage( type = IO_DELAY; flags = BMAPI_ALLOCATE; - if (wbc->sync_mode == WB_SYNC_NONE && - wbc->nonblocking) + if (wbc->sync_mode == WB_SYNC_NONE) flags |= BMAPI_TRYLOCK; } diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index ba5312802aa9..63fd2c07cb57 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1580,6 +1580,7 @@ xfs_mapping_buftarg( XFS_BUFTARG_NAME(btp)); return ENOMEM; } + inode->i_ino = get_next_ino(); inode->i_mode = S_IFBLK; inode->i_bdev = bdev; inode->i_rdev = bdev->bd_dev; diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index ec858e09d546..96107efc0c61 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -317,7 +317,7 @@ xfs_vn_link( if (unlikely(error)) return -error; - atomic_inc(&inode->i_count); + ihold(inode); d_instantiate(dentry, inode); return 0; } @@ -760,7 +760,9 @@ xfs_setup_inode( inode->i_ino = ip->i_ino; inode->i_state = I_NEW; - inode_add_to_lists(ip->i_mount->m_super, inode); + + inode_sb_list_add(inode); + insert_inode_hash(inode); inode->i_mode = ip->i_d.di_mode; inode->i_nlink = ip->i_d.di_nlink; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index ab31ce5aeaf9..cf808782c065 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -576,7 +576,7 @@ xfs_max_file_offset( /* Figure out maximum filesize, on Linux this can depend on * the filesystem blocksize (on 32 bit platforms). - * __block_prepare_write does this in an [unsigned] long... + * __block_write_begin does this in an [unsigned] long... * page->index << (PAGE_CACHE_SHIFT - bbits) * So, for page sized blocks (4K on 32 bit platforms), * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index fac52290de90..fb2ca2e4cdc9 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -500,7 +500,7 @@ void xfs_mark_inode_dirty_sync(xfs_inode_t *); #define IHOLD(ip) \ do { \ ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ - atomic_inc(&(VFS_I(ip)->i_count)); \ + ihold(VFS_I(ip)); \ trace_xfs_ihold(ip, _THIS_IP_); \ } while (0) |
