From fd8c37eccdda21153298997417144b38b1623196 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 15 Dec 2010 20:26:48 -0500 Subject: [PATCH 01/44] ext4: Simplify the usage of clear_opt() and set_opt() macros Change clear_opt() and set_opt() to take a superblock pointer instead of a pointer to EXT4_SB(sb)->s_mount_opt. This makes it easier for us to support a second mount option field. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 6 +- fs/ext4/mballoc.c | 2 +- fs/ext4/super.c | 162 +++++++++++++++++++++++----------------------- 3 files changed, 86 insertions(+), 84 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 94ce3d7a1c4b..2d93620d092e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -917,8 +917,10 @@ struct ext4_inode_info { #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ #define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ -#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt -#define set_opt(o, opt) o |= EXT4_MOUNT_##opt +#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ + ~EXT4_MOUNT_##opt +#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ + EXT4_MOUNT_##opt #define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ EXT4_MOUNT_##opt) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 5b4d4e3a4d58..731b6f738a03 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2617,7 +2617,7 @@ static inline int ext4_issue_discard(struct super_block *sb, ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); if (ret == -EOPNOTSUPP) { ext4_warning(sb, "discard not supported, disabling"); - clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD); + clear_opt(sb, DISCARD); } return ret; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index fb15c9c0be74..cf7d9131d785 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1386,7 +1386,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) sbi->s_qf_names[qtype] = NULL; return 0; } - set_opt(sbi->s_mount_opt, QUOTA); + set_opt(sb, QUOTA); return 1; } @@ -1441,21 +1441,21 @@ static int parse_options(char *options, struct super_block *sb, switch (token) { case Opt_bsd_df: ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - clear_opt(sbi->s_mount_opt, MINIX_DF); + clear_opt(sb, MINIX_DF); break; case Opt_minix_df: ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - set_opt(sbi->s_mount_opt, MINIX_DF); + set_opt(sb, MINIX_DF); break; case Opt_grpid: ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - set_opt(sbi->s_mount_opt, GRPID); + set_opt(sb, GRPID); break; case Opt_nogrpid: ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - clear_opt(sbi->s_mount_opt, GRPID); + clear_opt(sb, GRPID); break; case Opt_resuid: @@ -1473,38 +1473,38 @@ static int parse_options(char *options, struct super_block *sb, /* *sb_block = match_int(&args[0]); */ break; case Opt_err_panic: - clear_opt(sbi->s_mount_opt, ERRORS_CONT); - clear_opt(sbi->s_mount_opt, ERRORS_RO); - set_opt(sbi->s_mount_opt, ERRORS_PANIC); + clear_opt(sb, ERRORS_CONT); + clear_opt(sb, ERRORS_RO); + set_opt(sb, ERRORS_PANIC); break; case Opt_err_ro: - clear_opt(sbi->s_mount_opt, ERRORS_CONT); - clear_opt(sbi->s_mount_opt, ERRORS_PANIC); - set_opt(sbi->s_mount_opt, ERRORS_RO); + clear_opt(sb, ERRORS_CONT); + clear_opt(sb, ERRORS_PANIC); + set_opt(sb, ERRORS_RO); break; case Opt_err_cont: - clear_opt(sbi->s_mount_opt, ERRORS_RO); - clear_opt(sbi->s_mount_opt, ERRORS_PANIC); - set_opt(sbi->s_mount_opt, ERRORS_CONT); + clear_opt(sb, ERRORS_RO); + clear_opt(sb, ERRORS_PANIC); + set_opt(sb, ERRORS_CONT); break; case Opt_nouid32: - set_opt(sbi->s_mount_opt, NO_UID32); + set_opt(sb, NO_UID32); break; case Opt_debug: - set_opt(sbi->s_mount_opt, DEBUG); + set_opt(sb, DEBUG); break; case Opt_oldalloc: - set_opt(sbi->s_mount_opt, OLDALLOC); + set_opt(sb, OLDALLOC); break; case Opt_orlov: - clear_opt(sbi->s_mount_opt, OLDALLOC); + clear_opt(sb, OLDALLOC); break; #ifdef CONFIG_EXT4_FS_XATTR case Opt_user_xattr: - set_opt(sbi->s_mount_opt, XATTR_USER); + set_opt(sb, XATTR_USER); break; case Opt_nouser_xattr: - clear_opt(sbi->s_mount_opt, XATTR_USER); + clear_opt(sb, XATTR_USER); break; #else case Opt_user_xattr: @@ -1514,10 +1514,10 @@ static int parse_options(char *options, struct super_block *sb, #endif #ifdef CONFIG_EXT4_FS_POSIX_ACL case Opt_acl: - set_opt(sbi->s_mount_opt, POSIX_ACL); + set_opt(sb, POSIX_ACL); break; case Opt_noacl: - clear_opt(sbi->s_mount_opt, POSIX_ACL); + clear_opt(sb, POSIX_ACL); break; #else case Opt_acl: @@ -1536,7 +1536,7 @@ static int parse_options(char *options, struct super_block *sb, "Cannot specify journal on remount"); return 0; } - set_opt(sbi->s_mount_opt, UPDATE_JOURNAL); + set_opt(sb, UPDATE_JOURNAL); break; case Opt_journal_dev: if (is_remount) { @@ -1549,14 +1549,14 @@ static int parse_options(char *options, struct super_block *sb, *journal_devnum = option; break; case Opt_journal_checksum: - set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); + set_opt(sb, JOURNAL_CHECKSUM); break; case Opt_journal_async_commit: - set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT); - set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); + set_opt(sb, JOURNAL_ASYNC_COMMIT); + set_opt(sb, JOURNAL_CHECKSUM); break; case Opt_noload: - set_opt(sbi->s_mount_opt, NOLOAD); + set_opt(sb, NOLOAD); break; case Opt_commit: if (match_int(&args[0], &option)) @@ -1599,15 +1599,15 @@ static int parse_options(char *options, struct super_block *sb, return 0; } } else { - clear_opt(sbi->s_mount_opt, DATA_FLAGS); + clear_opt(sb, DATA_FLAGS); sbi->s_mount_opt |= data_opt; } break; case Opt_data_err_abort: - set_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + set_opt(sb, DATA_ERR_ABORT); break; case Opt_data_err_ignore: - clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + clear_opt(sb, DATA_ERR_ABORT); break; #ifdef CONFIG_QUOTA case Opt_usrjquota: @@ -1647,12 +1647,12 @@ set_qf_format: break; case Opt_quota: case Opt_usrquota: - set_opt(sbi->s_mount_opt, QUOTA); - set_opt(sbi->s_mount_opt, USRQUOTA); + set_opt(sb, QUOTA); + set_opt(sb, USRQUOTA); break; case Opt_grpquota: - set_opt(sbi->s_mount_opt, QUOTA); - set_opt(sbi->s_mount_opt, GRPQUOTA); + set_opt(sb, QUOTA); + set_opt(sb, GRPQUOTA); break; case Opt_noquota: if (sb_any_quota_loaded(sb)) { @@ -1660,9 +1660,9 @@ set_qf_format: "options when quota turned on"); return 0; } - clear_opt(sbi->s_mount_opt, QUOTA); - clear_opt(sbi->s_mount_opt, USRQUOTA); - clear_opt(sbi->s_mount_opt, GRPQUOTA); + clear_opt(sb, QUOTA); + clear_opt(sb, USRQUOTA); + clear_opt(sb, GRPQUOTA); break; #else case Opt_quota: @@ -1688,7 +1688,7 @@ set_qf_format: sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; break; case Opt_nobarrier: - clear_opt(sbi->s_mount_opt, BARRIER); + clear_opt(sb, BARRIER); break; case Opt_barrier: if (args[0].from) { @@ -1697,9 +1697,9 @@ set_qf_format: } else option = 1; /* No argument, default to 1 */ if (option) - set_opt(sbi->s_mount_opt, BARRIER); + set_opt(sb, BARRIER); else - clear_opt(sbi->s_mount_opt, BARRIER); + clear_opt(sb, BARRIER); break; case Opt_ignore: break; @@ -1723,17 +1723,17 @@ set_qf_format: "Ignoring deprecated bh option"); break; case Opt_i_version: - set_opt(sbi->s_mount_opt, I_VERSION); + set_opt(sb, I_VERSION); sb->s_flags |= MS_I_VERSION; break; case Opt_nodelalloc: - clear_opt(sbi->s_mount_opt, DELALLOC); + clear_opt(sb, DELALLOC); break; case Opt_mblk_io_submit: - set_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT); + set_opt(sb, MBLK_IO_SUBMIT); break; case Opt_nomblk_io_submit: - clear_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT); + clear_opt(sb, MBLK_IO_SUBMIT); break; case Opt_stripe: if (match_int(&args[0], &option)) @@ -1743,13 +1743,13 @@ set_qf_format: sbi->s_stripe = option; break; case Opt_delalloc: - set_opt(sbi->s_mount_opt, DELALLOC); + set_opt(sb, DELALLOC); break; case Opt_block_validity: - set_opt(sbi->s_mount_opt, BLOCK_VALIDITY); + set_opt(sb, BLOCK_VALIDITY); break; case Opt_noblock_validity: - clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY); + clear_opt(sb, BLOCK_VALIDITY); break; case Opt_inode_readahead_blks: if (match_int(&args[0], &option)) @@ -1773,7 +1773,7 @@ set_qf_format: option); break; case Opt_noauto_da_alloc: - set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); + set_opt(sb, NO_AUTO_DA_ALLOC); break; case Opt_auto_da_alloc: if (args[0].from) { @@ -1782,24 +1782,24 @@ set_qf_format: } else option = 1; /* No argument, default to 1 */ if (option) - clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); + clear_opt(sb, NO_AUTO_DA_ALLOC); else - set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); + set_opt(sb,NO_AUTO_DA_ALLOC); break; case Opt_discard: - set_opt(sbi->s_mount_opt, DISCARD); + set_opt(sb, DISCARD); break; case Opt_nodiscard: - clear_opt(sbi->s_mount_opt, DISCARD); + clear_opt(sb, DISCARD); break; case Opt_dioread_nolock: - set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); + set_opt(sb, DIOREAD_NOLOCK); break; case Opt_dioread_lock: - clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); + clear_opt(sb, DIOREAD_NOLOCK); break; case Opt_init_inode_table: - set_opt(sbi->s_mount_opt, INIT_INODE_TABLE); + set_opt(sb, INIT_INODE_TABLE); if (args[0].from) { if (match_int(&args[0], &option)) return 0; @@ -1810,7 +1810,7 @@ set_qf_format: sbi->s_li_wait_mult = option; break; case Opt_noinit_inode_table: - clear_opt(sbi->s_mount_opt, INIT_INODE_TABLE); + clear_opt(sb, INIT_INODE_TABLE); break; default: ext4_msg(sb, KERN_ERR, @@ -1822,10 +1822,10 @@ set_qf_format: #ifdef CONFIG_QUOTA if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) - clear_opt(sbi->s_mount_opt, USRQUOTA); + clear_opt(sb, USRQUOTA); if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) - clear_opt(sbi->s_mount_opt, GRPQUOTA); + clear_opt(sb, GRPQUOTA); if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { ext4_msg(sb, KERN_ERR, "old and new quota " @@ -3071,41 +3071,41 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); - set_opt(sbi->s_mount_opt, INIT_INODE_TABLE); + set_opt(sb, INIT_INODE_TABLE); if (def_mount_opts & EXT4_DEFM_DEBUG) - set_opt(sbi->s_mount_opt, DEBUG); + set_opt(sb, DEBUG); if (def_mount_opts & EXT4_DEFM_BSDGROUPS) { ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups", "2.6.38"); - set_opt(sbi->s_mount_opt, GRPID); + set_opt(sb, GRPID); } if (def_mount_opts & EXT4_DEFM_UID16) - set_opt(sbi->s_mount_opt, NO_UID32); + set_opt(sb, NO_UID32); #ifdef CONFIG_EXT4_FS_XATTR if (def_mount_opts & EXT4_DEFM_XATTR_USER) - set_opt(sbi->s_mount_opt, XATTR_USER); + set_opt(sb, XATTR_USER); #endif #ifdef CONFIG_EXT4_FS_POSIX_ACL if (def_mount_opts & EXT4_DEFM_ACL) - set_opt(sbi->s_mount_opt, POSIX_ACL); + set_opt(sb, POSIX_ACL); #endif if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) - set_opt(sbi->s_mount_opt, JOURNAL_DATA); + set_opt(sb, JOURNAL_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) - set_opt(sbi->s_mount_opt, ORDERED_DATA); + set_opt(sb, ORDERED_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) - set_opt(sbi->s_mount_opt, WRITEBACK_DATA); + set_opt(sb, WRITEBACK_DATA); if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) - set_opt(sbi->s_mount_opt, ERRORS_PANIC); + set_opt(sb, ERRORS_PANIC); else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE) - set_opt(sbi->s_mount_opt, ERRORS_CONT); + set_opt(sb, ERRORS_CONT); else - set_opt(sbi->s_mount_opt, ERRORS_RO); + set_opt(sb, ERRORS_RO); if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY) - set_opt(sbi->s_mount_opt, BLOCK_VALIDITY); + set_opt(sb, BLOCK_VALIDITY); if (def_mount_opts & EXT4_DEFM_DISCARD) - set_opt(sbi->s_mount_opt, DISCARD); + set_opt(sb, DISCARD); sbi->s_resuid = le16_to_cpu(es->s_def_resuid); sbi->s_resgid = le16_to_cpu(es->s_def_resgid); @@ -3114,7 +3114,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) - set_opt(sbi->s_mount_opt, BARRIER); + set_opt(sb, BARRIER); /* * enable delayed allocation by default @@ -3122,7 +3122,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) */ if (!IS_EXT3_SB(sb) && ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) - set_opt(sbi->s_mount_opt, DELALLOC); + set_opt(sb, DELALLOC); if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, &journal_devnum, &journal_ioprio, NULL, 0)) { @@ -3425,8 +3425,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "suppressed and not mounted read-only"); goto failed_mount_wq; } else { - clear_opt(sbi->s_mount_opt, DATA_FLAGS); - set_opt(sbi->s_mount_opt, WRITEBACK_DATA); + clear_opt(sb, DATA_FLAGS); + set_opt(sb, WRITEBACK_DATA); sbi->s_journal = NULL; needs_recovery = 0; goto no_journal; @@ -3464,9 +3464,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) */ if (jbd2_journal_check_available_features (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) - set_opt(sbi->s_mount_opt, ORDERED_DATA); + set_opt(sb, ORDERED_DATA); else - set_opt(sbi->s_mount_opt, JOURNAL_DATA); + set_opt(sb, JOURNAL_DATA); break; case EXT4_MOUNT_ORDERED_DATA: @@ -3556,18 +3556,18 @@ no_journal: (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - " "requested data journaling mode"); - clear_opt(sbi->s_mount_opt, DELALLOC); + clear_opt(sb, DELALLOC); } if (test_opt(sb, DIOREAD_NOLOCK)) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " "option - requested data journaling mode"); - clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); + clear_opt(sb, DIOREAD_NOLOCK); } if (sb->s_blocksize < PAGE_SIZE) { ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " "option - block size is too small"); - clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); + clear_opt(sb, DIOREAD_NOLOCK); } } From 673c610033a8202c037ecd068c7a235495acda17 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 15 Dec 2010 20:28:48 -0500 Subject: [PATCH 02/44] ext4: Move struct ext4_mount_options from ext4.h to super.c Move the ext4_mount_options structure definition from ext4.h, since it is only used in super.c. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 16 ---------------- fs/ext4/super.c | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2d93620d092e..ddae3c435138 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -561,22 +561,6 @@ struct ext4_new_group_data { #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION #endif - -/* - * Mount options - */ -struct ext4_mount_options { - unsigned long s_mount_opt; - uid_t s_resuid; - gid_t s_resgid; - unsigned long s_commit_interval; - u32 s_min_batch_time, s_max_batch_time; -#ifdef CONFIG_QUOTA - int s_jquota_fmt; - char *s_qf_names[MAXQUOTAS]; -#endif -}; - /* Max physical block we can addres w/o extents */ #define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF diff --git a/fs/ext4/super.c b/fs/ext4/super.c index cf7d9131d785..7aa3a790363a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4166,6 +4166,21 @@ static int ext4_unfreeze(struct super_block *sb) return 0; } +/* + * Structure to save mount options for ext4_remount's benefit + */ +struct ext4_mount_options { + unsigned long s_mount_opt; + uid_t s_resuid; + gid_t s_resgid; + unsigned long s_commit_interval; + u32 s_min_batch_time, s_max_batch_time; +#ifdef CONFIG_QUOTA + int s_jquota_fmt; + char *s_qf_names[MAXQUOTAS]; +#endif +}; + static int ext4_remount(struct super_block *sb, int *flags, char *data) { struct ext4_super_block *es; From a2595b8aa67011419dae26b47e474f46df902989 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 15 Dec 2010 20:30:48 -0500 Subject: [PATCH 03/44] ext4: Add second mount options field since the s_mount_opt is full up Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 8 ++++++++ fs/ext4/super.c | 7 +++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ddae3c435138..17baecbf8cda 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -908,6 +908,13 @@ struct ext4_inode_info { #define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ EXT4_MOUNT_##opt) +#define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \ + ~EXT4_MOUNT2_##opt +#define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \ + EXT4_MOUNT2_##opt +#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ + EXT4_MOUNT2_##opt) + #define ext4_set_bit ext2_set_bit #define ext4_set_bit_atomic ext2_set_bit_atomic #define ext4_clear_bit ext2_clear_bit @@ -1073,6 +1080,7 @@ struct ext4_sb_info { struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ struct buffer_head **s_group_desc; unsigned int s_mount_opt; + unsigned int s_mount_opt2; unsigned int s_mount_flags; ext4_fsblk_t s_sb_block; uid_t s_resuid; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7aa3a790363a..072ff973ff2b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1895,12 +1895,12 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, ext4_commit_super(sb, 1); if (test_opt(sb, DEBUG)) printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " - "bpg=%lu, ipg=%lu, mo=%04x]\n", + "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n", sb->s_blocksize, sbi->s_groups_count, EXT4_BLOCKS_PER_GROUP(sb), EXT4_INODES_PER_GROUP(sb), - sbi->s_mount_opt); + sbi->s_mount_opt, sbi->s_mount_opt2); return res; } @@ -4171,6 +4171,7 @@ static int ext4_unfreeze(struct super_block *sb) */ struct ext4_mount_options { unsigned long s_mount_opt; + unsigned long s_mount_opt2; uid_t s_resuid; gid_t s_resgid; unsigned long s_commit_interval; @@ -4201,6 +4202,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) lock_super(sb); old_sb_flags = sb->s_flags; old_opts.s_mount_opt = sbi->s_mount_opt; + old_opts.s_mount_opt2 = sbi->s_mount_opt2; old_opts.s_resuid = sbi->s_resuid; old_opts.s_resgid = sbi->s_resgid; old_opts.s_commit_interval = sbi->s_commit_interval; @@ -4354,6 +4356,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) restore_opts: sb->s_flags = old_sb_flags; sbi->s_mount_opt = old_opts.s_mount_opt; + sbi->s_mount_opt2 = old_opts.s_mount_opt2; sbi->s_resuid = old_opts.s_resuid; sbi->s_resgid = old_opts.s_resgid; sbi->s_commit_interval = old_opts.s_commit_interval; From 225db7d35c33f076115a583abec238a696f4467e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 16 Dec 2010 16:38:26 -0500 Subject: [PATCH 04/44] ext4: Fix up comments in inode.c This fixes up some broken argument descriptions that Namhyung Kim had originally submitted for ext3. This fixes the comments that were still applicable in ext4. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e659597b690b..db3cc913ee8f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -552,7 +552,7 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, } /** - * ext4_blks_to_allocate: Look up the block map and count the number + * ext4_blks_to_allocate - Look up the block map and count the number * of direct blocks need to be allocated for the given branch. * * @branch: chain of indirect blocks @@ -591,13 +591,19 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, /** * ext4_alloc_blocks: multiple allocate blocks needed for a branch + * @handle: handle for this transaction + * @inode: inode which needs allocated blocks + * @iblock: the logical block to start allocated at + * @goal: preferred physical block of allocation * @indirect_blks: the number of blocks need to allocate for indirect * blocks - * + * @blks: number of desired blocks * @new_blocks: on return it will store the new block numbers for * the indirect blocks(if needed) and the first direct block, - * @blks: on return it will store the total number of allocated - * direct blocks + * @err: on return it will store the error code + * + * This function will return the number of blocks allocated as + * requested by the passed-in parameters. */ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, ext4_fsblk_t goal, @@ -711,9 +717,11 @@ failed_out: /** * ext4_alloc_branch - allocate and set up a chain of blocks. + * @handle: handle for this transaction * @inode: owner * @indirect_blks: number of allocated indirect blocks * @blks: number of allocated direct blocks + * @goal: preferred place for allocation * @offsets: offsets (in the blocks) to store the pointers to next. * @branch: place to store the chain in. * @@ -826,6 +834,7 @@ failed: /** * ext4_splice_branch - splice the allocated branch onto inode. + * @handle: handle for this transaction * @inode: owner * @block: (logical) number of block we are adding * @chain: chain of indirect blocks (with a missing link - see From a8901d34872dafcafa23efa0865dcecfd4fddf8c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 17 Dec 2010 10:40:47 -0500 Subject: [PATCH 05/44] ext4: Use pr_warning_ratelimited() instead of printk_ratelimit() printk_ratelimit() is deprecated since it is a global instead of a per-printk ratelimit. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index db3cc913ee8f..c0fe426d444a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -40,6 +40,7 @@ #include #include #include +#include #include "ext4_jbd2.h" #include "xattr.h" @@ -3729,8 +3730,7 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode) retry: io_end = ext4_init_io_end(inode, GFP_ATOMIC); if (!io_end) { - if (printk_ratelimit()) - printk(KERN_WARNING "%s: allocation fail\n", __func__); + pr_warning_ratelimited("%s: allocation fail\n", __func__); schedule(); goto retry; } From 670be5a78ac7c80f0d6009d648c84c65a03f373a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 17 Dec 2010 10:44:16 -0500 Subject: [PATCH 06/44] jbd2: Use pr_notice_ratelimited() in journal_alloc_journal_head() We had an open-coded version of printk_ratelimited(); use the provided abstraction to make the code cleaner and easier to understand. Based on a similar patch for fs/jbd from Namhyung Kim Signed-off-by: "Theodore Ts'o" --- fs/jbd2/journal.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index f837ba953529..06dfd778cae5 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -43,6 +43,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -1982,7 +1983,6 @@ static void jbd2_journal_destroy_jbd2_journal_head_cache(void) static struct journal_head *journal_alloc_journal_head(void) { struct journal_head *ret; - static unsigned long last_warning; #ifdef CONFIG_JBD2_DEBUG atomic_inc(&nr_journal_heads); @@ -1990,11 +1990,7 @@ static struct journal_head *journal_alloc_journal_head(void) ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); if (!ret) { jbd_debug(1, "out of memory for journal_head\n"); - if (time_after(jiffies, last_warning + 5*HZ)) { - printk(KERN_NOTICE "ENOMEM in %s, retrying.\n", - __func__); - last_warning = jiffies; - } + pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__); while (!ret) { yield(); ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); From cfef2c6a559b1e37cbc7e7c1b51f82d26abf24ec Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 18 Dec 2010 13:07:34 -0500 Subject: [PATCH 07/44] jbd2: Fix a debug message in do_get_write_access() 'buffer_head' should be 'journal_head' This is a port of a patch which Namhyung Kim made to fs/jbd to jbd2. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/transaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 6bf0a242613e..10b5e3b1ca8b 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -589,7 +589,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, transaction = handle->h_transaction; journal = transaction->t_journal; - jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); + jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); JBUFFER_TRACE(jh, "entry"); repeat: From a1dd53318409ed6a27a8ce4fecf52e1326a100c0 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 18 Dec 2010 13:13:40 -0500 Subject: [PATCH 08/44] jbd2: use offset_in_page() instead of manual calculation This is a port to jbd2 of a patch which Namhyung Kim originally made to fs/jbd. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/transaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 10b5e3b1ca8b..80f9b2a3880b 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -774,7 +774,7 @@ done: J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), "Possible IO failure.\n"); page = jh2bh(jh)->b_page; - offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; + offset = offset_in_page(jh2bh(jh)->b_data); source = kmap_atomic(page, KM_USER0); /* Fire data frozen trigger just before we copy the data */ jbd2_buffer_frozen_trigger(jh, source + offset, From ae00b267f3827ba88309fb74bdf7527396f0acf9 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 18 Dec 2010 13:34:20 -0500 Subject: [PATCH 09/44] jbd2: remove unnecessary goto statement This is a port to jbd2 of a patch which Namhyung Kim originally made to fs/jbd. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/transaction.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 80f9b2a3880b..394893242ae3 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -340,9 +340,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask) jbd2_free_handle(handle); current->journal_info = NULL; handle = ERR_PTR(err); - goto out; } -out: return handle; } EXPORT_SYMBOL(jbd2__journal_start); From 9a4f6271b68b9693290963b97b320d2e6e6f3446 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 18 Dec 2010 13:36:33 -0500 Subject: [PATCH 10/44] jbd2: move debug message into debug #ifdef This is a port to jbd2 of a patch which Namhyung Kim originally made to fs/jbd. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/recovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 2bc4d5f116f1..1cad869494f0 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -299,10 +299,10 @@ int jbd2_journal_skip_recovery(journal_t *journal) #ifdef CONFIG_JBD2_DEBUG int dropped = info.end_transaction - be32_to_cpu(journal->j_superblock->s_sequence); -#endif jbd_debug(1, "JBD: ignoring %d transaction%s from the journal.\n", dropped, (dropped == 1) ? "" : "s"); +#endif journal->j_transaction_sequence = ++info.end_transaction; } From b7271b0a39947f757d7969f6150dcb16c1976b91 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 18 Dec 2010 13:39:38 -0500 Subject: [PATCH 11/44] jbd2: simplify return path of journal_init_common Signed-off-by: "Theodore Ts'o" --- fs/jbd2/journal.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 06dfd778cae5..2447bd86f801 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -828,7 +828,7 @@ static journal_t * journal_init_common (void) journal = kzalloc(sizeof(*journal), GFP_KERNEL); if (!journal) - goto fail; + return NULL; init_waitqueue_head(&journal->j_wait_transaction_locked); init_waitqueue_head(&journal->j_wait_logspace); @@ -853,14 +853,12 @@ static journal_t * journal_init_common (void) err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); if (err) { kfree(journal); - goto fail; + return NULL; } spin_lock_init(&journal->j_history_lock); return journal; -fail: - return NULL; } /* jbd2_journal_init_dev and jbd2_journal_init_inode: From 6ca7b13dea385484e2fcc89790b8030697c5014a Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Sun, 19 Dec 2010 21:38:46 -0500 Subject: [PATCH 12/44] ext4: Remove redundant unlikely() IS_ERR() already implies unlikely(), so it can be omitted here. Signed-off-by: Tobias Klauser Signed-off-by: "Theodore Ts'o" --- fs/ext4/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index dc40e75cba88..203086498caa 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1036,7 +1036,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru return ERR_PTR(-EIO); } inode = ext4_iget(dir->i_sb, ino); - if (unlikely(IS_ERR(inode))) { + if (IS_ERR(inode)) { if (PTR_ERR(inode) == -ESTALE) { EXT4_ERROR_INODE(dir, "deleted inode referenced: %u", From b17b35ec13adfeb0346d4b329110b14adc509327 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Sun, 19 Dec 2010 21:41:55 -0500 Subject: [PATCH 13/44] ext4: use kmem_cache_zalloc() in ext4_init_io_end() Use advantage of kmem_cache_zalloc() to remove a memset() call in ext4_init_io_end() and save a few bytes. Before: [jj@dragon linux-2.6]$ size fs/ext4/page-io.o text data bss dec hex filename 3016 0 624 3640 e38 fs/ext4/page-io.o After: [jj@dragon linux-2.6]$ size fs/ext4/page-io.o text data bss dec hex filename 3000 0 624 3624 e28 fs/ext4/page-io.o Signed-off-by: Jesper Juhl Signed-off-by: "Theodore Ts'o" --- fs/ext4/page-io.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index beacce11ac50..0f5dfe0e83e7 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -158,11 +158,8 @@ static void ext4_end_io_work(struct work_struct *work) ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) { - ext4_io_end_t *io = NULL; - - io = kmem_cache_alloc(io_end_cachep, flags); + ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); if (io) { - memset(io, 0, sizeof(*io)); atomic_inc(&EXT4_I(inode)->i_ioend_count); io->inode = inode; INIT_WORK(&io->work, ext4_end_io_work); From cad3f00763dcf9dfc62cbddf4bd714ab5a71a0eb Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 19 Dec 2010 22:07:02 -0500 Subject: [PATCH 14/44] ext4: optimize ext4_check_dir_entry() with unlikely() annotations This function gets called a lot for large directories, and the answer is almost always "no, no, there's no problem". This means using unlikely() is a good thing. Signed-off-by: "Theodore Ts'o" --- fs/ext4/dir.c | 40 +++++++++++++++++++++++----------------- fs/ext4/ext4.h | 3 ++- fs/ext4/namei.c | 14 +++++++------- 3 files changed, 32 insertions(+), 25 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index ece76fb6a40c..bd5d74d06399 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -60,7 +60,11 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) return (ext4_filetype_table[filetype]); } - +/* + * Return 0 if the directory entry is OK, and 1 if there is a problem + * + * Note: this is the opposite of what ext2 and ext3 historically returned... + */ int __ext4_check_dir_entry(const char *function, unsigned int line, struct inode *dir, struct ext4_dir_entry_2 *de, @@ -71,26 +75,28 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, const int rlen = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); - if (rlen < EXT4_DIR_REC_LEN(1)) + if (unlikely(rlen < EXT4_DIR_REC_LEN(1))) error_msg = "rec_len is smaller than minimal"; - else if (rlen % 4 != 0) + else if (unlikely(rlen % 4 != 0)) error_msg = "rec_len % 4 != 0"; - else if (rlen < EXT4_DIR_REC_LEN(de->name_len)) + else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) error_msg = "rec_len is too small for name_len"; - else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) + else if (unlikely(((char *) de - bh->b_data) + rlen > + dir->i_sb->s_blocksize)) error_msg = "directory entry across blocks"; - else if (le32_to_cpu(de->inode) > - le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)) + else if (unlikely(le32_to_cpu(de->inode) > + le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; + else + return 0; - if (error_msg != NULL) - ext4_error_inode(dir, function, line, bh->b_blocknr, - "bad entry in directory: %s - " - "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset%bh->b_size), offset, - le32_to_cpu(de->inode), - rlen, de->name_len); - return error_msg == NULL ? 1 : 0; + ext4_error_inode(dir, function, line, bh->b_blocknr, + "bad entry in directory: %s - " + "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", + error_msg, (unsigned) (offset%bh->b_size), offset, + le32_to_cpu(de->inode), + rlen, de->name_len); + return 1; } static int ext4_readdir(struct file *filp, @@ -194,8 +200,8 @@ revalidate: while (!error && filp->f_pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); - if (!ext4_check_dir_entry(inode, de, - bh, offset)) { + if (ext4_check_dir_entry(inode, de, + bh, offset)) { /* * On error, skip the f_pos to the next block */ diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 17baecbf8cda..49f1ceaac57d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1639,7 +1639,8 @@ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, struct ext4_dir_entry_2 *, struct buffer_head *, unsigned int); #define ext4_check_dir_entry(dir, de, bh, offset) \ - __ext4_check_dir_entry(__func__, __LINE__, (dir), (de), (bh), (offset)) + unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (de), \ + (bh), (offset))) extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 203086498caa..e275464f7754 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -581,9 +581,9 @@ static int htree_dirblock_to_tree(struct file *dir_file, dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0)); for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { - if (!ext4_check_dir_entry(dir, de, bh, - (block<i_sb)) - +((char *)de - bh->b_data))) { + if (ext4_check_dir_entry(dir, de, bh, + (block<i_sb)) + + ((char *)de - bh->b_data))) { /* On error, skip the f_pos to the next block. */ dir_file->f_pos = (dir_file->f_pos | (dir->i_sb->s_blocksize - 1)) + 1; @@ -820,7 +820,7 @@ static inline int search_dirblock(struct buffer_head *bh, if ((char *) de + namelen <= dlimit && ext4_match (namelen, name, de)) { /* found a match - just to be sure, do a full check */ - if (!ext4_check_dir_entry(dir, de, bh, offset)) + if (ext4_check_dir_entry(dir, de, bh, offset)) return -1; *res_dir = de; return 1; @@ -1269,7 +1269,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *)bh->b_data; top = bh->b_data + blocksize - reclen; while ((char *) de <= top) { - if (!ext4_check_dir_entry(dir, de, bh, offset)) + if (ext4_check_dir_entry(dir, de, bh, offset)) return -EIO; if (ext4_match(namelen, name, de)) return -EEXIST; @@ -1636,7 +1636,7 @@ static int ext4_delete_entry(handle_t *handle, pde = NULL; de = (struct ext4_dir_entry_2 *) bh->b_data; while (i < bh->b_size) { - if (!ext4_check_dir_entry(dir, de, bh, i)) + if (ext4_check_dir_entry(dir, de, bh, i)) return -EIO; if (de == de_del) { BUFFER_TRACE(bh, "get_write_access"); @@ -1919,7 +1919,7 @@ static int empty_dir(struct inode *inode) } de = (struct ext4_dir_entry_2 *) bh->b_data; } - if (!ext4_check_dir_entry(inode, de, bh, offset)) { + if (ext4_check_dir_entry(inode, de, bh, offset)) { de = (struct ext4_dir_entry_2 *)(bh->b_data + sb->s_blocksize); offset = (offset | (sb->s_blocksize - 1)) + 1; From af0b44a1970fed1cda31d2969c99c46ffc515160 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sun, 19 Dec 2010 22:10:31 -0500 Subject: [PATCH 15/44] ext4: zero out nanosecond timestamps for small inodes When nanosecond timestamp resolution isn't supported on an ext4 partition (inode size = 128), stat() appears to be returning uninitialized garbage in the nanosecond component of timestamps. EXT4_INODE_GET_XTIME should zero out tv_nsec when EXT4_FITS_IN_INODE evaluates to false. Reported-by: Jordan Russell Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 49f1ceaac57d..8104ab7eb7d4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -693,6 +693,8 @@ do { \ if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ ext4_decode_extra_time(&(inode)->xtime, \ raw_inode->xtime ## _extra); \ + else \ + (inode)->xtime.tv_nsec = 0; \ } while (0) #define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ @@ -703,6 +705,8 @@ do { \ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ ext4_decode_extra_time(&(einode)->xtime, \ raw_inode->xtime ## _extra); \ + else \ + (einode)->xtime.tv_nsec = 0; \ } while (0) #define i_disk_version osd1.linux1.l_i_version From 94de56ab2062be59d80e2efb7c0dc60ecf616075 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 19 Dec 2010 22:21:02 -0500 Subject: [PATCH 16/44] ext4: Use vzalloc in ext4_fill_flex_info() Signed-off-by: Joe Perches Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 072ff973ff2b..10290f8f5922 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1930,14 +1930,13 @@ static int ext4_fill_flex_info(struct super_block *sb) size = flex_group_count * sizeof(struct flex_groups); sbi->s_flex_groups = kzalloc(size, GFP_KERNEL); if (sbi->s_flex_groups == NULL) { - sbi->s_flex_groups = vmalloc(size); - if (sbi->s_flex_groups) - memset(sbi->s_flex_groups, 0, size); - } - if (sbi->s_flex_groups == NULL) { - ext4_msg(sb, KERN_ERR, "not enough memory for " - "%u flex groups", flex_group_count); - goto failed; + sbi->s_flex_groups = vzalloc(size); + if (sbi->s_flex_groups == NULL) { + ext4_msg(sb, KERN_ERR, + "not enough memory for %u flex groups", + flex_group_count); + goto failed; + } } for (i = 0; i < sbi->s_groups_count; i++) { From 0ff2ea7d84e31176a046a1eabea59d6e4eecd998 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 19 Dec 2010 22:43:19 -0500 Subject: [PATCH 17/44] ext4: Use printf extension %pV Using %pV reduces the number of printk calls and eliminates any possible message interleaving from other printk calls. In function __ext4_grp_locked_error also added KERN_CONT to some printks. Signed-off-by: Joe Perches Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 10290f8f5922..c228da112de0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -388,13 +388,14 @@ static void ext4_handle_error(struct super_block *sb) void __ext4_error(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: ", - sb->s_id, function, line, current->comm); - vprintk(fmt, args); - printk("\n"); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", + sb->s_id, function, line, current->comm, &vaf); va_end(args); ext4_handle_error(sb); @@ -543,28 +544,29 @@ void __ext4_abort(struct super_block *sb, const char *function, panic("EXT4-fs panic from previous error\n"); } -void ext4_msg (struct super_block * sb, const char *prefix, - const char *fmt, ...) +void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - printk("%sEXT4-fs (%s): ", prefix, sb->s_id); - vprintk(fmt, args); - printk("\n"); + vaf.fmt = fmt; + vaf.va = &args; + printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf); va_end(args); } void __ext4_warning(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: ", - sb->s_id, function, line); - vprintk(fmt, args); - printk("\n"); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n", + sb->s_id, function, line, &vaf); va_end(args); } @@ -575,21 +577,25 @@ void __ext4_grp_locked_error(const char *function, unsigned int line, __releases(bitlock) __acquires(bitlock) { + struct va_format vaf; va_list args; struct ext4_super_block *es = EXT4_SB(sb)->s_es; es->s_last_error_ino = cpu_to_le32(ino); es->s_last_error_block = cpu_to_le64(block); __save_error_info(sb, function, line); + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u", sb->s_id, function, line, grp); if (ino) - printk("inode %lu: ", ino); + printk(KERN_CONT "inode %lu: ", ino); if (block) - printk("block %llu:", (unsigned long long) block); - vprintk(fmt, args); - printk("\n"); + printk(KERN_CONT "block %llu:", (unsigned long long) block); + printk(KERN_CONT "%pV\n", &vaf); va_end(args); if (test_opt(sb, ERRORS_CONT)) { From b72143ab3ed566a12560fa4411a1f02c276dcc39 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 20 Dec 2010 07:26:59 -0500 Subject: [PATCH 18/44] ext4: Add error checking to kmem_cache_alloc() call in ext4_free_blocks() Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 731b6f738a03..46d5414f59c1 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4626,7 +4626,11 @@ do_more: * blocks being freed are metadata. these blocks shouldn't * be used until this transaction is committed */ - new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); + new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); + if (!new_entry) { + err = -ENOMEM; + goto error_return; + } new_entry->start_blk = bit; new_entry->group = block_group; new_entry->count = count; From 4f531501e44206862735e81ddf2b70d0dcf6acf6 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Mon, 10 Jan 2011 12:04:55 -0500 Subject: [PATCH 19/44] ext4: fix possible overflow in ext4_trim_fs() When determining last group through ext4_get_group_no_and_offset() the result may be wrong in cases when range->start and range-len are too big, because it may overflow when summing up those two numbers. Fix that by checking range->len and limit its value to ext4_blocks_count(). This commit was tested by myself with expected result. Signed-off-by: Lukas Czerner --- fs/ext4/mballoc.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 46d5414f59c1..7c603a02633e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4819,6 +4819,7 @@ ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) { struct ext4_buddy e4b; + ext4_fsblk_t blocks_count = ext4_blocks_count(EXT4_SB(sb)->s_es); ext4_group_t first_group, last_group; ext4_group_t group, ngroups = ext4_get_groups_count(sb); ext4_grpblk_t cnt = 0, first_block, last_block; @@ -4830,6 +4831,11 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) minlen = range->minlen >> sb->s_blocksize_bits; trimmed = 0; + if (start >= blocks_count) + return -EINVAL; + if (start + len > blocks_count) + len = blocks_count - start; + if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) return -EINVAL; From 932596366760e3f0dac9998665af1c49afcc4285 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Mon, 10 Jan 2011 12:09:59 -0500 Subject: [PATCH 20/44] ext4: remove warning message from ext4_issue_discard helper ext4_issue_discard is supposed to be helper for calling discard, however in case that underlying device does not support discard it prints out the warning message and clears the DISCARD t_mount_opt flag. Since it can be (and is) used by others, it should not do anything and let the caller to handle the error case. This commit removes warning message and flag setting from ext4_issue_discard and use it just in place where it is really needed (release_blocks_on_commit). FITRIM ioctl should not set any flags nor it should print out warning messages, so get rid of the warning as well. Signed-off-by: Lukas Czerner --- fs/ext4/mballoc.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7c603a02633e..12b604abc2fe 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2608,18 +2608,12 @@ int ext4_mb_release(struct super_block *sb) static inline int ext4_issue_discard(struct super_block *sb, ext4_group_t block_group, ext4_grpblk_t block, int count) { - int ret; ext4_fsblk_t discard_block; discard_block = block + ext4_group_first_block_no(sb, block_group); trace_ext4_discard_blocks(sb, (unsigned long long) discard_block, count); - ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); - if (ret == -EOPNOTSUPP) { - ext4_warning(sb, "discard not supported, disabling"); - clear_opt(sb, DISCARD); - } - return ret; + return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); } /* @@ -2631,7 +2625,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) struct super_block *sb = journal->j_private; struct ext4_buddy e4b; struct ext4_group_info *db; - int err, count = 0, count2 = 0; + int err, ret, count = 0, count2 = 0; struct ext4_free_data *entry; struct list_head *l, *ltmp; @@ -2641,9 +2635,15 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) mb_debug(1, "gonna free %u blocks in group %u (0x%p):", entry->count, entry->group, entry); - if (test_opt(sb, DISCARD)) - ext4_issue_discard(sb, entry->group, + if (test_opt(sb, DISCARD)) { + ret = ext4_issue_discard(sb, entry->group, entry->start_blk, entry->count); + if (unlikely(ret == -EOPNOTSUPP)) { + ext4_warning(sb, "discard not supported, " + "disabling"); + clear_opt(sb, DISCARD); + } + } err = ext4_mb_load_buddy(sb, entry->group, &e4b); /* we expect to find existing buddy because it's pinned */ @@ -4722,8 +4722,6 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count, ext4_unlock_group(sb, group); ret = ext4_issue_discard(sb, group, start, count); - if (ret) - ext4_std_error(sb, ret); ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); From eaeef86718249f5c75b1370f77a9bc11f196a01c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 10 Jan 2011 12:10:07 -0500 Subject: [PATCH 21/44] ext4: clean up ext4_xattr_list()'s error code checking and return strategy Any time you see code that tries to add error codes together, you should want to claw your eyes out... Signed-off-by: "Theodore Ts'o" --- fs/ext4/xattr.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index fa4b899da4b3..ca6ca14a827d 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -427,23 +427,23 @@ cleanup: static int ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { - int i_error, b_error; + int ret, ret2; down_read(&EXT4_I(dentry->d_inode)->xattr_sem); - i_error = ext4_xattr_ibody_list(dentry, buffer, buffer_size); - if (i_error < 0) { - b_error = 0; - } else { - if (buffer) { - buffer += i_error; - buffer_size -= i_error; - } - b_error = ext4_xattr_block_list(dentry, buffer, buffer_size); - if (b_error < 0) - i_error = 0; + ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size); + if (ret < 0) + goto errout; + if (buffer) { + buffer += ret; + buffer_size -= ret; } + ret = ext4_xattr_block_list(dentry, buffer, buffer_size); + if (ret < 0) + goto errout; + ret += ret2; +errout: up_read(&EXT4_I(dentry->d_inode)->xattr_sem); - return i_error + b_error; + return ret; } /* From 6e9510b0e0de657ca7c7bfb10ced80b4d237dd58 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Mon, 10 Jan 2011 12:10:30 -0500 Subject: [PATCH 22/44] ext2,ext3,ext4: clarify comment for extN_xattr_set_handle Signed-off-by: Wang Sheng-Hui Signed-off-by: "Theodore Ts'o" --- fs/ext2/xattr.c | 2 +- fs/ext3/xattr.c | 2 +- fs/ext4/xattr.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index f84700be3274..f3ffbf1cc8d0 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -355,7 +355,7 @@ static void ext2_xattr_update_super_block(struct super_block *sb) /* * ext2_xattr_set() * - * Create, replace or remove an extended attribute for this inode. Buffer + * Create, replace or remove an extended attribute for this inode. Value * is NULL to remove an existing extended attribute, and non-NULL to * either replace an existing extended attribute, or create a new extended * attribute. The flags XATTR_REPLACE and XATTR_CREATE diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index e69dc6dfaa89..32e6cc23bd9a 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -925,7 +925,7 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode, /* * ext3_xattr_set_handle() * - * Create, replace or remove an extended attribute for this inode. Buffer + * Create, replace or remove an extended attribute for this inode. Value * is NULL to remove an existing extended attribute, and non-NULL to * either replace an existing extended attribute, or create a new extended * attribute. The flags XATTR_REPLACE and XATTR_CREATE diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index ca6ca14a827d..fc32176eee39 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -947,7 +947,7 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, /* * ext4_xattr_set_handle() * - * Create, replace or remove an extended attribute for this inode. Buffer + * Create, replace or remove an extended attribute for this inode. Value * is NULL to remove an existing extended attribute, and non-NULL to * either replace an existing extended attribute, or create a new extended * attribute. The flags XATTR_REPLACE and XATTR_CREATE From 1f605b302724120777a1c38743cb20e2c8807333 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Mon, 10 Jan 2011 12:10:37 -0500 Subject: [PATCH 23/44] ext2: remove dead code in ext2_xattr_get Reviewed-by: Dan Carpenter Signed-off-by: Wang Sheng-Hui Signed-off-by: "Theodore Ts'o" --- fs/ext2/xattr.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index f3ffbf1cc8d0..c2e4dce984d2 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -199,14 +199,6 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", goto found; entry = next; } - /* Check the remaining name entries */ - while (!IS_LAST_ENTRY(entry)) { - struct ext2_xattr_entry *next = - EXT2_XATTR_NEXT(entry); - if ((char *)next >= end) - goto bad_block; - entry = next; - } if (ext2_xattr_cache_insert(bh)) ea_idebug(inode, "cache insert failed"); error = -ENODATA; From 13195184a8bc119dbd2f905db325a453047971cb Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 10 Jan 2011 12:10:44 -0500 Subject: [PATCH 24/44] ext4: test the correct variable in ext4_init_pageio() This is a copy and paste error. The intent was to check "io_page_cachep". We tested "io_page_cachep" earlier. Signed-off-by: Dan Carpenter Signed-off-by: "Theodore Ts'o" --- fs/ext4/page-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 0f5dfe0e83e7..7270dcfca92a 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -44,7 +44,7 @@ int __init ext4_init_pageio(void) if (io_page_cachep == NULL) return -ENOMEM; io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); - if (io_page_cachep == NULL) { + if (io_end_cachep == NULL) { kmem_cache_destroy(io_page_cachep); return -ENOMEM; } From f9a62d090cf47fae2fe6f6bd8eb9f24482573fd8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 10 Jan 2011 12:10:50 -0500 Subject: [PATCH 25/44] ext4: use IS_ERR() to check for errors in ext4_error_file d_path() returns an ERR_PTR and it doesn't return NULL. This is in ext4_error_file() and no one actually calls ext4_error_file(). Signed-off-by: Dan Carpenter --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c228da112de0..d49e3b1ec41e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -437,7 +437,7 @@ void ext4_error_file(struct file *file, const char *function, save_error_info(inode->i_sb, function, line); va_start(args, fmt); path = d_path(&(file->f_path), pathname, sizeof(pathname)); - if (!path) + if (IS_ERR(path)) path = "(unknown)"; printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu " From f7c21177af0b32a2cd9ee36189637f0c1f0e1e17 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 10 Jan 2011 12:10:55 -0500 Subject: [PATCH 26/44] ext4: Use ext4_error_file() to print the pathname to the corrupted inode Where the file pointer is available, use ext4_error_file() instead of ext4_error_inode(). Signed-off-by: "Theodore Ts'o" --- fs/ext4/dir.c | 30 ++++++++++++++++++++---------- fs/ext4/ext4.h | 15 ++++++++------- fs/ext4/namei.c | 10 +++++----- fs/ext4/super.c | 28 ++++++++++++++++------------ 4 files changed, 49 insertions(+), 34 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index bd5d74d06399..164c56092e58 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -66,7 +66,7 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) * Note: this is the opposite of what ext2 and ext3 historically returned... */ int __ext4_check_dir_entry(const char *function, unsigned int line, - struct inode *dir, + struct inode *dir, struct file *filp, struct ext4_dir_entry_2 *de, struct buffer_head *bh, unsigned int offset) @@ -90,12 +90,21 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, else return 0; - ext4_error_inode(dir, function, line, bh->b_blocknr, - "bad entry in directory: %s - " - "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset%bh->b_size), offset, - le32_to_cpu(de->inode), - rlen, de->name_len); + if (filp) + ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0, + "bad entry in directory: %s - offset=%u(%u), " + "inode=%u, rec_len=%d, name_len=%d", + error_msg, (unsigned) (offset%bh->b_size), + offset, le32_to_cpu(de->inode), + rlen, de->name_len); + else + ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0, + "bad entry in directory: %s - offset=%u(%u), " + "inode=%u, rec_len=%d, name_len=%d", + error_msg, (unsigned) (offset%bh->b_size), + offset, le32_to_cpu(de->inode), + rlen, de->name_len); + return 1; } @@ -158,8 +167,9 @@ static int ext4_readdir(struct file *filp, */ if (!bh) { if (!dir_has_error) { - EXT4_ERROR_INODE(inode, "directory " - "contains a hole at offset %Lu", + EXT4_ERROR_FILE(filp, 0, + "directory contains a " + "hole at offset %llu", (unsigned long long) filp->f_pos); dir_has_error = 1; } @@ -200,7 +210,7 @@ revalidate: while (!error && filp->f_pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); - if (ext4_check_dir_entry(inode, de, + if (ext4_check_dir_entry(inode, filp, de, bh, offset)) { /* * On error, skip the f_pos to the next block diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8104ab7eb7d4..2a739255ee05 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -62,8 +62,8 @@ #define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) -#define EXT4_ERROR_FILE(file, fmt, a...) \ - ext4_error_file(__func__, __LINE__, (file), (fmt), ## a) +#define EXT4_ERROR_FILE(file, block, fmt, a...) \ + ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) /* data type for block offset of block group */ typedef int ext4_grpblk_t; @@ -1640,11 +1640,12 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb, /* dir.c */ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, + struct file *, struct ext4_dir_entry_2 *, struct buffer_head *, unsigned int); -#define ext4_check_dir_entry(dir, de, bh, offset) \ - unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (de), \ - (bh), (offset))) +#define ext4_check_dir_entry(dir, filp, de, bh, offset) \ + unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ + (de), (bh), (offset))) extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent); @@ -1751,8 +1752,8 @@ extern void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, const char *, ...) __attribute__ ((format (printf, 5, 6))); extern void ext4_error_file(struct file *, const char *, unsigned int, - const char *, ...) - __attribute__ ((format (printf, 4, 5))); + ext4_fsblk_t, const char *, ...) + __attribute__ ((format (printf, 5, 6))); extern void __ext4_std_error(struct super_block *, const char *, unsigned int, int); extern void __ext4_abort(struct super_block *, const char *, unsigned int, diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index e275464f7754..96a594d86a19 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -581,7 +581,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0)); for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { - if (ext4_check_dir_entry(dir, de, bh, + if (ext4_check_dir_entry(dir, NULL, de, bh, (block<i_sb)) + ((char *)de - bh->b_data))) { /* On error, skip the f_pos to the next block. */ @@ -820,7 +820,7 @@ static inline int search_dirblock(struct buffer_head *bh, if ((char *) de + namelen <= dlimit && ext4_match (namelen, name, de)) { /* found a match - just to be sure, do a full check */ - if (ext4_check_dir_entry(dir, de, bh, offset)) + if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) return -1; *res_dir = de; return 1; @@ -1269,7 +1269,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *)bh->b_data; top = bh->b_data + blocksize - reclen; while ((char *) de <= top) { - if (ext4_check_dir_entry(dir, de, bh, offset)) + if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) return -EIO; if (ext4_match(namelen, name, de)) return -EEXIST; @@ -1636,7 +1636,7 @@ static int ext4_delete_entry(handle_t *handle, pde = NULL; de = (struct ext4_dir_entry_2 *) bh->b_data; while (i < bh->b_size) { - if (ext4_check_dir_entry(dir, de, bh, i)) + if (ext4_check_dir_entry(dir, NULL, de, bh, i)) return -EIO; if (de == de_del) { BUFFER_TRACE(bh, "get_write_access"); @@ -1919,7 +1919,7 @@ static int empty_dir(struct inode *inode) } de = (struct ext4_dir_entry_2 *) bh->b_data; } - if (ext4_check_dir_entry(inode, de, bh, offset)) { + if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) { de = (struct ext4_dir_entry_2 *)(bh->b_data + sb->s_blocksize); offset = (offset | (sb->s_blocksize - 1)) + 1; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d49e3b1ec41e..7728a4ca3d6c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -406,28 +406,31 @@ void ext4_error_inode(struct inode *inode, const char *function, const char *fmt, ...) { va_list args; + struct va_format vaf; struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; es->s_last_error_ino = cpu_to_le32(inode->i_ino); es->s_last_error_block = cpu_to_le64(block); save_error_info(inode->i_sb, function, line); va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ", inode->i_sb->s_id, function, line, inode->i_ino); if (block) - printk("block %llu: ", block); - printk("comm %s: ", current->comm); - vprintk(fmt, args); - printk("\n"); + printk(KERN_CONT "block %llu: ", block); + printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf); va_end(args); ext4_handle_error(inode->i_sb); } void ext4_error_file(struct file *file, const char *function, - unsigned int line, const char *fmt, ...) + unsigned int line, ext4_fsblk_t block, + const char *fmt, ...) { va_list args; + struct va_format vaf; struct ext4_super_block *es; struct inode *inode = file->f_dentry->d_inode; char pathname[80], *path; @@ -435,17 +438,18 @@ void ext4_error_file(struct file *file, const char *function, es = EXT4_SB(inode->i_sb)->s_es; es->s_last_error_ino = cpu_to_le32(inode->i_ino); save_error_info(inode->i_sb, function, line); - va_start(args, fmt); path = d_path(&(file->f_path), pathname, sizeof(pathname)); if (IS_ERR(path)) path = "(unknown)"; printk(KERN_CRIT - "EXT4-fs error (device %s): %s:%d: inode #%lu " - "(comm %s path %s): ", - inode->i_sb->s_id, function, line, inode->i_ino, - current->comm, path); - vprintk(fmt, args); - printk("\n"); + "EXT4-fs error (device %s): %s:%d: inode #%lu: ", + inode->i_sb->s_id, function, line, inode->i_ino); + if (block) + printk(KERN_CONT "block %llu: ", block); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf); va_end(args); ext4_handle_error(inode->i_sb); From f1dffc4c5431c6bd8972489636573c5cd09ab672 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 10 Jan 2011 12:11:00 -0500 Subject: [PATCH 27/44] ext4: ext4_ext_migrate should use NULL not 0 ext4_ext_migrate() calls ext4_new_inode() and passes 0 instead of a pointer to a struct qstr. This patch uses NULL, to make it obvious to the caller that this was a pointer. Signed-off-by: Eric Paris Signed-off-by: "Theodore Ts'o" --- fs/ext4/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 25f3a974b725..b0a126f23c20 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -496,7 +496,7 @@ int ext4_ext_migrate(struct inode *inode) goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, - S_IFREG, 0, goal); + S_IFREG, NULL, goal); if (IS_ERR(tmp_inode)) { retval = -ENOMEM; ext4_journal_stop(handle); From dabd991f9d8e3232bb4531c920daddac8d10d313 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 10 Jan 2011 12:11:16 -0500 Subject: [PATCH 28/44] ext4: add more error checks to ext4_mkdir() Check return value of ext4_journal_get_write_access, ext4_journal_dirty_metadata and ext4_mark_inode_dirty. Move brelse() under 'out_stop' to release bh properly in case of journal error. Signed-off-by: Namhyung Kim Signed-off-by: "Theodore Ts'o" --- fs/ext4/namei.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 96a594d86a19..6dfc5b9de3e6 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1789,7 +1789,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) { handle_t *handle; struct inode *inode; - struct buffer_head *dir_block; + struct buffer_head *dir_block = NULL; struct ext4_dir_entry_2 *de; unsigned int blocksize = dir->i_sb->s_blocksize; int err, retries = 0; @@ -1822,7 +1822,9 @@ retry: if (!dir_block) goto out_clear_inode; BUFFER_TRACE(dir_block, "get_write_access"); - ext4_journal_get_write_access(handle, dir_block); + err = ext4_journal_get_write_access(handle, dir_block); + if (err) + goto out_clear_inode; de = (struct ext4_dir_entry_2 *) dir_block->b_data; de->inode = cpu_to_le32(inode->i_ino); de->name_len = 1; @@ -1839,10 +1841,12 @@ retry: ext4_set_de_type(dir->i_sb, de, S_IFDIR); inode->i_nlink = 2; BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); - ext4_handle_dirty_metadata(handle, dir, dir_block); - brelse(dir_block); - ext4_mark_inode_dirty(handle, inode); - err = ext4_add_entry(handle, dentry, inode); + err = ext4_handle_dirty_metadata(handle, dir, dir_block); + if (err) + goto out_clear_inode; + err = ext4_mark_inode_dirty(handle, inode); + if (!err) + err = ext4_add_entry(handle, dentry, inode); if (err) { out_clear_inode: clear_nlink(inode); @@ -1853,10 +1857,13 @@ out_clear_inode: } ext4_inc_count(handle, dir); ext4_update_dx_flag(dir); - ext4_mark_inode_dirty(handle, dir); + err = ext4_mark_inode_dirty(handle, dir); + if (err) + goto out_clear_inode; d_instantiate(dentry, inode); unlock_new_inode(inode); out_stop: + brelse(dir_block); ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; From ad4fb9cafe100a4a9de6e0529015e584d94ac8dc Mon Sep 17 00:00:00 2001 From: Kazuya Mio Date: Mon, 10 Jan 2011 12:12:28 -0500 Subject: [PATCH 29/44] ext4: fix 32bit overflow in ext4_ext_find_goal() ext4_ext_find_goal() returns an ideal physical block number that the block allocator tries to allocate first. However, if a required file offset is smaller than the existing extent's one, ext4_ext_find_goal() returns a wrong block number because it may overflow at "block - le32_to_cpu(ex->ee_block)". This patch fixes the problem. ext4_ext_find_goal() will also return a wrong block number in case a file offset of the existing extent is too big. In this case, the ideal physical block number is fixed in ext4_mb_initialize_context(), so it's no problem. reproduce: # dd if=/dev/zero of=/mnt/mp1/tmp bs=127M count=1 oflag=sync # dd if=/dev/zero of=/mnt/mp1/file bs=512K count=1 seek=1 oflag=sync # filefrag -v /mnt/mp1/file Filesystem type is: ef53 File size of /mnt/mp1/file is 1048576 (256 blocks, blocksize 4096) ext logical physical expected length flags 0 128 67456 128 eof /mnt/mp1/file: 2 extents found # rm -rf /mnt/mp1/tmp # echo $((512*4096)) > /sys/fs/ext4/loop0/mb_stream_req # dd if=/dev/zero of=/mnt/mp1/file bs=512K count=1 oflag=sync conv=notrunc result (linux-2.6.37-rc2 + ext4 patch queue): # filefrag -v /mnt/mp1/file Filesystem type is: ef53 File size of /mnt/mp1/file is 1048576 (256 blocks, blocksize 4096) ext logical physical expected length flags 0 0 33280 128 1 128 67456 33407 128 eof /mnt/mp1/file: 2 extents found result(apply this patch): # filefrag -v /mnt/mp1/file Filesystem type is: ef53 File size of /mnt/mp1/file is 1048576 (256 blocks, blocksize 4096) ext logical physical expected length flags 0 0 66560 128 1 128 67456 66687 128 eof /mnt/mp1/file: 2 extents found Signed-off-by: Kazuya Mio Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0554c48cb1fd..d53e20f53103 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -117,11 +117,33 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, struct ext4_extent *ex; depth = path->p_depth; - /* try to predict block placement */ + /* + * Try to predict block placement assuming that we are + * filling in a file which will eventually be + * non-sparse --- i.e., in the case of libbfd writing + * an ELF object sections out-of-order but in a way + * the eventually results in a contiguous object or + * executable file, or some database extending a table + * space file. However, this is actually somewhat + * non-ideal if we are writing a sparse file such as + * qemu or KVM writing a raw image file that is going + * to stay fairly sparse, since it will end up + * fragmenting the file system's free space. Maybe we + * should have some hueristics or some way to allow + * userspace to pass a hint to file system, + * especiially if the latter case turns out to be + * common. + */ ex = path[depth].p_ext; - if (ex) - return (ext4_ext_pblock(ex) + - (block - le32_to_cpu(ex->ee_block))); + if (ex) { + ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex); + ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block); + + if (block > ext_block) + return ext_pblk + (block - ext_block); + else + return ext_pblk - (ext_block - block); + } /* it looks like index is empty; * try to find starting block from index itself */ From f232109773ff5b0c840a6761d74940b9cf0d66ec Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 10 Jan 2011 12:12:36 -0500 Subject: [PATCH 30/44] ext4: replace i_delalloc_reserved_flag with EXT4_STATE_DELALLOC_RESERVED Remove the short element i_delalloc_reserved_flag from the ext4_inode_info structure and replace it a new bit in i_state_flags. Since we have an ext4_inode_info for every ext4 inode cached in the inode cache, any savings we can produce here is a very good thing from a memory utilization perspective. Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 3 ++- fs/ext4/ext4.h | 2 +- fs/ext4/inode.c | 6 +++--- fs/ext4/mballoc.c | 5 +++-- fs/ext4/super.c | 1 - 5 files changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 14c3af26c671..adf96b822781 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -592,7 +592,8 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, * Account for the allocated meta blocks. We will never * fail EDQUOT for metdata, but we do account for it. */ - if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { + if (!(*errp) && + ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) { spin_lock(&EXT4_I(inode)->i_block_reservation_lock); EXT4_I(inode)->i_allocated_meta_blocks += ar.len; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2a739255ee05..b7ee66ff9962 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -828,7 +828,6 @@ struct ext4_inode_info { unsigned int i_reserved_data_blocks; unsigned int i_reserved_meta_blocks; unsigned int i_allocated_meta_blocks; - unsigned short i_delalloc_reserved_flag; sector_t i_da_metadata_calc_last_lblock; int i_da_metadata_calc_len; @@ -1235,6 +1234,7 @@ enum { EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ EXT4_STATE_NEWENTRY, /* File just added to dir */ + EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ }; #define EXT4_INODE_BIT_FNS(name, field) \ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c0fe426d444a..ac08460921aa 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1330,7 +1330,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * avoid double accounting */ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) - EXT4_I(inode)->i_delalloc_reserved_flag = 1; + ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); /* * We need to check for EXT4 here because migrate * could have changed the inode type in between @@ -1360,7 +1360,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, ext4_da_update_reserve_space(inode, retval, 1); } if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) - EXT4_I(inode)->i_delalloc_reserved_flag = 0; + ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { @@ -2249,7 +2249,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) * affects functions in many different parts of the allocation * call path. This flag exists primarily because we don't * want to change *many* call functions, so ext4_map_blocks() - * will set the magic i_delalloc_reserved_flag once the + * will set the EXT4_STATE_DELALLOC_RESERVED flag once the * inode's allocation semaphore is taken. * * If the blocks in questions were delalloc blocks, set diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 12b604abc2fe..d47a80ec231d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4283,7 +4283,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, * EDQUOT check, as blocks and quotas have been already * reserved when data being copied into pagecache. */ - if (EXT4_I(ar->inode)->i_delalloc_reserved_flag) + if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) ar->flags |= EXT4_MB_DELALLOC_RESERVED; else { /* Without delayed allocation we need to verify @@ -4380,7 +4380,8 @@ out: if (inquota && ar->len < inquota) dquot_free_block(ar->inode, inquota - ar->len); if (!ar->len) { - if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) + if (!ext4_test_inode_state(ar->inode, + EXT4_STATE_DELALLOC_RESERVED)) /* release all the reserved blocks if non delalloc */ percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7728a4ca3d6c..f5960d673e4e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -828,7 +828,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_reserved_meta_blocks = 0; ei->i_allocated_meta_blocks = 0; ei->i_da_metadata_calc_len = 0; - ei->i_delalloc_reserved_flag = 0; spin_lock_init(&(ei->i_block_reservation_lock)); #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; From 01f49d0b9d0209dc1194255b11601e4b94447b36 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 10 Jan 2011 12:13:03 -0500 Subject: [PATCH 31/44] ext4: use ext4_lblk_t instead of sector_t for logical blocks This fixes a number of places where we used sector_t instead of ext4_lblk_t for logical blocks, which for ext4 are still 32-bit data types. No point wasting space in the ext4_inode_info structure, and requiring 64-bit arithmetic on 32-bit systems, when it isn't necessary. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 2 +- fs/ext4/ext4_extents.h | 2 +- fs/ext4/extents.c | 2 +- fs/ext4/inode.c | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b7ee66ff9962..746a59853a07 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -828,7 +828,7 @@ struct ext4_inode_info { unsigned int i_reserved_data_blocks; unsigned int i_reserved_meta_blocks; unsigned int i_allocated_meta_blocks; - sector_t i_da_metadata_calc_last_lblock; + ext4_lblk_t i_da_metadata_calc_last_lblock; int i_da_metadata_calc_len; /* on-disk additional length */ diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 28ce70fd9cd0..dfdda1766927 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -278,7 +278,7 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, } extern int ext4_ext_calc_metadata_amount(struct inode *inode, - sector_t lblocks); + ext4_lblk_t lblocks); extern int ext4_extent_tree_init(handle_t *, struct inode *); extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d53e20f53103..f1a4354ea3cf 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -266,7 +266,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check) * to allocate @blocks * Worse case is one block per extent */ -int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock) +int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) { struct ext4_inode_info *ei = EXT4_I(inode); int idxs, num = 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ac08460921aa..3ae83137cf34 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1091,7 +1091,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode, * Calculate the number of metadata blocks need to reserve * to allocate a block located at @lblock */ -static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock) +static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) { if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return ext4_ext_calc_metadata_amount(inode, lblock); @@ -1888,7 +1888,7 @@ static int ext4_journalled_write_end(struct file *file, /* * Reserve a single block located at lblock */ -static int ext4_da_reserve_space(struct inode *inode, sector_t lblock) +static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) { int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); From b05e6ae58a13b56e3e11882c1fc71948c9b29760 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 10 Jan 2011 12:13:26 -0500 Subject: [PATCH 32/44] ext4: drop ec_type from the ext4_ext_cache structure We can encode the ec_type information by using ee_len == 0 to denote EXT4_EXT_CACHE_NO, ee_start == 0 to denote EXT4_EXT_CACHE_GAP, and if neither is true, then the cache type must be EXT4_EXT_CACHE_EXTENT. This allows us to reduce the size of ext4_ext_inode by another 8 bytes. (ec_type is 4 bytes, plus another 4 bytes of padding) Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 3 ++- fs/ext4/ext4_extents.h | 6 +----- fs/ext4/extents.c | 37 +++++++++++++++---------------------- 3 files changed, 18 insertions(+), 28 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 746a59853a07..de937fc10503 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -738,12 +738,13 @@ do { \ /* * storage for cached extent + * If ec_len == 0, then the cache is invalid. + * If ec_start == 0, then the cache represents a gap (null mapping) */ struct ext4_ext_cache { ext4_fsblk_t ec_start; ext4_lblk_t ec_block; __u32 ec_len; /* must be 32bit to return holes */ - __u32 ec_type; }; /* diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index dfdda1766927..2e29abb30f76 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -119,10 +119,6 @@ struct ext4_ext_path { * structure for external API */ -#define EXT4_EXT_CACHE_NO 0 -#define EXT4_EXT_CACHE_GAP 1 -#define EXT4_EXT_CACHE_EXTENT 2 - /* * to be called by ext4_ext_walk_space() * negative retcode - error @@ -197,7 +193,7 @@ static inline unsigned short ext_depth(struct inode *inode) static inline void ext4_ext_invalidate_cache(struct inode *inode) { - EXT4_I(inode)->i_cached_extent.ec_type = EXT4_EXT_CACHE_NO; + EXT4_I(inode)->i_cached_extent.ec_len = 0; } static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f1a4354ea3cf..9081d1060a5f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1894,12 +1894,10 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, cbex.ec_block = start; cbex.ec_len = end - start; cbex.ec_start = 0; - cbex.ec_type = EXT4_EXT_CACHE_GAP; } else { cbex.ec_block = le32_to_cpu(ex->ee_block); cbex.ec_len = ext4_ext_get_actual_len(ex); cbex.ec_start = ext4_ext_pblock(ex); - cbex.ec_type = EXT4_EXT_CACHE_EXTENT; } if (unlikely(cbex.ec_len == 0)) { @@ -1939,13 +1937,12 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, static void ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, - __u32 len, ext4_fsblk_t start, int type) + __u32 len, ext4_fsblk_t start) { struct ext4_ext_cache *cex; BUG_ON(len == 0); spin_lock(&EXT4_I(inode)->i_block_reservation_lock); cex = &EXT4_I(inode)->i_cached_extent; - cex->ec_type = type; cex->ec_block = block; cex->ec_len = len; cex->ec_start = start; @@ -1998,15 +1995,18 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, } ext_debug(" -> %u:%lu\n", lblock, len); - ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP); + ext4_ext_put_in_cache(inode, lblock, len, 0); } +/* + * Return 0 if cache is invalid; 1 if the cache is valid + */ static int ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, struct ext4_extent *ex) { struct ext4_ext_cache *cex; - int ret = EXT4_EXT_CACHE_NO; + int ret = 0; /* * We borrow i_block_reservation_lock to protect i_cached_extent @@ -2015,11 +2015,9 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, cex = &EXT4_I(inode)->i_cached_extent; /* has cache valid data? */ - if (cex->ec_type == EXT4_EXT_CACHE_NO) + if (cex->ec_len == 0) goto errout; - BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP && - cex->ec_type != EXT4_EXT_CACHE_EXTENT); if (in_range(block, cex->ec_block, cex->ec_len)) { ex->ee_block = cpu_to_le32(cex->ec_block); ext4_ext_store_pblock(ex, cex->ec_start); @@ -2027,7 +2025,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, ext_debug("%u cached by %u:%u:%llu\n", block, cex->ec_block, cex->ec_len, cex->ec_start); - ret = cex->ec_type; + ret = 1; } errout: spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); @@ -3298,7 +3296,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_extent_header *eh; struct ext4_extent newex, *ex; ext4_fsblk_t newblock; - int err = 0, depth, ret, cache_type; + int err = 0, depth, ret; unsigned int allocated = 0; struct ext4_allocation_request ar; ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; @@ -3307,9 +3305,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, map->m_lblk, map->m_len, inode->i_ino); /* check in cache */ - cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex); - if (cache_type) { - if (cache_type == EXT4_EXT_CACHE_GAP) { + if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { + if (!newex.ee_start_lo && !newex.ee_start_hi) { if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* * block isn't allocated yet and @@ -3318,7 +3315,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, goto out2; } /* we should allocate requested block */ - } else if (cache_type == EXT4_EXT_CACHE_EXTENT) { + } else { /* block is already allocated */ newblock = map->m_lblk - le32_to_cpu(newex.ee_block) @@ -3327,8 +3324,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, allocated = ext4_ext_get_actual_len(&newex) - (map->m_lblk - le32_to_cpu(newex.ee_block)); goto out; - } else { - BUG(); } } @@ -3379,8 +3374,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* Do not put uninitialized extent in the cache */ if (!ext4_ext_is_uninitialized(ex)) { ext4_ext_put_in_cache(inode, ee_block, - ee_len, ee_start, - EXT4_EXT_CACHE_EXTENT); + ee_len, ee_start); goto out; } ret = ext4_ext_handle_uninitialized_extents(handle, @@ -3512,8 +3506,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * when it is _not_ an uninitialized extent. */ if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { - ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock, - EXT4_EXT_CACHE_EXTENT); + ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock); ext4_update_inode_fsync_trans(handle, inode, 1); } else ext4_update_inode_fsync_trans(handle, inode, 0); @@ -3789,7 +3782,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, logical = (__u64)newex->ec_block << blksize_bits; - if (newex->ec_type == EXT4_EXT_CACHE_GAP) { + if (newex->ec_start == 0) { pgoff_t offset; struct page *page; struct buffer_head *bh = NULL; From 8a2005d3f84457b7d7d8646ab5195341d9e5f06a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 10 Jan 2011 12:13:42 -0500 Subject: [PATCH 33/44] ext4: reorder ext4_inode_info structure elements to remove unneeded padding By reordering the elements in the ext4_inode_info structure, we can reduce the padding needed on an x86_64 system by 16 bytes. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index de937fc10503..50e3d24483fb 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -763,10 +763,10 @@ struct ext4_inode_info { * near to their parent directory's inode. */ ext4_group_t i_block_group; + ext4_lblk_t i_dir_start_lookup; unsigned long i_state_flags; /* Dynamic state flags */ unsigned long i_flags; - ext4_lblk_t i_dir_start_lookup; #ifdef CONFIG_EXT4_FS_XATTR /* * Extended attributes can be read independently of the main file @@ -835,7 +835,6 @@ struct ext4_inode_info { /* on-disk additional length */ __u16 i_extra_isize; - spinlock_t i_block_reservation_lock; #ifdef CONFIG_QUOTA /* quota space reservation, managed internally by quota code */ qsize_t i_reserved_quota; @@ -844,9 +843,11 @@ struct ext4_inode_info { /* completed IOs that might need unwritten extents handling */ struct list_head i_completed_io_list; spinlock_t i_completed_io_lock; + atomic_t i_ioend_count; /* Number of outstanding io_end structs */ /* current io_end structure for async DIO write*/ ext4_io_end_t *cur_aio_dio; - atomic_t i_ioend_count; /* Number of outstanding io_end structs */ + + spinlock_t i_block_reservation_lock; /* * Transactions that contain inode's metadata needed to complete From 353eb83c1422c6326eaab30ce044a179c6018169 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 10 Jan 2011 12:18:25 -0500 Subject: [PATCH 34/44] ext4: drop i_state_flags on architectures with 64-bit longs We can store the dynamic inode state flags in the high bits of EXT4_I(inode)->i_flags, and eliminate i_state_flags. This saves 8 bytes from the size of ext4_inode_info structure, which when multiplied by the number of the number of in the inode cache, can save a lot of memory. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 28 ++++++++++++++++++++++------ fs/ext4/ialloc.c | 2 +- fs/ext4/inode.c | 4 ++-- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 50e3d24483fb..2fb531cfd48b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -764,7 +764,9 @@ struct ext4_inode_info { */ ext4_group_t i_block_group; ext4_lblk_t i_dir_start_lookup; +#if (BITS_PER_LONG < 64) unsigned long i_state_flags; /* Dynamic state flags */ +#endif unsigned long i_flags; #ifdef CONFIG_EXT4_FS_XATTR @@ -1239,22 +1241,36 @@ enum { EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ }; -#define EXT4_INODE_BIT_FNS(name, field) \ +#define EXT4_INODE_BIT_FNS(name, field, offset) \ static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ { \ - return test_bit(bit, &EXT4_I(inode)->i_##field); \ + return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ } \ static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ { \ - set_bit(bit, &EXT4_I(inode)->i_##field); \ + set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ } \ static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ { \ - clear_bit(bit, &EXT4_I(inode)->i_##field); \ + clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ } -EXT4_INODE_BIT_FNS(flag, flags) -EXT4_INODE_BIT_FNS(state, state_flags) +EXT4_INODE_BIT_FNS(flag, flags, 0) +#if (BITS_PER_LONG < 64) +EXT4_INODE_BIT_FNS(state, state_flags, 0) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + (ei)->i_state_flags = 0; +} +#else +EXT4_INODE_BIT_FNS(state, flags, 32) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + /* We depend on the fact that callers will set i_flags */ +} +#endif #else /* Assume that user mode programs are passing in an ext4fs superblock, not * a kernel struct super_block. This will allow us to call the feature-test diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 1ce240a23ebb..eb9097aec6f0 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1027,7 +1027,7 @@ got: inode->i_generation = sbi->s_next_generation++; spin_unlock(&sbi->s_next_gen_lock); - ei->i_state_flags = 0; + ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ext4_set_inode_state(inode, EXT4_STATE_NEW); ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3ae83137cf34..0801ee6a173e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4868,7 +4868,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); - ei->i_state_flags = 0; + ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. @@ -5127,7 +5127,7 @@ static int ext4_do_update_inode(handle_t *handle, if (ext4_inode_blocks_set(handle, raw_inode, ei)) goto out_brelse; raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); - raw_inode->i_flags = cpu_to_le32(ei->i_flags); + raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_HURD)) raw_inode->i_file_acl_high = From 8aefcd557d26d0023a36f9ec5afbf55e59f8f26b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 10 Jan 2011 12:29:43 -0500 Subject: [PATCH 35/44] ext4: dynamically allocate the jbd2_inode in ext4_inode_info as necessary Replace the jbd2_inode structure (which is 48 bytes) with a pointer and only allocate the jbd2_inode when it is needed --- that is, when the file system has a journal present and the inode has been opened for writing. This allows us to further slim down the ext4_inode_info structure. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 2 +- fs/ext4/ext4_jbd2.h | 2 +- fs/ext4/file.c | 22 ++++++++++++++++++++++ fs/ext4/inode.c | 17 ++++++++++++----- fs/ext4/super.c | 16 +++++++--------- fs/jbd2/journal.c | 20 +++++++++++++------- include/linux/jbd2.h | 20 ++++++++++++++++++-- 7 files changed, 74 insertions(+), 25 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2fb531cfd48b..32b7daa41a42 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -811,7 +811,7 @@ struct ext4_inode_info { */ struct rw_semaphore i_data_sem; struct inode vfs_inode; - struct jbd2_inode jinode; + struct jbd2_inode *jinode; struct ext4_ext_cache i_cached_extent; /* diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index b0bd792c58c5..d8b992e658c1 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -253,7 +253,7 @@ static inline int ext4_journal_force_commit(journal_t *journal) static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) { if (ext4_handle_valid(handle)) - return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); + return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode); return 0; } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 5a5c55ddceef..bb003dc9ffff 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -104,6 +104,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) { struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); struct vfsmount *mnt = filp->f_path.mnt; struct path path; char buf[64], *cp; @@ -127,6 +128,27 @@ static int ext4_file_open(struct inode * inode, struct file * filp) ext4_mark_super_dirty(sb); } } + /* + * Set up the jbd2_inode if we are opening the inode for + * writing and the journal is present + */ + if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) { + struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL); + + spin_lock(&inode->i_lock); + if (!ei->jinode) { + if (!jinode) { + spin_unlock(&inode->i_lock); + return -ENOMEM; + } + ei->jinode = jinode; + jbd2_journal_init_jbd_inode(ei->jinode, inode); + jinode = NULL; + } + spin_unlock(&inode->i_lock); + if (unlikely(jinode != NULL)) + jbd2_free_inode(jinode); + } return dquot_file_open(inode, filp); } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0801ee6a173e..2693fcda30d8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -55,10 +55,17 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { trace_ext4_begin_ordered_truncate(inode, new_size); - return jbd2_journal_begin_ordered_truncate( - EXT4_SB(inode->i_sb)->s_journal, - &EXT4_I(inode)->jinode, - new_size); + /* + * If jinode is zero, then we never opened the file for + * writing, so there's no need to call + * jbd2_journal_begin_ordered_truncate() since there's no + * outstanding writes we need to flush. + */ + if (!EXT4_I(inode)->jinode) + return 0; + return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), + EXT4_I(inode)->jinode, + new_size); } static void ext4_invalidatepage(struct page *page, unsigned long offset); @@ -4054,7 +4061,7 @@ int ext4_block_truncate_page(handle_t *handle, if (ext4_should_journal_data(inode)) { err = ext4_handle_dirty_metadata(handle, inode, bh); } else { - if (ext4_should_order_data(inode)) + if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode) err = ext4_jbd2_file_inode(handle, inode); mark_buffer_dirty(bh); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f5960d673e4e..1cd4326c530b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -818,12 +818,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); - /* - * Note: We can be called before EXT4_SB(sb)->s_journal is set, - * therefore it can be null here. Don't check it, just initialize - * jinode. - */ - jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode); ei->i_reserved_data_blocks = 0; ei->i_reserved_meta_blocks = 0; ei->i_allocated_meta_blocks = 0; @@ -832,6 +826,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; #endif + ei->jinode = NULL; INIT_LIST_HEAD(&ei->i_completed_io_list); spin_lock_init(&ei->i_completed_io_lock); ei->cur_aio_dio = NULL; @@ -900,9 +895,12 @@ void ext4_clear_inode(struct inode *inode) end_writeback(inode); dquot_drop(inode); ext4_discard_preallocations(inode); - if (EXT4_JOURNAL(inode)) - jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, - &EXT4_I(inode)->jinode); + if (EXT4_I(inode)->jinode) { + jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), + EXT4_I(inode)->jinode); + jbd2_free_inode(EXT4_I(inode)->jinode); + EXT4_I(inode)->jinode = NULL; + } } static inline void ext4_show_quota_options(struct seq_file *seq, diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 2447bd86f801..9e4686900f18 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -94,6 +94,7 @@ EXPORT_SYMBOL(jbd2_journal_file_inode); EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); +EXPORT_SYMBOL(jbd2_inode_cache); static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); static void __journal_abort_soft (journal_t *journal, int errno); @@ -2286,17 +2287,19 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void) #endif -struct kmem_cache *jbd2_handle_cache; +struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache; static int __init journal_init_handle_cache(void) { - jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle", - sizeof(handle_t), - 0, /* offset */ - SLAB_TEMPORARY, /* flags */ - NULL); /* ctor */ + jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY); if (jbd2_handle_cache == NULL) { - printk(KERN_EMERG "JBD: failed to create handle cache\n"); + printk(KERN_EMERG "JBD2: failed to create handle cache\n"); + return -ENOMEM; + } + jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0); + if (jbd2_inode_cache == NULL) { + printk(KERN_EMERG "JBD2: failed to create inode cache\n"); + kmem_cache_destroy(jbd2_handle_cache); return -ENOMEM; } return 0; @@ -2306,6 +2309,9 @@ static void jbd2_journal_destroy_handle_cache(void) { if (jbd2_handle_cache) kmem_cache_destroy(jbd2_handle_cache); + if (jbd2_inode_cache) + kmem_cache_destroy(jbd2_inode_cache); + } /* diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 2ae86aa21fce..27e79c27ba08 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -94,7 +94,7 @@ extern void jbd2_free(void *ptr, size_t size); * * This is an opaque datatype. **/ -typedef struct handle_s handle_t; /* Atomic operation type */ +typedef struct jbd2_journal_handle handle_t; /* Atomic operation type */ /** @@ -416,7 +416,7 @@ struct jbd2_revoke_table_s; * in so it can be fixed later. */ -struct handle_s +struct jbd2_journal_handle { /* Which compound transaction is this update a part of? */ transaction_t *h_transaction; @@ -1158,6 +1158,22 @@ static inline void jbd2_free_handle(handle_t *handle) kmem_cache_free(jbd2_handle_cache, handle); } +/* + * jbd2_inode management (optional, for those file systems that want to use + * dynamically allocated jbd2_inode structures) + */ +extern struct kmem_cache *jbd2_inode_cache; + +static inline struct jbd2_inode *jbd2_alloc_inode(gfp_t gfp_flags) +{ + return kmem_cache_alloc(jbd2_inode_cache, gfp_flags); +} + +static inline void jbd2_free_inode(struct jbd2_inode *jinode) +{ + kmem_cache_free(jbd2_inode_cache, jinode); +} + /* Primary revoke support */ #define JOURNAL_REVOKE_DEFAULT_HASH 256 extern int jbd2_journal_init_revoke(journal_t *, int); From 6c5a6cb998854f3c579ecb2bc1423d302bcb1b76 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 10 Jan 2011 12:30:17 -0500 Subject: [PATCH 36/44] ext4: fix uninitialized variable in ext4_register_li_request fs/ext4/super.c: In function 'ext4_register_li_request': fs/ext4/super.c:2936: warning: 'ret' may be used uninitialized in this function It looks buggy to me, too. Cc: Lukas Czerner Cc: stable@kernel.org Signed-off-by: Andrew Morton Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1cd4326c530b..757cb24c0256 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2922,7 +2922,7 @@ static int ext4_register_li_request(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_li_request *elr; ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; - int ret; + int ret = 0; if (sbi->s_li_request != NULL) return 0; From ca6e909f9bebe709bc65a3ee605ce32969db0452 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 10 Jan 2011 12:30:39 -0500 Subject: [PATCH 37/44] ext4: fix trimming of a single group When ext4_trim_fs() is called to trim a part of a single group, the logic will wrongly set last block of the interval to 'len' instead of 'first_block + len'. Thus a shorter interval is possibly trimmed. Fix it. CC: Lukas Czerner Cc: stable@kernel.org Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d47a80ec231d..21ee30b86de5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4860,7 +4860,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) if (len >= EXT4_BLOCKS_PER_GROUP(sb)) len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block); else - last_block = len; + last_block = first_block + len; if (e4b.bd_info->bb_free >= minlen) { cnt = ext4_trim_all_free(sb, &e4b, first_block, From b40971426a837e9dc9c66e1b6bbcb3874eafe4e0 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 10 Jan 2011 12:46:59 -0500 Subject: [PATCH 38/44] ext4: add error checking to calls to ext4_handle_dirty_metadata() Call ext4_std_error() in various places when we can't bail out cleanly, so the file system can be marked as in error. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 21 +++++++++++++--- fs/ext4/namei.c | 32 +++++++++++++++++++----- fs/ext4/resize.c | 64 ++++++++++++++++++++++++++++++++++-------------- 3 files changed, 89 insertions(+), 28 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2693fcda30d8..84b616269265 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4185,6 +4185,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, { __le32 *p; int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; + int err; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) flags |= EXT4_FREE_BLOCKS_METADATA; @@ -4200,11 +4201,23 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, if (try_to_extend_transaction(handle, inode)) { if (bh) { BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - ext4_handle_dirty_metadata(handle, inode, bh); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (unlikely(err)) { + ext4_std_error(inode->i_sb, err); + return 1; + } + } + err = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err)) { + ext4_std_error(inode->i_sb, err); + return 1; + } + err = ext4_truncate_restart_trans(handle, inode, + blocks_for_truncate(inode)); + if (unlikely(err)) { + ext4_std_error(inode->i_sb, err); + return 1; } - ext4_mark_inode_dirty(handle, inode); - ext4_truncate_restart_trans(handle, inode, - blocks_for_truncate(inode)); if (bh) { BUFFER_TRACE(bh, "retaking write access"); ext4_journal_get_write_access(handle, bh); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 6dfc5b9de3e6..5485390d32c5 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1602,7 +1602,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (err) goto journal_error; } - ext4_handle_dirty_metadata(handle, inode, frames[0].bh); + err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh); + if (err) { + ext4_std_error(inode->i_sb, err); + goto cleanup; + } } de = do_split(handle, dir, &bh, frame, &hinfo, &err); if (!de) @@ -1630,7 +1634,7 @@ static int ext4_delete_entry(handle_t *handle, { struct ext4_dir_entry_2 *de, *pde; unsigned int blocksize = dir->i_sb->s_blocksize; - int i; + int i, err; i = 0; pde = NULL; @@ -1640,7 +1644,11 @@ static int ext4_delete_entry(handle_t *handle, return -EIO; if (de == de_del) { BUFFER_TRACE(bh, "get_write_access"); - ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, bh); + if (unlikely(err)) { + ext4_std_error(dir->i_sb, err); + return err; + } if (pde) pde->rec_len = ext4_rec_len_to_disk( ext4_rec_len_from_disk(pde->rec_len, @@ -1652,7 +1660,11 @@ static int ext4_delete_entry(handle_t *handle, de->inode = 0; dir->i_version++; BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - ext4_handle_dirty_metadata(handle, dir, bh); + err = ext4_handle_dirty_metadata(handle, dir, bh); + if (unlikely(err)) { + ext4_std_error(dir->i_sb, err); + return err; + } return 0; } i += ext4_rec_len_from_disk(de->rec_len, blocksize); @@ -2414,7 +2426,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, ext4_current_time(new_dir); ext4_mark_inode_dirty(handle, new_dir); BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); - ext4_handle_dirty_metadata(handle, new_dir, new_bh); + retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh); + if (unlikely(retval)) { + ext4_std_error(new_dir->i_sb, retval); + goto end_rename; + } brelse(new_bh); new_bh = NULL; } @@ -2466,7 +2482,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); - ext4_handle_dirty_metadata(handle, old_dir, dir_bh); + retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh); + if (retval) { + ext4_std_error(old_dir->i_sb, retval); + goto end_rename; + } ext4_dec_count(handle, old_dir); if (new_inode) { /* checked empty_dir above, can't have another parent, diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index dc963929de65..7faf47dde7fb 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -220,7 +220,11 @@ static int setup_new_group_blocks(struct super_block *sb, memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); set_buffer_uptodate(gdb); unlock_buffer(gdb); - ext4_handle_dirty_metadata(handle, NULL, gdb); + err = ext4_handle_dirty_metadata(handle, NULL, gdb); + if (unlikely(err)) { + brelse(gdb); + goto exit_bh; + } ext4_set_bit(bit, bh->b_data); brelse(gdb); } @@ -253,7 +257,11 @@ static int setup_new_group_blocks(struct super_block *sb, ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data); - ext4_handle_dirty_metadata(handle, NULL, bh); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (unlikely(err)) { + ext4_std_error(sb, err); + goto exit_bh; + } brelse(bh); /* Mark unused entries in inode bitmap used */ ext4_debug("clear inode bitmap %#04llx (+%llu)\n", @@ -265,7 +273,9 @@ static int setup_new_group_blocks(struct super_block *sb, ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); - ext4_handle_dirty_metadata(handle, NULL, bh); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (unlikely(err)) + ext4_std_error(sb, err); exit_bh: brelse(bh); @@ -417,17 +427,21 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, goto exit_dind; } - if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh))) + err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + if (unlikely(err)) goto exit_dind; - if ((err = ext4_journal_get_write_access(handle, *primary))) + err = ext4_journal_get_write_access(handle, *primary); + if (unlikely(err)) goto exit_sbh; - if ((err = ext4_journal_get_write_access(handle, dind))) - goto exit_primary; + err = ext4_journal_get_write_access(handle, dind); + if (unlikely(err)) + ext4_std_error(sb, err); /* ext4_reserve_inode_write() gets a reference on the iloc */ - if ((err = ext4_reserve_inode_write(handle, inode, &iloc))) + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (unlikely(err)) goto exit_dindj; n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), @@ -449,12 +463,20 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, * reserved inode, and will become GDT blocks (primary and backup). */ data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0; - ext4_handle_dirty_metadata(handle, NULL, dind); - brelse(dind); + err = ext4_handle_dirty_metadata(handle, NULL, dind); + if (unlikely(err)) { + ext4_std_error(sb, err); + goto exit_inode; + } inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; ext4_mark_iloc_dirty(handle, inode, &iloc); memset((*primary)->b_data, 0, sb->s_blocksize); - ext4_handle_dirty_metadata(handle, NULL, *primary); + err = ext4_handle_dirty_metadata(handle, NULL, *primary); + if (unlikely(err)) { + ext4_std_error(sb, err); + goto exit_inode; + } + brelse(dind); o_group_desc = EXT4_SB(sb)->s_group_desc; memcpy(n_group_desc, o_group_desc, @@ -465,19 +487,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, kfree(o_group_desc); le16_add_cpu(&es->s_reserved_gdt_blocks, -1); - ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); + err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); + if (err) + ext4_std_error(sb, err); - return 0; + return err; exit_inode: /* ext4_journal_release_buffer(handle, iloc.bh); */ brelse(iloc.bh); exit_dindj: /* ext4_journal_release_buffer(handle, dind); */ -exit_primary: - /* ext4_journal_release_buffer(handle, *primary); */ exit_sbh: - /* ext4_journal_release_buffer(handle, *primary); */ + /* ext4_journal_release_buffer(handle, EXT4_SB(sb)->s_sbh); */ exit_dind: brelse(dind); exit_bh: @@ -660,7 +682,9 @@ static void update_backups(struct super_block *sb, memset(bh->b_data + size, 0, rest); set_buffer_uptodate(bh); unlock_buffer(bh); - ext4_handle_dirty_metadata(handle, NULL, bh); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (unlikely(err)) + ext4_std_error(sb, err); brelse(bh); } if ((err2 = ext4_journal_stop(handle)) && !err) @@ -878,7 +902,11 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) /* Update the global fs size fields */ sbi->s_groups_count++; - ext4_handle_dirty_metadata(handle, NULL, primary); + err = ext4_handle_dirty_metadata(handle, NULL, primary); + if (unlikely(err)) { + ext4_std_error(sb, err); + goto exit_journal; + } /* Update the reserved block counts only once the new group is * active. */ From 3889fd57ea3c58209354862523275774fca9db03 Mon Sep 17 00:00:00 2001 From: Jiaying Zhang Date: Mon, 10 Jan 2011 12:47:05 -0500 Subject: [PATCH 39/44] ext4: flush the i_completed_io_list during ext4_truncate Ted first found the bug when running 2.6.36 kernel with dioread_nolock mount option that xfstests #13 complained about wrong file size during fsck. However, the bug exists in the older kernels as well although it is somehow harder to trigger. The problem is that ext4_end_io_work() can happen after we have truncated an inode to a smaller size. Then when ext4_end_io_work() calls ext4_convert_unwritten_extents(), we may reallocate some blocks that have been truncated, so the inode size becomes inconsistent with the allocated blocks. The following patch flushes the i_completed_io_list during truncate to reduce the risk that some pending end_io requests are executed later and convert already truncated blocks to initialized. Note that although the fix helps reduce the problem a lot there may still be a race window between vmtruncate() and ext4_end_io_work(). The fundamental problem is that if vmtruncate() is called without either i_mutex or i_alloc_sem held, it can race with an ongoing write request so that the io_end request is processed later when the corresponding blocks have been truncated. Ted and I have discussed the problem offline and we saw a few ways to fix the race completely: a) We guarantee that i_mutex lock and i_alloc_sem write lock are both hold whenever vmtruncate() is called. The i_mutex lock prevents any new write requests from entering writeback and the i_alloc_sem prevents the race from ext4_page_mkwrite(). Currently we hold both locks if vmtruncate() is called from do_truncate(), which is probably the most common case. However, there are places where we may call vmtruncate() without holding either i_mutex or i_alloc_sem. I would like to ask for other people's opinions on what locks are expected to be held before calling vmtruncate(). There seems a disagreement among the callers of that function. b) We change the ext4 write path so that we change the extent tree to contain the newly allocated blocks and update i_size both at the same time --- when the write of the data blocks is completed. c) We add some additional locking to synchronize vmtruncate() and ext4_end_io_work(). This approach may have performance implications so we need to be careful. All of the above proposals may require more substantial changes, so we may consider to take the following patch as a bandaid. Signed-off-by: Jiaying Zhang Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 1 + fs/ext4/extents.c | 6 ++++++ fs/ext4/fsync.c | 4 ++-- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 32b7daa41a42..bab2387fba43 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1671,6 +1671,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p); /* fsync.c */ extern int ext4_sync_file(struct file *, int); +extern int ext4_flush_completed_IO(struct inode *); /* hash.c */ extern int ext4fs_dirhash(const char *name, int len, struct diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9081d1060a5f..627f7ae94ae5 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3533,6 +3533,12 @@ void ext4_ext_truncate(struct inode *inode) handle_t *handle; int err = 0; + /* + * finish any pending end_io work so we won't run the risk of + * converting any truncated blocks to initialized later + */ + ext4_flush_completed_IO(inode); + /* * probably first extent we're gonna free will be last in block */ diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index c1a7bc923cf6..7829b287822a 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -75,7 +75,7 @@ static void dump_completed_IO(struct inode * inode) * to written. * The function return the number of pending IOs on success. */ -static int flush_completed_IO(struct inode *inode) +extern int ext4_flush_completed_IO(struct inode *inode) { ext4_io_end_t *io; struct ext4_inode_info *ei = EXT4_I(inode); @@ -169,7 +169,7 @@ int ext4_sync_file(struct file *file, int datasync) if (inode->i_sb->s_flags & MS_RDONLY) return 0; - ret = flush_completed_IO(inode); + ret = ext4_flush_completed_IO(inode); if (ret < 0) return ret; From a5196f8cdfbf6ccb20f093aaf48852d6d23b4e0b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 10 Jan 2011 12:47:07 -0500 Subject: [PATCH 40/44] ext4: remove ext4_mb_return_to_preallocation() This function was never implemented, except for a BUG_ON which was tripping when ext4 is run without a journal. The problem is that although the comment asserts that "truncate (which is the only way to free block) discards all preallocations", ext4_free_blocks() is also called in various error recovery paths when blocks have been allocated, but for various reasons, we were not able to use those data blocks (for example, because we ran out of memory while trying to manipulate the extent tree, or some other similar situation). In addition to the fact that this function isn't implemented except for the incorrect BUG_ON, the single caller of this function, ext4_free_blocks(), doesn't use it all if the journal is enabled. So remove the (stub) function entirely for now. If we decide it's better to add it back, it's only going to be useful with a relatively large number of code changes anyway. Google-Bug-Id: 3236408 Cc: Jiaying Zhang Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 21ee30b86de5..cd5214f75397 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3881,19 +3881,6 @@ repeat: } } -/* - * finds all preallocated spaces and return blocks being freed to them - * if preallocated space becomes full (no block is used from the space) - * then the function frees space in buddy - * XXX: at the moment, truncate (which is the only way to free blocks) - * discards all preallocations - */ -static void ext4_mb_return_to_preallocation(struct inode *inode, - struct ext4_buddy *e4b, - sector_t block, int count) -{ - BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list)); -} #ifdef CONFIG_EXT4_DEBUG static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { @@ -4648,7 +4635,6 @@ do_more: ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count); mb_free_blocks(inode, &e4b, bit, count); - ext4_mb_return_to_preallocation(inode, &e4b, block, count); } ret = ext4_free_blks_count(sb, gdp) + count; From 1c5b9e9065567876c2d4a7a16d78f0fed154a5bf Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 10 Jan 2011 12:51:28 -0500 Subject: [PATCH 41/44] ext4: fix memory leak in ext4_free_branches Commit 40389687 moved a call to ext4_forget() out of ext4_free_branches and let ext4_free_blocks() handle calling bforget(). But that change unfortunately did not replace the call to ext4_forget() with brelse(), which was needed to drop the in-use count of the indirect block's buffer head, which lead to a memory leak when deleting files that used indirect blocks. Fix this. Thanks to Hugh Dickins for pointing this out. Cc: stable@kernel.org Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 84b616269265..e80fc513eacc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4378,6 +4378,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, (__le32 *) bh->b_data, (__le32 *) bh->b_data + addr_per_block, depth); + brelse(bh); /* * Everything below this this pointer has been From d002ebf1d8daa5a317645b1c4a3a0b7ea2abc9ac Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 10 Jan 2011 13:03:35 -0500 Subject: [PATCH 42/44] ext4: don't pass entire map to check_eofblocks_fl Since check_eofblocks_fl() only uses the m_lblk portion of the map structure, we may as well pass that directly, rather than passing the entire map, which IMHO obfuscates what parameters check_eofblocks_fl() cares about. Not a big deal, but seems tidier and less confusing, to me. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 627f7ae94ae5..e910720e8bb8 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3102,7 +3102,7 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev, * Handle EOFBLOCKS_FL flag, clearing it if necessary */ static int check_eofblocks_fl(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map, + ext4_lblk_t lblk, struct ext4_ext_path *path, unsigned int len) { @@ -3132,7 +3132,7 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode, * this turns out to be false, we can bail out from this * function immediately. */ - if (map->m_lblk + len < le32_to_cpu(last_ex->ee_block) + + if (lblk + len < le32_to_cpu(last_ex->ee_block) + ext4_ext_get_actual_len(last_ex)) return 0; /* @@ -3188,8 +3188,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, path); if (ret >= 0) { ext4_update_inode_fsync_trans(handle, inode, 1); - err = check_eofblocks_fl(handle, inode, map, path, - map->m_len); + err = check_eofblocks_fl(handle, inode, map->m_lblk, + path, map->m_len); } else err = ret; goto out2; @@ -3219,7 +3219,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ret = ext4_ext_convert_to_initialized(handle, inode, map, path); if (ret >= 0) { ext4_update_inode_fsync_trans(handle, inode, 1); - err = check_eofblocks_fl(handle, inode, map, path, map->m_len); + err = check_eofblocks_fl(handle, inode, map->m_lblk, path, + map->m_len); if (err < 0) goto out2; } @@ -3472,7 +3473,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, map->m_flags |= EXT4_MAP_UNINIT; } - err = check_eofblocks_fl(handle, inode, map, path, ar.len); + err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); if (err) goto out2; From 0a2179b169089f871e071c74316371ed43e6c8eb Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 11 Jan 2011 14:42:29 -0500 Subject: [PATCH 43/44] ext4: revert buggy trim overflow patch This reverts commit 4f531501e44: ext4: fix possible overflow in ext4_trim_fs() Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index cd5214f75397..cc1297e15f1b 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4804,7 +4804,6 @@ ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) { struct ext4_buddy e4b; - ext4_fsblk_t blocks_count = ext4_blocks_count(EXT4_SB(sb)->s_es); ext4_group_t first_group, last_group; ext4_group_t group, ngroups = ext4_get_groups_count(sb); ext4_grpblk_t cnt = 0, first_block, last_block; @@ -4816,11 +4815,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) minlen = range->minlen >> sb->s_blocksize_bits; trimmed = 0; - if (start >= blocks_count) - return -EINVAL; - if (start + len > blocks_count) - len = blocks_count - start; - if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) return -EINVAL; From 0f0a25bf516843adae479636dc1cf75fd0bd003c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 11 Jan 2011 15:16:31 -0500 Subject: [PATCH 44/44] ext4: fix trimming starting with block 0 with small blocksize When s_first_data_block is not zero (which happens e.g. when block size is 1KB) and trim ioctl is called to start trimming from block 0, the math in ext4_get_group_no_and_offset() overflows. The overall result is that ioctl returns EINVAL which is kind of unexpected and we probably don't want userspace tools to bother with internal details of filesystem structure. So just silently increase starting offset (and shorten length) when starting block is below s_first_data_block. CC: Lukas Czerner Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index cc1297e15f1b..851f49b2f9d2 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4808,6 +4808,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) ext4_group_t group, ngroups = ext4_get_groups_count(sb); ext4_grpblk_t cnt = 0, first_block, last_block; uint64_t start, len, minlen, trimmed; + ext4_fsblk_t first_data_blk = + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); int ret = 0; start = range->start >> sb->s_blocksize_bits; @@ -4817,6 +4819,10 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) return -EINVAL; + if (start < first_data_blk) { + len -= first_data_blk - start; + start = first_data_blk; + } /* Determine first and last group to examine based on start and len */ ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,