diff options
Diffstat (limited to 'fs')
49 files changed, 903 insertions, 809 deletions
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index f34078d702d3..303983fabfd6 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -941,9 +941,13 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) current->mm->start_stack = bprm->p; #ifdef arch_randomize_brk - if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) + if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) { current->mm->brk = current->mm->start_brk = arch_randomize_brk(current->mm); +#ifdef CONFIG_COMPAT_BRK + current->brk_randomized = 1; +#endif + } #endif if (current->personality & MMAP_PAGE_ZERO) { diff --git a/fs/cifs/README b/fs/cifs/README index fe1683590828..74ab165fc646 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -685,22 +685,6 @@ LinuxExtensionsEnabled If set to one then the client will attempt to support and want to map the uid and gid fields to values supplied at mount (rather than the actual values, then set this to zero. (default 1) -Experimental When set to 1 used to enable certain experimental - features (currently enables multipage writes - when signing is enabled, the multipage write - performance enhancement was disabled when - signing turned on in case buffer was modified - just before it was sent, also this flag will - be used to use the new experimental directory change - notification code). When set to 2 enables - an additional experimental feature, "raw ntlmssp" - session establishment support (which allows - specifying "sec=ntlmssp" on mount). The Linux cifs - module will use ntlmv2 authentication encapsulated - in "raw ntlmssp" (not using SPNEGO) when - "sec=ntlmssp" is specified on mount. - This support also requires building cifs with - the CONFIG_CIFS_EXPERIMENTAL configuration flag. These experimental features and tracing can be enabled by changing flags in /proc/fs/cifs (after the cifs module has been installed or built into the diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c index e654dfd092c3..53d57a3fe427 100644 --- a/fs/cifs/cache.c +++ b/fs/cifs/cache.c @@ -50,7 +50,7 @@ void cifs_fscache_unregister(void) */ struct cifs_server_key { uint16_t family; /* address family */ - uint16_t port; /* IP port */ + __be16 port; /* IP port */ union { struct in_addr ipv4_addr; struct in6_addr ipv6_addr; diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 65829d32128c..30d01bc90855 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -423,7 +423,6 @@ static const struct file_operations cifs_lookup_cache_proc_fops; static const struct file_operations traceSMB_proc_fops; static const struct file_operations cifs_multiuser_mount_proc_fops; static const struct file_operations cifs_security_flags_proc_fops; -static const struct file_operations cifs_experimental_proc_fops; static const struct file_operations cifs_linux_ext_proc_fops; void @@ -441,8 +440,6 @@ cifs_proc_init(void) proc_create("cifsFYI", 0, proc_fs_cifs, &cifsFYI_proc_fops); proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops); proc_create("OplockEnabled", 0, proc_fs_cifs, &cifs_oplock_proc_fops); - proc_create("Experimental", 0, proc_fs_cifs, - &cifs_experimental_proc_fops); proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs, &cifs_linux_ext_proc_fops); proc_create("MultiuserMount", 0, proc_fs_cifs, @@ -469,7 +466,6 @@ cifs_proc_clean(void) remove_proc_entry("OplockEnabled", proc_fs_cifs); remove_proc_entry("SecurityFlags", proc_fs_cifs); remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs); - remove_proc_entry("Experimental", proc_fs_cifs); remove_proc_entry("LookupCacheEnabled", proc_fs_cifs); remove_proc_entry("fs/cifs", NULL); } @@ -550,45 +546,6 @@ static const struct file_operations cifs_oplock_proc_fops = { .write = cifs_oplock_proc_write, }; -static int cifs_experimental_proc_show(struct seq_file *m, void *v) -{ - seq_printf(m, "%d\n", experimEnabled); - return 0; -} - -static int cifs_experimental_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, cifs_experimental_proc_show, NULL); -} - -static ssize_t cifs_experimental_proc_write(struct file *file, - const char __user *buffer, size_t count, loff_t *ppos) -{ - char c; - int rc; - - rc = get_user(c, buffer); - if (rc) - return rc; - if (c == '0' || c == 'n' || c == 'N') - experimEnabled = 0; - else if (c == '1' || c == 'y' || c == 'Y') - experimEnabled = 1; - else if (c == '2') - experimEnabled = 2; - - return count; -} - -static const struct file_operations cifs_experimental_proc_fops = { - .owner = THIS_MODULE, - .open = cifs_experimental_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = cifs_experimental_proc_write, -}; - static int cifs_linux_ext_proc_show(struct seq_file *m, void *v) { seq_printf(m, "%d\n", linuxExtEnabled); diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 4dfba8283165..33d221394aca 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -113,7 +113,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo) MAX_MECH_STR_LEN + UID_KEY_LEN + (sizeof(uid_t) * 2) + CREDUID_KEY_LEN + (sizeof(uid_t) * 2) + - USER_KEY_LEN + strlen(sesInfo->userName) + + USER_KEY_LEN + strlen(sesInfo->user_name) + PID_KEY_LEN + (sizeof(pid_t) * 2) + 1; spnego_key = ERR_PTR(-ENOMEM); @@ -153,7 +153,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo) sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid); dp = description + strlen(description); - sprintf(dp, ";user=%s", sesInfo->userName); + sprintf(dp, ";user=%s", sesInfo->user_name); dp = description + strlen(description); sprintf(dp, ";pid=0x%x", current->pid); diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index fc0fd4fde306..23d43cde4306 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -90,7 +90,7 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp, case UNI_COLON: *target = ':'; break; - case UNI_ASTERIK: + case UNI_ASTERISK: *target = '*'; break; case UNI_QUESTION: @@ -264,40 +264,40 @@ cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode, * names are little endian 16 bit Unicode on the wire */ int -cifsConvertToUCS(__le16 *target, const char *source, int maxlen, +cifsConvertToUCS(__le16 *target, const char *source, int srclen, const struct nls_table *cp, int mapChars) { int i, j, charlen; - int len_remaining = maxlen; char src_char; - __u16 temp; + __le16 dst_char; + wchar_t tmp; if (!mapChars) return cifs_strtoUCS(target, source, PATH_MAX, cp); - for (i = 0, j = 0; i < maxlen; j++) { + for (i = 0, j = 0; i < srclen; j++) { src_char = source[i]; switch (src_char) { case 0: - put_unaligned_le16(0, &target[j]); + put_unaligned(0, &target[j]); goto ctoUCS_out; case ':': - temp = UNI_COLON; + dst_char = cpu_to_le16(UNI_COLON); break; case '*': - temp = UNI_ASTERIK; + dst_char = cpu_to_le16(UNI_ASTERISK); break; case '?': - temp = UNI_QUESTION; + dst_char = cpu_to_le16(UNI_QUESTION); break; case '<': - temp = UNI_LESSTHAN; + dst_char = cpu_to_le16(UNI_LESSTHAN); break; case '>': - temp = UNI_GRTRTHAN; + dst_char = cpu_to_le16(UNI_GRTRTHAN); break; case '|': - temp = UNI_PIPE; + dst_char = cpu_to_le16(UNI_PIPE); break; /* * FIXME: We can not handle remapping backslash (UNI_SLASH) @@ -305,17 +305,17 @@ cifsConvertToUCS(__le16 *target, const char *source, int maxlen, * as they use backslash as separator. */ default: - charlen = cp->char2uni(source+i, len_remaining, - &temp); + charlen = cp->char2uni(source + i, srclen - i, &tmp); + dst_char = cpu_to_le16(tmp); + /* * if no match, use question mark, which at least in * some cases serves as wild card */ if (charlen < 1) { - temp = 0x003f; + dst_char = cpu_to_le16(0x003f); charlen = 1; } - len_remaining -= charlen; /* * character may take more than one byte in the source * string, but will take exactly two bytes in the @@ -324,9 +324,8 @@ cifsConvertToUCS(__le16 *target, const char *source, int maxlen, i += charlen; continue; } - put_unaligned_le16(temp, &target[j]); + put_unaligned(dst_char, &target[j]); i++; /* move to next char in source string */ - len_remaining--; } ctoUCS_out: diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h index 7fe6b52df507..644dd882a560 100644 --- a/fs/cifs/cifs_unicode.h +++ b/fs/cifs/cifs_unicode.h @@ -44,7 +44,7 @@ * reserved symbols (along with \ and /), otherwise illegal to store * in filenames in NTFS */ -#define UNI_ASTERIK (__u16) ('*' + 0xF000) +#define UNI_ASTERISK (__u16) ('*' + 0xF000) #define UNI_QUESTION (__u16) ('?' + 0xF000) #define UNI_COLON (__u16) (':' + 0xF000) #define UNI_GRTRTHAN (__u16) ('>' + 0xF000) diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index a51585f9852b..d1a016be73ba 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -30,12 +30,13 @@ #include <linux/ctype.h> #include <linux/random.h> -/* Calculate and return the CIFS signature based on the mac key and SMB PDU */ -/* the 16 byte signature must be allocated by the caller */ -/* Note we only use the 1st eight bytes */ -/* Note that the smb header signature field on input contains the - sequence number before this function is called */ - +/* + * Calculate and return the CIFS signature based on the mac key and SMB PDU. + * The 16 byte signature must be allocated by the caller. Note we only use the + * 1st eight bytes and that the smb header signature field on input contains + * the sequence number before this function is called. Also, this function + * should be called with the server->srv_mutex held. + */ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, char *signature) { @@ -209,8 +210,10 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu, cpu_to_le32(expected_sequence_number); cifs_pdu->Signature.Sequence.Reserved = 0; + mutex_lock(&server->srv_mutex); rc = cifs_calculate_signature(cifs_pdu, server, what_we_think_sig_should_be); + mutex_unlock(&server->srv_mutex); if (rc) return rc; @@ -469,15 +472,15 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses, char *ntlmv2_hash, return rc; } - /* convert ses->userName to unicode and uppercase */ - len = strlen(ses->userName); + /* convert ses->user_name to unicode and uppercase */ + len = strlen(ses->user_name); user = kmalloc(2 + (len * 2), GFP_KERNEL); if (user == NULL) { cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n"); rc = -ENOMEM; goto calc_exit_2; } - len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp); + len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp); UniStrupr(user); crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index f2970136d17d..5c412b33cd7c 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -53,7 +53,6 @@ int cifsFYI = 0; int cifsERROR = 1; int traceSMB = 0; unsigned int oplockEnabled = 1; -unsigned int experimEnabled = 0; unsigned int linuxExtEnabled = 1; unsigned int lookupCacheEnabled = 1; unsigned int multiuser_mount = 0; @@ -127,6 +126,7 @@ cifs_read_super(struct super_block *sb, void *data, kfree(cifs_sb); return rc; } + cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages; #ifdef CONFIG_CIFS_DFS_UPCALL /* copy mount params to sb for use in submounts */ @@ -409,8 +409,8 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) seq_printf(s, ",multiuser"); - else if (tcon->ses->userName) - seq_printf(s, ",username=%s", tcon->ses->userName); + else if (tcon->ses->user_name) + seq_printf(s, ",username=%s", tcon->ses->user_name); if (tcon->ses->domainName) seq_printf(s, ",domain=%s", tcon->ses->domainName); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 17afb0fbcaed..a5d1106fcbde 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -37,10 +37,9 @@ #define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1) #define MAX_SERVER_SIZE 15 -#define MAX_SHARE_SIZE 64 /* used to be 20, this should still be enough */ -#define MAX_USERNAME_SIZE 32 /* 32 is to allow for 15 char names + null - termination then *2 for unicode versions */ -#define MAX_PASSWORD_SIZE 512 /* max for windows seems to be 256 wide chars */ +#define MAX_SHARE_SIZE 80 +#define MAX_USERNAME_SIZE 256 /* reasonable maximum for current servers */ +#define MAX_PASSWORD_SIZE 512 /* max for windows seems to be 256 wide chars */ #define CIFS_MIN_RCV_POOL 4 @@ -92,7 +91,8 @@ enum statusEnum { CifsNew = 0, CifsGood, CifsExiting, - CifsNeedReconnect + CifsNeedReconnect, + CifsNeedNegotiate }; enum securityEnum { @@ -274,7 +274,7 @@ struct cifsSesInfo { int capabilities; char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for TCP names - will ipv6 and sctp addresses fit? */ - char userName[MAX_USERNAME_SIZE + 1]; + char *user_name; char *domainName; char *password; struct session_key auth_key; @@ -817,7 +817,6 @@ GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions have the uid/password or Kerberos credential or equivalent for current user */ GLOBAL_EXTERN unsigned int oplockEnabled; -GLOBAL_EXTERN unsigned int experimEnabled; GLOBAL_EXTERN unsigned int lookupCacheEnabled; GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent with more secure ntlmssp2 challenge/resp */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 2644a5d6cc67..df959bae6728 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -142,9 +142,9 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command) */ while (server->tcpStatus == CifsNeedReconnect) { wait_event_interruptible_timeout(server->response_q, - (server->tcpStatus == CifsGood), 10 * HZ); + (server->tcpStatus != CifsNeedReconnect), 10 * HZ); - /* is TCP session is reestablished now ?*/ + /* are we still trying to reconnect? */ if (server->tcpStatus != CifsNeedReconnect) break; @@ -729,7 +729,7 @@ CIFSSMBEcho(struct TCP_Server_Info *server) return rc; /* set up echo request */ - smb->hdr.Tid = cpu_to_le16(0xffff); + smb->hdr.Tid = 0xffff; smb->hdr.WordCount = 1; put_unaligned_le16(1, &smb->EchoCount); put_bcc_le(1, &smb->hdr); @@ -1884,10 +1884,10 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, __constant_cpu_to_le16(CIFS_WRLCK)) pLockData->fl_type = F_WRLCK; - pLockData->fl_start = parm_data->start; - pLockData->fl_end = parm_data->start + - parm_data->length - 1; - pLockData->fl_pid = parm_data->pid; + pLockData->fl_start = le64_to_cpu(parm_data->start); + pLockData->fl_end = pLockData->fl_start + + le64_to_cpu(parm_data->length) - 1; + pLockData->fl_pid = le32_to_cpu(parm_data->pid); } } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 6e2b2addfc78..db9d55b507d0 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -199,8 +199,7 @@ cifs_reconnect(struct TCP_Server_Info *server) } spin_unlock(&GlobalMid_Lock); - while ((server->tcpStatus != CifsExiting) && - (server->tcpStatus != CifsGood)) { + while (server->tcpStatus == CifsNeedReconnect) { try_to_freeze(); /* we should try only the port we connected to before */ @@ -212,7 +211,7 @@ cifs_reconnect(struct TCP_Server_Info *server) atomic_inc(&tcpSesReconnectCount); spin_lock(&GlobalMid_Lock); if (server->tcpStatus != CifsExiting) - server->tcpStatus = CifsGood; + server->tcpStatus = CifsNeedNegotiate; spin_unlock(&GlobalMid_Lock); } } @@ -248,24 +247,24 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize) total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount); data_in_this_rsp = get_unaligned_le16(&pSMBt->t2_rsp.DataCount); - remaining = total_data_size - data_in_this_rsp; - - if (remaining == 0) + if (total_data_size == data_in_this_rsp) return 0; - else if (remaining < 0) { + else if (total_data_size < data_in_this_rsp) { cFYI(1, "total data %d smaller than data in frame %d", total_data_size, data_in_this_rsp); return -EINVAL; - } else { - cFYI(1, "missing %d bytes from transact2, check next response", - remaining); - if (total_data_size > maxBufSize) { - cERROR(1, "TotalDataSize %d is over maximum buffer %d", - total_data_size, maxBufSize); - return -EINVAL; - } - return remaining; } + + remaining = total_data_size - data_in_this_rsp; + + cFYI(1, "missing %d bytes from transact2, check next response", + remaining); + if (total_data_size > maxBufSize) { + cERROR(1, "TotalDataSize %d is over maximum buffer %d", + total_data_size, maxBufSize); + return -EINVAL; + } + return remaining; } static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB) @@ -421,7 +420,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server) pdu_length = 4; /* enough to get RFC1001 header */ incomplete_rcv: - if (echo_retries > 0 && + if (echo_retries > 0 && server->tcpStatus == CifsGood && time_after(jiffies, server->lstrp + (echo_retries * SMB_ECHO_INTERVAL))) { cERROR(1, "Server %s has not responded in %d seconds. " @@ -881,7 +880,8 @@ cifs_parse_mount_options(char *options, const char *devname, /* null user, ie anonymous, authentication */ vol->nullauth = 1; } - if (strnlen(value, 200) < 200) { + if (strnlen(value, MAX_USERNAME_SIZE) < + MAX_USERNAME_SIZE) { vol->username = value; } else { printk(KERN_WARNING "CIFS: username too long\n"); @@ -1472,7 +1472,7 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs) static bool match_port(struct TCP_Server_Info *server, struct sockaddr *addr) { - unsigned short int port, *sport; + __be16 port, *sport; switch (addr->sa_family) { case AF_INET: @@ -1765,6 +1765,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) module_put(THIS_MODULE); goto out_err_crypto_release; } + tcp_ses->tcpStatus = CifsNeedNegotiate; /* thread spawned, put it on the list */ spin_lock(&cifs_tcp_ses_lock); @@ -1808,7 +1809,9 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol) break; default: /* anything else takes username/password */ - if (strncmp(ses->userName, vol->username, + if (ses->user_name == NULL) + continue; + if (strncmp(ses->user_name, vol->username, MAX_USERNAME_SIZE)) continue; if (strlen(vol->username) != 0 && @@ -1851,6 +1854,8 @@ cifs_put_smb_ses(struct cifsSesInfo *ses) cifs_put_tcp_session(server); } +static bool warned_on_ntlm; /* globals init to false automatically */ + static struct cifsSesInfo * cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) { @@ -1906,9 +1911,11 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) else sprintf(ses->serverName, "%pI4", &addr->sin_addr); - if (volume_info->username) - strncpy(ses->userName, volume_info->username, - MAX_USERNAME_SIZE); + if (volume_info->username) { + ses->user_name = kstrdup(volume_info->username, GFP_KERNEL); + if (!ses->user_name) + goto get_ses_fail; + } /* volume_info->password freed at unmount */ if (volume_info->password) { @@ -1923,6 +1930,15 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) } ses->cred_uid = volume_info->cred_uid; ses->linux_uid = volume_info->linux_uid; + + /* ntlmv2 is much stronger than ntlm security, and has been broadly + supported for many years, time to update default security mechanism */ + if ((volume_info->secFlg == 0) && warned_on_ntlm == false) { + warned_on_ntlm = true; + cERROR(1, "default security mechanism requested. The default " + "security mechanism will be upgraded from ntlm to " + "ntlmv2 in kernel release 2.6.41"); + } ses->overrideSecFlg = volume_info->secFlg; mutex_lock(&ses->session_mutex); @@ -2276,7 +2292,7 @@ static int generic_ip_connect(struct TCP_Server_Info *server) { int rc = 0; - unsigned short int sport; + __be16 sport; int slen, sfamily; struct socket *socket = server->ssocket; struct sockaddr *saddr; @@ -2361,7 +2377,7 @@ generic_ip_connect(struct TCP_Server_Info *server) static int ip_connect(struct TCP_Server_Info *server) { - unsigned short int *sport; + __be16 *sport; struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr; struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr; @@ -2826,7 +2842,7 @@ try_mount_again: remote_path_check: /* check if a whole path (including prepath) is not remote */ - if (!rc && cifs_sb->prepathlen && tcon) { + if (!rc && tcon) { /* build_path_to_root works only when we have a valid tcon */ full_path = cifs_build_path_to_root(cifs_sb, tcon); if (full_path == NULL) { diff --git a/fs/cifs/file.c b/fs/cifs/file.c index c27d236738fc..faf59529e847 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -575,8 +575,10 @@ reopen_error_exit: int cifs_close(struct inode *inode, struct file *file) { - cifsFileInfo_put(file->private_data); - file->private_data = NULL; + if (file->private_data != NULL) { + cifsFileInfo_put(file->private_data); + file->private_data = NULL; + } /* return code from the ->release op is always ignored */ return 0; @@ -970,6 +972,9 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file, total_written += bytes_written) { rc = -EAGAIN; while (rc == -EAGAIN) { + struct kvec iov[2]; + unsigned int len; + if (open_file->invalidHandle) { /* we could deadlock if we called filemap_fdatawait from here so tell @@ -979,31 +984,14 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file, if (rc != 0) break; } - if (experimEnabled || (pTcon->ses->server && - ((pTcon->ses->server->secMode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) - == 0))) { - struct kvec iov[2]; - unsigned int len; - - len = min((size_t)cifs_sb->wsize, - write_size - total_written); - /* iov[0] is reserved for smb header */ - iov[1].iov_base = (char *)write_data + - total_written; - iov[1].iov_len = len; - rc = CIFSSMBWrite2(xid, pTcon, - open_file->netfid, len, - *poffset, &bytes_written, - iov, 1, 0); - } else - rc = CIFSSMBWrite(xid, pTcon, - open_file->netfid, - min_t(const int, cifs_sb->wsize, - write_size - total_written), - *poffset, &bytes_written, - write_data + total_written, - NULL, 0); + + len = min((size_t)cifs_sb->wsize, + write_size - total_written); + /* iov[0] is reserved for smb header */ + iov[1].iov_base = (char *)write_data + total_written; + iov[1].iov_len = len; + rc = CIFSSMBWrite2(xid, pTcon, open_file->netfid, len, + *poffset, &bytes_written, iov, 1, 0); } if (rc || (bytes_written == 0)) { if (total_written) @@ -1240,12 +1228,6 @@ static int cifs_writepages(struct address_space *mapping, } tcon = tlink_tcon(open_file->tlink); - if (!experimEnabled && tcon->ses->server->secMode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { - cifsFileInfo_put(open_file); - kfree(iov); - return generic_writepages(mapping, wbc); - } cifsFileInfo_put(open_file); xid = GetXid(); @@ -1980,6 +1962,24 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, return total_read; } +/* + * If the page is mmap'ed into a process' page tables, then we need to make + * sure that it doesn't change while being written back. + */ +static int +cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + + lock_page(page); + return VM_FAULT_LOCKED; +} + +static struct vm_operations_struct cifs_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = cifs_page_mkwrite, +}; + int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) { int rc, xid; @@ -1991,6 +1991,8 @@ int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) cifs_invalidate_mapping(inode); rc = generic_file_mmap(file, vma); + if (rc == 0) + vma->vm_ops = &cifs_file_vm_ops; FreeXid(xid); return rc; } @@ -2007,6 +2009,8 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) return rc; } rc = generic_file_mmap(file, vma); + if (rc == 0) + vma->vm_ops = &cifs_file_vm_ops; FreeXid(xid); return rc; } diff --git a/fs/cifs/link.c b/fs/cifs/link.c index e8804d373404..ce417a9764a3 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -239,7 +239,7 @@ CIFSQueryMFSymLink(const int xid, struct cifsTconInfo *tcon, if (rc != 0) return rc; - if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) { + if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { CIFSSMBClose(xid, tcon, netfid); /* it's not a symlink */ return -EINVAL; @@ -316,7 +316,7 @@ CIFSCheckMFSymlink(struct cifs_fattr *fattr, if (rc != 0) goto out; - if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) { + if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { CIFSSMBClose(xid, pTcon, netfid); /* it's not a symlink */ goto out; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 2a930a752a78..0c684ae4c071 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -100,6 +100,7 @@ sesInfoFree(struct cifsSesInfo *buf_to_free) memset(buf_to_free->password, 0, strlen(buf_to_free->password)); kfree(buf_to_free->password); } + kfree(buf_to_free->user_name); kfree(buf_to_free->domainName); kfree(buf_to_free); } @@ -520,7 +521,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) (struct smb_com_transaction_change_notify_rsp *)buf; struct file_notify_information *pnotify; __u32 data_offset = 0; - if (pSMBr->ByteCount > sizeof(struct file_notify_information)) { + if (get_bcc_le(buf) > sizeof(struct file_notify_information)) { data_offset = le32_to_cpu(pSMBr->DataOffset); pnotify = (struct file_notify_information *) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 16765703131b..f6728eb6f4b9 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -219,12 +219,12 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses, bcc_ptr++; } */ /* copy user */ - if (ses->userName == NULL) { + if (ses->user_name == NULL) { /* null user mount */ *bcc_ptr = 0; *(bcc_ptr+1) = 0; } else { - bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->userName, + bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->user_name, MAX_USERNAME_SIZE, nls_cp); } bcc_ptr += 2 * bytes_ret; @@ -244,12 +244,11 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses, /* copy user */ /* BB what about null user mounts - check that we do this BB */ /* copy user */ - if (ses->userName == NULL) { - /* BB what about null user mounts - check that we do this BB */ - } else { - strncpy(bcc_ptr, ses->userName, MAX_USERNAME_SIZE); - } - bcc_ptr += strnlen(ses->userName, MAX_USERNAME_SIZE); + if (ses->user_name != NULL) + strncpy(bcc_ptr, ses->user_name, MAX_USERNAME_SIZE); + /* else null user mount */ + + bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE); *bcc_ptr = 0; bcc_ptr++; /* account for null termination */ @@ -405,8 +404,8 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, /* BB spec says that if AvId field of MsvAvTimestamp is populated then we must set the MIC field of the AUTHENTICATE_MESSAGE */ ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags); - tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset); - tilen = cpu_to_le16(pblob->TargetInfoArray.Length); + tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset); + tilen = le16_to_cpu(pblob->TargetInfoArray.Length); if (tilen) { ses->auth_key.response = kmalloc(tilen, GFP_KERNEL); if (!ses->auth_key.response) { @@ -523,14 +522,14 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer, tmp += len; } - if (ses->userName == NULL) { + if (ses->user_name == NULL) { sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); sec_blob->UserName.Length = 0; sec_blob->UserName.MaximumLength = 0; tmp += 2; } else { int len; - len = cifs_strtoUCS((__le16 *)tmp, ses->userName, + len = cifs_strtoUCS((__le16 *)tmp, ses->user_name, MAX_USERNAME_SIZE, nls_cp); len *= 2; /* unicode is 2 bytes each */ sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); diff --git a/fs/dcache.c b/fs/dcache.c index ad25c4cec7d5..129a35730994 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2131,7 +2131,7 @@ EXPORT_SYMBOL(d_rehash); */ void dentry_update_name_case(struct dentry *dentry, struct qstr *name) { - BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + BUG_ON(!mutex_is_locked(&dentry->d_parent->d_inode->i_mutex)); BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */ spin_lock(&dentry->d_lock); diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index e25e99bf7ee1..d0f53538a57f 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -86,8 +86,8 @@ #ifdef CONFIG_QUOTA /* Amount of blocks needed for quota update - we know that the structure was - * allocated so we need to update only inode+data */ -#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0) + * allocated so we need to update only data block */ +#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 1 : 0) /* Amount of blocks needed for quota insert/delete - we do some block writes * but inode, sb and group updates are done only once */ #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 4673bc05274f..e9473cbe80df 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -125,9 +125,11 @@ extern int ext4_flush_completed_IO(struct inode *inode) * the parent directory's parent as well, and so on recursively, if * they are also freshly created. */ -static void ext4_sync_parent(struct inode *inode) +static int ext4_sync_parent(struct inode *inode) { + struct writeback_control wbc; struct dentry *dentry = NULL; + int ret = 0; while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); @@ -136,8 +138,17 @@ static void ext4_sync_parent(struct inode *inode) if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode) break; inode = dentry->d_parent->d_inode; - sync_mapping_buffers(inode->i_mapping); + ret = sync_mapping_buffers(inode->i_mapping); + if (ret) + break; + memset(&wbc, 0, sizeof(wbc)); + wbc.sync_mode = WB_SYNC_ALL; + wbc.nr_to_write = 0; /* only write out the inode */ + ret = sync_inode(inode, &wbc); + if (ret) + break; } + return ret; } /* @@ -176,7 +187,7 @@ int ext4_sync_file(struct file *file, int datasync) if (!journal) { ret = generic_file_fsync(file, datasync); if (!ret && !list_empty(&inode->i_dentry)) - ext4_sync_parent(inode); + ret = ext4_sync_parent(inode); goto out; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ad8e303c0d29..f2fa5e8a582c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2502,6 +2502,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, * for partial write. */ set_buffer_new(bh); + set_buffer_mapped(bh); } return 0; } @@ -4429,8 +4430,8 @@ void ext4_truncate(struct inode *inode) Indirect chain[4]; Indirect *partial; __le32 nr = 0; - int n; - ext4_lblk_t last_block; + int n = 0; + ext4_lblk_t last_block, max_block; unsigned blocksize = inode->i_sb->s_blocksize; trace_ext4_truncate_enter(inode); @@ -4455,14 +4456,18 @@ void ext4_truncate(struct inode *inode) last_block = (inode->i_size + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); if (inode->i_size & (blocksize - 1)) if (ext4_block_truncate_page(handle, mapping, inode->i_size)) goto out_stop; - n = ext4_block_to_path(inode, last_block, offsets, NULL); - if (n == 0) - goto out_stop; /* error */ + if (last_block != max_block) { + n = ext4_block_to_path(inode, last_block, offsets, NULL); + if (n == 0) + goto out_stop; /* error */ + } /* * OK. This truncate is going to happen. We add the inode to the @@ -4493,7 +4498,13 @@ void ext4_truncate(struct inode *inode) */ ei->i_disksize = inode->i_size; - if (n == 1) { /* direct blocks */ + if (last_block == max_block) { + /* + * It is unnecessary to free any data blocks if last_block is + * equal to the indirect block limit. + */ + goto out_unlock; + } else if (n == 1) { /* direct blocks */ ext4_free_data(handle, inode, NULL, i_data+offsets[0], i_data + EXT4_NDIR_BLOCKS); goto do_indirects; @@ -4553,6 +4564,7 @@ do_indirects: ; } +out_unlock: up_write(&ei->i_data_sem); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); @@ -5398,13 +5410,12 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, /* if nrblocks are contiguous */ if (chunk) { /* - * With N contiguous data blocks, it need at most - * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks - * 2 dindirect blocks - * 1 tindirect block + * With N contiguous data blocks, we need at most + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, + * 2 dindirect blocks, and 1 tindirect block */ - indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb); - return indirects + 3; + return DIV_ROUND_UP(nrblocks, + EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; } /* * if nrblocks are not contiguous, worse case, each block touch diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 056474b7b8e0..8553dfb310af 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -242,27 +242,44 @@ static void ext4_put_nojournal(handle_t *handle) * journal_end calls result in the superblock being marked dirty, so * that sync() will call the filesystem's write_super callback if * appropriate. + * + * To avoid j_barrier hold in userspace when a user calls freeze(), + * ext4 prevents a new handle from being started by s_frozen, which + * is in an upper layer. */ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) { journal_t *journal; + handle_t *handle; if (sb->s_flags & MS_RDONLY) return ERR_PTR(-EROFS); - vfs_check_frozen(sb, SB_FREEZE_TRANS); - /* Special case here: if the journal has aborted behind our - * backs (eg. EIO in the commit thread), then we still need to - * take the FS itself readonly cleanly. */ journal = EXT4_SB(sb)->s_journal; - if (journal) { - if (is_journal_aborted(journal)) { - ext4_abort(sb, "Detected aborted journal"); - return ERR_PTR(-EROFS); - } - return jbd2_journal_start(journal, nblocks); + handle = ext4_journal_current_handle(); + + /* + * If a handle has been started, it should be allowed to + * finish, otherwise deadlock could happen between freeze + * and others(e.g. truncate) due to the restart of the + * journal handle if the filesystem is forzen and active + * handles are not stopped. + */ + if (!handle) + vfs_check_frozen(sb, SB_FREEZE_TRANS); + + if (!journal) + return ext4_get_nojournal(); + /* + * Special case here: if the journal has aborted behind our + * backs (eg. EIO in the commit thread), then we still need to + * take the FS itself readonly cleanly. + */ + if (is_journal_aborted(journal)) { + ext4_abort(sb, "Detected aborted journal"); + return ERR_PTR(-EROFS); } - return ext4_get_nojournal(); + return jbd2_journal_start(journal, nblocks); } /* @@ -2975,6 +2992,12 @@ static int ext4_register_li_request(struct super_block *sb, mutex_unlock(&ext4_li_info->li_list_mtx); sbi->s_li_request = elr; + /* + * set elr to NULL here since it has been inserted to + * the request_list and the removal and free of it is + * handled by ext4_clear_request_list from now on. + */ + elr = NULL; if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) { ret = ext4_run_lazyinit_thread(); @@ -3385,6 +3408,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); + init_timer(&sbi->s_err_report); + sbi->s_err_report.function = print_daily_error_info; + sbi->s_err_report.data = (unsigned long) sb; + err = percpu_counter_init(&sbi->s_freeblocks_counter, ext4_count_free_blocks(sb)); if (!err) { @@ -3646,9 +3673,6 @@ no_journal: "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, *sbi->s_es->s_mount_opts ? "; " : "", orig_data); - init_timer(&sbi->s_err_report); - sbi->s_err_report.function = print_daily_error_info; - sbi->s_err_report.data = (unsigned long) sb; if (es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ @@ -3672,6 +3696,7 @@ failed_mount_wq: sbi->s_journal = NULL; } failed_mount3: + del_timer(&sbi->s_err_report); if (sbi->s_flex_groups) { if (is_vmalloc_addr(sbi->s_flex_groups)) vfree(sbi->s_flex_groups); @@ -4138,6 +4163,11 @@ static int ext4_sync_fs(struct super_block *sb, int wait) /* * LVM calls this function before a (read-only) snapshot is created. This * gives us a chance to flush the journal completely and mark the fs clean. + * + * Note that only this function cannot bring a filesystem to be in a clean + * state independently, because ext4 prevents a new handle from being started + * by @sb->s_frozen, which stays in an upper layer. It thus needs help from + * the upper layer. */ static int ext4_freeze(struct super_block *sb) { @@ -4614,11 +4644,24 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, static int ext4_quota_off(struct super_block *sb, int type) { + struct inode *inode = sb_dqopt(sb)->files[type]; + handle_t *handle; + /* Force all delayed allocation blocks to be allocated. * Caller already holds s_umount sem */ if (test_opt(sb, DELALLOC)) sync_filesystem(sb); + /* Update modification times of quota files when userspace can + * start looking at them */ + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) + goto out; + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + +out: return dquot_quota_off(sb, type); } @@ -4714,9 +4757,8 @@ out: if (inode->i_size < off + len) { i_size_write(inode, off + len); EXT4_I(inode)->i_disksize = inode->i_size; + ext4_mark_inode_dirty(handle, inode); } - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - ext4_mark_inode_dirty(handle, inode); mutex_unlock(&inode->i_mutex); return len; } diff --git a/fs/fhandle.c b/fs/fhandle.c index bf93ad2bee07..6b088641f5bf 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -7,6 +7,7 @@ #include <linux/exportfs.h> #include <linux/fs_struct.h> #include <linux/fsnotify.h> +#include <linux/personality.h> #include <asm/uaccess.h> #include "internal.h" diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 20af62f4304b..6e28000a4b21 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -105,6 +105,8 @@ static int journal_submit_commit_record(journal_t *journal, int ret; struct timespec now = current_kernel_time(); + *cbh = NULL; + if (is_journal_aborted(journal)) return 0; @@ -806,7 +808,7 @@ wait_for_iobuf: if (err) __jbd2_journal_abort_hard(journal); } - if (!err && !is_journal_aborted(journal)) + if (cbh) err = journal_wait_on_commit_record(journal, cbh); if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) && diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index aba8ebaec25c..e0ec3db1c395 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2413,10 +2413,12 @@ const char *jbd2_dev_to_name(dev_t device) new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL); if (!new_dev) return "NODEV-ALLOCFAILURE"; /* Something non-NULL */ + bd = bdget(device); spin_lock(&devname_cache_lock); if (devcache[i]) { if (devcache[i]->device == device) { kfree(new_dev); + bdput(bd); ret = devcache[i]->devname; spin_unlock(&devname_cache_lock); return ret; @@ -2425,7 +2427,6 @@ const char *jbd2_dev_to_name(dev_t device) } devcache[i] = new_dev; devcache[i]->device = device; - bd = bdget(device); if (bd) { bdevname(bd, devcache[i]->devname); bdput(bd); diff --git a/fs/namespace.c b/fs/namespace.c index 7dba2ed03429..d99bcf59e4c2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1030,18 +1030,6 @@ const struct seq_operations mounts_op = { .show = show_vfsmnt }; -static int uuid_is_nil(u8 *uuid) -{ - int i; - u8 *cp = (u8 *)uuid; - - for (i = 0; i < 16; i++) { - if (*cp++) - return 0; - } - return 1; -} - static int show_mountinfo(struct seq_file *m, void *v) { struct proc_mounts *p = m->private; @@ -1085,10 +1073,6 @@ static int show_mountinfo(struct seq_file *m, void *v) if (IS_MNT_UNBINDABLE(mnt)) seq_puts(m, " unbindable"); - if (!uuid_is_nil(mnt->mnt_sb->s_uuid)) - /* print the uuid */ - seq_printf(m, " uuid:%pU", mnt->mnt_sb->s_uuid); - /* Filesystem specific data */ seq_puts(m, " - "); show_type(m, sb); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index af0c6279a4a7..e4cbc11a74ab 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -542,11 +542,15 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, u if (!nfs_need_commit(nfsi)) return 0; + spin_lock(&inode->i_lock); ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); if (ret > 0) nfsi->ncommit -= ret; + spin_unlock(&inode->i_lock); + if (nfs_need_commit(NFS_I(inode))) __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + return ret; } #else @@ -1483,9 +1487,7 @@ int nfs_commit_inode(struct inode *inode, int how) res = nfs_commit_set_lock(NFS_I(inode), may_wait); if (res <= 0) goto out_mark_dirty; - spin_lock(&inode->i_lock); res = nfs_scan_commit(inode, &head, 0, 0); - spin_unlock(&inode->i_lock); if (res) { int error; diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 0c6d81670137..7c831a2731fa 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -38,7 +38,6 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp) exp_readlock(); nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp); fh_put(&fh); - rqstp->rq_client = NULL; exp_readunlock(); /* We return nlm error codes as nlm doesn't know * about nfsd, but nfsd does know about nlm.. diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 4b36ec3eb8ea..aa309aa93fe8 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -397,10 +397,13 @@ static void unhash_generic_stateid(struct nfs4_stateid *stp) static void free_generic_stateid(struct nfs4_stateid *stp) { - int oflag = nfs4_access_bmap_to_omode(stp); + int oflag; - nfs4_file_put_access(stp->st_file, oflag); - put_nfs4_file(stp->st_file); + if (stp->st_access_bmap) { + oflag = nfs4_access_bmap_to_omode(stp); + nfs4_file_put_access(stp->st_file, oflag); + put_nfs4_file(stp->st_file); + } kmem_cache_free(stateid_slab, stp); } diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c index b10e3540d5b7..ce4f62440425 100644 --- a/fs/partitions/ldm.c +++ b/fs/partitions/ldm.c @@ -1299,6 +1299,11 @@ static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags) BUG_ON (!data || !frags); + if (size < 2 * VBLK_SIZE_HEAD) { + ldm_error("Value of size is to small."); + return false; + } + group = get_unaligned_be32(data + 0x08); rec = get_unaligned_be16(data + 0x0C); num = get_unaligned_be16(data + 0x0E); @@ -1306,6 +1311,10 @@ static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags) ldm_error ("A VBLK claims to have %d parts.", num); return false; } + if (rec >= num) { + ldm_error("REC value (%d) exceeds NUM value (%d)", rec, num); + return false; + } list_for_each (item, frags) { f = list_entry (item, struct frag, list); @@ -1334,10 +1343,9 @@ found: f->map |= (1 << rec); - if (num > 0) { - data += VBLK_SIZE_HEAD; - size -= VBLK_SIZE_HEAD; - } + data += VBLK_SIZE_HEAD; + size -= VBLK_SIZE_HEAD; + memcpy (f->data+rec*(size-VBLK_SIZE_HEAD)+VBLK_SIZE_HEAD, data, size); return true; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 9eead2c796b7..fbb0b478a346 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -112,6 +112,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) SetPageDirty(page); unlock_page(page); + put_page(page); } return 0; diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 919f0de29d8f..e6493cac193d 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -23,6 +23,12 @@ #ifndef __UBIFS_DEBUG_H__ #define __UBIFS_DEBUG_H__ +/* Checking helper functions */ +typedef int (*dbg_leaf_callback)(struct ubifs_info *c, + struct ubifs_zbranch *zbr, void *priv); +typedef int (*dbg_znode_callback)(struct ubifs_info *c, + struct ubifs_znode *znode, void *priv); + #ifdef CONFIG_UBIFS_FS_DEBUG /** @@ -270,11 +276,6 @@ void dbg_dump_tnc(struct ubifs_info *c); void dbg_dump_index(struct ubifs_info *c); void dbg_dump_lpt_lebs(const struct ubifs_info *c); -/* Checking helper functions */ -typedef int (*dbg_leaf_callback)(struct ubifs_info *c, - struct ubifs_zbranch *zbr, void *priv); -typedef int (*dbg_znode_callback)(struct ubifs_info *c, - struct ubifs_znode *znode, void *priv); int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, dbg_znode_callback znode_cb, void *priv); @@ -295,7 +296,6 @@ int dbg_check_idx_size(struct ubifs_info *c, long long idx_size); int dbg_check_filesystem(struct ubifs_info *c); void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, int add_pos); -int dbg_check_lprops(struct ubifs_info *c); int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, int row, int col); int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, @@ -401,58 +401,94 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c); #define DBGKEY(key) ((char *)(key)) #define DBGKEY1(key) ((char *)(key)) -#define ubifs_debugging_init(c) 0 -#define ubifs_debugging_exit(c) ({}) - -#define dbg_ntype(type) "" -#define dbg_cstate(cmt_state) "" -#define dbg_jhead(jhead) "" -#define dbg_get_key_dump(c, key) ({}) -#define dbg_dump_inode(c, inode) ({}) -#define dbg_dump_node(c, node) ({}) -#define dbg_dump_lpt_node(c, node, lnum, offs) ({}) -#define dbg_dump_budget_req(req) ({}) -#define dbg_dump_lstats(lst) ({}) -#define dbg_dump_budg(c) ({}) -#define dbg_dump_lprop(c, lp) ({}) -#define dbg_dump_lprops(c) ({}) -#define dbg_dump_lpt_info(c) ({}) -#define dbg_dump_leb(c, lnum) ({}) -#define dbg_dump_znode(c, znode) ({}) -#define dbg_dump_heap(c, heap, cat) ({}) -#define dbg_dump_pnode(c, pnode, parent, iip) ({}) -#define dbg_dump_tnc(c) ({}) -#define dbg_dump_index(c) ({}) -#define dbg_dump_lpt_lebs(c) ({}) - -#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0 -#define dbg_old_index_check_init(c, zroot) 0 -#define dbg_save_space_info(c) ({}) -#define dbg_check_space_info(c) 0 -#define dbg_check_old_index(c, zroot) 0 -#define dbg_check_cats(c) 0 -#define dbg_check_ltab(c) 0 -#define dbg_chk_lpt_free_spc(c) 0 -#define dbg_chk_lpt_sz(c, action, len) 0 -#define dbg_check_synced_i_size(inode) 0 -#define dbg_check_dir_size(c, dir) 0 -#define dbg_check_tnc(c, x) 0 -#define dbg_check_idx_size(c, idx_size) 0 -#define dbg_check_filesystem(c) 0 -#define dbg_check_heap(c, heap, cat, add_pos) ({}) -#define dbg_check_lprops(c) 0 -#define dbg_check_lpt_nodes(c, cnode, row, col) 0 -#define dbg_check_inode_size(c, inode, size) 0 -#define dbg_check_data_nodes_order(c, head) 0 -#define dbg_check_nondata_nodes_order(c, head) 0 -#define dbg_force_in_the_gaps_enabled 0 -#define dbg_force_in_the_gaps() 0 -#define dbg_failure_mode 0 - -#define dbg_debugfs_init() 0 -#define dbg_debugfs_exit() -#define dbg_debugfs_init_fs(c) 0 -#define dbg_debugfs_exit_fs(c) 0 +static inline int ubifs_debugging_init(struct ubifs_info *c) { return 0; } +static inline void ubifs_debugging_exit(struct ubifs_info *c) { return; } +static inline const char *dbg_ntype(int type) { return ""; } +static inline const char *dbg_cstate(int cmt_state) { return ""; } +static inline const char *dbg_jhead(int jhead) { return ""; } +static inline const char * +dbg_get_key_dump(const struct ubifs_info *c, + const union ubifs_key *key) { return ""; } +static inline void dbg_dump_inode(const struct ubifs_info *c, + const struct inode *inode) { return; } +static inline void dbg_dump_node(const struct ubifs_info *c, + const void *node) { return; } +static inline void dbg_dump_lpt_node(const struct ubifs_info *c, + void *node, int lnum, + int offs) { return; } +static inline void +dbg_dump_budget_req(const struct ubifs_budget_req *req) { return; } +static inline void +dbg_dump_lstats(const struct ubifs_lp_stats *lst) { return; } +static inline void dbg_dump_budg(struct ubifs_info *c) { return; } +static inline void dbg_dump_lprop(const struct ubifs_info *c, + const struct ubifs_lprops *lp) { return; } +static inline void dbg_dump_lprops(struct ubifs_info *c) { return; } +static inline void dbg_dump_lpt_info(struct ubifs_info *c) { return; } +static inline void dbg_dump_leb(const struct ubifs_info *c, + int lnum) { return; } +static inline void +dbg_dump_znode(const struct ubifs_info *c, + const struct ubifs_znode *znode) { return; } +static inline void dbg_dump_heap(struct ubifs_info *c, + struct ubifs_lpt_heap *heap, + int cat) { return; } +static inline void dbg_dump_pnode(struct ubifs_info *c, + struct ubifs_pnode *pnode, + struct ubifs_nnode *parent, + int iip) { return; } +static inline void dbg_dump_tnc(struct ubifs_info *c) { return; } +static inline void dbg_dump_index(struct ubifs_info *c) { return; } +static inline void dbg_dump_lpt_lebs(const struct ubifs_info *c) { return; } + +static inline int dbg_walk_index(struct ubifs_info *c, + dbg_leaf_callback leaf_cb, + dbg_znode_callback znode_cb, + void *priv) { return 0; } +static inline void dbg_save_space_info(struct ubifs_info *c) { return; } +static inline int dbg_check_space_info(struct ubifs_info *c) { return 0; } +static inline int dbg_check_lprops(struct ubifs_info *c) { return 0; } +static inline int +dbg_old_index_check_init(struct ubifs_info *c, + struct ubifs_zbranch *zroot) { return 0; } +static inline int +dbg_check_old_index(struct ubifs_info *c, + struct ubifs_zbranch *zroot) { return 0; } +static inline int dbg_check_cats(struct ubifs_info *c) { return 0; } +static inline int dbg_check_ltab(struct ubifs_info *c) { return 0; } +static inline int dbg_chk_lpt_free_spc(struct ubifs_info *c) { return 0; } +static inline int dbg_chk_lpt_sz(struct ubifs_info *c, + int action, int len) { return 0; } +static inline int dbg_check_synced_i_size(struct inode *inode) { return 0; } +static inline int dbg_check_dir_size(struct ubifs_info *c, + const struct inode *dir) { return 0; } +static inline int dbg_check_tnc(struct ubifs_info *c, int extra) { return 0; } +static inline int dbg_check_idx_size(struct ubifs_info *c, + long long idx_size) { return 0; } +static inline int dbg_check_filesystem(struct ubifs_info *c) { return 0; } +static inline void dbg_check_heap(struct ubifs_info *c, + struct ubifs_lpt_heap *heap, + int cat, int add_pos) { return; } +static inline int dbg_check_lpt_nodes(struct ubifs_info *c, + struct ubifs_cnode *cnode, int row, int col) { return 0; } +static inline int dbg_check_inode_size(struct ubifs_info *c, + const struct inode *inode, + loff_t size) { return 0; } +static inline int +dbg_check_data_nodes_order(struct ubifs_info *c, + struct list_head *head) { return 0; } +static inline int +dbg_check_nondata_nodes_order(struct ubifs_info *c, + struct list_head *head) { return 0; } + +static inline int dbg_force_in_the_gaps(void) { return 0; } +#define dbg_force_in_the_gaps_enabled 0 +#define dbg_failure_mode 0 + +static inline int dbg_debugfs_init(void) { return 0; } +static inline void dbg_debugfs_exit(void) { return; } +static inline int dbg_debugfs_init_fs(struct ubifs_info *c) { return 0; } +static inline int dbg_debugfs_exit_fs(struct ubifs_info *c) { return 0; } #endif /* !CONFIG_UBIFS_FS_DEBUG */ #endif /* !__UBIFS_DEBUG_H__ */ diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 28be1e6a65e8..b286db79c686 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1312,6 +1312,9 @@ int ubifs_fsync(struct file *file, int datasync) dbg_gen("syncing inode %lu", inode->i_ino); + if (inode->i_sb->s_flags & MS_RDONLY) + return 0; + /* * VFS has already synchronized dirty pages for this inode. Synchronize * the inode unless this is a 'datasync()' call. diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 5ea402023ebd..9ef9ed2cfe2e 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -293,7 +293,6 @@ xfs_buf_allocate_memory( size_t nbytes, offset; gfp_t gfp_mask = xb_to_gfp(flags); unsigned short page_count, i; - pgoff_t first; xfs_off_t end; int error; @@ -333,7 +332,6 @@ use_alloc_page: return error; offset = bp->b_offset; - first = bp->b_file_offset >> PAGE_SHIFT; bp->b_flags |= _XBF_PAGES; for (i = 0; i < bp->b_page_count; i++) { @@ -657,8 +655,6 @@ xfs_buf_readahead( xfs_off_t ioff, size_t isize) { - struct backing_dev_info *bdi; - if (bdi_read_congested(target->bt_bdi)) return; @@ -919,8 +915,6 @@ xfs_buf_lock( if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) xfs_log_force(bp->b_target->bt_mount, 0); - if (atomic_read(&bp->b_io_remaining)) - blk_flush_plug(current); down(&bp->b_sema); XB_SET_OWNER(bp); @@ -1309,8 +1303,6 @@ xfs_buf_iowait( { trace_xfs_buf_iowait(bp, _RET_IP_); - if (atomic_read(&bp->b_io_remaining)) - blk_flush_plug(current); wait_for_completion(&bp->b_iowait); trace_xfs_buf_iowait_done(bp, _RET_IP_); @@ -1747,8 +1739,8 @@ xfsbufd( do { long age = xfs_buf_age_centisecs * msecs_to_jiffies(10); long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10); - int count = 0; struct list_head tmp; + struct blk_plug plug; if (unlikely(freezing(current))) { set_bit(XBT_FORCE_SLEEP, &target->bt_flags); @@ -1764,16 +1756,15 @@ xfsbufd( xfs_buf_delwri_split(target, &tmp, age); list_sort(NULL, &tmp, xfs_buf_cmp); + + blk_start_plug(&plug); while (!list_empty(&tmp)) { struct xfs_buf *bp; bp = list_first_entry(&tmp, struct xfs_buf, b_list); list_del_init(&bp->b_list); xfs_bdstrat_cb(bp); - count++; } - if (count) - blk_flush_plug(current); - + blk_finish_plug(&plug); } while (!kthread_should_stop()); return 0; @@ -1793,6 +1784,7 @@ xfs_flush_buftarg( int pincount = 0; LIST_HEAD(tmp_list); LIST_HEAD(wait_list); + struct blk_plug plug; xfs_buf_runall_queues(xfsconvertd_workqueue); xfs_buf_runall_queues(xfsdatad_workqueue); @@ -1807,6 +1799,8 @@ xfs_flush_buftarg( * we do that after issuing all the IO. */ list_sort(NULL, &tmp_list, xfs_buf_cmp); + + blk_start_plug(&plug); while (!list_empty(&tmp_list)) { bp = list_first_entry(&tmp_list, struct xfs_buf, b_list); ASSERT(target == bp->b_target); @@ -1817,10 +1811,10 @@ xfs_flush_buftarg( } xfs_bdstrat_cb(bp); } + blk_finish_plug(&plug); if (wait) { - /* Expedite and wait for IO to complete. */ - blk_flush_plug(current); + /* Wait for IO to complete. */ while (!list_empty(&wait_list)) { bp = list_first_entry(&wait_list, struct xfs_buf, b_list); diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c index 508e06fd7d1e..3ca795609113 100644 --- a/fs/xfs/linux-2.6/xfs_message.c +++ b/fs/xfs/linux-2.6/xfs_message.c @@ -28,53 +28,47 @@ /* * XFS logging functions */ -static int +static void __xfs_printk( const char *level, const struct xfs_mount *mp, struct va_format *vaf) { if (mp && mp->m_fsname) - return printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf); - return printk("%sXFS: %pV\n", level, vaf); + printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf); + printk("%sXFS: %pV\n", level, vaf); } -int xfs_printk( +void xfs_printk( const char *level, const struct xfs_mount *mp, const char *fmt, ...) { struct va_format vaf; va_list args; - int r; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - r = __xfs_printk(level, mp, &vaf); + __xfs_printk(level, mp, &vaf); va_end(args); - - return r; } #define define_xfs_printk_level(func, kern_level) \ -int func(const struct xfs_mount *mp, const char *fmt, ...) \ +void func(const struct xfs_mount *mp, const char *fmt, ...) \ { \ struct va_format vaf; \ va_list args; \ - int r; \ \ va_start(args, fmt); \ \ vaf.fmt = fmt; \ vaf.va = &args; \ \ - r = __xfs_printk(kern_level, mp, &vaf); \ + __xfs_printk(kern_level, mp, &vaf); \ va_end(args); \ - \ - return r; \ } \ define_xfs_printk_level(xfs_emerg, KERN_EMERG); @@ -88,7 +82,7 @@ define_xfs_printk_level(xfs_info, KERN_INFO); define_xfs_printk_level(xfs_debug, KERN_DEBUG); #endif -int +void xfs_alert_tag( const struct xfs_mount *mp, int panic_tag, @@ -97,7 +91,6 @@ xfs_alert_tag( struct va_format vaf; va_list args; int do_panic = 0; - int r; if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) { xfs_printk(KERN_ALERT, mp, @@ -110,12 +103,10 @@ xfs_alert_tag( vaf.fmt = fmt; vaf.va = &args; - r = __xfs_printk(KERN_ALERT, mp, &vaf); + __xfs_printk(KERN_ALERT, mp, &vaf); va_end(args); BUG_ON(do_panic); - - return r; } void diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h index e77ffa16745b..f1b3fc1b6c4e 100644 --- a/fs/xfs/linux-2.6/xfs_message.h +++ b/fs/xfs/linux-2.6/xfs_message.h @@ -3,32 +3,34 @@ struct xfs_mount; -extern int xfs_printk(const char *level, const struct xfs_mount *mp, +extern void xfs_printk(const char *level, const struct xfs_mount *mp, const char *fmt, ...) __attribute__ ((format (printf, 3, 4))); -extern int xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...) +extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); -extern int xfs_alert(const struct xfs_mount *mp, const char *fmt, ...) +extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); -extern int xfs_alert_tag(const struct xfs_mount *mp, int tag, +extern void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...) __attribute__ ((format (printf, 3, 4))); -extern int xfs_crit(const struct xfs_mount *mp, const char *fmt, ...) +extern void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); -extern int xfs_err(const struct xfs_mount *mp, const char *fmt, ...) +extern void xfs_err(const struct xfs_mount *mp, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); -extern int xfs_warn(const struct xfs_mount *mp, const char *fmt, ...) +extern void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); -extern int xfs_notice(const struct xfs_mount *mp, const char *fmt, ...) +extern void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); -extern int xfs_info(const struct xfs_mount *mp, const char *fmt, ...) +extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); #ifdef DEBUG -extern int xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) +extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); #else -#define xfs_debug(mp, fmt, ...) (0) +static inline void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) +{ +} #endif extern void assfail(char *expr, char *f, int l); diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 1ba5c451da36..b38e58d02299 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -816,75 +816,6 @@ xfs_setup_devices( return 0; } -/* - * XFS AIL push thread support - */ -void -xfsaild_wakeup( - struct xfs_ail *ailp, - xfs_lsn_t threshold_lsn) -{ - /* only ever move the target forwards */ - if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) { - ailp->xa_target = threshold_lsn; - wake_up_process(ailp->xa_task); - } -} - -STATIC int -xfsaild( - void *data) -{ - struct xfs_ail *ailp = data; - xfs_lsn_t last_pushed_lsn = 0; - long tout = 0; /* milliseconds */ - - while (!kthread_should_stop()) { - /* - * for short sleeps indicating congestion, don't allow us to - * get woken early. Otherwise all we do is bang on the AIL lock - * without making progress. - */ - if (tout && tout <= 20) - __set_current_state(TASK_KILLABLE); - else - __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(tout ? - msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT); - - /* swsusp */ - try_to_freeze(); - - ASSERT(ailp->xa_mount->m_log); - if (XFS_FORCED_SHUTDOWN(ailp->xa_mount)) - continue; - - tout = xfsaild_push(ailp, &last_pushed_lsn); - } - - return 0; -} /* xfsaild */ - -int -xfsaild_start( - struct xfs_ail *ailp) -{ - ailp->xa_target = 0; - ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s", - ailp->xa_mount->m_fsname); - if (IS_ERR(ailp->xa_task)) - return -PTR_ERR(ailp->xa_task); - return 0; -} - -void -xfsaild_stop( - struct xfs_ail *ailp) -{ - kthread_stop(ailp->xa_task); -} - - /* Catch misguided souls that try to use this interface on XFS */ STATIC struct inode * xfs_fs_alloc_inode( @@ -1191,22 +1122,12 @@ xfs_fs_sync_fs( return -error; if (laptop_mode) { - int prev_sync_seq = mp->m_sync_seq; - /* * The disk must be active because we're syncing. * We schedule xfssyncd now (now that the disk is * active) instead of later (when it might not be). */ - wake_up_process(mp->m_sync_task); - /* - * We have to wait for the sync iteration to complete. - * If we don't, the disk activity caused by the sync - * will come after the sync is completed, and that - * triggers another sync from laptop mode. - */ - wait_event(mp->m_wait_single_sync_task, - mp->m_sync_seq != prev_sync_seq); + flush_delayed_work_sync(&mp->m_sync_work); } return 0; @@ -1490,9 +1411,6 @@ xfs_fs_fill_super( spin_lock_init(&mp->m_sb_lock); mutex_init(&mp->m_growlock); atomic_set(&mp->m_active_trans, 0); - INIT_LIST_HEAD(&mp->m_sync_list); - spin_lock_init(&mp->m_sync_lock); - init_waitqueue_head(&mp->m_wait_single_sync_task); mp->m_super = sb; sb->s_fs_info = mp; @@ -1799,6 +1717,38 @@ xfs_destroy_zones(void) } STATIC int __init +xfs_init_workqueues(void) +{ + /* + * max_active is set to 8 to give enough concurency to allow + * multiple work operations on each CPU to run. This allows multiple + * filesystems to be running sync work concurrently, and scales with + * the number of CPUs in the system. + */ + xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8); + if (!xfs_syncd_wq) + goto out; + + xfs_ail_wq = alloc_workqueue("xfsail", WQ_CPU_INTENSIVE, 8); + if (!xfs_ail_wq) + goto out_destroy_syncd; + + return 0; + +out_destroy_syncd: + destroy_workqueue(xfs_syncd_wq); +out: + return -ENOMEM; +} + +STATIC void +xfs_destroy_workqueues(void) +{ + destroy_workqueue(xfs_ail_wq); + destroy_workqueue(xfs_syncd_wq); +} + +STATIC int __init init_xfs_fs(void) { int error; @@ -1813,10 +1763,14 @@ init_xfs_fs(void) if (error) goto out; - error = xfs_mru_cache_init(); + error = xfs_init_workqueues(); if (error) goto out_destroy_zones; + error = xfs_mru_cache_init(); + if (error) + goto out_destroy_wq; + error = xfs_filestream_init(); if (error) goto out_mru_cache_uninit; @@ -1833,6 +1787,10 @@ init_xfs_fs(void) if (error) goto out_cleanup_procfs; + error = xfs_init_workqueues(); + if (error) + goto out_sysctl_unregister; + vfs_initquota(); error = register_filesystem(&xfs_fs_type); @@ -1850,6 +1808,8 @@ init_xfs_fs(void) xfs_filestream_uninit(); out_mru_cache_uninit: xfs_mru_cache_uninit(); + out_destroy_wq: + xfs_destroy_workqueues(); out_destroy_zones: xfs_destroy_zones(); out: @@ -1866,6 +1826,7 @@ exit_xfs_fs(void) xfs_buf_terminate(); xfs_filestream_uninit(); xfs_mru_cache_uninit(); + xfs_destroy_workqueues(); xfs_destroy_zones(); } diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 9cf35a688f53..e4f9c1b0836c 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -22,6 +22,7 @@ #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" +#include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" @@ -39,6 +40,8 @@ #include <linux/kthread.h> #include <linux/freezer.h> +struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ + /* * The inode lookup is done in batches to keep the amount of lock traffic and * radix tree lookups to a minimum. The batch size is a trade off between @@ -431,62 +434,12 @@ xfs_quiesce_attr( xfs_unmountfs_writesb(mp); } -/* - * Enqueue a work item to be picked up by the vfs xfssyncd thread. - * Doing this has two advantages: - * - It saves on stack space, which is tight in certain situations - * - It can be used (with care) as a mechanism to avoid deadlocks. - * Flushing while allocating in a full filesystem requires both. - */ -STATIC void -xfs_syncd_queue_work( - struct xfs_mount *mp, - void *data, - void (*syncer)(struct xfs_mount *, void *), - struct completion *completion) -{ - struct xfs_sync_work *work; - - work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP); - INIT_LIST_HEAD(&work->w_list); - work->w_syncer = syncer; - work->w_data = data; - work->w_mount = mp; - work->w_completion = completion; - spin_lock(&mp->m_sync_lock); - list_add_tail(&work->w_list, &mp->m_sync_list); - spin_unlock(&mp->m_sync_lock); - wake_up_process(mp->m_sync_task); -} - -/* - * Flush delayed allocate data, attempting to free up reserved space - * from existing allocations. At this point a new allocation attempt - * has failed with ENOSPC and we are in the process of scratching our - * heads, looking about for more room... - */ -STATIC void -xfs_flush_inodes_work( - struct xfs_mount *mp, - void *arg) -{ - struct inode *inode = arg; - xfs_sync_data(mp, SYNC_TRYLOCK); - xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT); - iput(inode); -} - -void -xfs_flush_inodes( - xfs_inode_t *ip) +static void +xfs_syncd_queue_sync( + struct xfs_mount *mp) { - struct inode *inode = VFS_I(ip); - DECLARE_COMPLETION_ONSTACK(completion); - - igrab(inode); - xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion); - wait_for_completion(&completion); - xfs_log_force(ip->i_mount, XFS_LOG_SYNC); + queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work, + msecs_to_jiffies(xfs_syncd_centisecs * 10)); } /* @@ -496,9 +449,10 @@ xfs_flush_inodes( */ STATIC void xfs_sync_worker( - struct xfs_mount *mp, - void *unused) + struct work_struct *work) { + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_sync_work); int error; if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { @@ -508,73 +462,106 @@ xfs_sync_worker( error = xfs_fs_log_dummy(mp); else xfs_log_force(mp, 0); - xfs_reclaim_inodes(mp, 0); error = xfs_qm_sync(mp, SYNC_TRYLOCK); + + /* start pushing all the metadata that is currently dirty */ + xfs_ail_push_all(mp->m_ail); } - mp->m_sync_seq++; - wake_up(&mp->m_wait_single_sync_task); + + /* queue us up again */ + xfs_syncd_queue_sync(mp); } -STATIC int -xfssyncd( - void *arg) +/* + * Queue a new inode reclaim pass if there are reclaimable inodes and there + * isn't a reclaim pass already in progress. By default it runs every 5s based + * on the xfs syncd work default of 30s. Perhaps this should have it's own + * tunable, but that can be done if this method proves to be ineffective or too + * aggressive. + */ +static void +xfs_syncd_queue_reclaim( + struct xfs_mount *mp) { - struct xfs_mount *mp = arg; - long timeleft; - xfs_sync_work_t *work, *n; - LIST_HEAD (tmp); - - set_freezable(); - timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); - for (;;) { - if (list_empty(&mp->m_sync_list)) - timeleft = schedule_timeout_interruptible(timeleft); - /* swsusp */ - try_to_freeze(); - if (kthread_should_stop() && list_empty(&mp->m_sync_list)) - break; - spin_lock(&mp->m_sync_lock); - /* - * We can get woken by laptop mode, to do a sync - - * that's the (only!) case where the list would be - * empty with time remaining. - */ - if (!timeleft || list_empty(&mp->m_sync_list)) { - if (!timeleft) - timeleft = xfs_syncd_centisecs * - msecs_to_jiffies(10); - INIT_LIST_HEAD(&mp->m_sync_work.w_list); - list_add_tail(&mp->m_sync_work.w_list, - &mp->m_sync_list); - } - list_splice_init(&mp->m_sync_list, &tmp); - spin_unlock(&mp->m_sync_lock); + /* + * We can have inodes enter reclaim after we've shut down the syncd + * workqueue during unmount, so don't allow reclaim work to be queued + * during unmount. + */ + if (!(mp->m_super->s_flags & MS_ACTIVE)) + return; - list_for_each_entry_safe(work, n, &tmp, w_list) { - (*work->w_syncer)(mp, work->w_data); - list_del(&work->w_list); - if (work == &mp->m_sync_work) - continue; - if (work->w_completion) - complete(work->w_completion); - kmem_free(work); - } + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { + queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work, + msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); } + rcu_read_unlock(); +} - return 0; +/* + * This is a fast pass over the inode cache to try to get reclaim moving on as + * many inodes as possible in a short period of time. It kicks itself every few + * seconds, as well as being kicked by the inode cache shrinker when memory + * goes low. It scans as quickly as possible avoiding locked inodes or those + * already being flushed, and once done schedules a future pass. + */ +STATIC void +xfs_reclaim_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_reclaim_work); + + xfs_reclaim_inodes(mp, SYNC_TRYLOCK); + xfs_syncd_queue_reclaim(mp); +} + +/* + * Flush delayed allocate data, attempting to free up reserved space + * from existing allocations. At this point a new allocation attempt + * has failed with ENOSPC and we are in the process of scratching our + * heads, looking about for more room. + * + * Queue a new data flush if there isn't one already in progress and + * wait for completion of the flush. This means that we only ever have one + * inode flush in progress no matter how many ENOSPC events are occurring and + * so will prevent the system from bogging down due to every concurrent + * ENOSPC event scanning all the active inodes in the system for writeback. + */ +void +xfs_flush_inodes( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + queue_work(xfs_syncd_wq, &mp->m_flush_work); + flush_work_sync(&mp->m_flush_work); +} + +STATIC void +xfs_flush_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(work, + struct xfs_mount, m_flush_work); + + xfs_sync_data(mp, SYNC_TRYLOCK); + xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT); } int xfs_syncd_init( struct xfs_mount *mp) { - mp->m_sync_work.w_syncer = xfs_sync_worker; - mp->m_sync_work.w_mount = mp; - mp->m_sync_work.w_completion = NULL; - mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname); - if (IS_ERR(mp->m_sync_task)) - return -PTR_ERR(mp->m_sync_task); + INIT_WORK(&mp->m_flush_work, xfs_flush_worker); + INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); + INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); + + xfs_syncd_queue_sync(mp); + xfs_syncd_queue_reclaim(mp); + return 0; } @@ -582,7 +569,9 @@ void xfs_syncd_stop( struct xfs_mount *mp) { - kthread_stop(mp->m_sync_task); + cancel_delayed_work_sync(&mp->m_sync_work); + cancel_delayed_work_sync(&mp->m_reclaim_work); + cancel_work_sync(&mp->m_flush_work); } void @@ -601,6 +590,10 @@ __xfs_inode_set_reclaim_tag( XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), XFS_ICI_RECLAIM_TAG); spin_unlock(&ip->i_mount->m_perag_lock); + + /* schedule periodic background inode reclaim */ + xfs_syncd_queue_reclaim(ip->i_mount); + trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, -1, _RET_IP_); } @@ -1017,7 +1010,13 @@ xfs_reclaim_inodes( } /* - * Shrinker infrastructure. + * Inode cache shrinker. + * + * When called we make sure that there is a background (fast) inode reclaim in + * progress, while we will throttle the speed of reclaim via doiing synchronous + * reclaim of inodes. That means if we come across dirty inodes, we wait for + * them to be cleaned, which we hope will not be very long due to the + * background walker having already kicked the IO off on those dirty inodes. */ static int xfs_reclaim_inode_shrink( @@ -1032,10 +1031,15 @@ xfs_reclaim_inode_shrink( mp = container_of(shrink, struct xfs_mount, m_inode_shrink); if (nr_to_scan) { + /* kick background reclaimer and push the AIL */ + xfs_syncd_queue_reclaim(mp); + xfs_ail_push_all(mp->m_ail); + if (!(gfp_mask & __GFP_FS)) return -1; - xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan); + xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, + &nr_to_scan); /* terminate if we don't exhaust the scan */ if (nr_to_scan > 0) return -1; diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index 32ba6628290c..e3a6ad27415f 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -32,6 +32,8 @@ typedef struct xfs_sync_work { #define SYNC_WAIT 0x0001 /* wait for i/o to complete */ #define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ +extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ + int xfs_syncd_init(struct xfs_mount *mp); void xfs_syncd_stop(struct xfs_mount *mp); diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 254ee062bd7d..69228aa8605a 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -461,12 +461,10 @@ xfs_qm_dqflush_all( struct xfs_quotainfo *q = mp->m_quotainfo; int recl; struct xfs_dquot *dqp; - int niters; int error; if (!q) return 0; - niters = 0; again: mutex_lock(&q->qi_dqlist_lock); list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { @@ -1314,14 +1312,9 @@ xfs_qm_dqiter_bufs( { xfs_buf_t *bp; int error; - int notcommitted; - int incr; int type; ASSERT(blkcnt > 0); - notcommitted = 0; - incr = (blkcnt > XFS_QM_MAX_DQCLUSTER_LOGSZ) ? - XFS_QM_MAX_DQCLUSTER_LOGSZ : blkcnt; type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER : (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP); error = 0; diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h index c9446f1c726d..567b29b9f1b3 100644 --- a/fs/xfs/quota/xfs_qm.h +++ b/fs/xfs/quota/xfs_qm.h @@ -65,11 +65,6 @@ extern kmem_zone_t *qm_dqtrxzone; * block in the dquot/xqm code. */ #define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 -/* - * When doing a quotacheck, we log dquot clusters of this many FSBs at most - * in a single transaction. We don't want to ask for too huge a log reservation. - */ -#define XFS_QM_MAX_DQCLUSTER_LOGSZ 3 typedef xfs_dqhash_t xfs_dqlist_t; diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 0d62a07b7fd8..2dadb15d5ca9 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -313,14 +313,12 @@ xfs_qm_scall_quotaon( { int error; uint qf; - uint accflags; __int64_t sbflags; flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); /* * Switching on quota accounting must be done at mount time. */ - accflags = flags & XFS_ALL_QUOTA_ACCT; flags &= ~(XFS_ALL_QUOTA_ACCT); sbflags = 0; diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 4bc3c649aee4..27d64d752eab 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2395,17 +2395,33 @@ xfs_free_extent( memset(&args, 0, sizeof(xfs_alloc_arg_t)); args.tp = tp; args.mp = tp->t_mountp; + + /* + * validate that the block number is legal - the enables us to detect + * and handle a silent filesystem corruption rather than crashing. + */ args.agno = XFS_FSB_TO_AGNO(args.mp, bno); - ASSERT(args.agno < args.mp->m_sb.sb_agcount); + if (args.agno >= args.mp->m_sb.sb_agcount) + return EFSCORRUPTED; + args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); + if (args.agbno >= args.mp->m_sb.sb_agblocks) + return EFSCORRUPTED; + args.pag = xfs_perag_get(args.mp, args.agno); - if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING))) + ASSERT(args.pag); + + error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); + if (error) goto error0; -#ifdef DEBUG - ASSERT(args.agbp != NULL); - ASSERT((args.agbno + len) <= - be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)); -#endif + + /* validate the extent size is legal now we have the agf locked */ + if (args.agbno + len > + be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) { + error = EFSCORRUPTED; + goto error0; + } + error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); error0: xfs_perag_put(args.pag); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 46cc40131d4a..576fdfe81d60 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -198,6 +198,41 @@ xfs_inode_item_size( } /* + * xfs_inode_item_format_extents - convert in-core extents to on-disk form + * + * For either the data or attr fork in extent format, we need to endian convert + * the in-core extent as we place them into the on-disk inode. In this case, we + * need to do this conversion before we write the extents into the log. Because + * we don't have the disk inode to write into here, we allocate a buffer and + * format the extents into it via xfs_iextents_copy(). We free the buffer in + * the unlock routine after the copy for the log has been made. + * + * In the case of the data fork, the in-core and on-disk fork sizes can be + * different due to delayed allocation extents. We only log on-disk extents + * here, so always use the physical fork size to determine the size of the + * buffer we need to allocate. + */ +STATIC void +xfs_inode_item_format_extents( + struct xfs_inode *ip, + struct xfs_log_iovec *vecp, + int whichfork, + int type) +{ + xfs_bmbt_rec_t *ext_buffer; + + ext_buffer = kmem_alloc(XFS_IFORK_SIZE(ip, whichfork), KM_SLEEP); + if (whichfork == XFS_DATA_FORK) + ip->i_itemp->ili_extents_buf = ext_buffer; + else + ip->i_itemp->ili_aextents_buf = ext_buffer; + + vecp->i_addr = ext_buffer; + vecp->i_len = xfs_iextents_copy(ip, ext_buffer, whichfork); + vecp->i_type = type; +} + +/* * This is called to fill in the vector of log iovecs for the * given inode log item. It fills the first item with an inode * log format structure, the second with the on-disk inode structure, @@ -213,7 +248,6 @@ xfs_inode_item_format( struct xfs_inode *ip = iip->ili_inode; uint nvecs; size_t data_bytes; - xfs_bmbt_rec_t *ext_buffer; xfs_mount_t *mp; vecp->i_addr = &iip->ili_format; @@ -320,22 +354,8 @@ xfs_inode_item_format( } else #endif { - /* - * There are delayed allocation extents - * in the inode, or we need to convert - * the extents to on disk format. - * Use xfs_iextents_copy() - * to copy only the real extents into - * a separate buffer. We'll free the - * buffer in the unlock routine. - */ - ext_buffer = kmem_alloc(ip->i_df.if_bytes, - KM_SLEEP); - iip->ili_extents_buf = ext_buffer; - vecp->i_addr = ext_buffer; - vecp->i_len = xfs_iextents_copy(ip, ext_buffer, - XFS_DATA_FORK); - vecp->i_type = XLOG_REG_TYPE_IEXT; + xfs_inode_item_format_extents(ip, vecp, + XFS_DATA_FORK, XLOG_REG_TYPE_IEXT); } ASSERT(vecp->i_len <= ip->i_df.if_bytes); iip->ili_format.ilf_dsize = vecp->i_len; @@ -445,19 +465,12 @@ xfs_inode_item_format( */ vecp->i_addr = ip->i_afp->if_u1.if_extents; vecp->i_len = ip->i_afp->if_bytes; + vecp->i_type = XLOG_REG_TYPE_IATTR_EXT; #else ASSERT(iip->ili_aextents_buf == NULL); - /* - * Need to endian flip before logging - */ - ext_buffer = kmem_alloc(ip->i_afp->if_bytes, - KM_SLEEP); - iip->ili_aextents_buf = ext_buffer; - vecp->i_addr = ext_buffer; - vecp->i_len = xfs_iextents_copy(ip, ext_buffer, - XFS_ATTR_FORK); + xfs_inode_item_format_extents(ip, vecp, + XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT); #endif - vecp->i_type = XLOG_REG_TYPE_IATTR_EXT; iip->ili_format.ilf_asize = vecp->i_len; vecp++; nvecs++; diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index dc1882adaf54..751e94fe1f77 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -204,7 +204,6 @@ xfs_bulkstat( xfs_agi_t *agi; /* agi header data */ xfs_agino_t agino; /* inode # in allocation group */ xfs_agnumber_t agno; /* allocation group number */ - xfs_daddr_t bno; /* inode cluster start daddr */ int chunkidx; /* current index into inode chunk */ int clustidx; /* current index into inode cluster */ xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ @@ -463,7 +462,6 @@ xfs_bulkstat( mp->m_sb.sb_inopblog); } ino = XFS_AGINO_TO_INO(mp, agno, agino); - bno = XFS_AGB_TO_DADDR(mp, agno, agbno); /* * Skip if this inode is free. */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 25efa9b8a602..b612ce4520ae 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -761,7 +761,7 @@ xfs_log_need_covered(xfs_mount_t *mp) break; case XLOG_STATE_COVER_NEED: case XLOG_STATE_COVER_NEED2: - if (!xfs_trans_ail_tail(log->l_ailp) && + if (!xfs_ail_min_lsn(log->l_ailp) && xlog_iclogs_empty(log)) { if (log->l_covered_state == XLOG_STATE_COVER_NEED) log->l_covered_state = XLOG_STATE_COVER_DONE; @@ -801,7 +801,7 @@ xlog_assign_tail_lsn( xfs_lsn_t tail_lsn; struct log *log = mp->m_log; - tail_lsn = xfs_trans_ail_tail(mp->m_ail); + tail_lsn = xfs_ail_min_lsn(mp->m_ail); if (!tail_lsn) tail_lsn = atomic64_read(&log->l_last_sync_lsn); @@ -1239,7 +1239,7 @@ xlog_grant_push_ail( * the filesystem is shutting down. */ if (!XLOG_FORCED_SHUTDOWN(log)) - xfs_trans_ail_push(log->l_ailp, threshold_lsn); + xfs_ail_push(log->l_ailp, threshold_lsn); } /* @@ -3407,6 +3407,17 @@ xlog_verify_dest_ptr( xfs_emerg(log->l_mp, "%s: invalid ptr", __func__); } +/* + * Check to make sure the grant write head didn't just over lap the tail. If + * the cycles are the same, we can't be overlapping. Otherwise, make sure that + * the cycles differ by exactly one and check the byte count. + * + * This check is run unlocked, so can give false positives. Rather than assert + * on failures, use a warn-once flag and a panic tag to allow the admin to + * determine if they want to panic the machine when such an error occurs. For + * debug kernels this will have the same effect as using an assert but, unlinke + * an assert, it can be turned off at runtime. + */ STATIC void xlog_verify_grant_tail( struct log *log) @@ -3414,17 +3425,22 @@ xlog_verify_grant_tail( int tail_cycle, tail_blocks; int cycle, space; - /* - * Check to make sure the grant write head didn't just over lap the - * tail. If the cycles are the same, we can't be overlapping. - * Otherwise, make sure that the cycles differ by exactly one and - * check the byte count. - */ xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space); xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks); if (tail_cycle != cycle) { - ASSERT(cycle - 1 == tail_cycle); - ASSERT(space <= BBTOB(tail_blocks)); + if (cycle - 1 != tail_cycle && + !(log->l_flags & XLOG_TAIL_WARN)) { + xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, + "%s: cycle - 1 != tail_cycle", __func__); + log->l_flags |= XLOG_TAIL_WARN; + } + + if (space > BBTOB(tail_blocks) && + !(log->l_flags & XLOG_TAIL_WARN)) { + xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, + "%s: space > BBTOB(tail_blocks)", __func__); + log->l_flags |= XLOG_TAIL_WARN; + } } } diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index ffae692c9832..5864850e9e34 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -144,6 +144,7 @@ static inline uint xlog_get_client_id(__be32 i) #define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ #define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being shutdown */ +#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */ #ifdef __KERNEL__ /* diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a62e8971539d..19af0ab0d0c6 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -203,12 +203,9 @@ typedef struct xfs_mount { struct mutex m_icsb_mutex; /* balancer sync lock */ #endif struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ - struct task_struct *m_sync_task; /* generalised sync thread */ - xfs_sync_work_t m_sync_work; /* work item for VFS_SYNC */ - struct list_head m_sync_list; /* sync thread work item list */ - spinlock_t m_sync_lock; /* work item list lock */ - int m_sync_seq; /* sync thread generation no. */ - wait_queue_head_t m_wait_single_sync_task; + struct delayed_work m_sync_work; /* background sync work */ + struct delayed_work m_reclaim_work; /* background inode reclaim */ + struct work_struct m_flush_work; /* background inode flush */ __int64_t m_update_flags; /* sb flags we need to update on the next remount,rw */ struct shrinker m_inode_shrink; /* inode reclaim shrinker */ diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 12aff9584e29..acdb92f14d51 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -28,74 +28,138 @@ #include "xfs_trans_priv.h" #include "xfs_error.h" -STATIC void xfs_ail_splice(struct xfs_ail *, struct list_head *, xfs_lsn_t); -STATIC void xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *); -STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *); -STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *); +struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */ #ifdef DEBUG -STATIC void xfs_ail_check(struct xfs_ail *, xfs_log_item_t *); -#else +/* + * Check that the list is sorted as it should be. + */ +STATIC void +xfs_ail_check( + struct xfs_ail *ailp, + xfs_log_item_t *lip) +{ + xfs_log_item_t *prev_lip; + + if (list_empty(&ailp->xa_ail)) + return; + + /* + * Check the next and previous entries are valid. + */ + ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); + prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail); + if (&prev_lip->li_ail != &ailp->xa_ail) + ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); + + prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail); + if (&prev_lip->li_ail != &ailp->xa_ail) + ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0); + + +#ifdef XFS_TRANS_DEBUG + /* + * Walk the list checking lsn ordering, and that every entry has the + * XFS_LI_IN_AIL flag set. This is really expensive, so only do it + * when specifically debugging the transaction subsystem. + */ + prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail); + list_for_each_entry(lip, &ailp->xa_ail, li_ail) { + if (&prev_lip->li_ail != &ailp->xa_ail) + ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); + ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); + prev_lip = lip; + } +#endif /* XFS_TRANS_DEBUG */ +} +#else /* !DEBUG */ #define xfs_ail_check(a,l) #endif /* DEBUG */ +/* + * Return a pointer to the first item in the AIL. If the AIL is empty, then + * return NULL. + */ +static xfs_log_item_t * +xfs_ail_min( + struct xfs_ail *ailp) +{ + if (list_empty(&ailp->xa_ail)) + return NULL; + + return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail); +} + + /* + * Return a pointer to the last item in the AIL. If the AIL is empty, then + * return NULL. + */ +static xfs_log_item_t * +xfs_ail_max( + struct xfs_ail *ailp) +{ + if (list_empty(&ailp->xa_ail)) + return NULL; + + return list_entry(ailp->xa_ail.prev, xfs_log_item_t, li_ail); +} + +/* + * Return a pointer to the item which follows the given item in the AIL. If + * the given item is the last item in the list, then return NULL. + */ +static xfs_log_item_t * +xfs_ail_next( + struct xfs_ail *ailp, + xfs_log_item_t *lip) +{ + if (lip->li_ail.next == &ailp->xa_ail) + return NULL; + + return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail); +} /* - * This is called by the log manager code to determine the LSN - * of the tail of the log. This is exactly the LSN of the first - * item in the AIL. If the AIL is empty, then this function - * returns 0. + * This is called by the log manager code to determine the LSN of the tail of + * the log. This is exactly the LSN of the first item in the AIL. If the AIL + * is empty, then this function returns 0. * - * We need the AIL lock in order to get a coherent read of the - * lsn of the last item in the AIL. + * We need the AIL lock in order to get a coherent read of the lsn of the last + * item in the AIL. */ xfs_lsn_t -xfs_trans_ail_tail( +xfs_ail_min_lsn( struct xfs_ail *ailp) { - xfs_lsn_t lsn; + xfs_lsn_t lsn = 0; xfs_log_item_t *lip; spin_lock(&ailp->xa_lock); lip = xfs_ail_min(ailp); - if (lip == NULL) { - lsn = (xfs_lsn_t)0; - } else { + if (lip) lsn = lip->li_lsn; - } spin_unlock(&ailp->xa_lock); return lsn; } /* - * xfs_trans_push_ail - * - * This routine is called to move the tail of the AIL forward. It does this by - * trying to flush items in the AIL whose lsns are below the given - * threshold_lsn. - * - * the push is run asynchronously in a separate thread, so we return the tail - * of the log right now instead of the tail after the push. This means we will - * either continue right away, or we will sleep waiting on the async thread to - * do its work. - * - * We do this unlocked - we only need to know whether there is anything in the - * AIL at the time we are called. We don't need to access the contents of - * any of the objects, so the lock is not needed. + * Return the maximum lsn held in the AIL, or zero if the AIL is empty. */ -void -xfs_trans_ail_push( - struct xfs_ail *ailp, - xfs_lsn_t threshold_lsn) +static xfs_lsn_t +xfs_ail_max_lsn( + struct xfs_ail *ailp) { - xfs_log_item_t *lip; + xfs_lsn_t lsn = 0; + xfs_log_item_t *lip; - lip = xfs_ail_min(ailp); - if (lip && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) { - if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) - xfsaild_wakeup(ailp, threshold_lsn); - } + spin_lock(&ailp->xa_lock); + lip = xfs_ail_max(ailp); + if (lip) + lsn = lip->li_lsn; + spin_unlock(&ailp->xa_lock); + + return lsn; } /* @@ -236,16 +300,57 @@ out: } /* - * xfsaild_push does the work of pushing on the AIL. Returning a timeout of - * zero indicates that the caller should sleep until woken. + * splice the log item list into the AIL at the given LSN. */ -long -xfsaild_push( - struct xfs_ail *ailp, - xfs_lsn_t *last_lsn) +static void +xfs_ail_splice( + struct xfs_ail *ailp, + struct list_head *list, + xfs_lsn_t lsn) { - long tout = 0; - xfs_lsn_t last_pushed_lsn = *last_lsn; + xfs_log_item_t *next_lip; + + /* If the list is empty, just insert the item. */ + if (list_empty(&ailp->xa_ail)) { + list_splice(list, &ailp->xa_ail); + return; + } + + list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) { + if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0) + break; + } + + ASSERT(&next_lip->li_ail == &ailp->xa_ail || + XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0); + + list_splice_init(list, &next_lip->li_ail); +} + +/* + * Delete the given item from the AIL. Return a pointer to the item. + */ +static void +xfs_ail_delete( + struct xfs_ail *ailp, + xfs_log_item_t *lip) +{ + xfs_ail_check(ailp, lip); + list_del(&lip->li_ail); + xfs_trans_ail_cursor_clear(ailp, lip); +} + +/* + * xfs_ail_worker does the work of pushing on the AIL. It will requeue itself + * to run at a later time if there is more work to do to complete the push. + */ +STATIC void +xfs_ail_worker( + struct work_struct *work) +{ + struct xfs_ail *ailp = container_of(to_delayed_work(work), + struct xfs_ail, xa_work); + long tout; xfs_lsn_t target = ailp->xa_target; xfs_lsn_t lsn; xfs_log_item_t *lip; @@ -256,15 +361,15 @@ xfsaild_push( spin_lock(&ailp->xa_lock); xfs_trans_ail_cursor_init(ailp, cur); - lip = xfs_trans_ail_cursor_first(ailp, cur, *last_lsn); + lip = xfs_trans_ail_cursor_first(ailp, cur, ailp->xa_last_pushed_lsn); if (!lip || XFS_FORCED_SHUTDOWN(mp)) { /* * AIL is empty or our push has reached the end. */ xfs_trans_ail_cursor_done(ailp, cur); spin_unlock(&ailp->xa_lock); - *last_lsn = 0; - return tout; + ailp->xa_last_pushed_lsn = 0; + return; } XFS_STATS_INC(xs_push_ail); @@ -301,13 +406,13 @@ xfsaild_push( case XFS_ITEM_SUCCESS: XFS_STATS_INC(xs_push_ail_success); IOP_PUSH(lip); - last_pushed_lsn = lsn; + ailp->xa_last_pushed_lsn = lsn; break; case XFS_ITEM_PUSHBUF: XFS_STATS_INC(xs_push_ail_pushbuf); IOP_PUSHBUF(lip); - last_pushed_lsn = lsn; + ailp->xa_last_pushed_lsn = lsn; push_xfsbufd = 1; break; @@ -319,7 +424,7 @@ xfsaild_push( case XFS_ITEM_LOCKED: XFS_STATS_INC(xs_push_ail_locked); - last_pushed_lsn = lsn; + ailp->xa_last_pushed_lsn = lsn; stuck++; break; @@ -374,9 +479,23 @@ xfsaild_push( wake_up_process(mp->m_ddev_targp->bt_task); } + /* assume we have more work to do in a short while */ + tout = 10; if (!count) { /* We're past our target or empty, so idle */ - last_pushed_lsn = 0; + ailp->xa_last_pushed_lsn = 0; + + /* + * Check for an updated push target before clearing the + * XFS_AIL_PUSHING_BIT. If the target changed, we've got more + * work to do. Wait a bit longer before starting that work. + */ + smp_rmb(); + if (ailp->xa_target == target) { + clear_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags); + return; + } + tout = 50; } else if (XFS_LSN_CMP(lsn, target) >= 0) { /* * We reached the target so wait a bit longer for I/O to @@ -384,7 +503,7 @@ xfsaild_push( * start the next scan from the start of the AIL. */ tout = 50; - last_pushed_lsn = 0; + ailp->xa_last_pushed_lsn = 0; } else if ((stuck * 100) / count > 90) { /* * Either there is a lot of contention on the AIL or we @@ -396,14 +515,61 @@ xfsaild_push( * continuing from where we were. */ tout = 20; - } else { - /* more to do, but wait a short while before continuing */ - tout = 10; } - *last_lsn = last_pushed_lsn; - return tout; + + /* There is more to do, requeue us. */ + queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, + msecs_to_jiffies(tout)); +} + +/* + * This routine is called to move the tail of the AIL forward. It does this by + * trying to flush items in the AIL whose lsns are below the given + * threshold_lsn. + * + * The push is run asynchronously in a workqueue, which means the caller needs + * to handle waiting on the async flush for space to become available. + * We don't want to interrupt any push that is in progress, hence we only queue + * work if we set the pushing bit approriately. + * + * We do this unlocked - we only need to know whether there is anything in the + * AIL at the time we are called. We don't need to access the contents of + * any of the objects, so the lock is not needed. + */ +void +xfs_ail_push( + struct xfs_ail *ailp, + xfs_lsn_t threshold_lsn) +{ + xfs_log_item_t *lip; + + lip = xfs_ail_min(ailp); + if (!lip || XFS_FORCED_SHUTDOWN(ailp->xa_mount) || + XFS_LSN_CMP(threshold_lsn, ailp->xa_target) <= 0) + return; + + /* + * Ensure that the new target is noticed in push code before it clears + * the XFS_AIL_PUSHING_BIT. + */ + smp_wmb(); + ailp->xa_target = threshold_lsn; + if (!test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags)) + queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, 0); } +/* + * Push out all items in the AIL immediately + */ +void +xfs_ail_push_all( + struct xfs_ail *ailp) +{ + xfs_lsn_t threshold_lsn = xfs_ail_max_lsn(ailp); + + if (threshold_lsn) + xfs_ail_push(ailp, threshold_lsn); +} /* * This is to be called when an item is unlocked that may have @@ -615,7 +781,6 @@ xfs_trans_ail_init( xfs_mount_t *mp) { struct xfs_ail *ailp; - int error; ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL); if (!ailp) @@ -624,15 +789,9 @@ xfs_trans_ail_init( ailp->xa_mount = mp; INIT_LIST_HEAD(&ailp->xa_ail); spin_lock_init(&ailp->xa_lock); - error = xfsaild_start(ailp); - if (error) - goto out_free_ailp; + INIT_DELAYED_WORK(&ailp->xa_work, xfs_ail_worker); mp->m_ail = ailp; return 0; - -out_free_ailp: - kmem_free(ailp); - return error; } void @@ -641,124 +800,6 @@ xfs_trans_ail_destroy( { struct xfs_ail *ailp = mp->m_ail; - xfsaild_stop(ailp); + cancel_delayed_work_sync(&ailp->xa_work); kmem_free(ailp); } - -/* - * splice the log item list into the AIL at the given LSN. - */ -STATIC void -xfs_ail_splice( - struct xfs_ail *ailp, - struct list_head *list, - xfs_lsn_t lsn) -{ - xfs_log_item_t *next_lip; - - /* - * If the list is empty, just insert the item. - */ - if (list_empty(&ailp->xa_ail)) { - list_splice(list, &ailp->xa_ail); - return; - } - - list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) { - if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0) - break; - } - - ASSERT((&next_lip->li_ail == &ailp->xa_ail) || - (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)); - - list_splice_init(list, &next_lip->li_ail); - return; -} - -/* - * Delete the given item from the AIL. Return a pointer to the item. - */ -STATIC void -xfs_ail_delete( - struct xfs_ail *ailp, - xfs_log_item_t *lip) -{ - xfs_ail_check(ailp, lip); - list_del(&lip->li_ail); - xfs_trans_ail_cursor_clear(ailp, lip); -} - -/* - * Return a pointer to the first item in the AIL. - * If the AIL is empty, then return NULL. - */ -STATIC xfs_log_item_t * -xfs_ail_min( - struct xfs_ail *ailp) -{ - if (list_empty(&ailp->xa_ail)) - return NULL; - - return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail); -} - -/* - * Return a pointer to the item which follows - * the given item in the AIL. If the given item - * is the last item in the list, then return NULL. - */ -STATIC xfs_log_item_t * -xfs_ail_next( - struct xfs_ail *ailp, - xfs_log_item_t *lip) -{ - if (lip->li_ail.next == &ailp->xa_ail) - return NULL; - - return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail); -} - -#ifdef DEBUG -/* - * Check that the list is sorted as it should be. - */ -STATIC void -xfs_ail_check( - struct xfs_ail *ailp, - xfs_log_item_t *lip) -{ - xfs_log_item_t *prev_lip; - - if (list_empty(&ailp->xa_ail)) - return; - - /* - * Check the next and previous entries are valid. - */ - ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); - prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail); - if (&prev_lip->li_ail != &ailp->xa_ail) - ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); - - prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail); - if (&prev_lip->li_ail != &ailp->xa_ail) - ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0); - - -#ifdef XFS_TRANS_DEBUG - /* - * Walk the list checking lsn ordering, and that every entry has the - * XFS_LI_IN_AIL flag set. This is really expensive, so only do it - * when specifically debugging the transaction subsystem. - */ - prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail); - list_for_each_entry(lip, &ailp->xa_ail, li_ail) { - if (&prev_lip->li_ail != &ailp->xa_ail) - ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); - ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); - prev_lip = lip; - } -#endif /* XFS_TRANS_DEBUG */ -} -#endif /* DEBUG */ diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 35162c238fa3..6b164e9e9a1f 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -65,16 +65,22 @@ struct xfs_ail_cursor { struct xfs_ail { struct xfs_mount *xa_mount; struct list_head xa_ail; - uint xa_gen; - struct task_struct *xa_task; xfs_lsn_t xa_target; struct xfs_ail_cursor xa_cursors; spinlock_t xa_lock; + struct delayed_work xa_work; + xfs_lsn_t xa_last_pushed_lsn; + unsigned long xa_flags; }; +#define XFS_AIL_PUSHING_BIT 0 + /* * From xfs_trans_ail.c */ + +extern struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */ + void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, struct xfs_log_item **log_items, int nr_items, xfs_lsn_t lsn) __releases(ailp->xa_lock); @@ -98,12 +104,13 @@ xfs_trans_ail_delete( xfs_trans_ail_delete_bulk(ailp, &lip, 1); } -void xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t); +void xfs_ail_push(struct xfs_ail *, xfs_lsn_t); +void xfs_ail_push_all(struct xfs_ail *); +xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp); + void xfs_trans_unlocked_item(struct xfs_ail *, xfs_log_item_t *); -xfs_lsn_t xfs_trans_ail_tail(struct xfs_ail *ailp); - struct xfs_log_item *xfs_trans_ail_cursor_first(struct xfs_ail *ailp, struct xfs_ail_cursor *cur, xfs_lsn_t lsn); @@ -112,11 +119,6 @@ struct xfs_log_item *xfs_trans_ail_cursor_next(struct xfs_ail *ailp, void xfs_trans_ail_cursor_done(struct xfs_ail *ailp, struct xfs_ail_cursor *cur); -long xfsaild_push(struct xfs_ail *, xfs_lsn_t *); -void xfsaild_wakeup(struct xfs_ail *, xfs_lsn_t); -int xfsaild_start(struct xfs_ail *); -void xfsaild_stop(struct xfs_ail *); - #if BITS_PER_LONG != 64 static inline void xfs_trans_ail_copy_lsn( |