summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDavid Woodhouse <dwmw2@infradead.org>2007-04-27 20:16:19 +0200
committerDavid Woodhouse <dwmw2@infradead.org>2007-04-27 20:16:19 +0200
commitd1da4e50e5d09f02c340927a4fcb7f54202fa033 (patch)
tree7f98317bdd45dbdb7644e9179891c5af6a3a8ef1 /fs
parent[MTD] [NAND] Wrong calculation of page number in nand_block_bad() (diff)
parentMerge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jmo... (diff)
downloadlinux-d1da4e50e5d09f02c340927a4fcb7f54202fa033.tar.xz
linux-d1da4e50e5d09f02c340927a4fcb7f54202fa033.zip
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Conflicts: drivers/mtd/Kconfig Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig13
-rw-r--r--fs/afs/Makefile7
-rw-r--r--fs/afs/afs.h146
-rw-r--r--fs/afs/afs_cm.h32
-rw-r--r--fs/afs/afs_fs.h48
-rw-r--r--fs/afs/afs_vl.h (renamed from fs/afs/vlclient.h)49
-rw-r--r--fs/afs/cache.c256
-rw-r--r--fs/afs/cache.h12
-rw-r--r--fs/afs/callback.c509
-rw-r--r--fs/afs/cell.c471
-rw-r--r--fs/afs/cell.h78
-rw-r--r--fs/afs/cmservice.c926
-rw-r--r--fs/afs/cmservice.h29
-rw-r--r--fs/afs/dir.c852
-rw-r--r--fs/afs/errors.h34
-rw-r--r--fs/afs/file.c124
-rw-r--r--fs/afs/fsclient.c1528
-rw-r--r--fs/afs/fsclient.h54
-rw-r--r--fs/afs/inode.c248
-rw-r--r--fs/afs/internal.h755
-rw-r--r--fs/afs/kafsasyncd.c255
-rw-r--r--fs/afs/kafsasyncd.h52
-rw-r--r--fs/afs/kafstimod.c205
-rw-r--r--fs/afs/kafstimod.h49
-rw-r--r--fs/afs/main.c262
-rw-r--r--fs/afs/misc.c38
-rw-r--r--fs/afs/mntpt.c141
-rw-r--r--fs/afs/mount.h23
-rw-r--r--fs/afs/proc.c230
-rw-r--r--fs/afs/rxrpc.c782
-rw-r--r--fs/afs/security.c356
-rw-r--r--fs/afs/server.c647
-rw-r--r--fs/afs/server.h102
-rw-r--r--fs/afs/super.c326
-rw-r--r--fs/afs/super.h45
-rw-r--r--fs/afs/transport.h21
-rw-r--r--fs/afs/types.h125
-rw-r--r--fs/afs/use-rtnetlink.c473
-rw-r--r--fs/afs/vlclient.c737
-rw-r--r--fs/afs/vlocation.c1225
-rw-r--r--fs/afs/vnode.c731
-rw-r--r--fs/afs/vnode.h94
-rw-r--r--fs/afs/volume.c290
-rw-r--r--fs/afs/volume.h140
-rw-r--r--fs/compat_ioctl.c18
-rw-r--r--fs/ecryptfs/netlink.c6
-rw-r--r--fs/jffs2/fs.c12
-rw-r--r--fs/jffs2/os-linux.h6
-rw-r--r--fs/jffs2/wbuf.c24
-rw-r--r--fs/ocfs2/alloc.c3037
-rw-r--r--fs/ocfs2/alloc.h27
-rw-r--r--fs/ocfs2/aops.c1011
-rw-r--r--fs/ocfs2/aops.h77
-rw-r--r--fs/ocfs2/cluster/quorum.c5
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h5
-rw-r--r--fs/ocfs2/dir.c15
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c5
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2
-rw-r--r--fs/ocfs2/dlmglue.c143
-rw-r--r--fs/ocfs2/dlmglue.h3
-rw-r--r--fs/ocfs2/extent_map.c1233
-rw-r--r--fs/ocfs2/extent_map.h39
-rw-r--r--fs/ocfs2/file.c637
-rw-r--r--fs/ocfs2/file.h5
-rw-r--r--fs/ocfs2/inode.c199
-rw-r--r--fs/ocfs2/inode.h23
-rw-r--r--fs/ocfs2/journal.c24
-rw-r--r--fs/ocfs2/journal.h2
-rw-r--r--fs/ocfs2/mmap.c7
-rw-r--r--fs/ocfs2/namei.c23
-rw-r--r--fs/ocfs2/ocfs2.h55
-rw-r--r--fs/ocfs2/ocfs2_fs.h31
-rw-r--r--fs/ocfs2/ocfs2_lockid.h5
-rw-r--r--fs/ocfs2/slot_map.c2
-rw-r--r--fs/ocfs2/suballoc.c3
-rw-r--r--fs/ocfs2/super.c7
-rw-r--r--fs/ocfs2/vote.c289
-rw-r--r--fs/ocfs2/vote.h3
-rw-r--r--fs/sync.c8
79 files changed, 12190 insertions, 8321 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 3c4886b849f5..e33c08924572 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -2019,7 +2019,7 @@ config CODA_FS_OLD_API
config AFS_FS
tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
depends on INET && EXPERIMENTAL
- select RXRPC
+ select AF_RXRPC
help
If you say Y here, you will get an experimental Andrew File System
driver. It currently only supports unsecured read-only AFS access.
@@ -2028,8 +2028,15 @@ config AFS_FS
If unsure, say N.
-config RXRPC
- tristate
+config AFS_DEBUG
+ bool "AFS dynamic debugging"
+ depends on AFS_FS
+ help
+ Say Y here to make runtime controllable debugging messages appear.
+
+ See <file:Documentation/filesystems/afs.txt> for more information.
+
+ If unsure, say N.
config 9P_FS
tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 4029c9da4b86..01545eb1d872 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -2,8 +2,6 @@
# Makefile for Red Hat Linux AFS client.
#
-#CFLAGS += -finstrument-functions
-
kafs-objs := \
callback.o \
cell.o \
@@ -12,14 +10,15 @@ kafs-objs := \
file.o \
fsclient.o \
inode.o \
- kafsasyncd.o \
- kafstimod.o \
main.o \
misc.o \
mntpt.o \
proc.o \
+ rxrpc.o \
+ security.o \
server.o \
super.o \
+ use-rtnetlink.o \
vlclient.o \
vlocation.o \
vnode.o \
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
new file mode 100644
index 000000000000..52d0752265b8
--- /dev/null
+++ b/fs/afs/afs.h
@@ -0,0 +1,146 @@
+/* AFS common types
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef AFS_H
+#define AFS_H
+
+#include <linux/in.h>
+
+#define AFS_MAXCELLNAME 64 /* maximum length of a cell name */
+#define AFS_MAXVOLNAME 64 /* maximum length of a volume name */
+
+typedef unsigned afs_volid_t;
+typedef unsigned afs_vnodeid_t;
+typedef unsigned long long afs_dataversion_t;
+
+typedef enum {
+ AFSVL_RWVOL, /* read/write volume */
+ AFSVL_ROVOL, /* read-only volume */
+ AFSVL_BACKVOL, /* backup volume */
+} __attribute__((packed)) afs_voltype_t;
+
+typedef enum {
+ AFS_FTYPE_INVALID = 0,
+ AFS_FTYPE_FILE = 1,
+ AFS_FTYPE_DIR = 2,
+ AFS_FTYPE_SYMLINK = 3,
+} afs_file_type_t;
+
+/*
+ * AFS file identifier
+ */
+struct afs_fid {
+ afs_volid_t vid; /* volume ID */
+ afs_vnodeid_t vnode; /* file index within volume */
+ unsigned unique; /* unique ID number (file index version) */
+};
+
+/*
+ * AFS callback notification
+ */
+typedef enum {
+ AFSCM_CB_UNTYPED = 0, /* no type set on CB break */
+ AFSCM_CB_EXCLUSIVE = 1, /* CB exclusive to CM [not implemented] */
+ AFSCM_CB_SHARED = 2, /* CB shared by other CM's */
+ AFSCM_CB_DROPPED = 3, /* CB promise cancelled by file server */
+} afs_callback_type_t;
+
+struct afs_callback {
+ struct afs_fid fid; /* file identifier */
+ unsigned version; /* callback version */
+ unsigned expiry; /* time at which expires */
+ afs_callback_type_t type; /* type of callback */
+};
+
+#define AFSCBMAX 50 /* maximum callbacks transferred per bulk op */
+
+/*
+ * AFS volume information
+ */
+struct afs_volume_info {
+ afs_volid_t vid; /* volume ID */
+ afs_voltype_t type; /* type of this volume */
+ afs_volid_t type_vids[5]; /* volume ID's for possible types for this vol */
+
+ /* list of fileservers serving this volume */
+ size_t nservers; /* number of entries used in servers[] */
+ struct {
+ struct in_addr addr; /* fileserver address */
+ } servers[8];
+};
+
+/*
+ * AFS security ACE access mask
+ */
+typedef u32 afs_access_t;
+#define AFS_ACE_READ 0x00000001U /* - permission to read a file/dir */
+#define AFS_ACE_WRITE 0x00000002U /* - permission to write/chmod a file */
+#define AFS_ACE_INSERT 0x00000004U /* - permission to create dirent in a dir */
+#define AFS_ACE_LOOKUP 0x00000008U /* - permission to lookup a file/dir in a dir */
+#define AFS_ACE_DELETE 0x00000010U /* - permission to delete a dirent from a dir */
+#define AFS_ACE_LOCK 0x00000020U /* - permission to lock a file */
+#define AFS_ACE_ADMINISTER 0x00000040U /* - permission to change ACL */
+#define AFS_ACE_USER_A 0x01000000U /* - 'A' user-defined permission */
+#define AFS_ACE_USER_B 0x02000000U /* - 'B' user-defined permission */
+#define AFS_ACE_USER_C 0x04000000U /* - 'C' user-defined permission */
+#define AFS_ACE_USER_D 0x08000000U /* - 'D' user-defined permission */
+#define AFS_ACE_USER_E 0x10000000U /* - 'E' user-defined permission */
+#define AFS_ACE_USER_F 0x20000000U /* - 'F' user-defined permission */
+#define AFS_ACE_USER_G 0x40000000U /* - 'G' user-defined permission */
+#define AFS_ACE_USER_H 0x80000000U /* - 'H' user-defined permission */
+
+/*
+ * AFS file status information
+ */
+struct afs_file_status {
+ unsigned if_version; /* interface version */
+#define AFS_FSTATUS_VERSION 1
+
+ afs_file_type_t type; /* file type */
+ unsigned nlink; /* link count */
+ u64 size; /* file size */
+ afs_dataversion_t data_version; /* current data version */
+ u32 author; /* author ID */
+ u32 owner; /* owner ID */
+ u32 group; /* group ID */
+ afs_access_t caller_access; /* access rights for authenticated caller */
+ afs_access_t anon_access; /* access rights for unauthenticated caller */
+ umode_t mode; /* UNIX mode */
+ struct afs_fid parent; /* parent dir ID for non-dirs only */
+ time_t mtime_client; /* last time client changed data */
+ time_t mtime_server; /* last time server changed data */
+};
+
+/*
+ * AFS file status change request
+ */
+struct afs_store_status {
+ u32 mask; /* which bits of the struct are set */
+ u32 mtime_client; /* last time client changed data */
+ u32 owner; /* owner ID */
+ u32 group; /* group ID */
+ umode_t mode; /* UNIX mode */
+};
+
+#define AFS_SET_MTIME 0x01 /* set the mtime */
+#define AFS_SET_OWNER 0x02 /* set the owner ID */
+#define AFS_SET_GROUP 0x04 /* set the group ID (unsupported?) */
+#define AFS_SET_MODE 0x08 /* set the UNIX mode */
+#define AFS_SET_SEG_SIZE 0x10 /* set the segment size (unsupported) */
+
+/*
+ * AFS volume synchronisation information
+ */
+struct afs_volsync {
+ time_t creation; /* volume creation time */
+};
+
+#endif /* AFS_H */
diff --git a/fs/afs/afs_cm.h b/fs/afs/afs_cm.h
new file mode 100644
index 000000000000..7b4d4fab4c80
--- /dev/null
+++ b/fs/afs/afs_cm.h
@@ -0,0 +1,32 @@
+/* AFS Cache Manager definitions
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef AFS_CM_H
+#define AFS_CM_H
+
+#define AFS_CM_PORT 7001 /* AFS file server port */
+#define CM_SERVICE 1 /* AFS File Service ID */
+
+enum AFS_CM_Operations {
+ CBCallBack = 204, /* break callback promises */
+ CBInitCallBackState = 205, /* initialise callback state */
+ CBProbe = 206, /* probe client */
+ CBGetLock = 207, /* get contents of CM lock table */
+ CBGetCE = 208, /* get cache file description */
+ CBGetXStatsVersion = 209, /* get version of extended statistics */
+ CBGetXStats = 210, /* get contents of extended statistics data */
+ CBInitCallBackState3 = 213, /* initialise callback state, version 3 */
+ CBGetCapabilities = 65538, /* get client capabilities */
+};
+
+#define AFS_CAP_ERROR_TRANSLATION 0x1
+
+#endif /* AFS_FS_H */
diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h
new file mode 100644
index 000000000000..89e0d1650a72
--- /dev/null
+++ b/fs/afs/afs_fs.h
@@ -0,0 +1,48 @@
+/* AFS File Service definitions
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef AFS_FS_H
+#define AFS_FS_H
+
+#define AFS_FS_PORT 7000 /* AFS file server port */
+#define FS_SERVICE 1 /* AFS File Service ID */
+
+enum AFS_FS_Operations {
+ FSFETCHDATA = 130, /* AFS Fetch file data */
+ FSFETCHSTATUS = 132, /* AFS Fetch file status */
+ FSREMOVEFILE = 136, /* AFS Remove a file */
+ FSCREATEFILE = 137, /* AFS Create a file */
+ FSRENAME = 138, /* AFS Rename or move a file or directory */
+ FSSYMLINK = 139, /* AFS Create a symbolic link */
+ FSLINK = 140, /* AFS Create a hard link */
+ FSMAKEDIR = 141, /* AFS Create a directory */
+ FSREMOVEDIR = 142, /* AFS Remove a directory */
+ FSGIVEUPCALLBACKS = 147, /* AFS Discard callback promises */
+ FSGETVOLUMEINFO = 148, /* AFS Get root volume information */
+ FSGETROOTVOLUME = 151, /* AFS Get root volume name */
+ FSLOOKUP = 161, /* AFS lookup file in directory */
+};
+
+enum AFS_FS_Errors {
+ VSALVAGE = 101, /* volume needs salvaging */
+ VNOVNODE = 102, /* no such file/dir (vnode) */
+ VNOVOL = 103, /* no such volume or volume unavailable */
+ VVOLEXISTS = 104, /* volume name already exists */
+ VNOSERVICE = 105, /* volume not currently in service */
+ VOFFLINE = 106, /* volume is currently offline (more info available [VVL-spec]) */
+ VONLINE = 107, /* volume is already online */
+ VDISKFULL = 108, /* disk partition is full */
+ VOVERQUOTA = 109, /* volume's maximum quota exceeded */
+ VBUSY = 110, /* volume is temporarily unavailable */
+ VMOVED = 111, /* volume moved to new server - ask this FS where */
+};
+
+#endif /* AFS_FS_H */
diff --git a/fs/afs/vlclient.h b/fs/afs/afs_vl.h
index e3d601179c46..8bbefe009ed4 100644
--- a/fs/afs/vlclient.h
+++ b/fs/afs/afs_vl.h
@@ -1,6 +1,6 @@
-/* vlclient.h: Volume Location Service client interface
+/* AFS Volume Location Service client interface
*
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
@@ -9,10 +9,19 @@
* 2 of the License, or (at your option) any later version.
*/
-#ifndef _LINUX_AFS_VLCLIENT_H
-#define _LINUX_AFS_VLCLIENT_H
+#ifndef AFS_VL_H
+#define AFS_VL_H
-#include "types.h"
+#include "afs.h"
+
+#define AFS_VL_PORT 7003 /* volume location service port */
+#define VL_SERVICE 52 /* RxRPC service ID for the Volume Location service */
+
+enum AFSVL_Operations {
+ VLGETENTRYBYID = 503, /* AFS Get Cache Entry By ID operation ID */
+ VLGETENTRYBYNAME = 504, /* AFS Get Cache Entry By Name operation ID */
+ VLPROBE = 514, /* AFS Probe Volume Location Service operation ID */
+};
enum AFSVL_Errors {
AFSVL_IDEXIST = 363520, /* Volume Id entry exists in vl database */
@@ -40,14 +49,16 @@ enum AFSVL_Errors {
AFSVL_BADVOLOPER = 363542, /* Bad volume operation code */
AFSVL_BADRELLOCKTYPE = 363543, /* Bad release lock type */
AFSVL_RERELEASE = 363544, /* Status report: last release was aborted */
- AFSVL_BADSERVERFLAG = 363545, /* Invalid replication site server °ag */
+ AFSVL_BADSERVERFLAG = 363545, /* Invalid replication site server °ag */
AFSVL_PERM = 363546, /* No permission access */
AFSVL_NOMEM = 363547, /* malloc/realloc failed to alloc enough memory */
};
-/* maps to "struct vldbentry" in vvl-spec.pdf */
+/*
+ * maps to "struct vldbentry" in vvl-spec.pdf
+ */
struct afs_vldbentry {
- char name[65]; /* name of volume (including NUL char) */
+ char name[65]; /* name of volume (with NUL char) */
afs_voltype_t type; /* volume type */
unsigned num_servers; /* num servers that hold instances of this vol */
unsigned clone_id; /* cloning ID */
@@ -68,26 +79,6 @@ struct afs_vldbentry {
#define AFS_VLSF_RWVOL 0x0004 /* this server holds a R/W instance of the volume */
#define AFS_VLSF_BACKVOL 0x0008 /* this server holds a backup instance of the volume */
} servers[8];
-
};
-/* look up a volume location database entry by name */
-extern int afs_rxvl_get_entry_by_name(struct afs_server *server,
- const char *volname,
- unsigned volnamesz,
- struct afs_cache_vlocation *entry);
-
-/* look up a volume location database entry by ID */
-extern int afs_rxvl_get_entry_by_id(struct afs_server *server,
- afs_volid_t volid,
- afs_voltype_t voltype,
- struct afs_cache_vlocation *entry);
-
-extern int afs_rxvl_get_entry_by_id_async(struct afs_async_op *op,
- afs_volid_t volid,
- afs_voltype_t voltype);
-
-extern int afs_rxvl_get_entry_by_id_async2(struct afs_async_op *op,
- struct afs_cache_vlocation *entry);
-
-#endif /* _LINUX_AFS_VLCLIENT_H */
+#endif /* AFS_VL_H */
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
new file mode 100644
index 000000000000..de0d7de69edc
--- /dev/null
+++ b/fs/afs/cache.c
@@ -0,0 +1,256 @@
+/* AFS caching stuff
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_cell_cache_match(void *target,
+ const void *entry);
+static void afs_cell_cache_update(void *source, void *entry);
+
+struct cachefs_index_def afs_cache_cell_index_def = {
+ .name = "cell_ix",
+ .data_size = sizeof(struct afs_cache_cell),
+ .keys[0] = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
+ .match = afs_cell_cache_match,
+ .update = afs_cell_cache_update,
+};
+#endif
+
+/*
+ * match a cell record obtained from the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_cell_cache_match(void *target,
+ const void *entry)
+{
+ const struct afs_cache_cell *ccell = entry;
+ struct afs_cell *cell = target;
+
+ _enter("{%s},{%s}", ccell->name, cell->name);
+
+ if (strncmp(ccell->name, cell->name, sizeof(ccell->name)) == 0) {
+ _leave(" = SUCCESS");
+ return CACHEFS_MATCH_SUCCESS;
+ }
+
+ _leave(" = FAILED");
+ return CACHEFS_MATCH_FAILED;
+}
+#endif
+
+/*
+ * update a cell record in the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static void afs_cell_cache_update(void *source, void *entry)
+{
+ struct afs_cache_cell *ccell = entry;
+ struct afs_cell *cell = source;
+
+ _enter("%p,%p", source, entry);
+
+ strncpy(ccell->name, cell->name, sizeof(ccell->name));
+
+ memcpy(ccell->vl_servers,
+ cell->vl_addrs,
+ min(sizeof(ccell->vl_servers), sizeof(cell->vl_addrs)));
+
+}
+#endif
+
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_vlocation_cache_match(void *target,
+ const void *entry);
+static void afs_vlocation_cache_update(void *source, void *entry);
+
+struct cachefs_index_def afs_vlocation_cache_index_def = {
+ .name = "vldb",
+ .data_size = sizeof(struct afs_cache_vlocation),
+ .keys[0] = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
+ .match = afs_vlocation_cache_match,
+ .update = afs_vlocation_cache_update,
+};
+#endif
+
+/*
+ * match a VLDB record stored in the cache
+ * - may also load target from entry
+ */
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_vlocation_cache_match(void *target,
+ const void *entry)
+{
+ const struct afs_cache_vlocation *vldb = entry;
+ struct afs_vlocation *vlocation = target;
+
+ _enter("{%s},{%s}", vlocation->vldb.name, vldb->name);
+
+ if (strncmp(vlocation->vldb.name, vldb->name, sizeof(vldb->name)) == 0
+ ) {
+ if (!vlocation->valid ||
+ vlocation->vldb.rtime == vldb->rtime
+ ) {
+ vlocation->vldb = *vldb;
+ vlocation->valid = 1;
+ _leave(" = SUCCESS [c->m]");
+ return CACHEFS_MATCH_SUCCESS;
+ } else if (memcmp(&vlocation->vldb, vldb, sizeof(*vldb)) != 0) {
+ /* delete if VIDs for this name differ */
+ if (memcmp(&vlocation->vldb.vid,
+ &vldb->vid,
+ sizeof(vldb->vid)) != 0) {
+ _leave(" = DELETE");
+ return CACHEFS_MATCH_SUCCESS_DELETE;
+ }
+
+ _leave(" = UPDATE");
+ return CACHEFS_MATCH_SUCCESS_UPDATE;
+ } else {
+ _leave(" = SUCCESS");
+ return CACHEFS_MATCH_SUCCESS;
+ }
+ }
+
+ _leave(" = FAILED");
+ return CACHEFS_MATCH_FAILED;
+}
+#endif
+
+/*
+ * update a VLDB record stored in the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static void afs_vlocation_cache_update(void *source, void *entry)
+{
+ struct afs_cache_vlocation *vldb = entry;
+ struct afs_vlocation *vlocation = source;
+
+ _enter("");
+
+ *vldb = vlocation->vldb;
+}
+#endif
+
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_volume_cache_match(void *target,
+ const void *entry);
+static void afs_volume_cache_update(void *source, void *entry);
+
+struct cachefs_index_def afs_volume_cache_index_def = {
+ .name = "volume",
+ .data_size = sizeof(struct afs_cache_vhash),
+ .keys[0] = { CACHEFS_INDEX_KEYS_BIN, 1 },
+ .keys[1] = { CACHEFS_INDEX_KEYS_BIN, 1 },
+ .match = afs_volume_cache_match,
+ .update = afs_volume_cache_update,
+};
+#endif
+
+/*
+ * match a volume hash record stored in the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_volume_cache_match(void *target,
+ const void *entry)
+{
+ const struct afs_cache_vhash *vhash = entry;
+ struct afs_volume *volume = target;
+
+ _enter("{%u},{%u}", volume->type, vhash->vtype);
+
+ if (volume->type == vhash->vtype) {
+ _leave(" = SUCCESS");
+ return CACHEFS_MATCH_SUCCESS;
+ }
+
+ _leave(" = FAILED");
+ return CACHEFS_MATCH_FAILED;
+}
+#endif
+
+/*
+ * update a volume hash record stored in the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static void afs_volume_cache_update(void *source, void *entry)
+{
+ struct afs_cache_vhash *vhash = entry;
+ struct afs_volume *volume = source;
+
+ _enter("");
+
+ vhash->vtype = volume->type;
+}
+#endif
+
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_vnode_cache_match(void *target,
+ const void *entry);
+static void afs_vnode_cache_update(void *source, void *entry);
+
+struct cachefs_index_def afs_vnode_cache_index_def = {
+ .name = "vnode",
+ .data_size = sizeof(struct afs_cache_vnode),
+ .keys[0] = { CACHEFS_INDEX_KEYS_BIN, 4 },
+ .match = afs_vnode_cache_match,
+ .update = afs_vnode_cache_update,
+};
+#endif
+
+/*
+ * match a vnode record stored in the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_vnode_cache_match(void *target,
+ const void *entry)
+{
+ const struct afs_cache_vnode *cvnode = entry;
+ struct afs_vnode *vnode = target;
+
+ _enter("{%x,%x,%Lx},{%x,%x,%Lx}",
+ vnode->fid.vnode,
+ vnode->fid.unique,
+ vnode->status.version,
+ cvnode->vnode_id,
+ cvnode->vnode_unique,
+ cvnode->data_version);
+
+ if (vnode->fid.vnode != cvnode->vnode_id) {
+ _leave(" = FAILED");
+ return CACHEFS_MATCH_FAILED;
+ }
+
+ if (vnode->fid.unique != cvnode->vnode_unique ||
+ vnode->status.version != cvnode->data_version) {
+ _leave(" = DELETE");
+ return CACHEFS_MATCH_SUCCESS_DELETE;
+ }
+
+ _leave(" = SUCCESS");
+ return CACHEFS_MATCH_SUCCESS;
+}
+#endif
+
+/*
+ * update a vnode record stored in the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static void afs_vnode_cache_update(void *source, void *entry)
+{
+ struct afs_cache_vnode *cvnode = entry;
+ struct afs_vnode *vnode = source;
+
+ _enter("");
+
+ cvnode->vnode_id = vnode->fid.vnode;
+ cvnode->vnode_unique = vnode->fid.unique;
+ cvnode->data_version = vnode->status.version;
+}
+#endif
diff --git a/fs/afs/cache.h b/fs/afs/cache.h
index 9eb7722b34d5..36a3642cf90e 100644
--- a/fs/afs/cache.h
+++ b/fs/afs/cache.h
@@ -1,4 +1,4 @@
-/* cache.h: AFS local cache management interface
+/* AFS local cache management interface
*
* Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
@@ -9,8 +9,8 @@
* 2 of the License, or (at your option) any later version.
*/
-#ifndef _LINUX_AFS_CACHE_H
-#define _LINUX_AFS_CACHE_H
+#ifndef AFS_CACHE_H
+#define AFS_CACHE_H
#undef AFS_CACHING_SUPPORT
@@ -20,8 +20,4 @@
#endif
#include "types.h"
-#ifdef __KERNEL__
-
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_AFS_CACHE_H */
+#endif /* AFS_CACHE_H */
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 9cb206e9d4be..639399f0ab6f 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2002 Red Hat, Inc. All rights reserved.
+ * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved.
*
* This software may be freely redistributed under the terms of the
* GNU General Public License.
@@ -16,85 +16,187 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
-#include "server.h"
-#include "vnode.h"
+#include <linux/circ_buf.h>
#include "internal.h"
-#include "cmservice.h"
-/*****************************************************************************/
+unsigned afs_vnode_update_timeout = 10;
+
+#define afs_breakring_space(server) \
+ CIRC_SPACE((server)->cb_break_head, (server)->cb_break_tail, \
+ ARRAY_SIZE((server)->cb_break))
+
+//static void afs_callback_updater(struct work_struct *);
+
+static struct workqueue_struct *afs_callback_update_worker;
+
/*
* allow the fileserver to request callback state (re-)initialisation
*/
-int SRXAFSCM_InitCallBackState(struct afs_server *server)
+void afs_init_callback_state(struct afs_server *server)
{
- struct list_head callbacks;
+ struct afs_vnode *vnode;
- _enter("%p", server);
+ _enter("{%p}", server);
- INIT_LIST_HEAD(&callbacks);
-
- /* transfer the callback list from the server to a temp holding area */
spin_lock(&server->cb_lock);
- list_add(&callbacks, &server->cb_promises);
- list_del_init(&server->cb_promises);
+ /* kill all the promises on record from this server */
+ while (!RB_EMPTY_ROOT(&server->cb_promises)) {
+ vnode = rb_entry(server->cb_promises.rb_node,
+ struct afs_vnode, cb_promise);
+ _debug("UNPROMISE { vid=%x vn=%u uq=%u}",
+ vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
+ rb_erase(&vnode->cb_promise, &server->cb_promises);
+ vnode->cb_promised = false;
+ }
- /* munch our way through the list, grabbing the inode, dropping all the
- * locks and regetting them in the right order
- */
- while (!list_empty(&callbacks)) {
- struct afs_vnode *vnode;
- struct inode *inode;
+ spin_unlock(&server->cb_lock);
+ _leave("");
+}
- vnode = list_entry(callbacks.next, struct afs_vnode, cb_link);
- list_del_init(&vnode->cb_link);
+/*
+ * handle the data invalidation side of a callback being broken
+ */
+void afs_broken_callback_work(struct work_struct *work)
+{
+ struct afs_vnode *vnode =
+ container_of(work, struct afs_vnode, cb_broken_work);
- /* try and grab the inode - may fail */
- inode = igrab(AFS_VNODE_TO_I(vnode));
- if (inode) {
- int release = 0;
+ _enter("");
- spin_unlock(&server->cb_lock);
- spin_lock(&vnode->lock);
+ if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+ return;
- if (vnode->cb_server == server) {
- vnode->cb_server = NULL;
- afs_kafstimod_del_timer(&vnode->cb_timeout);
- spin_lock(&afs_cb_hash_lock);
- list_del_init(&vnode->cb_hash_link);
- spin_unlock(&afs_cb_hash_lock);
- release = 1;
- }
+ /* we're only interested in dealing with a broken callback on *this*
+ * vnode and only if no-one else has dealt with it yet */
+ if (!mutex_trylock(&vnode->validate_lock))
+ return; /* someone else is dealing with it */
- spin_unlock(&vnode->lock);
+ if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
+ if (S_ISDIR(vnode->vfs_inode.i_mode))
+ afs_clear_permits(vnode);
- iput(inode);
- afs_put_server(server);
+ if (afs_vnode_fetch_status(vnode, NULL, NULL) < 0)
+ goto out;
- spin_lock(&server->cb_lock);
+ if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+ goto out;
+
+ /* if the vnode's data version number changed then its contents
+ * are different */
+ if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
+ _debug("zap data {%x:%u}",
+ vnode->fid.vid, vnode->fid.vnode);
+ invalidate_remote_inode(&vnode->vfs_inode);
}
}
- spin_unlock(&server->cb_lock);
+out:
+ mutex_unlock(&vnode->validate_lock);
- _leave(" = 0");
- return 0;
-} /* end SRXAFSCM_InitCallBackState() */
+ /* avoid the potential race whereby the mutex_trylock() in this
+ * function happens again between the clear_bit() and the
+ * mutex_unlock() */
+ if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
+ _debug("requeue");
+ queue_work(afs_callback_update_worker, &vnode->cb_broken_work);
+ }
+ _leave("");
+}
+
+/*
+ * actually break a callback
+ */
+static void afs_break_callback(struct afs_server *server,
+ struct afs_vnode *vnode)
+{
+ _enter("");
+
+ set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+
+ if (vnode->cb_promised) {
+ spin_lock(&vnode->lock);
+
+ _debug("break callback");
+
+ spin_lock(&server->cb_lock);
+ if (vnode->cb_promised) {
+ rb_erase(&vnode->cb_promise, &server->cb_promises);
+ vnode->cb_promised = false;
+ }
+ spin_unlock(&server->cb_lock);
+
+ queue_work(afs_callback_update_worker, &vnode->cb_broken_work);
+ spin_unlock(&vnode->lock);
+ }
+}
+
+/*
+ * allow the fileserver to explicitly break one callback
+ * - happens when
+ * - the backing file is changed
+ * - a lock is released
+ */
+static void afs_break_one_callback(struct afs_server *server,
+ struct afs_fid *fid)
+{
+ struct afs_vnode *vnode;
+ struct rb_node *p;
+
+ _debug("find");
+ spin_lock(&server->fs_lock);
+ p = server->fs_vnodes.rb_node;
+ while (p) {
+ vnode = rb_entry(p, struct afs_vnode, server_rb);
+ if (fid->vid < vnode->fid.vid)
+ p = p->rb_left;
+ else if (fid->vid > vnode->fid.vid)
+ p = p->rb_right;
+ else if (fid->vnode < vnode->fid.vnode)
+ p = p->rb_left;
+ else if (fid->vnode > vnode->fid.vnode)
+ p = p->rb_right;
+ else if (fid->unique < vnode->fid.unique)
+ p = p->rb_left;
+ else if (fid->unique > vnode->fid.unique)
+ p = p->rb_right;
+ else
+ goto found;
+ }
+
+ /* not found so we just ignore it (it may have moved to another
+ * server) */
+not_available:
+ _debug("not avail");
+ spin_unlock(&server->fs_lock);
+ _leave("");
+ return;
+
+found:
+ _debug("found");
+ ASSERTCMP(server, ==, vnode->server);
+
+ if (!igrab(AFS_VNODE_TO_I(vnode)))
+ goto not_available;
+ spin_unlock(&server->fs_lock);
+
+ afs_break_callback(server, vnode);
+ iput(&vnode->vfs_inode);
+ _leave("");
+}
-/*****************************************************************************/
/*
* allow the fileserver to break callback promises
*/
-int SRXAFSCM_CallBack(struct afs_server *server, size_t count,
- struct afs_callback callbacks[])
+void afs_break_callbacks(struct afs_server *server, size_t count,
+ struct afs_callback callbacks[])
{
- _enter("%p,%u,", server, count);
+ _enter("%p,%zu,", server, count);
- for (; count > 0; callbacks++, count--) {
- struct afs_vnode *vnode = NULL;
- struct inode *inode = NULL;
- int valid = 0;
+ ASSERT(server != NULL);
+ ASSERTCMP(count, <=, AFSCBMAX);
+ for (; count > 0; callbacks++, count--) {
_debug("- Fid { vl=%08x n=%u u=%u } CB { v=%u x=%u t=%u }",
callbacks->fid.vid,
callbacks->fid.vnode,
@@ -103,67 +205,270 @@ int SRXAFSCM_CallBack(struct afs_server *server, size_t count,
callbacks->expiry,
callbacks->type
);
+ afs_break_one_callback(server, &callbacks->fid);
+ }
- /* find the inode for this fid */
- spin_lock(&afs_cb_hash_lock);
+ _leave("");
+ return;
+}
- list_for_each_entry(vnode,
- &afs_cb_hash(server, &callbacks->fid),
- cb_hash_link) {
- if (memcmp(&vnode->fid, &callbacks->fid,
- sizeof(struct afs_fid)) != 0)
- continue;
+/*
+ * record the callback for breaking
+ * - the caller must hold server->cb_lock
+ */
+static void afs_do_give_up_callback(struct afs_server *server,
+ struct afs_vnode *vnode)
+{
+ struct afs_callback *cb;
- /* right vnode, but is it same server? */
- if (vnode->cb_server != server)
- break; /* no */
+ _enter("%p,%p", server, vnode);
- /* try and nail the inode down */
- inode = igrab(AFS_VNODE_TO_I(vnode));
- break;
+ cb = &server->cb_break[server->cb_break_head];
+ cb->fid = vnode->fid;
+ cb->version = vnode->cb_version;
+ cb->expiry = vnode->cb_expiry;
+ cb->type = vnode->cb_type;
+ smp_wmb();
+ server->cb_break_head =
+ (server->cb_break_head + 1) &
+ (ARRAY_SIZE(server->cb_break) - 1);
+
+ /* defer the breaking of callbacks to try and collect as many as
+ * possible to ship in one operation */
+ switch (atomic_inc_return(&server->cb_break_n)) {
+ case 1 ... AFSCBMAX - 1:
+ queue_delayed_work(afs_callback_update_worker,
+ &server->cb_break_work, HZ * 2);
+ break;
+ case AFSCBMAX:
+ afs_flush_callback_breaks(server);
+ break;
+ default:
+ break;
+ }
+
+ ASSERT(server->cb_promises.rb_node != NULL);
+ rb_erase(&vnode->cb_promise, &server->cb_promises);
+ vnode->cb_promised = false;
+ _leave("");
+}
+
+/*
+ * discard the callback on a deleted item
+ */
+void afs_discard_callback_on_delete(struct afs_vnode *vnode)
+{
+ struct afs_server *server = vnode->server;
+
+ _enter("%d", vnode->cb_promised);
+
+ if (!vnode->cb_promised) {
+ _leave(" [not promised]");
+ return;
+ }
+
+ ASSERT(server != NULL);
+
+ spin_lock(&server->cb_lock);
+ if (vnode->cb_promised) {
+ ASSERT(server->cb_promises.rb_node != NULL);
+ rb_erase(&vnode->cb_promise, &server->cb_promises);
+ vnode->cb_promised = false;
+ }
+ spin_unlock(&server->cb_lock);
+ _leave("");
+}
+
+/*
+ * give up the callback registered for a vnode on the file server when the
+ * inode is being cleared
+ */
+void afs_give_up_callback(struct afs_vnode *vnode)
+{
+ struct afs_server *server = vnode->server;
+
+ DECLARE_WAITQUEUE(myself, current);
+
+ _enter("%d", vnode->cb_promised);
+
+ _debug("GIVE UP INODE %p", &vnode->vfs_inode);
+
+ if (!vnode->cb_promised) {
+ _leave(" [not promised]");
+ return;
+ }
+
+ ASSERT(server != NULL);
+
+ spin_lock(&server->cb_lock);
+ if (vnode->cb_promised && afs_breakring_space(server) == 0) {
+ add_wait_queue(&server->cb_break_waitq, &myself);
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!vnode->cb_promised ||
+ afs_breakring_space(server) != 0)
+ break;
+ spin_unlock(&server->cb_lock);
+ schedule();
+ spin_lock(&server->cb_lock);
}
+ remove_wait_queue(&server->cb_break_waitq, &myself);
+ __set_current_state(TASK_RUNNING);
+ }
+
+ /* of course, it's always possible for the server to break this vnode's
+ * callback first... */
+ if (vnode->cb_promised)
+ afs_do_give_up_callback(server, vnode);
+
+ spin_unlock(&server->cb_lock);
+ _leave("");
+}
+
+/*
+ * dispatch a deferred give up callbacks operation
+ */
+void afs_dispatch_give_up_callbacks(struct work_struct *work)
+{
+ struct afs_server *server =
+ container_of(work, struct afs_server, cb_break_work.work);
+
+ _enter("");
+
+ /* tell the fileserver to discard the callback promises it has
+ * - in the event of ENOMEM or some other error, we just forget that we
+ * had callbacks entirely, and the server will call us later to break
+ * them
+ */
+ afs_fs_give_up_callbacks(server, &afs_async_call);
+}
+
+/*
+ * flush the outstanding callback breaks on a server
+ */
+void afs_flush_callback_breaks(struct afs_server *server)
+{
+ cancel_delayed_work(&server->cb_break_work);
+ queue_delayed_work(afs_callback_update_worker,
+ &server->cb_break_work, 0);
+}
- spin_unlock(&afs_cb_hash_lock);
-
- if (inode) {
- /* we've found the record for this vnode */
- spin_lock(&vnode->lock);
- if (vnode->cb_server == server) {
- /* the callback _is_ on the calling server */
- vnode->cb_server = NULL;
- valid = 1;
-
- afs_kafstimod_del_timer(&vnode->cb_timeout);
- vnode->flags |= AFS_VNODE_CHANGED;
-
- spin_lock(&server->cb_lock);
- list_del_init(&vnode->cb_link);
- spin_unlock(&server->cb_lock);
-
- spin_lock(&afs_cb_hash_lock);
- list_del_init(&vnode->cb_hash_link);
- spin_unlock(&afs_cb_hash_lock);
- }
- spin_unlock(&vnode->lock);
-
- if (valid) {
- invalidate_remote_inode(inode);
- afs_put_server(server);
- }
- iput(inode);
+#if 0
+/*
+ * update a bunch of callbacks
+ */
+static void afs_callback_updater(struct work_struct *work)
+{
+ struct afs_server *server;
+ struct afs_vnode *vnode, *xvnode;
+ time_t now;
+ long timeout;
+ int ret;
+
+ server = container_of(work, struct afs_server, updater);
+
+ _enter("");
+
+ now = get_seconds();
+
+ /* find the first vnode to update */
+ spin_lock(&server->cb_lock);
+ for (;;) {
+ if (RB_EMPTY_ROOT(&server->cb_promises)) {
+ spin_unlock(&server->cb_lock);
+ _leave(" [nothing]");
+ return;
}
+
+ vnode = rb_entry(rb_first(&server->cb_promises),
+ struct afs_vnode, cb_promise);
+ if (atomic_read(&vnode->usage) > 0)
+ break;
+ rb_erase(&vnode->cb_promise, &server->cb_promises);
+ vnode->cb_promised = false;
}
- _leave(" = 0");
- return 0;
-} /* end SRXAFSCM_CallBack() */
+ timeout = vnode->update_at - now;
+ if (timeout > 0) {
+ queue_delayed_work(afs_vnode_update_worker,
+ &afs_vnode_update, timeout * HZ);
+ spin_unlock(&server->cb_lock);
+ _leave(" [nothing]");
+ return;
+ }
+
+ list_del_init(&vnode->update);
+ atomic_inc(&vnode->usage);
+ spin_unlock(&server->cb_lock);
+
+ /* we can now perform the update */
+ _debug("update %s", vnode->vldb.name);
+ vnode->state = AFS_VL_UPDATING;
+ vnode->upd_rej_cnt = 0;
+ vnode->upd_busy_cnt = 0;
+
+ ret = afs_vnode_update_record(vl, &vldb);
+ switch (ret) {
+ case 0:
+ afs_vnode_apply_update(vl, &vldb);
+ vnode->state = AFS_VL_UPDATING;
+ break;
+ case -ENOMEDIUM:
+ vnode->state = AFS_VL_VOLUME_DELETED;
+ break;
+ default:
+ vnode->state = AFS_VL_UNCERTAIN;
+ break;
+ }
+
+ /* and then reschedule */
+ _debug("reschedule");
+ vnode->update_at = get_seconds() + afs_vnode_update_timeout;
+
+ spin_lock(&server->cb_lock);
+
+ if (!list_empty(&server->cb_promises)) {
+ /* next update in 10 minutes, but wait at least 1 second more
+ * than the newest record already queued so that we don't spam
+ * the VL server suddenly with lots of requests
+ */
+ xvnode = list_entry(server->cb_promises.prev,
+ struct afs_vnode, update);
+ if (vnode->update_at <= xvnode->update_at)
+ vnode->update_at = xvnode->update_at + 1;
+ xvnode = list_entry(server->cb_promises.next,
+ struct afs_vnode, update);
+ timeout = xvnode->update_at - now;
+ if (timeout < 0)
+ timeout = 0;
+ } else {
+ timeout = afs_vnode_update_timeout;
+ }
+
+ list_add_tail(&vnode->update, &server->cb_promises);
+
+ _debug("timeout %ld", timeout);
+ queue_delayed_work(afs_vnode_update_worker,
+ &afs_vnode_update, timeout * HZ);
+ spin_unlock(&server->cb_lock);
+ afs_put_vnode(vl);
+}
+#endif
+
+/*
+ * initialise the callback update process
+ */
+int __init afs_callback_update_init(void)
+{
+ afs_callback_update_worker =
+ create_singlethread_workqueue("kafs_callbackd");
+ return afs_callback_update_worker ? 0 : -ENOMEM;
+}
-/*****************************************************************************/
/*
- * allow the fileserver to see if the cache manager is still alive
+ * shut down the callback update process
*/
-int SRXAFSCM_Probe(struct afs_server *server)
+void __exit afs_callback_update_kill(void)
{
- _debug("SRXAFSCM_Probe(%p)\n", server);
- return 0;
-} /* end SRXAFSCM_Probe() */
+ destroy_workqueue(afs_callback_update_worker);
+}
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 1fc578372759..9b1311a1df51 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -1,4 +1,4 @@
-/* cell.c: AFS cell and server record management
+/* AFS cell and server record management
*
* Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
@@ -11,15 +11,9 @@
#include <linux/module.h>
#include <linux/slab.h>
-#include <rxrpc/peer.h>
-#include <rxrpc/connection.h>
-#include "volume.h"
-#include "cell.h"
-#include "server.h"
-#include "transport.h"
-#include "vlclient.h"
-#include "kafstimod.h"
-#include "super.h"
+#include <linux/key.h>
+#include <linux/ctype.h>
+#include <keys/rxrpc-type.h>
#include "internal.h"
DECLARE_RWSEM(afs_proc_cells_sem);
@@ -28,66 +22,47 @@ LIST_HEAD(afs_proc_cells);
static struct list_head afs_cells = LIST_HEAD_INIT(afs_cells);
static DEFINE_RWLOCK(afs_cells_lock);
static DECLARE_RWSEM(afs_cells_sem); /* add/remove serialisation */
+static DECLARE_WAIT_QUEUE_HEAD(afs_cells_freeable_wq);
static struct afs_cell *afs_cell_root;
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_cell_cache_match(void *target,
- const void *entry);
-static void afs_cell_cache_update(void *source, void *entry);
-
-struct cachefs_index_def afs_cache_cell_index_def = {
- .name = "cell_ix",
- .data_size = sizeof(struct afs_cache_cell),
- .keys[0] = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
- .match = afs_cell_cache_match,
- .update = afs_cell_cache_update,
-};
-#endif
-
-/*****************************************************************************/
/*
- * create a cell record
- * - "name" is the name of the cell
- * - "vllist" is a colon separated list of IP addresses in "a.b.c.d" format
+ * allocate a cell record and fill in its name, VL server address list and
+ * allocate an anonymous key
*/
-int afs_cell_create(const char *name, char *vllist, struct afs_cell **_cell)
+static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
{
struct afs_cell *cell;
- char *next;
+ size_t namelen;
+ char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next;
int ret;
- _enter("%s", name);
+ _enter("%s,%s", name, vllist);
BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */
+ namelen = strlen(name);
+ if (namelen > AFS_MAXCELLNAME)
+ return ERR_PTR(-ENAMETOOLONG);
+
/* allocate and initialise a cell record */
- cell = kmalloc(sizeof(struct afs_cell) + strlen(name) + 1, GFP_KERNEL);
+ cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL);
if (!cell) {
_leave(" = -ENOMEM");
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
}
- down_write(&afs_cells_sem);
-
- memset(cell, 0, sizeof(struct afs_cell));
- atomic_set(&cell->usage, 0);
+ memcpy(cell->name, name, namelen);
+ cell->name[namelen] = 0;
+ atomic_set(&cell->usage, 1);
INIT_LIST_HEAD(&cell->link);
-
- rwlock_init(&cell->sv_lock);
- INIT_LIST_HEAD(&cell->sv_list);
- INIT_LIST_HEAD(&cell->sv_graveyard);
- spin_lock_init(&cell->sv_gylock);
-
+ rwlock_init(&cell->servers_lock);
+ INIT_LIST_HEAD(&cell->servers);
init_rwsem(&cell->vl_sem);
INIT_LIST_HEAD(&cell->vl_list);
- INIT_LIST_HEAD(&cell->vl_graveyard);
- spin_lock_init(&cell->vl_gylock);
-
- strcpy(cell->name,name);
+ spin_lock_init(&cell->vl_lock);
/* fill in the VL server list from the rest of the string */
- ret = -EINVAL;
do {
unsigned a, b, c, d;
@@ -96,20 +71,75 @@ int afs_cell_create(const char *name, char *vllist, struct afs_cell **_cell)
*next++ = 0;
if (sscanf(vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4)
- goto badaddr;
+ goto bad_address;
if (a > 255 || b > 255 || c > 255 || d > 255)
- goto badaddr;
+ goto bad_address;
cell->vl_addrs[cell->vl_naddrs++].s_addr =
htonl((a << 24) | (b << 16) | (c << 8) | d);
- if (cell->vl_naddrs >= AFS_CELL_MAX_ADDRS)
- break;
+ } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (vllist = next));
+
+ /* create a key to represent an anonymous user */
+ memcpy(keyname, "afs@", 4);
+ dp = keyname + 4;
+ cp = cell->name;
+ do {
+ *dp++ = toupper(*cp);
+ } while (*cp++);
+ cell->anonymous_key = key_alloc(&key_type_rxrpc, keyname, 0, 0, current,
+ KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA);
+ if (IS_ERR(cell->anonymous_key)) {
+ _debug("no key");
+ ret = PTR_ERR(cell->anonymous_key);
+ goto error;
+ }
+
+ ret = key_instantiate_and_link(cell->anonymous_key, NULL, 0,
+ NULL, NULL);
+ if (ret < 0) {
+ _debug("instantiate failed");
+ goto error;
+ }
+
+ _debug("anon key %p{%x}",
+ cell->anonymous_key, key_serial(cell->anonymous_key));
+
+ _leave(" = %p", cell);
+ return cell;
+
+bad_address:
+ printk(KERN_ERR "kAFS: bad VL server IP address\n");
+ ret = -EINVAL;
+error:
+ key_put(cell->anonymous_key);
+ kfree(cell);
+ _leave(" = %d", ret);
+ return ERR_PTR(ret);
+}
+
+/*
+ * create a cell record
+ * - "name" is the name of the cell
+ * - "vllist" is a colon separated list of IP addresses in "a.b.c.d" format
+ */
+struct afs_cell *afs_cell_create(const char *name, char *vllist)
+{
+ struct afs_cell *cell;
+ int ret;
+
+ _enter("%s,%s", name, vllist);
- } while(vllist = next, vllist);
+ cell = afs_cell_alloc(name, vllist);
+ if (IS_ERR(cell)) {
+ _leave(" = %ld", PTR_ERR(cell));
+ return cell;
+ }
+
+ down_write(&afs_cells_sem);
- /* add a proc dir for this cell */
+ /* add a proc directory for this cell */
ret = afs_proc_cell_setup(cell);
if (ret < 0)
goto error;
@@ -130,31 +160,28 @@ int afs_cell_create(const char *name, char *vllist, struct afs_cell **_cell)
down_write(&afs_proc_cells_sem);
list_add_tail(&cell->proc_link, &afs_proc_cells);
up_write(&afs_proc_cells_sem);
-
- *_cell = cell;
up_write(&afs_cells_sem);
- _leave(" = 0 (%p)", cell);
- return 0;
+ _leave(" = %p", cell);
+ return cell;
- badaddr:
- printk(KERN_ERR "kAFS: bad VL server IP address: '%s'\n", vllist);
- error:
+error:
up_write(&afs_cells_sem);
+ key_put(cell->anonymous_key);
kfree(cell);
_leave(" = %d", ret);
- return ret;
-} /* end afs_cell_create() */
+ return ERR_PTR(ret);
+}
-/*****************************************************************************/
/*
- * initialise the cell database from module parameters
+ * set the root cell information
+ * - can be called with a module parameter string
+ * - can be called from a write to /proc/fs/afs/rootcell
*/
int afs_cell_init(char *rootcell)
{
struct afs_cell *old_root, *new_root;
char *cp;
- int ret;
_enter("");
@@ -162,82 +189,60 @@ int afs_cell_init(char *rootcell)
/* module is loaded with no parameters, or built statically.
* - in the future we might initialize cell DB here.
*/
- _leave(" = 0 (but no root)");
+ _leave(" = 0 [no root]");
return 0;
}
cp = strchr(rootcell, ':');
if (!cp) {
printk(KERN_ERR "kAFS: no VL server IP addresses specified\n");
- _leave(" = %d (no colon)", -EINVAL);
+ _leave(" = -EINVAL");
return -EINVAL;
}
/* allocate a cell record for the root cell */
*cp++ = 0;
- ret = afs_cell_create(rootcell, cp, &new_root);
- if (ret < 0) {
- _leave(" = %d", ret);
- return ret;
+ new_root = afs_cell_create(rootcell, cp);
+ if (IS_ERR(new_root)) {
+ _leave(" = %ld", PTR_ERR(new_root));
+ return PTR_ERR(new_root);
}
- /* as afs_put_cell() takes locks by itself, we have to do
- * a little gymnastics to be race-free.
- */
- afs_get_cell(new_root);
-
+ /* install the new cell */
write_lock(&afs_cells_lock);
- while (afs_cell_root) {
- old_root = afs_cell_root;
- afs_cell_root = NULL;
- write_unlock(&afs_cells_lock);
- afs_put_cell(old_root);
- write_lock(&afs_cells_lock);
- }
+ old_root = afs_cell_root;
afs_cell_root = new_root;
write_unlock(&afs_cells_lock);
+ afs_put_cell(old_root);
- _leave(" = %d", ret);
- return ret;
-
-} /* end afs_cell_init() */
+ _leave(" = 0");
+ return 0;
+}
-/*****************************************************************************/
/*
* lookup a cell record
*/
-int afs_cell_lookup(const char *name, unsigned namesz, struct afs_cell **_cell)
+struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz)
{
struct afs_cell *cell;
- int ret;
_enter("\"%*.*s\",", namesz, namesz, name ? name : "");
- *_cell = NULL;
+ down_read(&afs_cells_sem);
+ read_lock(&afs_cells_lock);
if (name) {
/* if the cell was named, look for it in the cell record list */
- ret = -ENOENT;
- cell = NULL;
- read_lock(&afs_cells_lock);
-
list_for_each_entry(cell, &afs_cells, link) {
if (strncmp(cell->name, name, namesz) == 0) {
afs_get_cell(cell);
goto found;
}
}
- cell = NULL;
+ cell = ERR_PTR(-ENOENT);
found:
-
- read_unlock(&afs_cells_lock);
-
- if (cell)
- ret = 0;
- }
- else {
- read_lock(&afs_cells_lock);
-
+ ;
+ } else {
cell = afs_cell_root;
if (!cell) {
/* this should not happen unless user tries to mount
@@ -246,44 +251,35 @@ int afs_cell_lookup(const char *name, unsigned namesz, struct afs_cell **_cell)
* ENOENT might be "more appropriate" but they happen
* for other reasons.
*/
- ret = -EDESTADDRREQ;
- }
- else {
+ cell = ERR_PTR(-EDESTADDRREQ);
+ } else {
afs_get_cell(cell);
- ret = 0;
}
- read_unlock(&afs_cells_lock);
}
- *_cell = cell;
- _leave(" = %d (%p)", ret, cell);
- return ret;
-
-} /* end afs_cell_lookup() */
+ read_unlock(&afs_cells_lock);
+ up_read(&afs_cells_sem);
+ _leave(" = %p", cell);
+ return cell;
+}
-/*****************************************************************************/
/*
* try and get a cell record
*/
-struct afs_cell *afs_get_cell_maybe(struct afs_cell **_cell)
+struct afs_cell *afs_get_cell_maybe(struct afs_cell *cell)
{
- struct afs_cell *cell;
-
write_lock(&afs_cells_lock);
- cell = *_cell;
if (cell && !list_empty(&cell->link))
afs_get_cell(cell);
else
cell = NULL;
write_unlock(&afs_cells_lock);
-
return cell;
-} /* end afs_get_cell_maybe() */
+}
-/*****************************************************************************/
/*
* destroy a cell record
*/
@@ -294,8 +290,7 @@ void afs_put_cell(struct afs_cell *cell)
_enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name);
- /* sanity check */
- BUG_ON(atomic_read(&cell->usage) <= 0);
+ ASSERTCMP(atomic_read(&cell->usage), >, 0);
/* to prevent a race, the decrement and the dequeue must be effectively
* atomic */
@@ -307,36 +302,49 @@ void afs_put_cell(struct afs_cell *cell)
return;
}
+ ASSERT(list_empty(&cell->servers));
+ ASSERT(list_empty(&cell->vl_list));
+
write_unlock(&afs_cells_lock);
- BUG_ON(!list_empty(&cell->sv_list));
- BUG_ON(!list_empty(&cell->sv_graveyard));
- BUG_ON(!list_empty(&cell->vl_list));
- BUG_ON(!list_empty(&cell->vl_graveyard));
+ wake_up(&afs_cells_freeable_wq);
_leave(" [unused]");
-} /* end afs_put_cell() */
+}
-/*****************************************************************************/
/*
* destroy a cell record
+ * - must be called with the afs_cells_sem write-locked
+ * - cell->link should have been broken by the caller
*/
static void afs_cell_destroy(struct afs_cell *cell)
{
_enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name);
- /* to prevent a race, the decrement and the dequeue must be effectively
- * atomic */
- write_lock(&afs_cells_lock);
+ ASSERTCMP(atomic_read(&cell->usage), >=, 0);
+ ASSERT(list_empty(&cell->link));
- /* sanity check */
- BUG_ON(atomic_read(&cell->usage) != 0);
+ /* wait for everyone to stop using the cell */
+ if (atomic_read(&cell->usage) > 0) {
+ DECLARE_WAITQUEUE(myself, current);
- list_del_init(&cell->link);
+ _debug("wait for cell %s", cell->name);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ add_wait_queue(&afs_cells_freeable_wq, &myself);
- write_unlock(&afs_cells_lock);
+ while (atomic_read(&cell->usage) > 0) {
+ schedule();
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ }
- down_write(&afs_cells_sem);
+ remove_wait_queue(&afs_cells_freeable_wq, &myself);
+ set_current_state(TASK_RUNNING);
+ }
+
+ _debug("cell dead");
+ ASSERTCMP(atomic_read(&cell->usage), ==, 0);
+ ASSERT(list_empty(&cell->servers));
+ ASSERT(list_empty(&cell->vl_list));
afs_proc_cell_remove(cell);
@@ -348,104 +356,26 @@ static void afs_cell_destroy(struct afs_cell *cell)
cachefs_relinquish_cookie(cell->cache, 0);
#endif
- up_write(&afs_cells_sem);
-
- BUG_ON(!list_empty(&cell->sv_list));
- BUG_ON(!list_empty(&cell->sv_graveyard));
- BUG_ON(!list_empty(&cell->vl_list));
- BUG_ON(!list_empty(&cell->vl_graveyard));
-
- /* finish cleaning up the cell */
+ key_put(cell->anonymous_key);
kfree(cell);
_leave(" [destroyed]");
-} /* end afs_cell_destroy() */
-
-/*****************************************************************************/
-/*
- * lookup the server record corresponding to an Rx RPC peer
- */
-int afs_server_find_by_peer(const struct rxrpc_peer *peer,
- struct afs_server **_server)
-{
- struct afs_server *server;
- struct afs_cell *cell;
-
- _enter("%p{a=%08x},", peer, ntohl(peer->addr.s_addr));
-
- /* search the cell list */
- read_lock(&afs_cells_lock);
-
- list_for_each_entry(cell, &afs_cells, link) {
-
- _debug("? cell %s",cell->name);
-
- write_lock(&cell->sv_lock);
-
- /* check the active list */
- list_for_each_entry(server, &cell->sv_list, link) {
- _debug("?? server %08x", ntohl(server->addr.s_addr));
-
- if (memcmp(&server->addr, &peer->addr,
- sizeof(struct in_addr)) == 0)
- goto found_server;
- }
+}
- /* check the inactive list */
- spin_lock(&cell->sv_gylock);
- list_for_each_entry(server, &cell->sv_graveyard, link) {
- _debug("?? dead server %08x",
- ntohl(server->addr.s_addr));
-
- if (memcmp(&server->addr, &peer->addr,
- sizeof(struct in_addr)) == 0)
- goto found_dead_server;
- }
- spin_unlock(&cell->sv_gylock);
-
- write_unlock(&cell->sv_lock);
- }
- read_unlock(&afs_cells_lock);
-
- _leave(" = -ENOENT");
- return -ENOENT;
-
- /* we found it in the graveyard - resurrect it */
- found_dead_server:
- list_move_tail(&server->link, &cell->sv_list);
- afs_get_server(server);
- afs_kafstimod_del_timer(&server->timeout);
- spin_unlock(&cell->sv_gylock);
- goto success;
-
- /* we found it - increment its ref count and return it */
- found_server:
- afs_get_server(server);
-
- success:
- write_unlock(&cell->sv_lock);
- read_unlock(&afs_cells_lock);
-
- *_server = server;
- _leave(" = 0 (s=%p c=%p)", server, cell);
- return 0;
-
-} /* end afs_server_find_by_peer() */
-
-/*****************************************************************************/
/*
* purge in-memory cell database on module unload or afs_init() failure
* - the timeout daemon is stopped before calling this
*/
void afs_cell_purge(void)
{
- struct afs_vlocation *vlocation;
struct afs_cell *cell;
_enter("");
afs_put_cell(afs_cell_root);
+ down_write(&afs_cells_sem);
+
while (!list_empty(&afs_cells)) {
cell = NULL;
@@ -464,104 +394,11 @@ void afs_cell_purge(void)
_debug("PURGING CELL %s (%d)",
cell->name, atomic_read(&cell->usage));
- BUG_ON(!list_empty(&cell->sv_list));
- BUG_ON(!list_empty(&cell->vl_list));
-
- /* purge the cell's VL graveyard list */
- _debug(" - clearing VL graveyard");
-
- spin_lock(&cell->vl_gylock);
-
- while (!list_empty(&cell->vl_graveyard)) {
- vlocation = list_entry(cell->vl_graveyard.next,
- struct afs_vlocation,
- link);
- list_del_init(&vlocation->link);
-
- afs_kafstimod_del_timer(&vlocation->timeout);
-
- spin_unlock(&cell->vl_gylock);
-
- afs_vlocation_do_timeout(vlocation);
- /* TODO: race if move to use krxtimod instead
- * of kafstimod */
-
- spin_lock(&cell->vl_gylock);
- }
-
- spin_unlock(&cell->vl_gylock);
-
- /* purge the cell's server graveyard list */
- _debug(" - clearing server graveyard");
-
- spin_lock(&cell->sv_gylock);
-
- while (!list_empty(&cell->sv_graveyard)) {
- struct afs_server *server;
-
- server = list_entry(cell->sv_graveyard.next,
- struct afs_server, link);
- list_del_init(&server->link);
-
- afs_kafstimod_del_timer(&server->timeout);
-
- spin_unlock(&cell->sv_gylock);
-
- afs_server_do_timeout(server);
-
- spin_lock(&cell->sv_gylock);
- }
-
- spin_unlock(&cell->sv_gylock);
-
/* now the cell should be left with no references */
afs_cell_destroy(cell);
}
}
+ up_write(&afs_cells_sem);
_leave("");
-} /* end afs_cell_purge() */
-
-/*****************************************************************************/
-/*
- * match a cell record obtained from the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_cell_cache_match(void *target,
- const void *entry)
-{
- const struct afs_cache_cell *ccell = entry;
- struct afs_cell *cell = target;
-
- _enter("{%s},{%s}", ccell->name, cell->name);
-
- if (strncmp(ccell->name, cell->name, sizeof(ccell->name)) == 0) {
- _leave(" = SUCCESS");
- return CACHEFS_MATCH_SUCCESS;
- }
-
- _leave(" = FAILED");
- return CACHEFS_MATCH_FAILED;
-} /* end afs_cell_cache_match() */
-#endif
-
-/*****************************************************************************/
-/*
- * update a cell record in the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_cell_cache_update(void *source, void *entry)
-{
- struct afs_cache_cell *ccell = entry;
- struct afs_cell *cell = source;
-
- _enter("%p,%p", source, entry);
-
- strncpy(ccell->name, cell->name, sizeof(ccell->name));
-
- memcpy(ccell->vl_servers,
- cell->vl_addrs,
- min(sizeof(ccell->vl_servers), sizeof(cell->vl_addrs)));
-
-} /* end afs_cell_cache_update() */
-#endif
+}
diff --git a/fs/afs/cell.h b/fs/afs/cell.h
deleted file mode 100644
index 48349108fb00..000000000000
--- a/fs/afs/cell.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* cell.h: AFS cell record
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_AFS_CELL_H
-#define _LINUX_AFS_CELL_H
-
-#include "types.h"
-#include "cache.h"
-
-#define AFS_CELL_MAX_ADDRS 15
-
-extern volatile int afs_cells_being_purged; /* T when cells are being purged by rmmod */
-
-/*****************************************************************************/
-/*
- * entry in the cached cell catalogue
- */
-struct afs_cache_cell
-{
- char name[64]; /* cell name (padded with NULs) */
- struct in_addr vl_servers[15]; /* cached cell VL servers */
-};
-
-/*****************************************************************************/
-/*
- * AFS cell record
- */
-struct afs_cell
-{
- atomic_t usage;
- struct list_head link; /* main cell list link */
- struct list_head proc_link; /* /proc cell list link */
- struct proc_dir_entry *proc_dir; /* /proc dir for this cell */
-#ifdef AFS_CACHING_SUPPORT
- struct cachefs_cookie *cache; /* caching cookie */
-#endif
-
- /* server record management */
- rwlock_t sv_lock; /* active server list lock */
- struct list_head sv_list; /* active server list */
- struct list_head sv_graveyard; /* inactive server list */
- spinlock_t sv_gylock; /* inactive server list lock */
-
- /* volume location record management */
- struct rw_semaphore vl_sem; /* volume management serialisation semaphore */
- struct list_head vl_list; /* cell's active VL record list */
- struct list_head vl_graveyard; /* cell's inactive VL record list */
- spinlock_t vl_gylock; /* graveyard lock */
- unsigned short vl_naddrs; /* number of VL servers in addr list */
- unsigned short vl_curr_svix; /* current server index */
- struct in_addr vl_addrs[AFS_CELL_MAX_ADDRS]; /* cell VL server addresses */
-
- char name[0]; /* cell name - must go last */
-};
-
-extern int afs_cell_init(char *rootcell);
-
-extern int afs_cell_create(const char *name, char *vllist, struct afs_cell **_cell);
-
-extern int afs_cell_lookup(const char *name, unsigned nmsize, struct afs_cell **_cell);
-
-#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
-
-extern struct afs_cell *afs_get_cell_maybe(struct afs_cell **_cell);
-
-extern void afs_put_cell(struct afs_cell *cell);
-
-extern void afs_cell_purge(void);
-
-#endif /* _LINUX_AFS_CELL_H */
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 3d097fddcb7a..6685f4cbccb3 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -1,4 +1,4 @@
-/* cmservice.c: AFS Cache Manager Service
+/* AFS Cache Manager Service
*
* Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
@@ -12,641 +12,463 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sched.h>
-#include <linux/completion.h>
-#include "server.h"
-#include "cell.h"
-#include "transport.h"
-#include <rxrpc/rxrpc.h>
-#include <rxrpc/transport.h>
-#include <rxrpc/connection.h>
-#include <rxrpc/call.h>
-#include "cmservice.h"
+#include <linux/ip.h>
#include "internal.h"
+#include "afs_cm.h"
-static unsigned afscm_usage; /* AFS cache manager usage count */
-static struct rw_semaphore afscm_sem; /* AFS cache manager start/stop semaphore */
-
-static int afscm_new_call(struct rxrpc_call *call);
-static void afscm_attention(struct rxrpc_call *call);
-static void afscm_error(struct rxrpc_call *call);
-static void afscm_aemap(struct rxrpc_call *call);
-
-static void _SRXAFSCM_CallBack(struct rxrpc_call *call);
-static void _SRXAFSCM_InitCallBackState(struct rxrpc_call *call);
-static void _SRXAFSCM_Probe(struct rxrpc_call *call);
-
-typedef void (*_SRXAFSCM_xxxx_t)(struct rxrpc_call *call);
-
-static const struct rxrpc_operation AFSCM_ops[] = {
- {
- .id = 204,
- .asize = RXRPC_APP_MARK_EOF,
- .name = "CallBack",
- .user = _SRXAFSCM_CallBack,
- },
- {
- .id = 205,
- .asize = RXRPC_APP_MARK_EOF,
- .name = "InitCallBackState",
- .user = _SRXAFSCM_InitCallBackState,
- },
- {
- .id = 206,
- .asize = RXRPC_APP_MARK_EOF,
- .name = "Probe",
- .user = _SRXAFSCM_Probe,
- },
-#if 0
- {
- .id = 207,
- .asize = RXRPC_APP_MARK_EOF,
- .name = "GetLock",
- .user = _SRXAFSCM_GetLock,
- },
- {
- .id = 208,
- .asize = RXRPC_APP_MARK_EOF,
- .name = "GetCE",
- .user = _SRXAFSCM_GetCE,
- },
- {
- .id = 209,
- .asize = RXRPC_APP_MARK_EOF,
- .name = "GetXStatsVersion",
- .user = _SRXAFSCM_GetXStatsVersion,
- },
- {
- .id = 210,
- .asize = RXRPC_APP_MARK_EOF,
- .name = "GetXStats",
- .user = _SRXAFSCM_GetXStats,
- }
-#endif
-};
+struct workqueue_struct *afs_cm_workqueue;
-static struct rxrpc_service AFSCM_service = {
- .name = "AFS/CM",
- .owner = THIS_MODULE,
- .link = LIST_HEAD_INIT(AFSCM_service.link),
- .new_call = afscm_new_call,
- .service_id = 1,
- .attn_func = afscm_attention,
- .error_func = afscm_error,
- .aemap_func = afscm_aemap,
- .ops_begin = &AFSCM_ops[0],
- .ops_end = &AFSCM_ops[ARRAY_SIZE(AFSCM_ops)],
-};
+static int afs_deliver_cb_init_call_back_state(struct afs_call *,
+ struct sk_buff *, bool);
+static int afs_deliver_cb_init_call_back_state3(struct afs_call *,
+ struct sk_buff *, bool);
+static int afs_deliver_cb_probe(struct afs_call *, struct sk_buff *, bool);
+static int afs_deliver_cb_callback(struct afs_call *, struct sk_buff *, bool);
+static int afs_deliver_cb_get_capabilities(struct afs_call *, struct sk_buff *,
+ bool);
+static void afs_cm_destructor(struct afs_call *);
-static DECLARE_COMPLETION(kafscmd_alive);
-static DECLARE_COMPLETION(kafscmd_dead);
-static DECLARE_WAIT_QUEUE_HEAD(kafscmd_sleepq);
-static LIST_HEAD(kafscmd_attention_list);
-static LIST_HEAD(afscm_calls);
-static DEFINE_SPINLOCK(afscm_calls_lock);
-static DEFINE_SPINLOCK(kafscmd_attention_lock);
-static int kafscmd_die;
-
-/*****************************************************************************/
/*
- * AFS Cache Manager kernel thread
+ * CB.CallBack operation type
*/
-static int kafscmd(void *arg)
-{
- DECLARE_WAITQUEUE(myself, current);
-
- struct rxrpc_call *call;
- _SRXAFSCM_xxxx_t func;
- int die;
-
- printk(KERN_INFO "kAFS: Started kafscmd %d\n", current->pid);
-
- daemonize("kafscmd");
-
- complete(&kafscmd_alive);
-
- /* loop around looking for things to attend to */
- do {
- if (list_empty(&kafscmd_attention_list)) {
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&kafscmd_sleepq, &myself);
-
- for (;;) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (!list_empty(&kafscmd_attention_list) ||
- signal_pending(current) ||
- kafscmd_die)
- break;
-
- schedule();
- }
-
- remove_wait_queue(&kafscmd_sleepq, &myself);
- set_current_state(TASK_RUNNING);
- }
-
- die = kafscmd_die;
-
- /* dequeue the next call requiring attention */
- call = NULL;
- spin_lock(&kafscmd_attention_lock);
-
- if (!list_empty(&kafscmd_attention_list)) {
- call = list_entry(kafscmd_attention_list.next,
- struct rxrpc_call,
- app_attn_link);
- list_del_init(&call->app_attn_link);
- die = 0;
- }
-
- spin_unlock(&kafscmd_attention_lock);
-
- if (call) {
- /* act upon it */
- _debug("@@@ Begin Attend Call %p", call);
-
- func = call->app_user;
- if (func)
- func(call);
-
- rxrpc_put_call(call);
-
- _debug("@@@ End Attend Call %p", call);
- }
-
- } while(!die);
-
- /* and that's all */
- complete_and_exit(&kafscmd_dead, 0);
-
-} /* end kafscmd() */
+static const struct afs_call_type afs_SRXCBCallBack = {
+ .name = "CB.CallBack",
+ .deliver = afs_deliver_cb_callback,
+ .abort_to_error = afs_abort_to_error,
+ .destructor = afs_cm_destructor,
+};
-/*****************************************************************************/
/*
- * handle a call coming in to the cache manager
- * - if I want to keep the call, I must increment its usage count
- * - the return value will be negated and passed back in an abort packet if
- * non-zero
- * - serialised by virtue of there only being one krxiod
+ * CB.InitCallBackState operation type
*/
-static int afscm_new_call(struct rxrpc_call *call)
-{
- _enter("%p{cid=%u u=%d}",
- call, ntohl(call->call_id), atomic_read(&call->usage));
-
- rxrpc_get_call(call);
-
- /* add to my current call list */
- spin_lock(&afscm_calls_lock);
- list_add(&call->app_link,&afscm_calls);
- spin_unlock(&afscm_calls_lock);
-
- _leave(" = 0");
- return 0;
-
-} /* end afscm_new_call() */
+static const struct afs_call_type afs_SRXCBInitCallBackState = {
+ .name = "CB.InitCallBackState",
+ .deliver = afs_deliver_cb_init_call_back_state,
+ .abort_to_error = afs_abort_to_error,
+ .destructor = afs_cm_destructor,
+};
-/*****************************************************************************/
/*
- * queue on the kafscmd queue for attention
+ * CB.InitCallBackState3 operation type
*/
-static void afscm_attention(struct rxrpc_call *call)
-{
- _enter("%p{cid=%u u=%d}",
- call, ntohl(call->call_id), atomic_read(&call->usage));
-
- spin_lock(&kafscmd_attention_lock);
-
- if (list_empty(&call->app_attn_link)) {
- list_add_tail(&call->app_attn_link, &kafscmd_attention_list);
- rxrpc_get_call(call);
- }
-
- spin_unlock(&kafscmd_attention_lock);
-
- wake_up(&kafscmd_sleepq);
-
- _leave(" {u=%d}", atomic_read(&call->usage));
-} /* end afscm_attention() */
+static const struct afs_call_type afs_SRXCBInitCallBackState3 = {
+ .name = "CB.InitCallBackState3",
+ .deliver = afs_deliver_cb_init_call_back_state3,
+ .abort_to_error = afs_abort_to_error,
+ .destructor = afs_cm_destructor,
+};
-/*****************************************************************************/
/*
- * handle my call being aborted
- * - clean up, dequeue and put my ref to the call
+ * CB.Probe operation type
*/
-static void afscm_error(struct rxrpc_call *call)
-{
- int removed;
-
- _enter("%p{est=%s ac=%u er=%d}",
- call,
- rxrpc_call_error_states[call->app_err_state],
- call->app_abort_code,
- call->app_errno);
-
- spin_lock(&kafscmd_attention_lock);
-
- if (list_empty(&call->app_attn_link)) {
- list_add_tail(&call->app_attn_link, &kafscmd_attention_list);
- rxrpc_get_call(call);
- }
-
- spin_unlock(&kafscmd_attention_lock);
-
- removed = 0;
- spin_lock(&afscm_calls_lock);
- if (!list_empty(&call->app_link)) {
- list_del_init(&call->app_link);
- removed = 1;
- }
- spin_unlock(&afscm_calls_lock);
-
- if (removed)
- rxrpc_put_call(call);
-
- wake_up(&kafscmd_sleepq);
+static const struct afs_call_type afs_SRXCBProbe = {
+ .name = "CB.Probe",
+ .deliver = afs_deliver_cb_probe,
+ .abort_to_error = afs_abort_to_error,
+ .destructor = afs_cm_destructor,
+};
- _leave("");
-} /* end afscm_error() */
+/*
+ * CB.GetCapabilities operation type
+ */
+static const struct afs_call_type afs_SRXCBGetCapabilites = {
+ .name = "CB.GetCapabilities",
+ .deliver = afs_deliver_cb_get_capabilities,
+ .abort_to_error = afs_abort_to_error,
+ .destructor = afs_cm_destructor,
+};
-/*****************************************************************************/
/*
- * map afs abort codes to/from Linux error codes
- * - called with call->lock held
+ * route an incoming cache manager call
+ * - return T if supported, F if not
*/
-static void afscm_aemap(struct rxrpc_call *call)
+bool afs_cm_incoming_call(struct afs_call *call)
{
- switch (call->app_err_state) {
- case RXRPC_ESTATE_LOCAL_ABORT:
- call->app_abort_code = -call->app_errno;
- break;
- case RXRPC_ESTATE_PEER_ABORT:
- call->app_errno = -ECONNABORTED;
- break;
+ u32 operation_id = ntohl(call->operation_ID);
+
+ _enter("{CB.OP %u}", operation_id);
+
+ switch (operation_id) {
+ case CBCallBack:
+ call->type = &afs_SRXCBCallBack;
+ return true;
+ case CBInitCallBackState:
+ call->type = &afs_SRXCBInitCallBackState;
+ return true;
+ case CBInitCallBackState3:
+ call->type = &afs_SRXCBInitCallBackState3;
+ return true;
+ case CBProbe:
+ call->type = &afs_SRXCBProbe;
+ return true;
+ case CBGetCapabilities:
+ call->type = &afs_SRXCBGetCapabilites;
+ return true;
default:
- break;
+ return false;
}
-} /* end afscm_aemap() */
+}
-/*****************************************************************************/
/*
- * start the cache manager service if not already started
+ * clean up a cache manager call
*/
-int afscm_start(void)
+static void afs_cm_destructor(struct afs_call *call)
{
- int ret;
-
- down_write(&afscm_sem);
- if (!afscm_usage) {
- ret = kernel_thread(kafscmd, NULL, 0);
- if (ret < 0)
- goto out;
-
- wait_for_completion(&kafscmd_alive);
-
- ret = rxrpc_add_service(afs_transport, &AFSCM_service);
- if (ret < 0)
- goto kill;
-
- afs_kafstimod_add_timer(&afs_mntpt_expiry_timer,
- afs_mntpt_expiry_timeout * HZ);
- }
-
- afscm_usage++;
- up_write(&afscm_sem);
-
- return 0;
-
- kill:
- kafscmd_die = 1;
- wake_up(&kafscmd_sleepq);
- wait_for_completion(&kafscmd_dead);
-
- out:
- up_write(&afscm_sem);
- return ret;
+ _enter("");
-} /* end afscm_start() */
+ afs_put_server(call->server);
+ call->server = NULL;
+ kfree(call->buffer);
+ call->buffer = NULL;
+}
-/*****************************************************************************/
/*
- * stop the cache manager service
+ * allow the fileserver to see if the cache manager is still alive
*/
-void afscm_stop(void)
+static void SRXAFSCB_CallBack(struct work_struct *work)
{
- struct rxrpc_call *call;
+ struct afs_call *call = container_of(work, struct afs_call, work);
- down_write(&afscm_sem);
+ _enter("");
- BUG_ON(afscm_usage == 0);
- afscm_usage--;
+ /* be sure to send the reply *before* attempting to spam the AFS server
+ * with FSFetchStatus requests on the vnodes with broken callbacks lest
+ * the AFS server get into a vicious cycle of trying to break further
+ * callbacks because it hadn't received completion of the CBCallBack op
+ * yet */
+ afs_send_empty_reply(call);
- if (afscm_usage == 0) {
- /* don't want more incoming calls */
- rxrpc_del_service(afs_transport, &AFSCM_service);
-
- /* abort any calls I've still got open (the afscm_error() will
- * dequeue them) */
- spin_lock(&afscm_calls_lock);
- while (!list_empty(&afscm_calls)) {
- call = list_entry(afscm_calls.next,
- struct rxrpc_call,
- app_link);
+ afs_break_callbacks(call->server, call->count, call->request);
+ _leave("");
+}
- list_del_init(&call->app_link);
- rxrpc_get_call(call);
- spin_unlock(&afscm_calls_lock);
+/*
+ * deliver request data to a CB.CallBack call
+ */
+static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
+ bool last)
+{
+ struct afs_callback *cb;
+ struct afs_server *server;
+ struct in_addr addr;
+ __be32 *bp;
+ u32 tmp;
+ int ret, loop;
+
+ _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+ switch (call->unmarshall) {
+ case 0:
+ call->offset = 0;
+ call->unmarshall++;
+
+ /* extract the FID array and its count in two steps */
+ case 1:
+ _debug("extract FID count");
+ ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+ switch (ret) {
+ case 0: break;
+ case -EAGAIN: return 0;
+ default: return ret;
+ }
- rxrpc_call_abort(call, -ESRCH); /* abort, dequeue and
- * put */
+ call->count = ntohl(call->tmp);
+ _debug("FID count: %u", call->count);
+ if (call->count > AFSCBMAX)
+ return -EBADMSG;
+
+ call->buffer = kmalloc(call->count * 3 * 4, GFP_KERNEL);
+ if (!call->buffer)
+ return -ENOMEM;
+ call->offset = 0;
+ call->unmarshall++;
+
+ case 2:
+ _debug("extract FID array");
+ ret = afs_extract_data(call, skb, last, call->buffer,
+ call->count * 3 * 4);
+ switch (ret) {
+ case 0: break;
+ case -EAGAIN: return 0;
+ default: return ret;
+ }
- _debug("nuking active call %08x.%d",
- ntohl(call->conn->conn_id),
- ntohl(call->call_id));
- rxrpc_put_call(call);
- rxrpc_put_call(call);
+ _debug("unmarshall FID array");
+ call->request = kcalloc(call->count,
+ sizeof(struct afs_callback),
+ GFP_KERNEL);
+ if (!call->request)
+ return -ENOMEM;
+
+ cb = call->request;
+ bp = call->buffer;
+ for (loop = call->count; loop > 0; loop--, cb++) {
+ cb->fid.vid = ntohl(*bp++);
+ cb->fid.vnode = ntohl(*bp++);
+ cb->fid.unique = ntohl(*bp++);
+ cb->type = AFSCM_CB_UNTYPED;
+ }
- spin_lock(&afscm_calls_lock);
+ call->offset = 0;
+ call->unmarshall++;
+
+ /* extract the callback array and its count in two steps */
+ case 3:
+ _debug("extract CB count");
+ ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+ switch (ret) {
+ case 0: break;
+ case -EAGAIN: return 0;
+ default: return ret;
}
- spin_unlock(&afscm_calls_lock);
- /* get rid of my daemon */
- kafscmd_die = 1;
- wake_up(&kafscmd_sleepq);
- wait_for_completion(&kafscmd_dead);
+ tmp = ntohl(call->tmp);
+ _debug("CB count: %u", tmp);
+ if (tmp != call->count && tmp != 0)
+ return -EBADMSG;
+ call->offset = 0;
+ call->unmarshall++;
+ if (tmp == 0)
+ goto empty_cb_array;
+
+ case 4:
+ _debug("extract CB array");
+ ret = afs_extract_data(call, skb, last, call->request,
+ call->count * 3 * 4);
+ switch (ret) {
+ case 0: break;
+ case -EAGAIN: return 0;
+ default: return ret;
+ }
- /* dispose of any calls waiting for attention */
- spin_lock(&kafscmd_attention_lock);
- while (!list_empty(&kafscmd_attention_list)) {
- call = list_entry(kafscmd_attention_list.next,
- struct rxrpc_call,
- app_attn_link);
+ _debug("unmarshall CB array");
+ cb = call->request;
+ bp = call->buffer;
+ for (loop = call->count; loop > 0; loop--, cb++) {
+ cb->version = ntohl(*bp++);
+ cb->expiry = ntohl(*bp++);
+ cb->type = ntohl(*bp++);
+ }
- list_del_init(&call->app_attn_link);
- spin_unlock(&kafscmd_attention_lock);
+ empty_cb_array:
+ call->offset = 0;
+ call->unmarshall++;
- rxrpc_put_call(call);
+ case 5:
+ _debug("trailer");
+ if (skb->len != 0)
+ return -EBADMSG;
+ break;
+ }
- spin_lock(&kafscmd_attention_lock);
- }
- spin_unlock(&kafscmd_attention_lock);
+ if (!last)
+ return 0;
- afs_kafstimod_del_timer(&afs_mntpt_expiry_timer);
- }
+ call->state = AFS_CALL_REPLYING;
- up_write(&afscm_sem);
+ /* we'll need the file server record as that tells us which set of
+ * vnodes to operate upon */
+ memcpy(&addr, &ip_hdr(skb)->saddr, 4);
+ server = afs_find_server(&addr);
+ if (!server)
+ return -ENOTCONN;
+ call->server = server;
-} /* end afscm_stop() */
+ INIT_WORK(&call->work, SRXAFSCB_CallBack);
+ schedule_work(&call->work);
+ return 0;
+}
-/*****************************************************************************/
/*
- * handle the fileserver breaking a set of callbacks
+ * allow the fileserver to request callback state (re-)initialisation
*/
-static void _SRXAFSCM_CallBack(struct rxrpc_call *call)
+static void SRXAFSCB_InitCallBackState(struct work_struct *work)
{
- struct afs_server *server;
- size_t count, qty, tmp;
- int ret = 0, removed;
-
- _enter("%p{acs=%s}", call, rxrpc_call_states[call->app_call_state]);
-
- server = afs_server_get_from_peer(call->conn->peer);
-
- switch (call->app_call_state) {
- /* we've received the last packet
- * - drain all the data from the call and send the reply
- */
- case RXRPC_CSTATE_SRVR_GOT_ARGS:
- ret = -EBADMSG;
- qty = call->app_ready_qty;
- if (qty < 8 || qty > 50 * (6 * 4) + 8)
- break;
-
- {
- struct afs_callback *cb, *pcb;
- int loop;
- __be32 *fp, *bp;
-
- fp = rxrpc_call_alloc_scratch(call, qty);
-
- /* drag the entire argument block out to the scratch
- * space */
- ret = rxrpc_call_read_data(call, fp, qty, 0);
- if (ret < 0)
- break;
-
- /* and unmarshall the parameter block */
- ret = -EBADMSG;
- count = ntohl(*fp++);
- if (count>AFSCBMAX ||
- (count * (3 * 4) + 8 != qty &&
- count * (6 * 4) + 8 != qty))
- break;
-
- bp = fp + count*3;
- tmp = ntohl(*bp++);
- if (tmp > 0 && tmp != count)
- break;
- if (tmp == 0)
- bp = NULL;
-
- pcb = cb = rxrpc_call_alloc_scratch_s(
- call, struct afs_callback);
-
- for (loop = count - 1; loop >= 0; loop--) {
- pcb->fid.vid = ntohl(*fp++);
- pcb->fid.vnode = ntohl(*fp++);
- pcb->fid.unique = ntohl(*fp++);
- if (bp) {
- pcb->version = ntohl(*bp++);
- pcb->expiry = ntohl(*bp++);
- pcb->type = ntohl(*bp++);
- }
- else {
- pcb->version = 0;
- pcb->expiry = 0;
- pcb->type = AFSCM_CB_UNTYPED;
- }
- pcb++;
- }
-
- /* invoke the actual service routine */
- ret = SRXAFSCM_CallBack(server, count, cb);
- if (ret < 0)
- break;
- }
+ struct afs_call *call = container_of(work, struct afs_call, work);
- /* send the reply */
- ret = rxrpc_call_write_data(call, 0, NULL, RXRPC_LAST_PACKET,
- GFP_KERNEL, 0, &count);
- if (ret < 0)
- break;
- break;
-
- /* operation complete */
- case RXRPC_CSTATE_COMPLETE:
- call->app_user = NULL;
- removed = 0;
- spin_lock(&afscm_calls_lock);
- if (!list_empty(&call->app_link)) {
- list_del_init(&call->app_link);
- removed = 1;
- }
- spin_unlock(&afscm_calls_lock);
+ _enter("{%p}", call->server);
- if (removed)
- rxrpc_put_call(call);
- break;
+ afs_init_callback_state(call->server);
+ afs_send_empty_reply(call);
+ _leave("");
+}
- /* operation terminated on error */
- case RXRPC_CSTATE_ERROR:
- call->app_user = NULL;
- break;
+/*
+ * deliver request data to a CB.InitCallBackState call
+ */
+static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
+ struct sk_buff *skb,
+ bool last)
+{
+ struct afs_server *server;
+ struct in_addr addr;
- default:
- break;
- }
+ _enter(",{%u},%d", skb->len, last);
- if (ret < 0)
- rxrpc_call_abort(call, ret);
+ if (skb->len > 0)
+ return -EBADMSG;
+ if (!last)
+ return 0;
- afs_put_server(server);
+ /* no unmarshalling required */
+ call->state = AFS_CALL_REPLYING;
- _leave(" = %d", ret);
+ /* we'll need the file server record as that tells us which set of
+ * vnodes to operate upon */
+ memcpy(&addr, &ip_hdr(skb)->saddr, 4);
+ server = afs_find_server(&addr);
+ if (!server)
+ return -ENOTCONN;
+ call->server = server;
-} /* end _SRXAFSCM_CallBack() */
+ INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
+ schedule_work(&call->work);
+ return 0;
+}
-/*****************************************************************************/
/*
- * handle the fileserver asking us to initialise our callback state
+ * deliver request data to a CB.InitCallBackState3 call
*/
-static void _SRXAFSCM_InitCallBackState(struct rxrpc_call *call)
+static int afs_deliver_cb_init_call_back_state3(struct afs_call *call,
+ struct sk_buff *skb,
+ bool last)
{
struct afs_server *server;
- size_t count;
- int ret = 0, removed;
+ struct in_addr addr;
- _enter("%p{acs=%s}", call, rxrpc_call_states[call->app_call_state]);
+ _enter(",{%u},%d", skb->len, last);
- server = afs_server_get_from_peer(call->conn->peer);
+ if (!last)
+ return 0;
- switch (call->app_call_state) {
- /* we've received the last packet - drain all the data from the
- * call */
- case RXRPC_CSTATE_SRVR_GOT_ARGS:
- /* shouldn't be any args */
- ret = -EBADMSG;
- break;
-
- /* send the reply when asked for it */
- case RXRPC_CSTATE_SRVR_SND_REPLY:
- /* invoke the actual service routine */
- ret = SRXAFSCM_InitCallBackState(server);
- if (ret < 0)
- break;
-
- ret = rxrpc_call_write_data(call, 0, NULL, RXRPC_LAST_PACKET,
- GFP_KERNEL, 0, &count);
- if (ret < 0)
- break;
- break;
+ /* no unmarshalling required */
+ call->state = AFS_CALL_REPLYING;
- /* operation complete */
- case RXRPC_CSTATE_COMPLETE:
- call->app_user = NULL;
- removed = 0;
- spin_lock(&afscm_calls_lock);
- if (!list_empty(&call->app_link)) {
- list_del_init(&call->app_link);
- removed = 1;
- }
- spin_unlock(&afscm_calls_lock);
+ /* we'll need the file server record as that tells us which set of
+ * vnodes to operate upon */
+ memcpy(&addr, &ip_hdr(skb)->saddr, 4);
+ server = afs_find_server(&addr);
+ if (!server)
+ return -ENOTCONN;
+ call->server = server;
- if (removed)
- rxrpc_put_call(call);
- break;
-
- /* operation terminated on error */
- case RXRPC_CSTATE_ERROR:
- call->app_user = NULL;
- break;
-
- default:
- break;
- }
-
- if (ret < 0)
- rxrpc_call_abort(call, ret);
-
- afs_put_server(server);
+ INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
+ schedule_work(&call->work);
+ return 0;
+}
- _leave(" = %d", ret);
+/*
+ * allow the fileserver to see if the cache manager is still alive
+ */
+static void SRXAFSCB_Probe(struct work_struct *work)
+{
+ struct afs_call *call = container_of(work, struct afs_call, work);
-} /* end _SRXAFSCM_InitCallBackState() */
+ _enter("");
+ afs_send_empty_reply(call);
+ _leave("");
+}
-/*****************************************************************************/
/*
- * handle a probe from a fileserver
+ * deliver request data to a CB.Probe call
*/
-static void _SRXAFSCM_Probe(struct rxrpc_call *call)
+static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
+ bool last)
{
- struct afs_server *server;
- size_t count;
- int ret = 0, removed;
-
- _enter("%p{acs=%s}", call, rxrpc_call_states[call->app_call_state]);
+ _enter(",{%u},%d", skb->len, last);
- server = afs_server_get_from_peer(call->conn->peer);
+ if (skb->len > 0)
+ return -EBADMSG;
+ if (!last)
+ return 0;
- switch (call->app_call_state) {
- /* we've received the last packet - drain all the data from the
- * call */
- case RXRPC_CSTATE_SRVR_GOT_ARGS:
- /* shouldn't be any args */
- ret = -EBADMSG;
- break;
+ /* no unmarshalling required */
+ call->state = AFS_CALL_REPLYING;
- /* send the reply when asked for it */
- case RXRPC_CSTATE_SRVR_SND_REPLY:
- /* invoke the actual service routine */
- ret = SRXAFSCM_Probe(server);
- if (ret < 0)
- break;
-
- ret = rxrpc_call_write_data(call, 0, NULL, RXRPC_LAST_PACKET,
- GFP_KERNEL, 0, &count);
- if (ret < 0)
- break;
- break;
+ INIT_WORK(&call->work, SRXAFSCB_Probe);
+ schedule_work(&call->work);
+ return 0;
+}
- /* operation complete */
- case RXRPC_CSTATE_COMPLETE:
- call->app_user = NULL;
- removed = 0;
- spin_lock(&afscm_calls_lock);
- if (!list_empty(&call->app_link)) {
- list_del_init(&call->app_link);
- removed = 1;
+/*
+ * allow the fileserver to ask about the cache manager's capabilities
+ */
+static void SRXAFSCB_GetCapabilities(struct work_struct *work)
+{
+ struct afs_interface *ifs;
+ struct afs_call *call = container_of(work, struct afs_call, work);
+ int loop, nifs;
+
+ struct {
+ struct /* InterfaceAddr */ {
+ __be32 nifs;
+ __be32 uuid[11];
+ __be32 ifaddr[32];
+ __be32 netmask[32];
+ __be32 mtu[32];
+ } ia;
+ struct /* Capabilities */ {
+ __be32 capcount;
+ __be32 caps[1];
+ } cap;
+ } reply;
+
+ _enter("");
+
+ nifs = 0;
+ ifs = kcalloc(32, sizeof(*ifs), GFP_KERNEL);
+ if (ifs) {
+ nifs = afs_get_ipv4_interfaces(ifs, 32, false);
+ if (nifs < 0) {
+ kfree(ifs);
+ ifs = NULL;
+ nifs = 0;
}
- spin_unlock(&afscm_calls_lock);
+ }
- if (removed)
- rxrpc_put_call(call);
- break;
+ memset(&reply, 0, sizeof(reply));
+ reply.ia.nifs = htonl(nifs);
+
+ reply.ia.uuid[0] = htonl(afs_uuid.time_low);
+ reply.ia.uuid[1] = htonl(afs_uuid.time_mid);
+ reply.ia.uuid[2] = htonl(afs_uuid.time_hi_and_version);
+ reply.ia.uuid[3] = htonl((s8) afs_uuid.clock_seq_hi_and_reserved);
+ reply.ia.uuid[4] = htonl((s8) afs_uuid.clock_seq_low);
+ for (loop = 0; loop < 6; loop++)
+ reply.ia.uuid[loop + 5] = htonl((s8) afs_uuid.node[loop]);
+
+ if (ifs) {
+ for (loop = 0; loop < nifs; loop++) {
+ reply.ia.ifaddr[loop] = ifs[loop].address.s_addr;
+ reply.ia.netmask[loop] = ifs[loop].netmask.s_addr;
+ reply.ia.mtu[loop] = htonl(ifs[loop].mtu);
+ }
+ }
- /* operation terminated on error */
- case RXRPC_CSTATE_ERROR:
- call->app_user = NULL;
- break;
+ reply.cap.capcount = htonl(1);
+ reply.cap.caps[0] = htonl(AFS_CAP_ERROR_TRANSLATION);
+ afs_send_simple_reply(call, &reply, sizeof(reply));
- default:
- break;
- }
+ _leave("");
+}
- if (ret < 0)
- rxrpc_call_abort(call, ret);
+/*
+ * deliver request data to a CB.GetCapabilities call
+ */
+static int afs_deliver_cb_get_capabilities(struct afs_call *call,
+ struct sk_buff *skb, bool last)
+{
+ _enter(",{%u},%d", skb->len, last);
- afs_put_server(server);
+ if (skb->len > 0)
+ return -EBADMSG;
+ if (!last)
+ return 0;
- _leave(" = %d", ret);
+ /* no unmarshalling required */
+ call->state = AFS_CALL_REPLYING;
-} /* end _SRXAFSCM_Probe() */
+ INIT_WORK(&call->work, SRXAFSCB_GetCapabilities);
+ schedule_work(&call->work);
+ return 0;
+}
diff --git a/fs/afs/cmservice.h b/fs/afs/cmservice.h
deleted file mode 100644
index af8d4d689cb2..000000000000
--- a/fs/afs/cmservice.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* cmservice.h: AFS Cache Manager Service declarations
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_AFS_CMSERVICE_H
-#define _LINUX_AFS_CMSERVICE_H
-
-#include <rxrpc/transport.h>
-#include "types.h"
-
-/* cache manager start/stop */
-extern int afscm_start(void);
-extern void afscm_stop(void);
-
-/* cache manager server functions */
-extern int SRXAFSCM_InitCallBackState(struct afs_server *server);
-extern int SRXAFSCM_CallBack(struct afs_server *server,
- size_t count,
- struct afs_callback callbacks[]);
-extern int SRXAFSCM_Probe(struct afs_server *server);
-
-#endif /* _LINUX_AFS_CMSERVICE_H */
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index b6dc2ebe47a8..dac5b990c0cd 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -15,45 +15,53 @@
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
-#include <linux/smp_lock.h>
-#include "vnode.h"
-#include "volume.h"
-#include <rxrpc/call.h>
-#include "super.h"
+#include <linux/ctype.h>
#include "internal.h"
-static struct dentry *afs_dir_lookup(struct inode *dir, struct dentry *dentry,
- struct nameidata *nd);
+static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd);
static int afs_dir_open(struct inode *inode, struct file *file);
-static int afs_dir_readdir(struct file *file, void *dirent, filldir_t filldir);
+static int afs_readdir(struct file *file, void *dirent, filldir_t filldir);
static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd);
static int afs_d_delete(struct dentry *dentry);
-static int afs_dir_lookup_filldir(void *_cookie, const char *name, int nlen,
+static void afs_d_release(struct dentry *dentry);
+static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
loff_t fpos, u64 ino, unsigned dtype);
+static int afs_create(struct inode *dir, struct dentry *dentry, int mode,
+ struct nameidata *nd);
+static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static int afs_rmdir(struct inode *dir, struct dentry *dentry);
+static int afs_unlink(struct inode *dir, struct dentry *dentry);
+static int afs_link(struct dentry *from, struct inode *dir,
+ struct dentry *dentry);
+static int afs_symlink(struct inode *dir, struct dentry *dentry,
+ const char *content);
+static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry);
const struct file_operations afs_dir_file_operations = {
.open = afs_dir_open,
- .readdir = afs_dir_readdir,
+ .release = afs_release,
+ .readdir = afs_readdir,
};
const struct inode_operations afs_dir_inode_operations = {
- .lookup = afs_dir_lookup,
+ .create = afs_create,
+ .lookup = afs_lookup,
+ .link = afs_link,
+ .unlink = afs_unlink,
+ .symlink = afs_symlink,
+ .mkdir = afs_mkdir,
+ .rmdir = afs_rmdir,
+ .rename = afs_rename,
+ .permission = afs_permission,
.getattr = afs_inode_getattr,
-#if 0 /* TODO */
- .create = afs_dir_create,
- .link = afs_dir_link,
- .unlink = afs_dir_unlink,
- .symlink = afs_dir_symlink,
- .mkdir = afs_dir_mkdir,
- .rmdir = afs_dir_rmdir,
- .mknod = afs_dir_mknod,
- .rename = afs_dir_rename,
-#endif
};
static struct dentry_operations afs_fs_dentry_operations = {
.d_revalidate = afs_d_revalidate,
.d_delete = afs_d_delete,
+ .d_release = afs_d_release,
};
#define AFS_DIR_HASHTBL_SIZE 128
@@ -105,14 +113,13 @@ struct afs_dir_page {
union afs_dir_block blocks[PAGE_SIZE / sizeof(union afs_dir_block)];
};
-struct afs_dir_lookup_cookie {
+struct afs_lookup_cookie {
struct afs_fid fid;
const char *name;
size_t nlen;
int found;
};
-/*****************************************************************************/
/*
* check that a directory page is valid
*/
@@ -128,9 +135,10 @@ static inline void afs_dir_check_page(struct inode *dir, struct page *page)
if (qty == 0)
goto error;
- if (page->index==0 && qty!=ntohs(dbuf->blocks[0].pagehdr.npages)) {
+ if (page->index == 0 && qty != ntohs(dbuf->blocks[0].pagehdr.npages)) {
printk("kAFS: %s(%lu): wrong number of dir blocks %d!=%hu\n",
- __FUNCTION__,dir->i_ino,qty,ntohs(dbuf->blocks[0].pagehdr.npages));
+ __FUNCTION__, dir->i_ino, qty,
+ ntohs(dbuf->blocks[0].pagehdr.npages));
goto error;
}
#endif
@@ -157,13 +165,11 @@ static inline void afs_dir_check_page(struct inode *dir, struct page *page)
SetPageChecked(page);
return;
- error:
+error:
SetPageChecked(page);
SetPageError(page);
+}
-} /* end afs_dir_check_page() */
-
-/*****************************************************************************/
/*
* discard a page cached in the pagecache
*/
@@ -171,20 +177,22 @@ static inline void afs_dir_put_page(struct page *page)
{
kunmap(page);
page_cache_release(page);
+}
-} /* end afs_dir_put_page() */
-
-/*****************************************************************************/
/*
* get a page into the pagecache
*/
-static struct page *afs_dir_get_page(struct inode *dir, unsigned long index)
+static struct page *afs_dir_get_page(struct inode *dir, unsigned long index,
+ struct key *key)
{
struct page *page;
+ struct file file = {
+ .private_data = key,
+ };
_enter("{%lu},%lu", dir->i_ino, index);
- page = read_mapping_page(dir->i_mapping, index, NULL);
+ page = read_mapping_page(dir->i_mapping, index, &file);
if (!IS_ERR(page)) {
wait_on_page_locked(page);
kmap(page);
@@ -197,12 +205,12 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index)
}
return page;
- fail:
+fail:
afs_dir_put_page(page);
+ _leave(" = -EIO");
return ERR_PTR(-EIO);
-} /* end afs_dir_get_page() */
+}
-/*****************************************************************************/
/*
* open an AFS directory file
*/
@@ -213,15 +221,12 @@ static int afs_dir_open(struct inode *inode, struct file *file)
BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
- if (AFS_FS_I(inode)->flags & AFS_VNODE_DELETED)
+ if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(inode)->flags))
return -ENOENT;
- _leave(" = 0");
- return 0;
-
-} /* end afs_dir_open() */
+ return afs_open(inode, file);
+}
-/*****************************************************************************/
/*
* deal with one block in an AFS directory
*/
@@ -250,7 +255,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
/* skip entries marked unused in the bitmap */
if (!(block->pagehdr.bitmap[offset / 8] &
(1 << (offset % 8)))) {
- _debug("ENT[%Zu.%u]: unused\n",
+ _debug("ENT[%Zu.%u]: unused",
blkoff / sizeof(union afs_dir_block), offset);
if (offset >= curr)
*fpos = blkoff +
@@ -264,7 +269,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
sizeof(*block) -
offset * sizeof(union afs_dirent));
- _debug("ENT[%Zu.%u]: %s %Zu \"%s\"\n",
+ _debug("ENT[%Zu.%u]: %s %Zu \"%s\"",
blkoff / sizeof(union afs_dir_block), offset,
(offset < curr ? "skip" : "fill"),
nlen, dire->u.name);
@@ -274,7 +279,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
if (next >= AFS_DIRENT_PER_BLOCK) {
_debug("ENT[%Zu.%u]:"
" %u travelled beyond end dir block"
- " (len %u/%Zu)\n",
+ " (len %u/%Zu)",
blkoff / sizeof(union afs_dir_block),
offset, next, tmp, nlen);
return -EIO;
@@ -282,13 +287,13 @@ static int afs_dir_iterate_block(unsigned *fpos,
if (!(block->pagehdr.bitmap[next / 8] &
(1 << (next % 8)))) {
_debug("ENT[%Zu.%u]:"
- " %u unmarked extension (len %u/%Zu)\n",
+ " %u unmarked extension (len %u/%Zu)",
blkoff / sizeof(union afs_dir_block),
offset, next, tmp, nlen);
return -EIO;
}
- _debug("ENT[%Zu.%u]: ext %u/%Zu\n",
+ _debug("ENT[%Zu.%u]: ext %u/%Zu",
blkoff / sizeof(union afs_dir_block),
next, tmp, nlen);
next++;
@@ -304,7 +309,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
nlen,
blkoff + offset * sizeof(union afs_dirent),
ntohl(dire->u.vnode),
- filldir == afs_dir_lookup_filldir ?
+ filldir == afs_lookup_filldir ?
ntohl(dire->u.unique) : DT_UNKNOWN);
if (ret < 0) {
_leave(" = 0 [full]");
@@ -316,16 +321,15 @@ static int afs_dir_iterate_block(unsigned *fpos,
_leave(" = 1 [more]");
return 1;
-} /* end afs_dir_iterate_block() */
+}
-/*****************************************************************************/
/*
- * read an AFS directory
+ * iterate through the data blob that lists the contents of an AFS directory
*/
static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
- filldir_t filldir)
+ filldir_t filldir, struct key *key)
{
- union afs_dir_block *dblock;
+ union afs_dir_block *dblock;
struct afs_dir_page *dbuf;
struct page *page;
unsigned blkoff, limit;
@@ -333,7 +337,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
_enter("{%lu},%u,,", dir->i_ino, *fpos);
- if (AFS_FS_I(dir)->flags & AFS_VNODE_DELETED) {
+ if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) {
_leave(" = -ESTALE");
return -ESTALE;
}
@@ -348,7 +352,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
blkoff = *fpos & ~(sizeof(union afs_dir_block) - 1);
/* fetch the appropriate page from the directory */
- page = afs_dir_get_page(dir, blkoff / PAGE_SIZE);
+ page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key);
if (IS_ERR(page)) {
ret = PTR_ERR(page);
break;
@@ -377,43 +381,50 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
ret = 0;
}
- out:
+out:
_leave(" = %d", ret);
return ret;
-} /* end afs_dir_iterate() */
+}
-/*****************************************************************************/
/*
* read an AFS directory
*/
-static int afs_dir_readdir(struct file *file, void *cookie, filldir_t filldir)
+static int afs_readdir(struct file *file, void *cookie, filldir_t filldir)
{
unsigned fpos;
int ret;
- _enter("{%Ld,{%lu}}", file->f_pos, file->f_path.dentry->d_inode->i_ino);
+ _enter("{%Ld,{%lu}}",
+ file->f_pos, file->f_path.dentry->d_inode->i_ino);
+
+ ASSERT(file->private_data != NULL);
fpos = file->f_pos;
- ret = afs_dir_iterate(file->f_path.dentry->d_inode, &fpos, cookie, filldir);
+ ret = afs_dir_iterate(file->f_path.dentry->d_inode, &fpos,
+ cookie, filldir, file->private_data);
file->f_pos = fpos;
_leave(" = %d", ret);
return ret;
-} /* end afs_dir_readdir() */
+}
-/*****************************************************************************/
/*
* search the directory for a name
* - if afs_dir_iterate_block() spots this function, it'll pass the FID
* uniquifier through dtype
*/
-static int afs_dir_lookup_filldir(void *_cookie, const char *name, int nlen,
- loff_t fpos, u64 ino, unsigned dtype)
+static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
+ loff_t fpos, u64 ino, unsigned dtype)
{
- struct afs_dir_lookup_cookie *cookie = _cookie;
+ struct afs_lookup_cookie *cookie = _cookie;
- _enter("{%s,%Zu},%s,%u,,%lu,%u",
- cookie->name, cookie->nlen, name, nlen, ino, dtype);
+ _enter("{%s,%Zu},%s,%u,,%llu,%u",
+ cookie->name, cookie->nlen, name, nlen,
+ (unsigned long long) ino, dtype);
+
+ /* insanity checks first */
+ BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
+ BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
if (cookie->nlen != nlen || memcmp(cookie->name, name, nlen) != 0) {
_leave(" = 0 [no]");
@@ -426,216 +437,254 @@ static int afs_dir_lookup_filldir(void *_cookie, const char *name, int nlen,
_leave(" = -1 [found]");
return -1;
-} /* end afs_dir_lookup_filldir() */
+}
-/*****************************************************************************/
/*
- * look up an entry in a directory
+ * do a lookup in a directory
+ * - just returns the FID the dentry name maps to if found
*/
-static struct dentry *afs_dir_lookup(struct inode *dir, struct dentry *dentry,
- struct nameidata *nd)
+static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
+ struct afs_fid *fid, struct key *key)
{
- struct afs_dir_lookup_cookie cookie;
+ struct afs_lookup_cookie cookie;
struct afs_super_info *as;
+ unsigned fpos;
+ int ret;
+
+ _enter("{%lu},%p{%s},", dir->i_ino, dentry, dentry->d_name.name);
+
+ as = dir->i_sb->s_fs_info;
+
+ /* search the directory */
+ cookie.name = dentry->d_name.name;
+ cookie.nlen = dentry->d_name.len;
+ cookie.fid.vid = as->volume->vid;
+ cookie.found = 0;
+
+ fpos = 0;
+ ret = afs_dir_iterate(dir, &fpos, &cookie, afs_lookup_filldir,
+ key);
+ if (ret < 0) {
+ _leave(" = %d [iter]", ret);
+ return ret;
+ }
+
+ ret = -ENOENT;
+ if (!cookie.found) {
+ _leave(" = -ENOENT [not found]");
+ return -ENOENT;
+ }
+
+ *fid = cookie.fid;
+ _leave(" = 0 { vn=%u u=%u }", fid->vnode, fid->unique);
+ return 0;
+}
+
+/*
+ * look up an entry in a directory
+ */
+static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
+{
struct afs_vnode *vnode;
+ struct afs_fid fid;
struct inode *inode;
- unsigned fpos;
+ struct key *key;
int ret;
- _enter("{%lu},%p{%s}", dir->i_ino, dentry, dentry->d_name.name);
+ vnode = AFS_FS_I(dir);
- /* insanity checks first */
- BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
- BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
+ _enter("{%x:%d},%p{%s},",
+ vnode->fid.vid, vnode->fid.vnode, dentry, dentry->d_name.name);
+
+ ASSERTCMP(dentry->d_inode, ==, NULL);
if (dentry->d_name.len > 255) {
_leave(" = -ENAMETOOLONG");
return ERR_PTR(-ENAMETOOLONG);
}
- vnode = AFS_FS_I(dir);
- if (vnode->flags & AFS_VNODE_DELETED) {
+ if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
_leave(" = -ESTALE");
return ERR_PTR(-ESTALE);
}
- as = dir->i_sb->s_fs_info;
-
- /* search the directory */
- cookie.name = dentry->d_name.name;
- cookie.nlen = dentry->d_name.len;
- cookie.fid.vid = as->volume->vid;
- cookie.found = 0;
+ key = afs_request_key(vnode->volume->cell);
+ if (IS_ERR(key)) {
+ _leave(" = %ld [key]", PTR_ERR(key));
+ return ERR_PTR(PTR_ERR(key));
+ }
- fpos = 0;
- ret = afs_dir_iterate(dir, &fpos, &cookie, afs_dir_lookup_filldir);
+ ret = afs_validate(vnode, key);
if (ret < 0) {
- _leave(" = %d", ret);
+ key_put(key);
+ _leave(" = %d [val]", ret);
return ERR_PTR(ret);
}
- ret = -ENOENT;
- if (!cookie.found) {
- _leave(" = %d", ret);
+ ret = afs_do_lookup(dir, dentry, &fid, key);
+ if (ret < 0) {
+ key_put(key);
+ if (ret == -ENOENT) {
+ d_add(dentry, NULL);
+ _leave(" = NULL [negative]");
+ return NULL;
+ }
+ _leave(" = %d [do]", ret);
return ERR_PTR(ret);
}
+ dentry->d_fsdata = (void *)(unsigned long) vnode->status.data_version;
/* instantiate the dentry */
- ret = afs_iget(dir->i_sb, &cookie.fid, &inode);
- if (ret < 0) {
- _leave(" = %d", ret);
- return ERR_PTR(ret);
+ inode = afs_iget(dir->i_sb, key, &fid, NULL, NULL);
+ key_put(key);
+ if (IS_ERR(inode)) {
+ _leave(" = %ld", PTR_ERR(inode));
+ return ERR_PTR(PTR_ERR(inode));
}
dentry->d_op = &afs_fs_dentry_operations;
- dentry->d_fsdata = (void *) (unsigned long) vnode->status.version;
d_add(dentry, inode);
_leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%lu }",
- cookie.fid.vnode,
- cookie.fid.unique,
+ fid.vnode,
+ fid.unique,
dentry->d_inode->i_ino,
dentry->d_inode->i_version);
return NULL;
-} /* end afs_dir_lookup() */
+}
-/*****************************************************************************/
/*
* check that a dentry lookup hit has found a valid entry
* - NOTE! the hit can be a negative hit too, so we can't assume we have an
* inode
- * (derived from nfs_lookup_revalidate)
*/
static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
{
- struct afs_dir_lookup_cookie cookie;
+ struct afs_vnode *vnode, *dir;
+ struct afs_fid fid;
struct dentry *parent;
- struct inode *inode, *dir;
- unsigned fpos;
+ struct key *key;
+ void *dir_version;
int ret;
- _enter("{sb=%p n=%s},", dentry->d_sb, dentry->d_name.name);
+ vnode = AFS_FS_I(dentry->d_inode);
- /* lock down the parent dentry so we can peer at it */
- parent = dget_parent(dentry->d_parent);
+ if (dentry->d_inode)
+ _enter("{v={%x:%u} n=%s fl=%lx},",
+ vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name,
+ vnode->flags);
+ else
+ _enter("{neg n=%s}", dentry->d_name.name);
- dir = parent->d_inode;
- inode = dentry->d_inode;
+ key = afs_request_key(AFS_FS_S(dentry->d_sb)->volume->cell);
+ if (IS_ERR(key))
+ key = NULL;
- /* handle a negative dentry */
- if (!inode)
+ /* lock down the parent dentry so we can peer at it */
+ parent = dget_parent(dentry);
+ if (!parent->d_inode)
goto out_bad;
- /* handle a bad inode */
- if (is_bad_inode(inode)) {
- printk("kAFS: afs_d_revalidate: %s/%s has bad inode\n",
- dentry->d_parent->d_name.name, dentry->d_name.name);
- goto out_bad;
- }
+ dir = AFS_FS_I(parent->d_inode);
- /* force a full look up if the parent directory changed since last the
- * server was consulted
- * - otherwise this inode must still exist, even if the inode details
- * themselves have changed
- */
- if (AFS_FS_I(dir)->flags & AFS_VNODE_CHANGED)
- afs_vnode_fetch_status(AFS_FS_I(dir));
+ /* validate the parent directory */
+ if (test_bit(AFS_VNODE_MODIFIED, &dir->flags))
+ afs_validate(dir, key);
- if (AFS_FS_I(dir)->flags & AFS_VNODE_DELETED) {
+ if (test_bit(AFS_VNODE_DELETED, &dir->flags)) {
_debug("%s: parent dir deleted", dentry->d_name.name);
goto out_bad;
}
- if (AFS_FS_I(inode)->flags & AFS_VNODE_DELETED) {
- _debug("%s: file already deleted", dentry->d_name.name);
- goto out_bad;
- }
-
- if ((unsigned long) dentry->d_fsdata !=
- (unsigned long) AFS_FS_I(dir)->status.version) {
- _debug("%s: parent changed %lu -> %u",
- dentry->d_name.name,
- (unsigned long) dentry->d_fsdata,
- (unsigned) AFS_FS_I(dir)->status.version);
+ dir_version = (void *) (unsigned long) dir->status.data_version;
+ if (dentry->d_fsdata == dir_version)
+ goto out_valid; /* the dir contents are unchanged */
- /* search the directory for this vnode */
- cookie.name = dentry->d_name.name;
- cookie.nlen = dentry->d_name.len;
- cookie.fid.vid = AFS_FS_I(inode)->volume->vid;
- cookie.found = 0;
+ _debug("dir modified");
- fpos = 0;
- ret = afs_dir_iterate(dir, &fpos, &cookie,
- afs_dir_lookup_filldir);
- if (ret < 0) {
- _debug("failed to iterate dir %s: %d",
- parent->d_name.name, ret);
+ /* search the directory for this vnode */
+ ret = afs_do_lookup(&dir->vfs_inode, dentry, &fid, key);
+ switch (ret) {
+ case 0:
+ /* the filename maps to something */
+ if (!dentry->d_inode)
+ goto out_bad;
+ if (is_bad_inode(dentry->d_inode)) {
+ printk("kAFS: afs_d_revalidate: %s/%s has bad inode\n",
+ parent->d_name.name, dentry->d_name.name);
goto out_bad;
- }
-
- if (!cookie.found) {
- _debug("%s: dirent not found", dentry->d_name.name);
- goto not_found;
}
/* if the vnode ID has changed, then the dirent points to a
* different file */
- if (cookie.fid.vnode != AFS_FS_I(inode)->fid.vnode) {
- _debug("%s: dirent changed", dentry->d_name.name);
+ if (fid.vnode != vnode->fid.vnode) {
+ _debug("%s: dirent changed [%u != %u]",
+ dentry->d_name.name, fid.vnode,
+ vnode->fid.vnode);
goto not_found;
}
/* if the vnode ID uniqifier has changed, then the file has
- * been deleted */
- if (cookie.fid.unique != AFS_FS_I(inode)->fid.unique) {
+ * been deleted and replaced, and the original vnode ID has
+ * been reused */
+ if (fid.unique != vnode->fid.unique) {
_debug("%s: file deleted (uq %u -> %u I:%lu)",
- dentry->d_name.name,
- cookie.fid.unique,
- AFS_FS_I(inode)->fid.unique,
- inode->i_version);
- spin_lock(&AFS_FS_I(inode)->lock);
- AFS_FS_I(inode)->flags |= AFS_VNODE_DELETED;
- spin_unlock(&AFS_FS_I(inode)->lock);
- invalidate_remote_inode(inode);
- goto out_bad;
+ dentry->d_name.name, fid.unique,
+ vnode->fid.unique, dentry->d_inode->i_version);
+ spin_lock(&vnode->lock);
+ set_bit(AFS_VNODE_DELETED, &vnode->flags);
+ spin_unlock(&vnode->lock);
+ goto not_found;
}
+ goto out_valid;
+
+ case -ENOENT:
+ /* the filename is unknown */
+ _debug("%s: dirent not found", dentry->d_name.name);
+ if (dentry->d_inode)
+ goto not_found;
+ goto out_valid;
- dentry->d_fsdata =
- (void *) (unsigned long) AFS_FS_I(dir)->status.version;
+ default:
+ _debug("failed to iterate dir %s: %d",
+ parent->d_name.name, ret);
+ goto out_bad;
}
- out_valid:
+out_valid:
+ dentry->d_fsdata = dir_version;
+out_skip:
dput(parent);
+ key_put(key);
_leave(" = 1 [valid]");
return 1;
/* the dirent, if it exists, now points to a different vnode */
- not_found:
+not_found:
spin_lock(&dentry->d_lock);
dentry->d_flags |= DCACHE_NFSFS_RENAMED;
spin_unlock(&dentry->d_lock);
- out_bad:
- if (inode) {
+out_bad:
+ if (dentry->d_inode) {
/* don't unhash if we have submounts */
if (have_submounts(dentry))
- goto out_valid;
+ goto out_skip;
}
- shrink_dcache_parent(dentry);
-
_debug("dropping dentry %s/%s",
- dentry->d_parent->d_name.name, dentry->d_name.name);
+ parent->d_name.name, dentry->d_name.name);
+ shrink_dcache_parent(dentry);
d_drop(dentry);
-
dput(parent);
+ key_put(key);
_leave(" = 0 [bad]");
return 0;
-} /* end afs_d_revalidate() */
+}
-/*****************************************************************************/
/*
* allow the VFS to enquire as to whether a dentry should be unhashed (mustn't
* sleep)
@@ -649,15 +698,444 @@ static int afs_d_delete(struct dentry *dentry)
if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
goto zap;
- if (dentry->d_inode) {
- if (AFS_FS_I(dentry->d_inode)->flags & AFS_VNODE_DELETED)
+ if (dentry->d_inode &&
+ test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dentry->d_inode)->flags))
goto zap;
- }
_leave(" = 0 [keep]");
return 0;
- zap:
+zap:
_leave(" = 1 [zap]");
return 1;
-} /* end afs_d_delete() */
+}
+
+/*
+ * handle dentry release
+ */
+static void afs_d_release(struct dentry *dentry)
+{
+ _enter("%s", dentry->d_name.name);
+}
+
+/*
+ * create a directory on an AFS filesystem
+ */
+static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+ struct afs_file_status status;
+ struct afs_callback cb;
+ struct afs_server *server;
+ struct afs_vnode *dvnode, *vnode;
+ struct afs_fid fid;
+ struct inode *inode;
+ struct key *key;
+ int ret;
+
+ dvnode = AFS_FS_I(dir);
+
+ _enter("{%x:%d},{%s},%o",
+ dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
+
+ ret = -ENAMETOOLONG;
+ if (dentry->d_name.len > 255)
+ goto error;
+
+ key = afs_request_key(dvnode->volume->cell);
+ if (IS_ERR(key)) {
+ ret = PTR_ERR(key);
+ goto error;
+ }
+
+ mode |= S_IFDIR;
+ ret = afs_vnode_create(dvnode, key, dentry->d_name.name,
+ mode, &fid, &status, &cb, &server);
+ if (ret < 0)
+ goto mkdir_error;
+
+ inode = afs_iget(dir->i_sb, key, &fid, &status, &cb);
+ if (IS_ERR(inode)) {
+ /* ENOMEM at a really inconvenient time - just abandon the new
+ * directory on the server */
+ ret = PTR_ERR(inode);
+ goto iget_error;
+ }
+
+ /* apply the status report we've got for the new vnode */
+ vnode = AFS_FS_I(inode);
+ spin_lock(&vnode->lock);
+ vnode->update_cnt++;
+ spin_unlock(&vnode->lock);
+ afs_vnode_finalise_status_update(vnode, server);
+ afs_put_server(server);
+
+ d_instantiate(dentry, inode);
+ if (d_unhashed(dentry)) {
+ _debug("not hashed");
+ d_rehash(dentry);
+ }
+ key_put(key);
+ _leave(" = 0");
+ return 0;
+
+iget_error:
+ afs_put_server(server);
+mkdir_error:
+ key_put(key);
+error:
+ d_drop(dentry);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * remove a directory from an AFS filesystem
+ */
+static int afs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ struct afs_vnode *dvnode, *vnode;
+ struct key *key;
+ int ret;
+
+ dvnode = AFS_FS_I(dir);
+
+ _enter("{%x:%d},{%s}",
+ dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
+
+ ret = -ENAMETOOLONG;
+ if (dentry->d_name.len > 255)
+ goto error;
+
+ key = afs_request_key(dvnode->volume->cell);
+ if (IS_ERR(key)) {
+ ret = PTR_ERR(key);
+ goto error;
+ }
+
+ ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, true);
+ if (ret < 0)
+ goto rmdir_error;
+
+ if (dentry->d_inode) {
+ vnode = AFS_FS_I(dentry->d_inode);
+ clear_nlink(&vnode->vfs_inode);
+ set_bit(AFS_VNODE_DELETED, &vnode->flags);
+ afs_discard_callback_on_delete(vnode);
+ }
+
+ key_put(key);
+ _leave(" = 0");
+ return 0;
+
+rmdir_error:
+ key_put(key);
+error:
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * remove a file from an AFS filesystem
+ */
+static int afs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct afs_vnode *dvnode, *vnode;
+ struct key *key;
+ int ret;
+
+ dvnode = AFS_FS_I(dir);
+
+ _enter("{%x:%d},{%s}",
+ dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
+
+ ret = -ENAMETOOLONG;
+ if (dentry->d_name.len > 255)
+ goto error;
+
+ key = afs_request_key(dvnode->volume->cell);
+ if (IS_ERR(key)) {
+ ret = PTR_ERR(key);
+ goto error;
+ }
+
+ if (dentry->d_inode) {
+ vnode = AFS_FS_I(dentry->d_inode);
+
+ /* make sure we have a callback promise on the victim */
+ ret = afs_validate(vnode, key);
+ if (ret < 0)
+ goto error;
+ }
+
+ ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, false);
+ if (ret < 0)
+ goto remove_error;
+
+ if (dentry->d_inode) {
+ /* if the file wasn't deleted due to excess hard links, the
+ * fileserver will break the callback promise on the file - if
+ * it had one - before it returns to us, and if it was deleted,
+ * it won't
+ *
+ * however, if we didn't have a callback promise outstanding,
+ * or it was outstanding on a different server, then it won't
+ * break it either...
+ */
+ vnode = AFS_FS_I(dentry->d_inode);
+ if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+ _debug("AFS_VNODE_DELETED");
+ if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags))
+ _debug("AFS_VNODE_CB_BROKEN");
+ set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+ ret = afs_validate(vnode, key);
+ _debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret);
+ }
+
+ key_put(key);
+ _leave(" = 0");
+ return 0;
+
+remove_error:
+ key_put(key);
+error:
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * create a regular file on an AFS filesystem
+ */
+static int afs_create(struct inode *dir, struct dentry *dentry, int mode,
+ struct nameidata *nd)
+{
+ struct afs_file_status status;
+ struct afs_callback cb;
+ struct afs_server *server;
+ struct afs_vnode *dvnode, *vnode;
+ struct afs_fid fid;
+ struct inode *inode;
+ struct key *key;
+ int ret;
+
+ dvnode = AFS_FS_I(dir);
+
+ _enter("{%x:%d},{%s},%o,",
+ dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
+
+ ret = -ENAMETOOLONG;
+ if (dentry->d_name.len > 255)
+ goto error;
+
+ key = afs_request_key(dvnode->volume->cell);
+ if (IS_ERR(key)) {
+ ret = PTR_ERR(key);
+ goto error;
+ }
+
+ mode |= S_IFREG;
+ ret = afs_vnode_create(dvnode, key, dentry->d_name.name,
+ mode, &fid, &status, &cb, &server);
+ if (ret < 0)
+ goto create_error;
+
+ inode = afs_iget(dir->i_sb, key, &fid, &status, &cb);
+ if (IS_ERR(inode)) {
+ /* ENOMEM at a really inconvenient time - just abandon the new
+ * directory on the server */
+ ret = PTR_ERR(inode);
+ goto iget_error;
+ }
+
+ /* apply the status report we've got for the new vnode */
+ vnode = AFS_FS_I(inode);
+ spin_lock(&vnode->lock);
+ vnode->update_cnt++;
+ spin_unlock(&vnode->lock);
+ afs_vnode_finalise_status_update(vnode, server);
+ afs_put_server(server);
+
+ d_instantiate(dentry, inode);
+ if (d_unhashed(dentry)) {
+ _debug("not hashed");
+ d_rehash(dentry);
+ }
+ key_put(key);
+ _leave(" = 0");
+ return 0;
+
+iget_error:
+ afs_put_server(server);
+create_error:
+ key_put(key);
+error:
+ d_drop(dentry);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * create a hard link between files in an AFS filesystem
+ */
+static int afs_link(struct dentry *from, struct inode *dir,
+ struct dentry *dentry)
+{
+ struct afs_vnode *dvnode, *vnode;
+ struct key *key;
+ int ret;
+
+ vnode = AFS_FS_I(from->d_inode);
+ dvnode = AFS_FS_I(dir);
+
+ _enter("{%x:%d},{%x:%d},{%s}",
+ vnode->fid.vid, vnode->fid.vnode,
+ dvnode->fid.vid, dvnode->fid.vnode,
+ dentry->d_name.name);
+
+ ret = -ENAMETOOLONG;
+ if (dentry->d_name.len > 255)
+ goto error;
+
+ key = afs_request_key(dvnode->volume->cell);
+ if (IS_ERR(key)) {
+ ret = PTR_ERR(key);
+ goto error;
+ }
+
+ ret = afs_vnode_link(dvnode, vnode, key, dentry->d_name.name);
+ if (ret < 0)
+ goto link_error;
+
+ atomic_inc(&vnode->vfs_inode.i_count);
+ d_instantiate(dentry, &vnode->vfs_inode);
+ key_put(key);
+ _leave(" = 0");
+ return 0;
+
+link_error:
+ key_put(key);
+error:
+ d_drop(dentry);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * create a symlink in an AFS filesystem
+ */
+static int afs_symlink(struct inode *dir, struct dentry *dentry,
+ const char *content)
+{
+ struct afs_file_status status;
+ struct afs_server *server;
+ struct afs_vnode *dvnode, *vnode;
+ struct afs_fid fid;
+ struct inode *inode;
+ struct key *key;
+ int ret;
+
+ dvnode = AFS_FS_I(dir);
+
+ _enter("{%x:%d},{%s},%s",
+ dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name,
+ content);
+
+ ret = -ENAMETOOLONG;
+ if (dentry->d_name.len > 255)
+ goto error;
+
+ ret = -EINVAL;
+ if (strlen(content) > 1023)
+ goto error;
+
+ key = afs_request_key(dvnode->volume->cell);
+ if (IS_ERR(key)) {
+ ret = PTR_ERR(key);
+ goto error;
+ }
+
+ ret = afs_vnode_symlink(dvnode, key, dentry->d_name.name, content,
+ &fid, &status, &server);
+ if (ret < 0)
+ goto create_error;
+
+ inode = afs_iget(dir->i_sb, key, &fid, &status, NULL);
+ if (IS_ERR(inode)) {
+ /* ENOMEM at a really inconvenient time - just abandon the new
+ * directory on the server */
+ ret = PTR_ERR(inode);
+ goto iget_error;
+ }
+
+ /* apply the status report we've got for the new vnode */
+ vnode = AFS_FS_I(inode);
+ spin_lock(&vnode->lock);
+ vnode->update_cnt++;
+ spin_unlock(&vnode->lock);
+ afs_vnode_finalise_status_update(vnode, server);
+ afs_put_server(server);
+
+ d_instantiate(dentry, inode);
+ if (d_unhashed(dentry)) {
+ _debug("not hashed");
+ d_rehash(dentry);
+ }
+ key_put(key);
+ _leave(" = 0");
+ return 0;
+
+iget_error:
+ afs_put_server(server);
+create_error:
+ key_put(key);
+error:
+ d_drop(dentry);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * rename a file in an AFS filesystem and/or move it between directories
+ */
+static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct afs_vnode *orig_dvnode, *new_dvnode, *vnode;
+ struct key *key;
+ int ret;
+
+ vnode = AFS_FS_I(old_dentry->d_inode);
+ orig_dvnode = AFS_FS_I(old_dir);
+ new_dvnode = AFS_FS_I(new_dir);
+
+ _enter("{%x:%d},{%x:%d},{%x:%d},{%s}",
+ orig_dvnode->fid.vid, orig_dvnode->fid.vnode,
+ vnode->fid.vid, vnode->fid.vnode,
+ new_dvnode->fid.vid, new_dvnode->fid.vnode,
+ new_dentry->d_name.name);
+
+ ret = -ENAMETOOLONG;
+ if (new_dentry->d_name.len > 255)
+ goto error;
+
+ key = afs_request_key(orig_dvnode->volume->cell);
+ if (IS_ERR(key)) {
+ ret = PTR_ERR(key);
+ goto error;
+ }
+
+ ret = afs_vnode_rename(orig_dvnode, new_dvnode, key,
+ old_dentry->d_name.name,
+ new_dentry->d_name.name);
+ if (ret < 0)
+ goto rename_error;
+ key_put(key);
+ _leave(" = 0");
+ return 0;
+
+rename_error:
+ key_put(key);
+error:
+ d_drop(new_dentry);
+ _leave(" = %d", ret);
+ return ret;
+}
diff --git a/fs/afs/errors.h b/fs/afs/errors.h
deleted file mode 100644
index 574d94ac8d05..000000000000
--- a/fs/afs/errors.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* errors.h: AFS abort/error codes
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_AFS_ERRORS_H
-#define _LINUX_AFS_ERRORS_H
-
-#include "types.h"
-
-/* file server abort codes */
-typedef enum {
- VSALVAGE = 101, /* volume needs salvaging */
- VNOVNODE = 102, /* no such file/dir (vnode) */
- VNOVOL = 103, /* no such volume or volume unavailable */
- VVOLEXISTS = 104, /* volume name already exists */
- VNOSERVICE = 105, /* volume not currently in service */
- VOFFLINE = 106, /* volume is currently offline (more info available [VVL-spec]) */
- VONLINE = 107, /* volume is already online */
- VDISKFULL = 108, /* disk partition is full */
- VOVERQUOTA = 109, /* volume's maximum quota exceeded */
- VBUSY = 110, /* volume is temporarily unavailable */
- VMOVED = 111, /* volume moved to new server - ask this FS where */
-} afs_rxfs_abort_t;
-
-extern int afs_abort_to_error(int abortcode);
-
-#endif /* _LINUX_AFS_ERRORS_H */
diff --git a/fs/afs/file.c b/fs/afs/file.c
index b17634541f67..ae256498f4f7 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -1,6 +1,6 @@
-/* file.c: AFS filesystem file handling
+/* AFS filesystem file handling
*
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
@@ -15,22 +15,25 @@
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
-#include "volume.h"
-#include "vnode.h"
-#include <rxrpc/call.h>
#include "internal.h"
-#if 0
-static int afs_file_open(struct inode *inode, struct file *file);
-static int afs_file_release(struct inode *inode, struct file *file);
-#endif
-
static int afs_file_readpage(struct file *file, struct page *page);
static void afs_file_invalidatepage(struct page *page, unsigned long offset);
static int afs_file_releasepage(struct page *page, gfp_t gfp_flags);
+const struct file_operations afs_file_operations = {
+ .open = afs_open,
+ .release = afs_release,
+ .llseek = generic_file_llseek,
+ .read = do_sync_read,
+ .aio_read = generic_file_aio_read,
+ .mmap = generic_file_readonly_mmap,
+ .sendfile = generic_file_sendfile,
+};
+
const struct inode_operations afs_file_inode_operations = {
.getattr = afs_inode_getattr,
+ .permission = afs_permission,
};
const struct address_space_operations afs_fs_aops = {
@@ -40,7 +43,48 @@ const struct address_space_operations afs_fs_aops = {
.invalidatepage = afs_file_invalidatepage,
};
-/*****************************************************************************/
+/*
+ * open an AFS file or directory and attach a key to it
+ */
+int afs_open(struct inode *inode, struct file *file)
+{
+ struct afs_vnode *vnode = AFS_FS_I(inode);
+ struct key *key;
+ int ret;
+
+ _enter("{%x:%x},", vnode->fid.vid, vnode->fid.vnode);
+
+ key = afs_request_key(vnode->volume->cell);
+ if (IS_ERR(key)) {
+ _leave(" = %ld [key]", PTR_ERR(key));
+ return PTR_ERR(key);
+ }
+
+ ret = afs_validate(vnode, key);
+ if (ret < 0) {
+ _leave(" = %d [val]", ret);
+ return ret;
+ }
+
+ file->private_data = key;
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * release an AFS file or directory and discard its key
+ */
+int afs_release(struct inode *inode, struct file *file)
+{
+ struct afs_vnode *vnode = AFS_FS_I(inode);
+
+ _enter("{%x:%x},", vnode->fid.vid, vnode->fid.vnode);
+
+ key_put(file->private_data);
+ _leave(" = 0");
+ return 0;
+}
+
/*
* deal with notification that a page was read from the cache
*/
@@ -58,10 +102,9 @@ static void afs_file_readpage_read_complete(void *cookie_data,
SetPageUptodate(page);
unlock_page(page);
-} /* end afs_file_readpage_read_complete() */
+}
#endif
-/*****************************************************************************/
/*
* deal with notification that a page was written to the cache
*/
@@ -74,41 +117,38 @@ static void afs_file_readpage_write_complete(void *cookie_data,
_enter("%p,%p,%p,%d", cookie_data, page, data, error);
unlock_page(page);
-
-} /* end afs_file_readpage_write_complete() */
+}
#endif
-/*****************************************************************************/
/*
* AFS read page from file (or symlink)
*/
static int afs_file_readpage(struct file *file, struct page *page)
{
- struct afs_rxfs_fetch_descriptor desc;
-#ifdef AFS_CACHING_SUPPORT
- struct cachefs_page *pageio;
-#endif
struct afs_vnode *vnode;
struct inode *inode;
+ struct key *key;
+ size_t len;
+ off_t offset;
int ret;
inode = page->mapping->host;
- _enter("{%lu},{%lu}", inode->i_ino, page->index);
+ ASSERT(file != NULL);
+ key = file->private_data;
+ ASSERT(key != NULL);
+
+ _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
vnode = AFS_FS_I(inode);
BUG_ON(!PageLocked(page));
ret = -ESTALE;
- if (vnode->flags & AFS_VNODE_DELETED)
+ if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
goto error;
#ifdef AFS_CACHING_SUPPORT
- ret = cachefs_page_get_private(page, &pageio, GFP_NOIO);
- if (ret < 0)
- goto error;
-
/* is it cached? */
ret = cachefs_read_or_alloc_page(vnode->cache,
page,
@@ -132,26 +172,19 @@ static int afs_file_readpage(struct file *file, struct page *page)
case -ENOBUFS:
case -ENODATA:
default:
- desc.fid = vnode->fid;
- desc.offset = page->index << PAGE_CACHE_SHIFT;
- desc.size = min((size_t) (inode->i_size - desc.offset),
- (size_t) PAGE_SIZE);
- desc.buffer = kmap(page);
-
- clear_page(desc.buffer);
+ offset = page->index << PAGE_CACHE_SHIFT;
+ len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE);
/* read the contents of the file from the server into the
* page */
- ret = afs_vnode_fetch_data(vnode, &desc);
- kunmap(page);
+ ret = afs_vnode_fetch_data(vnode, key, offset, len, page);
if (ret < 0) {
- if (ret==-ENOENT) {
+ if (ret == -ENOENT) {
_debug("got NOENT from server"
" - marking file deleted and stale");
- vnode->flags |= AFS_VNODE_DELETED;
+ set_bit(AFS_VNODE_DELETED, &vnode->flags);
ret = -ESTALE;
}
-
#ifdef AFS_CACHING_SUPPORT
cachefs_uncache_page(vnode->cache, page);
#endif
@@ -178,16 +211,13 @@ static int afs_file_readpage(struct file *file, struct page *page)
_leave(" = 0");
return 0;
- error:
+error:
SetPageError(page);
unlock_page(page);
-
_leave(" = %d", ret);
return ret;
+}
-} /* end afs_file_readpage() */
-
-/*****************************************************************************/
/*
* get a page cookie for the specified page
*/
@@ -202,10 +232,9 @@ int afs_cache_get_page_cookie(struct page *page,
_leave(" = %d", ret);
return ret;
-} /* end afs_cache_get_page_cookie() */
+}
#endif
-/*****************************************************************************/
/*
* invalidate part or all of a page
*/
@@ -240,9 +269,8 @@ static void afs_file_invalidatepage(struct page *page, unsigned long offset)
}
_leave(" = %d", ret);
-} /* end afs_file_invalidatepage() */
+}
-/*****************************************************************************/
/*
* release a page and cleanup its private data
*/
@@ -267,4 +295,4 @@ static int afs_file_releasepage(struct page *page, gfp_t gfp_flags)
_leave(" = 0");
return 0;
-} /* end afs_file_releasepage() */
+}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 61bc371532ab..2393d2a08d79 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -1,6 +1,6 @@
-/* fsclient.c: AFS File Server client stubs
+/* AFS File Server client stubs
*
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
@@ -11,827 +11,927 @@
#include <linux/init.h>
#include <linux/sched.h>
-#include <rxrpc/rxrpc.h>
-#include <rxrpc/transport.h>
-#include <rxrpc/connection.h>
-#include <rxrpc/call.h>
-#include "fsclient.h"
-#include "cmservice.h"
-#include "vnode.h"
-#include "server.h"
-#include "errors.h"
+#include <linux/circ_buf.h>
#include "internal.h"
+#include "afs_fs.h"
-#define FSFETCHSTATUS 132 /* AFS Fetch file status */
-#define FSFETCHDATA 130 /* AFS Fetch file data */
-#define FSGIVEUPCALLBACKS 147 /* AFS Discard callback promises */
-#define FSGETVOLUMEINFO 148 /* AFS Get root volume information */
-#define FSGETROOTVOLUME 151 /* AFS Get root volume name */
-#define FSLOOKUP 161 /* AFS lookup file in directory */
-
-/*****************************************************************************/
/*
- * map afs abort codes to/from Linux error codes
- * - called with call->lock held
+ * decode an AFSFid block
*/
-static void afs_rxfs_aemap(struct rxrpc_call *call)
+static void xdr_decode_AFSFid(const __be32 **_bp, struct afs_fid *fid)
{
- switch (call->app_err_state) {
- case RXRPC_ESTATE_LOCAL_ABORT:
- call->app_abort_code = -call->app_errno;
- break;
- case RXRPC_ESTATE_PEER_ABORT:
- call->app_errno = afs_abort_to_error(call->app_abort_code);
- break;
- default:
- break;
- }
-} /* end afs_rxfs_aemap() */
+ const __be32 *bp = *_bp;
+
+ fid->vid = ntohl(*bp++);
+ fid->vnode = ntohl(*bp++);
+ fid->unique = ntohl(*bp++);
+ *_bp = bp;
+}
-/*****************************************************************************/
/*
- * get the root volume name from a fileserver
- * - this operation doesn't seem to work correctly in OpenAFS server 1.2.2
+ * decode an AFSFetchStatus block
*/
-#if 0
-int afs_rxfs_get_root_volume(struct afs_server *server,
- char *buf, size_t *buflen)
+static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
+ struct afs_file_status *status,
+ struct afs_vnode *vnode)
{
- struct rxrpc_connection *conn;
- struct rxrpc_call *call;
- struct kvec piov[2];
- size_t sent;
- int ret;
- u32 param[1];
+ const __be32 *bp = *_bp;
+ umode_t mode;
+ u64 data_version, size;
+ u32 changed = 0; /* becomes non-zero if ctime-type changes seen */
+
+#define EXTRACT(DST) \
+ do { \
+ u32 x = ntohl(*bp++); \
+ changed |= DST - x; \
+ DST = x; \
+ } while (0)
+
+ status->if_version = ntohl(*bp++);
+ EXTRACT(status->type);
+ EXTRACT(status->nlink);
+ size = ntohl(*bp++);
+ data_version = ntohl(*bp++);
+ EXTRACT(status->author);
+ EXTRACT(status->owner);
+ EXTRACT(status->caller_access); /* call ticket dependent */
+ EXTRACT(status->anon_access);
+ EXTRACT(status->mode);
+ EXTRACT(status->parent.vnode);
+ EXTRACT(status->parent.unique);
+ bp++; /* seg size */
+ status->mtime_client = ntohl(*bp++);
+ status->mtime_server = ntohl(*bp++);
+ EXTRACT(status->group);
+ bp++; /* sync counter */
+ data_version |= (u64) ntohl(*bp++) << 32;
+ bp++; /* lock count */
+ size |= (u64) ntohl(*bp++) << 32;
+ bp++; /* spare 4 */
+ *_bp = bp;
+
+ if (size != status->size) {
+ status->size = size;
+ changed |= true;
+ }
+ status->mode &= S_IALLUGO;
+
+ _debug("vnode time %lx, %lx",
+ status->mtime_client, status->mtime_server);
+
+ if (vnode) {
+ status->parent.vid = vnode->fid.vid;
+ if (changed && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
+ _debug("vnode changed");
+ i_size_write(&vnode->vfs_inode, size);
+ vnode->vfs_inode.i_uid = status->owner;
+ vnode->vfs_inode.i_gid = status->group;
+ vnode->vfs_inode.i_version = vnode->fid.unique;
+ vnode->vfs_inode.i_nlink = status->nlink;
+
+ mode = vnode->vfs_inode.i_mode;
+ mode &= ~S_IALLUGO;
+ mode |= status->mode;
+ barrier();
+ vnode->vfs_inode.i_mode = mode;
+ }
- DECLARE_WAITQUEUE(myself, current);
+ vnode->vfs_inode.i_ctime.tv_sec = status->mtime_server;
+ vnode->vfs_inode.i_mtime = vnode->vfs_inode.i_ctime;
+ vnode->vfs_inode.i_atime = vnode->vfs_inode.i_ctime;
+ }
- kenter("%p,%p,%u",server, buf, *buflen);
+ if (status->data_version != data_version) {
+ status->data_version = data_version;
+ if (vnode && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
+ _debug("vnode modified %llx on {%x:%u}",
+ (unsigned long long) data_version,
+ vnode->fid.vid, vnode->fid.vnode);
+ set_bit(AFS_VNODE_MODIFIED, &vnode->flags);
+ set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
+ }
+ }
+}
- /* get hold of the fileserver connection */
- ret = afs_server_get_fsconn(server, &conn);
- if (ret < 0)
- goto out;
+/*
+ * decode an AFSCallBack block
+ */
+static void xdr_decode_AFSCallBack(const __be32 **_bp, struct afs_vnode *vnode)
+{
+ const __be32 *bp = *_bp;
- /* create a call through that connection */
- ret = rxrpc_create_call(conn, NULL, NULL, afs_rxfs_aemap, &call);
- if (ret < 0) {
- printk("kAFS: Unable to create call: %d\n", ret);
- goto out_put_conn;
- }
- call->app_opcode = FSGETROOTVOLUME;
+ vnode->cb_version = ntohl(*bp++);
+ vnode->cb_expiry = ntohl(*bp++);
+ vnode->cb_type = ntohl(*bp++);
+ vnode->cb_expires = vnode->cb_expiry + get_seconds();
+ *_bp = bp;
+}
- /* we want to get event notifications from the call */
- add_wait_queue(&call->waitq, &myself);
+static void xdr_decode_AFSCallBack_raw(const __be32 **_bp,
+ struct afs_callback *cb)
+{
+ const __be32 *bp = *_bp;
- /* marshall the parameters */
- param[0] = htonl(FSGETROOTVOLUME);
-
- piov[0].iov_len = sizeof(param);
- piov[0].iov_base = param;
-
- /* send the parameters to the server */
- ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
- 0, &sent);
- if (ret < 0)
- goto abort;
-
- /* wait for the reply to completely arrive */
- for (;;) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (call->app_call_state != RXRPC_CSTATE_CLNT_RCV_REPLY ||
- signal_pending(current))
- break;
- schedule();
- }
- set_current_state(TASK_RUNNING);
+ cb->version = ntohl(*bp++);
+ cb->expiry = ntohl(*bp++);
+ cb->type = ntohl(*bp++);
+ *_bp = bp;
+}
- ret = -EINTR;
- if (signal_pending(current))
- goto abort;
+/*
+ * decode an AFSVolSync block
+ */
+static void xdr_decode_AFSVolSync(const __be32 **_bp,
+ struct afs_volsync *volsync)
+{
+ const __be32 *bp = *_bp;
- switch (call->app_call_state) {
- case RXRPC_CSTATE_ERROR:
- ret = call->app_errno;
- kdebug("Got Error: %d", ret);
- goto out_unwait;
+ volsync->creation = ntohl(*bp++);
+ bp++; /* spare2 */
+ bp++; /* spare3 */
+ bp++; /* spare4 */
+ bp++; /* spare5 */
+ bp++; /* spare6 */
+ *_bp = bp;
+}
- case RXRPC_CSTATE_CLNT_GOT_REPLY:
- /* read the reply */
- kdebug("Got Reply: qty=%d", call->app_ready_qty);
+/*
+ * deliver reply data to an FS.FetchStatus
+ */
+static int afs_deliver_fs_fetch_status(struct afs_call *call,
+ struct sk_buff *skb, bool last)
+{
+ struct afs_vnode *vnode = call->reply;
+ const __be32 *bp;
- ret = -EBADMSG;
- if (call->app_ready_qty <= 4)
- goto abort;
+ _enter(",,%u", last);
- ret = rxrpc_call_read_data(call, NULL, call->app_ready_qty, 0);
- if (ret < 0)
- goto abort;
+ afs_transfer_reply(call, skb);
+ if (!last)
+ return 0;
-#if 0
- /* unmarshall the reply */
- bp = buffer;
- for (loop = 0; loop < 65; loop++)
- entry->name[loop] = ntohl(*bp++);
- entry->name[64] = 0;
+ if (call->reply_size != call->reply_max)
+ return -EBADMSG;
- entry->type = ntohl(*bp++);
- entry->num_servers = ntohl(*bp++);
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+ xdr_decode_AFSCallBack(&bp, vnode);
+ if (call->reply2)
+ xdr_decode_AFSVolSync(&bp, call->reply2);
- for (loop = 0; loop < 8; loop++)
- entry->servers[loop].addr.s_addr = *bp++;
+ _leave(" = 0 [done]");
+ return 0;
+}
- for (loop = 0; loop < 8; loop++)
- entry->servers[loop].partition = ntohl(*bp++);
+/*
+ * FS.FetchStatus operation type
+ */
+static const struct afs_call_type afs_RXFSFetchStatus = {
+ .name = "FS.FetchStatus",
+ .deliver = afs_deliver_fs_fetch_status,
+ .abort_to_error = afs_abort_to_error,
+ .destructor = afs_flat_call_destructor,
+};
- for (loop = 0; loop < 8; loop++)
- entry->servers[loop].flags = ntohl(*bp++);
+/*
+ * fetch the status information for a file
+ */
+int afs_fs_fetch_file_status(struct afs_server *server,
+ struct key *key,
+ struct afs_vnode *vnode,
+ struct afs_volsync *volsync,
+ const struct afs_wait_mode *wait_mode)
+{
+ struct afs_call *call;
+ __be32 *bp;
- for (loop = 0; loop < 3; loop++)
- entry->volume_ids[loop] = ntohl(*bp++);
+ _enter(",%x,{%x:%d},,",
+ key_serial(key), vnode->fid.vid, vnode->fid.vnode);
- entry->clone_id = ntohl(*bp++);
- entry->flags = ntohl(*bp);
-#endif
+ call = afs_alloc_flat_call(&afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4);
+ if (!call)
+ return -ENOMEM;
- /* success */
- ret = 0;
- goto out_unwait;
+ call->key = key;
+ call->reply = vnode;
+ call->reply2 = volsync;
+ call->service_id = FS_SERVICE;
+ call->port = htons(AFS_FS_PORT);
- default:
- BUG();
- }
+ /* marshall the parameters */
+ bp = call->request;
+ bp[0] = htonl(FSFETCHSTATUS);
+ bp[1] = htonl(vnode->fid.vid);
+ bp[2] = htonl(vnode->fid.vnode);
+ bp[3] = htonl(vnode->fid.unique);
+
+ return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
- abort:
- set_current_state(TASK_UNINTERRUPTIBLE);
- rxrpc_call_abort(call, ret);
- schedule();
- out_unwait:
- set_current_state(TASK_RUNNING);
- remove_wait_queue(&call->waitq, &myself);
- rxrpc_put_call(call);
- out_put_conn:
- afs_server_release_fsconn(server, conn);
- out:
- kleave("");
- return ret;
-} /* end afs_rxfs_get_root_volume() */
-#endif
-
-/*****************************************************************************/
/*
- * get information about a volume
+ * deliver reply data to an FS.FetchData
*/
-#if 0
-int afs_rxfs_get_volume_info(struct afs_server *server,
- const char *name,
- struct afs_volume_info *vinfo)
+static int afs_deliver_fs_fetch_data(struct afs_call *call,
+ struct sk_buff *skb, bool last)
{
- struct rxrpc_connection *conn;
- struct rxrpc_call *call;
- struct kvec piov[3];
- size_t sent;
+ struct afs_vnode *vnode = call->reply;
+ const __be32 *bp;
+ struct page *page;
+ void *buffer;
int ret;
- u32 param[2], *bp, zero;
- DECLARE_WAITQUEUE(myself, current);
+ _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+ switch (call->unmarshall) {
+ case 0:
+ call->offset = 0;
+ call->unmarshall++;
+
+ /* extract the returned data length */
+ case 1:
+ _debug("extract data length");
+ ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+ switch (ret) {
+ case 0: break;
+ case -EAGAIN: return 0;
+ default: return ret;
+ }
- _enter("%p,%s,%p", server, name, vinfo);
+ call->count = ntohl(call->tmp);
+ _debug("DATA length: %u", call->count);
+ if (call->count > PAGE_SIZE)
+ return -EBADMSG;
+ call->offset = 0;
+ call->unmarshall++;
+
+ if (call->count < PAGE_SIZE) {
+ buffer = kmap_atomic(call->reply3, KM_USER0);
+ memset(buffer + PAGE_SIZE - call->count, 0,
+ call->count);
+ kunmap_atomic(buffer, KM_USER0);
+ }
- /* get hold of the fileserver connection */
- ret = afs_server_get_fsconn(server, &conn);
- if (ret < 0)
- goto out;
+ /* extract the returned data */
+ case 2:
+ _debug("extract data");
+ page = call->reply3;
+ buffer = kmap_atomic(page, KM_USER0);
+ ret = afs_extract_data(call, skb, last, buffer, call->count);
+ kunmap_atomic(buffer, KM_USER0);
+ switch (ret) {
+ case 0: break;
+ case -EAGAIN: return 0;
+ default: return ret;
+ }
- /* create a call through that connection */
- ret = rxrpc_create_call(conn, NULL, NULL, afs_rxfs_aemap, &call);
- if (ret < 0) {
- printk("kAFS: Unable to create call: %d\n", ret);
- goto out_put_conn;
- }
- call->app_opcode = FSGETVOLUMEINFO;
+ call->offset = 0;
+ call->unmarshall++;
+
+ /* extract the metadata */
+ case 3:
+ ret = afs_extract_data(call, skb, last, call->buffer,
+ (21 + 3 + 6) * 4);
+ switch (ret) {
+ case 0: break;
+ case -EAGAIN: return 0;
+ default: return ret;
+ }
- /* we want to get event notifications from the call */
- add_wait_queue(&call->waitq, &myself);
+ bp = call->buffer;
+ xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+ xdr_decode_AFSCallBack(&bp, vnode);
+ if (call->reply2)
+ xdr_decode_AFSVolSync(&bp, call->reply2);
- /* marshall the parameters */
- piov[1].iov_len = strlen(name);
- piov[1].iov_base = (char *) name;
-
- zero = 0;
- piov[2].iov_len = (4 - (piov[1].iov_len & 3)) & 3;
- piov[2].iov_base = &zero;
-
- param[0] = htonl(FSGETVOLUMEINFO);
- param[1] = htonl(piov[1].iov_len);
-
- piov[0].iov_len = sizeof(param);
- piov[0].iov_base = param;
-
- /* send the parameters to the server */
- ret = rxrpc_call_write_data(call, 3, piov, RXRPC_LAST_PACKET, GFP_NOFS,
- 0, &sent);
- if (ret < 0)
- goto abort;
-
- /* wait for the reply to completely arrive */
- bp = rxrpc_call_alloc_scratch(call, 64);
-
- ret = rxrpc_call_read_data(call, bp, 64,
- RXRPC_CALL_READ_BLOCK |
- RXRPC_CALL_READ_ALL);
- if (ret < 0) {
- if (ret == -ECONNABORTED) {
- ret = call->app_errno;
- goto out_unwait;
- }
- goto abort;
+ call->offset = 0;
+ call->unmarshall++;
+
+ case 4:
+ _debug("trailer");
+ if (skb->len != 0)
+ return -EBADMSG;
+ break;
}
- /* unmarshall the reply */
- vinfo->vid = ntohl(*bp++);
- vinfo->type = ntohl(*bp++);
-
- vinfo->type_vids[0] = ntohl(*bp++);
- vinfo->type_vids[1] = ntohl(*bp++);
- vinfo->type_vids[2] = ntohl(*bp++);
- vinfo->type_vids[3] = ntohl(*bp++);
- vinfo->type_vids[4] = ntohl(*bp++);
-
- vinfo->nservers = ntohl(*bp++);
- vinfo->servers[0].addr.s_addr = *bp++;
- vinfo->servers[1].addr.s_addr = *bp++;
- vinfo->servers[2].addr.s_addr = *bp++;
- vinfo->servers[3].addr.s_addr = *bp++;
- vinfo->servers[4].addr.s_addr = *bp++;
- vinfo->servers[5].addr.s_addr = *bp++;
- vinfo->servers[6].addr.s_addr = *bp++;
- vinfo->servers[7].addr.s_addr = *bp++;
-
- ret = -EBADMSG;
- if (vinfo->nservers > 8)
- goto abort;
-
- /* success */
- ret = 0;
-
- out_unwait:
- set_current_state(TASK_RUNNING);
- remove_wait_queue(&call->waitq, &myself);
- rxrpc_put_call(call);
- out_put_conn:
- afs_server_release_fsconn(server, conn);
- out:
- _leave("");
- return ret;
-
- abort:
- set_current_state(TASK_UNINTERRUPTIBLE);
- rxrpc_call_abort(call, ret);
- schedule();
- goto out_unwait;
-
-} /* end afs_rxfs_get_volume_info() */
-#endif
-
-/*****************************************************************************/
+ if (!last)
+ return 0;
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
/*
- * fetch the status information for a file
+ * FS.FetchData operation type
+ */
+static const struct afs_call_type afs_RXFSFetchData = {
+ .name = "FS.FetchData",
+ .deliver = afs_deliver_fs_fetch_data,
+ .abort_to_error = afs_abort_to_error,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * fetch data from a file
*/
-int afs_rxfs_fetch_file_status(struct afs_server *server,
- struct afs_vnode *vnode,
- struct afs_volsync *volsync)
+int afs_fs_fetch_data(struct afs_server *server,
+ struct key *key,
+ struct afs_vnode *vnode,
+ off_t offset, size_t length,
+ struct page *buffer,
+ const struct afs_wait_mode *wait_mode)
{
- struct afs_server_callslot callslot;
- struct rxrpc_call *call;
- struct kvec piov[1];
- size_t sent;
- int ret;
+ struct afs_call *call;
__be32 *bp;
- DECLARE_WAITQUEUE(myself, current);
+ _enter("");
- _enter("%p,{%u,%u,%u}",
- server, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
+ call = afs_alloc_flat_call(&afs_RXFSFetchData, 24, (21 + 3 + 6) * 4);
+ if (!call)
+ return -ENOMEM;
- /* get hold of the fileserver connection */
- ret = afs_server_request_callslot(server, &callslot);
- if (ret < 0)
- goto out;
-
- /* create a call through that connection */
- ret = rxrpc_create_call(callslot.conn, NULL, NULL, afs_rxfs_aemap,
- &call);
- if (ret < 0) {
- printk("kAFS: Unable to create call: %d\n", ret);
- goto out_put_conn;
- }
- call->app_opcode = FSFETCHSTATUS;
-
- /* we want to get event notifications from the call */
- add_wait_queue(&call->waitq, &myself);
+ call->key = key;
+ call->reply = vnode;
+ call->reply2 = NULL; /* volsync */
+ call->reply3 = buffer;
+ call->service_id = FS_SERVICE;
+ call->port = htons(AFS_FS_PORT);
/* marshall the parameters */
- bp = rxrpc_call_alloc_scratch(call, 16);
- bp[0] = htonl(FSFETCHSTATUS);
+ bp = call->request;
+ bp[0] = htonl(FSFETCHDATA);
bp[1] = htonl(vnode->fid.vid);
bp[2] = htonl(vnode->fid.vnode);
bp[3] = htonl(vnode->fid.unique);
+ bp[4] = htonl(offset);
+ bp[5] = htonl(length);
- piov[0].iov_len = 16;
- piov[0].iov_base = bp;
-
- /* send the parameters to the server */
- ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
- 0, &sent);
- if (ret < 0)
- goto abort;
-
- /* wait for the reply to completely arrive */
- bp = rxrpc_call_alloc_scratch(call, 120);
-
- ret = rxrpc_call_read_data(call, bp, 120,
- RXRPC_CALL_READ_BLOCK |
- RXRPC_CALL_READ_ALL);
- if (ret < 0) {
- if (ret == -ECONNABORTED) {
- ret = call->app_errno;
- goto out_unwait;
- }
- goto abort;
- }
+ return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
- /* unmarshall the reply */
- vnode->status.if_version = ntohl(*bp++);
- vnode->status.type = ntohl(*bp++);
- vnode->status.nlink = ntohl(*bp++);
- vnode->status.size = ntohl(*bp++);
- vnode->status.version = ntohl(*bp++);
- vnode->status.author = ntohl(*bp++);
- vnode->status.owner = ntohl(*bp++);
- vnode->status.caller_access = ntohl(*bp++);
- vnode->status.anon_access = ntohl(*bp++);
- vnode->status.mode = ntohl(*bp++);
- vnode->status.parent.vid = vnode->fid.vid;
- vnode->status.parent.vnode = ntohl(*bp++);
- vnode->status.parent.unique = ntohl(*bp++);
- bp++; /* seg size */
- vnode->status.mtime_client = ntohl(*bp++);
- vnode->status.mtime_server = ntohl(*bp++);
- bp++; /* group */
- bp++; /* sync counter */
- vnode->status.version |= ((unsigned long long) ntohl(*bp++)) << 32;
- bp++; /* spare2 */
- bp++; /* spare3 */
- bp++; /* spare4 */
+/*
+ * deliver reply data to an FS.GiveUpCallBacks
+ */
+static int afs_deliver_fs_give_up_callbacks(struct afs_call *call,
+ struct sk_buff *skb, bool last)
+{
+ _enter(",{%u},%d", skb->len, last);
- vnode->cb_version = ntohl(*bp++);
- vnode->cb_expiry = ntohl(*bp++);
- vnode->cb_type = ntohl(*bp++);
-
- if (volsync) {
- volsync->creation = ntohl(*bp++);
- bp++; /* spare2 */
- bp++; /* spare3 */
- bp++; /* spare4 */
- bp++; /* spare5 */
- bp++; /* spare6 */
- }
+ if (skb->len > 0)
+ return -EBADMSG; /* shouldn't be any reply data */
+ return 0;
+}
- /* success */
- ret = 0;
-
- out_unwait:
- set_current_state(TASK_RUNNING);
- remove_wait_queue(&call->waitq, &myself);
- rxrpc_put_call(call);
- out_put_conn:
- afs_server_release_callslot(server, &callslot);
- out:
- _leave("");
- return ret;
-
- abort:
- set_current_state(TASK_UNINTERRUPTIBLE);
- rxrpc_call_abort(call, ret);
- schedule();
- goto out_unwait;
-} /* end afs_rxfs_fetch_file_status() */
-
-/*****************************************************************************/
/*
- * fetch the contents of a file or directory
+ * FS.GiveUpCallBacks operation type
*/
-int afs_rxfs_fetch_file_data(struct afs_server *server,
- struct afs_vnode *vnode,
- struct afs_rxfs_fetch_descriptor *desc,
- struct afs_volsync *volsync)
+static const struct afs_call_type afs_RXFSGiveUpCallBacks = {
+ .name = "FS.GiveUpCallBacks",
+ .deliver = afs_deliver_fs_give_up_callbacks,
+ .abort_to_error = afs_abort_to_error,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * give up a set of callbacks
+ * - the callbacks are held in the server->cb_break ring
+ */
+int afs_fs_give_up_callbacks(struct afs_server *server,
+ const struct afs_wait_mode *wait_mode)
{
- struct afs_server_callslot callslot;
- struct rxrpc_call *call;
- struct kvec piov[1];
- size_t sent;
- int ret;
- __be32 *bp;
+ struct afs_call *call;
+ size_t ncallbacks;
+ __be32 *bp, *tp;
+ int loop;
- DECLARE_WAITQUEUE(myself, current);
-
- _enter("%p,{fid={%u,%u,%u},sz=%Zu,of=%lu}",
- server,
- desc->fid.vid,
- desc->fid.vnode,
- desc->fid.unique,
- desc->size,
- desc->offset);
-
- /* get hold of the fileserver connection */
- ret = afs_server_request_callslot(server, &callslot);
- if (ret < 0)
- goto out;
-
- /* create a call through that connection */
- ret = rxrpc_create_call(callslot.conn, NULL, NULL, afs_rxfs_aemap, &call);
- if (ret < 0) {
- printk("kAFS: Unable to create call: %d\n", ret);
- goto out_put_conn;
- }
- call->app_opcode = FSFETCHDATA;
+ ncallbacks = CIRC_CNT(server->cb_break_head, server->cb_break_tail,
+ ARRAY_SIZE(server->cb_break));
+
+ _enter("{%zu},", ncallbacks);
+
+ if (ncallbacks == 0)
+ return 0;
+ if (ncallbacks > AFSCBMAX)
+ ncallbacks = AFSCBMAX;
+
+ _debug("break %zu callbacks", ncallbacks);
- /* we want to get event notifications from the call */
- add_wait_queue(&call->waitq, &myself);
+ call = afs_alloc_flat_call(&afs_RXFSGiveUpCallBacks,
+ 12 + ncallbacks * 6 * 4, 0);
+ if (!call)
+ return -ENOMEM;
+
+ call->service_id = FS_SERVICE;
+ call->port = htons(AFS_FS_PORT);
/* marshall the parameters */
- bp = rxrpc_call_alloc_scratch(call, 24);
- bp[0] = htonl(FSFETCHDATA);
- bp[1] = htonl(desc->fid.vid);
- bp[2] = htonl(desc->fid.vnode);
- bp[3] = htonl(desc->fid.unique);
- bp[4] = htonl(desc->offset);
- bp[5] = htonl(desc->size);
-
- piov[0].iov_len = 24;
- piov[0].iov_base = bp;
-
- /* send the parameters to the server */
- ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
- 0, &sent);
- if (ret < 0)
- goto abort;
-
- /* wait for the data count to arrive */
- ret = rxrpc_call_read_data(call, bp, 4, RXRPC_CALL_READ_BLOCK);
- if (ret < 0)
- goto read_failed;
-
- desc->actual = ntohl(bp[0]);
- if (desc->actual != desc->size) {
- ret = -EBADMSG;
- goto abort;
+ bp = call->request;
+ tp = bp + 2 + ncallbacks * 3;
+ *bp++ = htonl(FSGIVEUPCALLBACKS);
+ *bp++ = htonl(ncallbacks);
+ *tp++ = htonl(ncallbacks);
+
+ atomic_sub(ncallbacks, &server->cb_break_n);
+ for (loop = ncallbacks; loop > 0; loop--) {
+ struct afs_callback *cb =
+ &server->cb_break[server->cb_break_tail];
+
+ *bp++ = htonl(cb->fid.vid);
+ *bp++ = htonl(cb->fid.vnode);
+ *bp++ = htonl(cb->fid.unique);
+ *tp++ = htonl(cb->version);
+ *tp++ = htonl(cb->expiry);
+ *tp++ = htonl(cb->type);
+ smp_mb();
+ server->cb_break_tail =
+ (server->cb_break_tail + 1) &
+ (ARRAY_SIZE(server->cb_break) - 1);
}
- /* call the app to read the actual data */
- rxrpc_call_reset_scratch(call);
-
- ret = rxrpc_call_read_data(call, desc->buffer, desc->actual,
- RXRPC_CALL_READ_BLOCK);
- if (ret < 0)
- goto read_failed;
-
- /* wait for the rest of the reply to completely arrive */
- rxrpc_call_reset_scratch(call);
- bp = rxrpc_call_alloc_scratch(call, 120);
-
- ret = rxrpc_call_read_data(call, bp, 120,
- RXRPC_CALL_READ_BLOCK |
- RXRPC_CALL_READ_ALL);
- if (ret < 0)
- goto read_failed;
-
- /* unmarshall the reply */
- vnode->status.if_version = ntohl(*bp++);
- vnode->status.type = ntohl(*bp++);
- vnode->status.nlink = ntohl(*bp++);
- vnode->status.size = ntohl(*bp++);
- vnode->status.version = ntohl(*bp++);
- vnode->status.author = ntohl(*bp++);
- vnode->status.owner = ntohl(*bp++);
- vnode->status.caller_access = ntohl(*bp++);
- vnode->status.anon_access = ntohl(*bp++);
- vnode->status.mode = ntohl(*bp++);
- vnode->status.parent.vid = desc->fid.vid;
- vnode->status.parent.vnode = ntohl(*bp++);
- vnode->status.parent.unique = ntohl(*bp++);
- bp++; /* seg size */
- vnode->status.mtime_client = ntohl(*bp++);
- vnode->status.mtime_server = ntohl(*bp++);
- bp++; /* group */
- bp++; /* sync counter */
- vnode->status.version |= ((unsigned long long) ntohl(*bp++)) << 32;
- bp++; /* spare2 */
- bp++; /* spare3 */
- bp++; /* spare4 */
+ ASSERT(ncallbacks > 0);
+ wake_up_nr(&server->cb_break_waitq, ncallbacks);
- vnode->cb_version = ntohl(*bp++);
- vnode->cb_expiry = ntohl(*bp++);
- vnode->cb_type = ntohl(*bp++);
-
- if (volsync) {
- volsync->creation = ntohl(*bp++);
- bp++; /* spare2 */
- bp++; /* spare3 */
- bp++; /* spare4 */
- bp++; /* spare5 */
- bp++; /* spare6 */
- }
+ return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
- /* success */
- ret = 0;
-
- out_unwait:
- set_current_state(TASK_RUNNING);
- remove_wait_queue(&call->waitq,&myself);
- rxrpc_put_call(call);
- out_put_conn:
- afs_server_release_callslot(server, &callslot);
- out:
- _leave(" = %d", ret);
- return ret;
-
- read_failed:
- if (ret == -ECONNABORTED) {
- ret = call->app_errno;
- goto out_unwait;
- }
+/*
+ * deliver reply data to an FS.CreateFile or an FS.MakeDir
+ */
+static int afs_deliver_fs_create_vnode(struct afs_call *call,
+ struct sk_buff *skb, bool last)
+{
+ struct afs_vnode *vnode = call->reply;
+ const __be32 *bp;
+
+ _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
- abort:
- set_current_state(TASK_UNINTERRUPTIBLE);
- rxrpc_call_abort(call, ret);
- schedule();
- goto out_unwait;
+ afs_transfer_reply(call, skb);
+ if (!last)
+ return 0;
-} /* end afs_rxfs_fetch_file_data() */
+ if (call->reply_size != call->reply_max)
+ return -EBADMSG;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ xdr_decode_AFSFid(&bp, call->reply2);
+ xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL);
+ xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+ xdr_decode_AFSCallBack_raw(&bp, call->reply4);
+ /* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * FS.CreateFile and FS.MakeDir operation type
+ */
+static const struct afs_call_type afs_RXFSCreateXXXX = {
+ .name = "FS.CreateXXXX",
+ .deliver = afs_deliver_fs_create_vnode,
+ .abort_to_error = afs_abort_to_error,
+ .destructor = afs_flat_call_destructor,
+};
-/*****************************************************************************/
/*
- * ask the AFS fileserver to discard a callback request on a file
+ * create a file or make a directory
*/
-int afs_rxfs_give_up_callback(struct afs_server *server,
- struct afs_vnode *vnode)
+int afs_fs_create(struct afs_server *server,
+ struct key *key,
+ struct afs_vnode *vnode,
+ const char *name,
+ umode_t mode,
+ struct afs_fid *newfid,
+ struct afs_file_status *newstatus,
+ struct afs_callback *newcb,
+ const struct afs_wait_mode *wait_mode)
{
- struct afs_server_callslot callslot;
- struct rxrpc_call *call;
- struct kvec piov[1];
- size_t sent;
- int ret;
+ struct afs_call *call;
+ size_t namesz, reqsz, padsz;
__be32 *bp;
- DECLARE_WAITQUEUE(myself, current);
+ _enter("");
- _enter("%p,{%u,%u,%u}",
- server, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
+ namesz = strlen(name);
+ padsz = (4 - (namesz & 3)) & 3;
+ reqsz = (5 * 4) + namesz + padsz + (6 * 4);
- /* get hold of the fileserver connection */
- ret = afs_server_request_callslot(server, &callslot);
- if (ret < 0)
- goto out;
+ call = afs_alloc_flat_call(&afs_RXFSCreateXXXX, reqsz,
+ (3 + 21 + 21 + 3 + 6) * 4);
+ if (!call)
+ return -ENOMEM;
- /* create a call through that connection */
- ret = rxrpc_create_call(callslot.conn, NULL, NULL, afs_rxfs_aemap, &call);
- if (ret < 0) {
- printk("kAFS: Unable to create call: %d\n", ret);
- goto out_put_conn;
+ call->key = key;
+ call->reply = vnode;
+ call->reply2 = newfid;
+ call->reply3 = newstatus;
+ call->reply4 = newcb;
+ call->service_id = FS_SERVICE;
+ call->port = htons(AFS_FS_PORT);
+
+ /* marshall the parameters */
+ bp = call->request;
+ *bp++ = htonl(S_ISDIR(mode) ? FSMAKEDIR : FSCREATEFILE);
+ *bp++ = htonl(vnode->fid.vid);
+ *bp++ = htonl(vnode->fid.vnode);
+ *bp++ = htonl(vnode->fid.unique);
+ *bp++ = htonl(namesz);
+ memcpy(bp, name, namesz);
+ bp = (void *) bp + namesz;
+ if (padsz > 0) {
+ memset(bp, 0, padsz);
+ bp = (void *) bp + padsz;
}
- call->app_opcode = FSGIVEUPCALLBACKS;
+ *bp++ = htonl(AFS_SET_MODE);
+ *bp++ = 0; /* mtime */
+ *bp++ = 0; /* owner */
+ *bp++ = 0; /* group */
+ *bp++ = htonl(mode & S_IALLUGO); /* unix mode */
+ *bp++ = 0; /* segment size */
- /* we want to get event notifications from the call */
- add_wait_queue(&call->waitq, &myself);
+ return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
- /* marshall the parameters */
- bp = rxrpc_call_alloc_scratch(call, (1 + 4 + 4) * 4);
+/*
+ * deliver reply data to an FS.RemoveFile or FS.RemoveDir
+ */
+static int afs_deliver_fs_remove(struct afs_call *call,
+ struct sk_buff *skb, bool last)
+{
+ struct afs_vnode *vnode = call->reply;
+ const __be32 *bp;
- piov[0].iov_len = (1 + 4 + 4) * 4;
- piov[0].iov_base = bp;
+ _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
- *bp++ = htonl(FSGIVEUPCALLBACKS);
- *bp++ = htonl(1);
+ afs_transfer_reply(call, skb);
+ if (!last)
+ return 0;
+
+ if (call->reply_size != call->reply_max)
+ return -EBADMSG;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+ /* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * FS.RemoveDir/FS.RemoveFile operation type
+ */
+static const struct afs_call_type afs_RXFSRemoveXXXX = {
+ .name = "FS.RemoveXXXX",
+ .deliver = afs_deliver_fs_remove,
+ .abort_to_error = afs_abort_to_error,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * remove a file or directory
+ */
+int afs_fs_remove(struct afs_server *server,
+ struct key *key,
+ struct afs_vnode *vnode,
+ const char *name,
+ bool isdir,
+ const struct afs_wait_mode *wait_mode)
+{
+ struct afs_call *call;
+ size_t namesz, reqsz, padsz;
+ __be32 *bp;
+
+ _enter("");
+
+ namesz = strlen(name);
+ padsz = (4 - (namesz & 3)) & 3;
+ reqsz = (5 * 4) + namesz + padsz;
+
+ call = afs_alloc_flat_call(&afs_RXFSRemoveXXXX, reqsz, (21 + 6) * 4);
+ if (!call)
+ return -ENOMEM;
+
+ call->key = key;
+ call->reply = vnode;
+ call->service_id = FS_SERVICE;
+ call->port = htons(AFS_FS_PORT);
+
+ /* marshall the parameters */
+ bp = call->request;
+ *bp++ = htonl(isdir ? FSREMOVEDIR : FSREMOVEFILE);
*bp++ = htonl(vnode->fid.vid);
*bp++ = htonl(vnode->fid.vnode);
*bp++ = htonl(vnode->fid.unique);
- *bp++ = htonl(1);
- *bp++ = htonl(vnode->cb_version);
- *bp++ = htonl(vnode->cb_expiry);
- *bp++ = htonl(vnode->cb_type);
-
- /* send the parameters to the server */
- ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
- 0, &sent);
- if (ret < 0)
- goto abort;
-
- /* wait for the reply to completely arrive */
- for (;;) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (call->app_call_state != RXRPC_CSTATE_CLNT_RCV_REPLY ||
- signal_pending(current))
- break;
- schedule();
+ *bp++ = htonl(namesz);
+ memcpy(bp, name, namesz);
+ bp = (void *) bp + namesz;
+ if (padsz > 0) {
+ memset(bp, 0, padsz);
+ bp = (void *) bp + padsz;
}
- set_current_state(TASK_RUNNING);
- ret = -EINTR;
- if (signal_pending(current))
- goto abort;
+ return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
- switch (call->app_call_state) {
- case RXRPC_CSTATE_ERROR:
- ret = call->app_errno;
- goto out_unwait;
+/*
+ * deliver reply data to an FS.Link
+ */
+static int afs_deliver_fs_link(struct afs_call *call,
+ struct sk_buff *skb, bool last)
+{
+ struct afs_vnode *dvnode = call->reply, *vnode = call->reply2;
+ const __be32 *bp;
- case RXRPC_CSTATE_CLNT_GOT_REPLY:
- ret = 0;
- goto out_unwait;
+ _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
- default:
- BUG();
- }
+ afs_transfer_reply(call, skb);
+ if (!last)
+ return 0;
+
+ if (call->reply_size != call->reply_max)
+ return -EBADMSG;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+ xdr_decode_AFSFetchStatus(&bp, &dvnode->status, dvnode);
+ /* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * FS.Link operation type
+ */
+static const struct afs_call_type afs_RXFSLink = {
+ .name = "FS.Link",
+ .deliver = afs_deliver_fs_link,
+ .abort_to_error = afs_abort_to_error,
+ .destructor = afs_flat_call_destructor,
+};
- out_unwait:
- set_current_state(TASK_RUNNING);
- remove_wait_queue(&call->waitq, &myself);
- rxrpc_put_call(call);
- out_put_conn:
- afs_server_release_callslot(server, &callslot);
- out:
- _leave("");
- return ret;
-
- abort:
- set_current_state(TASK_UNINTERRUPTIBLE);
- rxrpc_call_abort(call, ret);
- schedule();
- goto out_unwait;
-} /* end afs_rxfs_give_up_callback() */
-
-/*****************************************************************************/
/*
- * look a filename up in a directory
- * - this operation doesn't seem to work correctly in OpenAFS server 1.2.2
+ * make a hard link
*/
-#if 0
-int afs_rxfs_lookup(struct afs_server *server,
- struct afs_vnode *dir,
- const char *filename,
- struct afs_vnode *vnode,
- struct afs_volsync *volsync)
+int afs_fs_link(struct afs_server *server,
+ struct key *key,
+ struct afs_vnode *dvnode,
+ struct afs_vnode *vnode,
+ const char *name,
+ const struct afs_wait_mode *wait_mode)
{
- struct rxrpc_connection *conn;
- struct rxrpc_call *call;
- struct kvec piov[3];
- size_t sent;
- int ret;
- u32 *bp, zero;
+ struct afs_call *call;
+ size_t namesz, reqsz, padsz;
+ __be32 *bp;
- DECLARE_WAITQUEUE(myself, current);
+ _enter("");
- kenter("%p,{%u,%u,%u},%s",
- server, fid->vid, fid->vnode, fid->unique, filename);
+ namesz = strlen(name);
+ padsz = (4 - (namesz & 3)) & 3;
+ reqsz = (5 * 4) + namesz + padsz + (3 * 4);
- /* get hold of the fileserver connection */
- ret = afs_server_get_fsconn(server, &conn);
- if (ret < 0)
- goto out;
+ call = afs_alloc_flat_call(&afs_RXFSLink, reqsz, (21 + 21 + 6) * 4);
+ if (!call)
+ return -ENOMEM;
- /* create a call through that connection */
- ret = rxrpc_create_call(conn, NULL, NULL, afs_rxfs_aemap, &call);
- if (ret < 0) {
- printk("kAFS: Unable to create call: %d\n", ret);
- goto out_put_conn;
+ call->key = key;
+ call->reply = dvnode;
+ call->reply2 = vnode;
+ call->service_id = FS_SERVICE;
+ call->port = htons(AFS_FS_PORT);
+
+ /* marshall the parameters */
+ bp = call->request;
+ *bp++ = htonl(FSLINK);
+ *bp++ = htonl(dvnode->fid.vid);
+ *bp++ = htonl(dvnode->fid.vnode);
+ *bp++ = htonl(dvnode->fid.unique);
+ *bp++ = htonl(namesz);
+ memcpy(bp, name, namesz);
+ bp = (void *) bp + namesz;
+ if (padsz > 0) {
+ memset(bp, 0, padsz);
+ bp = (void *) bp + padsz;
}
- call->app_opcode = FSLOOKUP;
+ *bp++ = htonl(vnode->fid.vid);
+ *bp++ = htonl(vnode->fid.vnode);
+ *bp++ = htonl(vnode->fid.unique);
+
+ return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.Symlink
+ */
+static int afs_deliver_fs_symlink(struct afs_call *call,
+ struct sk_buff *skb, bool last)
+{
+ struct afs_vnode *vnode = call->reply;
+ const __be32 *bp;
- /* we want to get event notifications from the call */
- add_wait_queue(&call->waitq,&myself);
+ _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+ afs_transfer_reply(call, skb);
+ if (!last)
+ return 0;
+
+ if (call->reply_size != call->reply_max)
+ return -EBADMSG;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ xdr_decode_AFSFid(&bp, call->reply2);
+ xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL);
+ xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+ /* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * FS.Symlink operation type
+ */
+static const struct afs_call_type afs_RXFSSymlink = {
+ .name = "FS.Symlink",
+ .deliver = afs_deliver_fs_symlink,
+ .abort_to_error = afs_abort_to_error,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * create a symbolic link
+ */
+int afs_fs_symlink(struct afs_server *server,
+ struct key *key,
+ struct afs_vnode *vnode,
+ const char *name,
+ const char *contents,
+ struct afs_fid *newfid,
+ struct afs_file_status *newstatus,
+ const struct afs_wait_mode *wait_mode)
+{
+ struct afs_call *call;
+ size_t namesz, reqsz, padsz, c_namesz, c_padsz;
+ __be32 *bp;
+
+ _enter("");
+
+ namesz = strlen(name);
+ padsz = (4 - (namesz & 3)) & 3;
+
+ c_namesz = strlen(contents);
+ c_padsz = (4 - (c_namesz & 3)) & 3;
+
+ reqsz = (6 * 4) + namesz + padsz + c_namesz + c_padsz + (6 * 4);
+
+ call = afs_alloc_flat_call(&afs_RXFSSymlink, reqsz,
+ (3 + 21 + 21 + 6) * 4);
+ if (!call)
+ return -ENOMEM;
+
+ call->key = key;
+ call->reply = vnode;
+ call->reply2 = newfid;
+ call->reply3 = newstatus;
+ call->service_id = FS_SERVICE;
+ call->port = htons(AFS_FS_PORT);
/* marshall the parameters */
- bp = rxrpc_call_alloc_scratch(call, 20);
-
- zero = 0;
-
- piov[0].iov_len = 20;
- piov[0].iov_base = bp;
- piov[1].iov_len = strlen(filename);
- piov[1].iov_base = (char *) filename;
- piov[2].iov_len = (4 - (piov[1].iov_len & 3)) & 3;
- piov[2].iov_base = &zero;
-
- *bp++ = htonl(FSLOOKUP);
- *bp++ = htonl(dirfid->vid);
- *bp++ = htonl(dirfid->vnode);
- *bp++ = htonl(dirfid->unique);
- *bp++ = htonl(piov[1].iov_len);
-
- /* send the parameters to the server */
- ret = rxrpc_call_write_data(call, 3, piov, RXRPC_LAST_PACKET, GFP_NOFS,
- 0, &sent);
- if (ret < 0)
- goto abort;
-
- /* wait for the reply to completely arrive */
- bp = rxrpc_call_alloc_scratch(call, 220);
-
- ret = rxrpc_call_read_data(call, bp, 220,
- RXRPC_CALL_READ_BLOCK |
- RXRPC_CALL_READ_ALL);
- if (ret < 0) {
- if (ret == -ECONNABORTED) {
- ret = call->app_errno;
- goto out_unwait;
- }
- goto abort;
+ bp = call->request;
+ *bp++ = htonl(FSSYMLINK);
+ *bp++ = htonl(vnode->fid.vid);
+ *bp++ = htonl(vnode->fid.vnode);
+ *bp++ = htonl(vnode->fid.unique);
+ *bp++ = htonl(namesz);
+ memcpy(bp, name, namesz);
+ bp = (void *) bp + namesz;
+ if (padsz > 0) {
+ memset(bp, 0, padsz);
+ bp = (void *) bp + padsz;
}
+ *bp++ = htonl(c_namesz);
+ memcpy(bp, contents, c_namesz);
+ bp = (void *) bp + c_namesz;
+ if (c_padsz > 0) {
+ memset(bp, 0, c_padsz);
+ bp = (void *) bp + c_padsz;
+ }
+ *bp++ = htonl(AFS_SET_MODE);
+ *bp++ = 0; /* mtime */
+ *bp++ = 0; /* owner */
+ *bp++ = 0; /* group */
+ *bp++ = htonl(S_IRWXUGO); /* unix mode */
+ *bp++ = 0; /* segment size */
- /* unmarshall the reply */
- fid->vid = ntohl(*bp++);
- fid->vnode = ntohl(*bp++);
- fid->unique = ntohl(*bp++);
+ return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
- vnode->status.if_version = ntohl(*bp++);
- vnode->status.type = ntohl(*bp++);
- vnode->status.nlink = ntohl(*bp++);
- vnode->status.size = ntohl(*bp++);
- vnode->status.version = ntohl(*bp++);
- vnode->status.author = ntohl(*bp++);
- vnode->status.owner = ntohl(*bp++);
- vnode->status.caller_access = ntohl(*bp++);
- vnode->status.anon_access = ntohl(*bp++);
- vnode->status.mode = ntohl(*bp++);
- vnode->status.parent.vid = dirfid->vid;
- vnode->status.parent.vnode = ntohl(*bp++);
- vnode->status.parent.unique = ntohl(*bp++);
- bp++; /* seg size */
- vnode->status.mtime_client = ntohl(*bp++);
- vnode->status.mtime_server = ntohl(*bp++);
- bp++; /* group */
- bp++; /* sync counter */
- vnode->status.version |= ((unsigned long long) ntohl(*bp++)) << 32;
- bp++; /* spare2 */
- bp++; /* spare3 */
- bp++; /* spare4 */
+/*
+ * deliver reply data to an FS.Rename
+ */
+static int afs_deliver_fs_rename(struct afs_call *call,
+ struct sk_buff *skb, bool last)
+{
+ struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2;
+ const __be32 *bp;
- dir->status.if_version = ntohl(*bp++);
- dir->status.type = ntohl(*bp++);
- dir->status.nlink = ntohl(*bp++);
- dir->status.size = ntohl(*bp++);
- dir->status.version = ntohl(*bp++);
- dir->status.author = ntohl(*bp++);
- dir->status.owner = ntohl(*bp++);
- dir->status.caller_access = ntohl(*bp++);
- dir->status.anon_access = ntohl(*bp++);
- dir->status.mode = ntohl(*bp++);
- dir->status.parent.vid = dirfid->vid;
- dir->status.parent.vnode = ntohl(*bp++);
- dir->status.parent.unique = ntohl(*bp++);
- bp++; /* seg size */
- dir->status.mtime_client = ntohl(*bp++);
- dir->status.mtime_server = ntohl(*bp++);
- bp++; /* group */
- bp++; /* sync counter */
- dir->status.version |= ((unsigned long long) ntohl(*bp++)) << 32;
- bp++; /* spare2 */
- bp++; /* spare3 */
- bp++; /* spare4 */
+ _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+ afs_transfer_reply(call, skb);
+ if (!last)
+ return 0;
+
+ if (call->reply_size != call->reply_max)
+ return -EBADMSG;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ xdr_decode_AFSFetchStatus(&bp, &orig_dvnode->status, orig_dvnode);
+ if (new_dvnode != orig_dvnode)
+ xdr_decode_AFSFetchStatus(&bp, &new_dvnode->status, new_dvnode);
+ /* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * FS.Rename operation type
+ */
+static const struct afs_call_type afs_RXFSRename = {
+ .name = "FS.Rename",
+ .deliver = afs_deliver_fs_rename,
+ .abort_to_error = afs_abort_to_error,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * create a symbolic link
+ */
+int afs_fs_rename(struct afs_server *server,
+ struct key *key,
+ struct afs_vnode *orig_dvnode,
+ const char *orig_name,
+ struct afs_vnode *new_dvnode,
+ const char *new_name,
+ const struct afs_wait_mode *wait_mode)
+{
+ struct afs_call *call;
+ size_t reqsz, o_namesz, o_padsz, n_namesz, n_padsz;
+ __be32 *bp;
+
+ _enter("");
+
+ o_namesz = strlen(orig_name);
+ o_padsz = (4 - (o_namesz & 3)) & 3;
+
+ n_namesz = strlen(new_name);
+ n_padsz = (4 - (n_namesz & 3)) & 3;
+
+ reqsz = (4 * 4) +
+ 4 + o_namesz + o_padsz +
+ (3 * 4) +
+ 4 + n_namesz + n_padsz;
+
+ call = afs_alloc_flat_call(&afs_RXFSRename, reqsz, (21 + 21 + 6) * 4);
+ if (!call)
+ return -ENOMEM;
+
+ call->key = key;
+ call->reply = orig_dvnode;
+ call->reply2 = new_dvnode;
+ call->service_id = FS_SERVICE;
+ call->port = htons(AFS_FS_PORT);
+
+ /* marshall the parameters */
+ bp = call->request;
+ *bp++ = htonl(FSRENAME);
+ *bp++ = htonl(orig_dvnode->fid.vid);
+ *bp++ = htonl(orig_dvnode->fid.vnode);
+ *bp++ = htonl(orig_dvnode->fid.unique);
+ *bp++ = htonl(o_namesz);
+ memcpy(bp, orig_name, o_namesz);
+ bp = (void *) bp + o_namesz;
+ if (o_padsz > 0) {
+ memset(bp, 0, o_padsz);
+ bp = (void *) bp + o_padsz;
+ }
- callback->fid = *fid;
- callback->version = ntohl(*bp++);
- callback->expiry = ntohl(*bp++);
- callback->type = ntohl(*bp++);
-
- if (volsync) {
- volsync->creation = ntohl(*bp++);
- bp++; /* spare2 */
- bp++; /* spare3 */
- bp++; /* spare4 */
- bp++; /* spare5 */
- bp++; /* spare6 */
+ *bp++ = htonl(new_dvnode->fid.vid);
+ *bp++ = htonl(new_dvnode->fid.vnode);
+ *bp++ = htonl(new_dvnode->fid.unique);
+ *bp++ = htonl(n_namesz);
+ memcpy(bp, new_name, n_namesz);
+ bp = (void *) bp + n_namesz;
+ if (n_padsz > 0) {
+ memset(bp, 0, n_padsz);
+ bp = (void *) bp + n_padsz;
}
- /* success */
- ret = 0;
-
- out_unwait:
- set_current_state(TASK_RUNNING);
- remove_wait_queue(&call->waitq, &myself);
- rxrpc_put_call(call);
- out_put_conn:
- afs_server_release_fsconn(server, conn);
- out:
- kleave("");
- return ret;
-
- abort:
- set_current_state(TASK_UNINTERRUPTIBLE);
- rxrpc_call_abort(call, ret);
- schedule();
- goto out_unwait;
-} /* end afs_rxfs_lookup() */
-#endif
+ return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
diff --git a/fs/afs/fsclient.h b/fs/afs/fsclient.h
deleted file mode 100644
index 8ba3e749ee3c..000000000000
--- a/fs/afs/fsclient.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* fsclient.h: AFS File Server client stub declarations
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_AFS_FSCLIENT_H
-#define _LINUX_AFS_FSCLIENT_H
-
-#include "server.h"
-
-extern int afs_rxfs_get_volume_info(struct afs_server *server,
- const char *name,
- struct afs_volume_info *vinfo);
-
-extern int afs_rxfs_fetch_file_status(struct afs_server *server,
- struct afs_vnode *vnode,
- struct afs_volsync *volsync);
-
-struct afs_rxfs_fetch_descriptor {
- struct afs_fid fid; /* file ID to fetch */
- size_t size; /* total number of bytes to fetch */
- off_t offset; /* offset in file to start from */
- void *buffer; /* read buffer */
- size_t actual; /* actual size sent back by server */
-};
-
-extern int afs_rxfs_fetch_file_data(struct afs_server *server,
- struct afs_vnode *vnode,
- struct afs_rxfs_fetch_descriptor *desc,
- struct afs_volsync *volsync);
-
-extern int afs_rxfs_give_up_callback(struct afs_server *server,
- struct afs_vnode *vnode);
-
-/* this doesn't appear to work in OpenAFS server */
-extern int afs_rxfs_lookup(struct afs_server *server,
- struct afs_vnode *dir,
- const char *filename,
- struct afs_vnode *vnode,
- struct afs_volsync *volsync);
-
-/* this is apparently mis-implemented in OpenAFS server */
-extern int afs_rxfs_get_root_volume(struct afs_server *server,
- char *buf,
- size_t *buflen);
-
-
-#endif /* _LINUX_AFS_FSCLIENT_H */
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 9d9bca6c28b5..c184a4ee5995 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -19,9 +19,6 @@
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
-#include "volume.h"
-#include "vnode.h"
-#include "super.h"
#include "internal.h"
struct afs_iget_data {
@@ -29,26 +26,25 @@ struct afs_iget_data {
struct afs_volume *volume; /* volume on which resides */
};
-/*****************************************************************************/
/*
* map the AFS file status to the inode member variables
*/
-static int afs_inode_map_status(struct afs_vnode *vnode)
+static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
{
struct inode *inode = AFS_VNODE_TO_I(vnode);
- _debug("FS: ft=%d lk=%d sz=%Zu ver=%Lu mod=%hu",
+ _debug("FS: ft=%d lk=%d sz=%llu ver=%Lu mod=%hu",
vnode->status.type,
vnode->status.nlink,
- vnode->status.size,
- vnode->status.version,
+ (unsigned long long) vnode->status.size,
+ vnode->status.data_version,
vnode->status.mode);
switch (vnode->status.type) {
case AFS_FTYPE_FILE:
inode->i_mode = S_IFREG | vnode->status.mode;
inode->i_op = &afs_file_inode_operations;
- inode->i_fop = &generic_ro_fops;
+ inode->i_fop = &afs_file_operations;
break;
case AFS_FTYPE_DIR:
inode->i_mode = S_IFDIR | vnode->status.mode;
@@ -77,9 +73,9 @@ static int afs_inode_map_status(struct afs_vnode *vnode)
/* check to see whether a symbolic link is really a mountpoint */
if (vnode->status.type == AFS_FTYPE_SYMLINK) {
- afs_mntpt_check_symlink(vnode);
+ afs_mntpt_check_symlink(vnode, key);
- if (vnode->flags & AFS_VNODE_MOUNTPOINT) {
+ if (test_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags)) {
inode->i_mode = S_IFDIR | vnode->status.mode;
inode->i_op = &afs_mntpt_inode_operations;
inode->i_fop = &afs_mntpt_file_operations;
@@ -87,30 +83,8 @@ static int afs_inode_map_status(struct afs_vnode *vnode)
}
return 0;
-} /* end afs_inode_map_status() */
+}
-/*****************************************************************************/
-/*
- * attempt to fetch the status of an inode, coelescing multiple simultaneous
- * fetches
- */
-static int afs_inode_fetch_status(struct inode *inode)
-{
- struct afs_vnode *vnode;
- int ret;
-
- vnode = AFS_FS_I(inode);
-
- ret = afs_vnode_fetch_status(vnode);
-
- if (ret == 0)
- ret = afs_inode_map_status(vnode);
-
- return ret;
-
-} /* end afs_inode_fetch_status() */
-
-/*****************************************************************************/
/*
* iget5() comparator
*/
@@ -120,9 +94,8 @@ static int afs_iget5_test(struct inode *inode, void *opaque)
return inode->i_ino == data->fid.vnode &&
inode->i_version == data->fid.unique;
-} /* end afs_iget5_test() */
+}
-/*****************************************************************************/
/*
* iget5() inode initialiser
*/
@@ -137,14 +110,14 @@ static int afs_iget5_set(struct inode *inode, void *opaque)
vnode->volume = data->volume;
return 0;
-} /* end afs_iget5_set() */
+}
-/*****************************************************************************/
/*
* inode retrieval
*/
-inline int afs_iget(struct super_block *sb, struct afs_fid *fid,
- struct inode **_inode)
+struct inode *afs_iget(struct super_block *sb, struct key *key,
+ struct afs_fid *fid, struct afs_file_status *status,
+ struct afs_callback *cb)
{
struct afs_iget_data data = { .fid = *fid };
struct afs_super_info *as;
@@ -161,20 +134,18 @@ inline int afs_iget(struct super_block *sb, struct afs_fid *fid,
&data);
if (!inode) {
_leave(" = -ENOMEM");
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
}
+ _debug("GOT INODE %p { vl=%x vn=%x, u=%x }",
+ inode, fid->vid, fid->vnode, fid->unique);
+
vnode = AFS_FS_I(inode);
/* deal with an existing inode */
if (!(inode->i_state & I_NEW)) {
- ret = afs_vnode_fetch_status(vnode);
- if (ret==0)
- *_inode = inode;
- else
- iput(inode);
- _leave(" = %d", ret);
- return ret;
+ _leave(" = %p", inode);
+ return inode;
}
#ifdef AFS_CACHING_SUPPORT
@@ -186,100 +157,185 @@ inline int afs_iget(struct super_block *sb, struct afs_fid *fid,
&vnode->cache);
#endif
- /* okay... it's a new inode */
- inode->i_flags |= S_NOATIME;
- vnode->flags |= AFS_VNODE_CHANGED;
- ret = afs_inode_fetch_status(inode);
- if (ret<0)
+ if (!status) {
+ /* it's a remotely extant inode */
+ set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+ ret = afs_vnode_fetch_status(vnode, NULL, key);
+ if (ret < 0)
+ goto bad_inode;
+ } else {
+ /* it's an inode we just created */
+ memcpy(&vnode->status, status, sizeof(vnode->status));
+
+ if (!cb) {
+ /* it's a symlink we just created (the fileserver
+ * didn't give us a callback) */
+ vnode->cb_version = 0;
+ vnode->cb_expiry = 0;
+ vnode->cb_type = 0;
+ vnode->cb_expires = get_seconds();
+ } else {
+ vnode->cb_version = cb->version;
+ vnode->cb_expiry = cb->expiry;
+ vnode->cb_type = cb->type;
+ vnode->cb_expires = vnode->cb_expiry + get_seconds();
+ }
+ }
+
+ ret = afs_inode_map_status(vnode, key);
+ if (ret < 0)
goto bad_inode;
/* success */
+ clear_bit(AFS_VNODE_UNSET, &vnode->flags);
+ inode->i_flags |= S_NOATIME;
unlock_new_inode(inode);
-
- *_inode = inode;
- _leave(" = 0 [CB { v=%u x=%lu t=%u }]",
- vnode->cb_version,
- vnode->cb_timeout.timo_jif,
- vnode->cb_type);
- return 0;
+ _leave(" = %p [CB { v=%u t=%u }]", inode, vnode->cb_version, vnode->cb_type);
+ return inode;
/* failure */
- bad_inode:
+bad_inode:
make_bad_inode(inode);
unlock_new_inode(inode);
iput(inode);
_leave(" = %d [bad]", ret);
+ return ERR_PTR(ret);
+}
+
+/*
+ * validate a vnode/inode
+ * - there are several things we need to check
+ * - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
+ * symlink)
+ * - parent dir metadata changed (security changes)
+ * - dentry data changed (write, truncate)
+ * - dentry metadata changed (security changes)
+ */
+int afs_validate(struct afs_vnode *vnode, struct key *key)
+{
+ int ret;
+
+ _enter("{v={%x:%u} fl=%lx},%x",
+ vnode->fid.vid, vnode->fid.vnode, vnode->flags,
+ key_serial(key));
+
+ if (vnode->cb_promised &&
+ !test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
+ !test_bit(AFS_VNODE_MODIFIED, &vnode->flags) &&
+ !test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
+ if (vnode->cb_expires < get_seconds() + 10) {
+ _debug("callback expired");
+ set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+ } else {
+ goto valid;
+ }
+ }
+
+ if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+ goto valid;
+
+ mutex_lock(&vnode->validate_lock);
+
+ /* if the promise has expired, we need to check the server again to get
+ * a new promise - note that if the (parent) directory's metadata was
+ * changed then the security may be different and we may no longer have
+ * access */
+ if (!vnode->cb_promised ||
+ test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
+ _debug("not promised");
+ ret = afs_vnode_fetch_status(vnode, NULL, key);
+ if (ret < 0)
+ goto error_unlock;
+ _debug("new promise [fl=%lx]", vnode->flags);
+ }
+
+ if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+ _debug("file already deleted");
+ ret = -ESTALE;
+ goto error_unlock;
+ }
+
+ /* if the vnode's data version number changed then its contents are
+ * different */
+ if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
+ _debug("zap data {%x:%d}", vnode->fid.vid, vnode->fid.vnode);
+ invalidate_remote_inode(&vnode->vfs_inode);
+ }
+
+ clear_bit(AFS_VNODE_MODIFIED, &vnode->flags);
+ mutex_unlock(&vnode->validate_lock);
+valid:
+ _leave(" = 0");
+ return 0;
+
+error_unlock:
+ mutex_unlock(&vnode->validate_lock);
+ _leave(" = %d", ret);
return ret;
-} /* end afs_iget() */
+}
-/*****************************************************************************/
/*
* read the attributes of an inode
*/
int afs_inode_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
- struct afs_vnode *vnode;
struct inode *inode;
- int ret;
inode = dentry->d_inode;
_enter("{ ino=%lu v=%lu }", inode->i_ino, inode->i_version);
- vnode = AFS_FS_I(inode);
-
- ret = afs_inode_fetch_status(inode);
- if (ret == -ENOENT) {
- _leave(" = %d [%d %p]",
- ret, atomic_read(&dentry->d_count), dentry->d_inode);
- return ret;
- }
- else if (ret < 0) {
- make_bad_inode(inode);
- _leave(" = %d", ret);
- return ret;
- }
-
- /* transfer attributes from the inode structure to the stat
- * structure */
generic_fillattr(inode, stat);
-
- _leave(" = 0 CB { v=%u x=%u t=%u }",
- vnode->cb_version,
- vnode->cb_expiry,
- vnode->cb_type);
-
return 0;
-} /* end afs_inode_getattr() */
+}
-/*****************************************************************************/
/*
* clear an AFS inode
*/
void afs_clear_inode(struct inode *inode)
{
+ struct afs_permits *permits;
struct afs_vnode *vnode;
vnode = AFS_FS_I(inode);
- _enter("ino=%lu { vn=%08x v=%u x=%u t=%u }",
- inode->i_ino,
+ _enter("{%x:%d.%d} v=%u x=%u t=%u }",
+ vnode->fid.vid,
vnode->fid.vnode,
+ vnode->fid.unique,
vnode->cb_version,
vnode->cb_expiry,
- vnode->cb_type
- );
+ vnode->cb_type);
- BUG_ON(inode->i_ino != vnode->fid.vnode);
+ _debug("CLEAR INODE %p", inode);
- afs_vnode_give_up_callback(vnode);
+ ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
+
+ afs_give_up_callback(vnode);
+
+ if (vnode->server) {
+ spin_lock(&vnode->server->fs_lock);
+ rb_erase(&vnode->server_rb, &vnode->server->fs_vnodes);
+ spin_unlock(&vnode->server->fs_lock);
+ afs_put_server(vnode->server);
+ vnode->server = NULL;
+ }
+
+ ASSERT(!vnode->cb_promised);
#ifdef AFS_CACHING_SUPPORT
cachefs_relinquish_cookie(vnode->cache, 0);
vnode->cache = NULL;
#endif
+ mutex_lock(&vnode->permits_lock);
+ permits = vnode->permits;
+ rcu_assign_pointer(vnode->permits, NULL);
+ mutex_unlock(&vnode->permits_lock);
+ if (permits)
+ call_rcu(&permits->rcu, afs_zap_permits);
+
_leave("");
-} /* end afs_clear_inode() */
+}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 5151d5da2c2f..6dd3197d1d8d 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1,6 +1,6 @@
-/* internal.h: internal AFS stuff
+/* internal AFS stuff
*
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
@@ -9,48 +9,391 @@
* 2 of the License, or (at your option) any later version.
*/
-#ifndef AFS_INTERNAL_H
-#define AFS_INTERNAL_H
-
#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
+#include <linux/skbuff.h>
+#include <linux/rxrpc.h>
+#include <linux/key.h>
+#include "afs.h"
+#include "afs_vl.h"
+
+#define AFS_CELL_MAX_ADDRS 15
+
+struct afs_call;
+
+typedef enum {
+ AFS_VL_NEW, /* new, uninitialised record */
+ AFS_VL_CREATING, /* creating record */
+ AFS_VL_VALID, /* record is pending */
+ AFS_VL_NO_VOLUME, /* no such volume available */
+ AFS_VL_UPDATING, /* update in progress */
+ AFS_VL_VOLUME_DELETED, /* volume was deleted */
+ AFS_VL_UNCERTAIN, /* uncertain state (update failed) */
+} __attribute__((packed)) afs_vlocation_state_t;
+
+struct afs_mount_params {
+ bool rwpath; /* T if the parent should be considered R/W */
+ bool force; /* T to force cell type */
+ afs_voltype_t type; /* type of volume requested */
+ int volnamesz; /* size of volume name */
+ const char *volname; /* name of volume to mount */
+ struct afs_cell *cell; /* cell in which to find volume */
+ struct afs_volume *volume; /* volume record */
+ struct key *key; /* key to use for secure mounting */
+};
/*
- * debug tracing
+ * definition of how to wait for the completion of an operation
*/
-#define kenter(FMT, a...) printk("==> %s("FMT")\n",__FUNCTION__ , ## a)
-#define kleave(FMT, a...) printk("<== %s()"FMT"\n",__FUNCTION__ , ## a)
-#define kdebug(FMT, a...) printk(FMT"\n" , ## a)
-#define kproto(FMT, a...) printk("### "FMT"\n" , ## a)
-#define knet(FMT, a...) printk(FMT"\n" , ## a)
-
-#ifdef __KDEBUG
-#define _enter(FMT, a...) kenter(FMT , ## a)
-#define _leave(FMT, a...) kleave(FMT , ## a)
-#define _debug(FMT, a...) kdebug(FMT , ## a)
-#define _proto(FMT, a...) kproto(FMT , ## a)
-#define _net(FMT, a...) knet(FMT , ## a)
-#else
-#define _enter(FMT, a...) do { } while(0)
-#define _leave(FMT, a...) do { } while(0)
-#define _debug(FMT, a...) do { } while(0)
-#define _proto(FMT, a...) do { } while(0)
-#define _net(FMT, a...) do { } while(0)
-#endif
+struct afs_wait_mode {
+ /* RxRPC received message notification */
+ void (*rx_wakeup)(struct afs_call *call);
-static inline void afs_discard_my_signals(void)
-{
- while (signal_pending(current)) {
- siginfo_t sinfo;
+ /* synchronous call waiter and call dispatched notification */
+ int (*wait)(struct afs_call *call);
+
+ /* asynchronous call completion */
+ void (*async_complete)(void *reply, int error);
+};
+
+extern const struct afs_wait_mode afs_sync_call;
+extern const struct afs_wait_mode afs_async_call;
- spin_lock_irq(&current->sighand->siglock);
- dequeue_signal(current,&current->blocked, &sinfo);
- spin_unlock_irq(&current->sighand->siglock);
- }
+/*
+ * a record of an in-progress RxRPC call
+ */
+struct afs_call {
+ const struct afs_call_type *type; /* type of call */
+ const struct afs_wait_mode *wait_mode; /* completion wait mode */
+ wait_queue_head_t waitq; /* processes awaiting completion */
+ struct work_struct async_work; /* asynchronous work processor */
+ struct work_struct work; /* actual work processor */
+ struct sk_buff_head rx_queue; /* received packets */
+ struct rxrpc_call *rxcall; /* RxRPC call handle */
+ struct key *key; /* security for this call */
+ struct afs_server *server; /* server affected by incoming CM call */
+ void *request; /* request data (first part) */
+ void *request2; /* request data (second part) */
+ void *buffer; /* reply receive buffer */
+ void *reply; /* reply buffer (first part) */
+ void *reply2; /* reply buffer (second part) */
+ void *reply3; /* reply buffer (third part) */
+ void *reply4; /* reply buffer (fourth part) */
+ enum { /* call state */
+ AFS_CALL_REQUESTING, /* request is being sent for outgoing call */
+ AFS_CALL_AWAIT_REPLY, /* awaiting reply to outgoing call */
+ AFS_CALL_AWAIT_OP_ID, /* awaiting op ID on incoming call */
+ AFS_CALL_AWAIT_REQUEST, /* awaiting request data on incoming call */
+ AFS_CALL_REPLYING, /* replying to incoming call */
+ AFS_CALL_AWAIT_ACK, /* awaiting final ACK of incoming call */
+ AFS_CALL_COMPLETE, /* successfully completed */
+ AFS_CALL_BUSY, /* server was busy */
+ AFS_CALL_ABORTED, /* call was aborted */
+ AFS_CALL_ERROR, /* call failed due to error */
+ } state;
+ int error; /* error code */
+ unsigned request_size; /* size of request data */
+ unsigned reply_max; /* maximum size of reply */
+ unsigned reply_size; /* current size of reply */
+ unsigned short offset; /* offset into received data store */
+ unsigned char unmarshall; /* unmarshalling phase */
+ bool incoming; /* T if incoming call */
+ u16 service_id; /* RxRPC service ID to call */
+ __be16 port; /* target UDP port */
+ __be32 operation_ID; /* operation ID for an incoming call */
+ u32 count; /* count for use in unmarshalling */
+ __be32 tmp; /* place to extract temporary data */
+};
+
+struct afs_call_type {
+ const char *name;
+
+ /* deliver request or reply data to an call
+ * - returning an error will cause the call to be aborted
+ */
+ int (*deliver)(struct afs_call *call, struct sk_buff *skb,
+ bool last);
+
+ /* map an abort code to an error number */
+ int (*abort_to_error)(u32 abort_code);
+
+ /* clean up a call */
+ void (*destructor)(struct afs_call *call);
+};
+
+/*
+ * AFS superblock private data
+ * - there's one superblock per volume
+ */
+struct afs_super_info {
+ struct afs_volume *volume; /* volume record */
+ char rwparent; /* T if parent is R/W AFS volume */
+};
+
+static inline struct afs_super_info *AFS_FS_S(struct super_block *sb)
+{
+ return sb->s_fs_info;
}
+extern struct file_system_type afs_fs_type;
+
+/*
+ * entry in the cached cell catalogue
+ */
+struct afs_cache_cell {
+ char name[AFS_MAXCELLNAME]; /* cell name (padded with NULs) */
+ struct in_addr vl_servers[15]; /* cached cell VL servers */
+};
+
+/*
+ * AFS cell record
+ */
+struct afs_cell {
+ atomic_t usage;
+ struct list_head link; /* main cell list link */
+ struct key *anonymous_key; /* anonymous user key for this cell */
+ struct list_head proc_link; /* /proc cell list link */
+ struct proc_dir_entry *proc_dir; /* /proc dir for this cell */
+#ifdef AFS_CACHING_SUPPORT
+ struct cachefs_cookie *cache; /* caching cookie */
+#endif
+
+ /* server record management */
+ rwlock_t servers_lock; /* active server list lock */
+ struct list_head servers; /* active server list */
+
+ /* volume location record management */
+ struct rw_semaphore vl_sem; /* volume management serialisation semaphore */
+ struct list_head vl_list; /* cell's active VL record list */
+ spinlock_t vl_lock; /* vl_list lock */
+ unsigned short vl_naddrs; /* number of VL servers in addr list */
+ unsigned short vl_curr_svix; /* current server index */
+ struct in_addr vl_addrs[AFS_CELL_MAX_ADDRS]; /* cell VL server addresses */
+
+ char name[0]; /* cell name - must go last */
+};
+
+/*
+ * entry in the cached volume location catalogue
+ */
+struct afs_cache_vlocation {
+ /* volume name (lowercase, padded with NULs) */
+ uint8_t name[AFS_MAXVOLNAME + 1];
+
+ uint8_t nservers; /* number of entries used in servers[] */
+ uint8_t vidmask; /* voltype mask for vid[] */
+ uint8_t srvtmask[8]; /* voltype masks for servers[] */
+#define AFS_VOL_VTM_RW 0x01 /* R/W version of the volume is available (on this server) */
+#define AFS_VOL_VTM_RO 0x02 /* R/O version of the volume is available (on this server) */
+#define AFS_VOL_VTM_BAK 0x04 /* backup version of the volume is available (on this server) */
+
+ afs_volid_t vid[3]; /* volume IDs for R/W, R/O and Bak volumes */
+ struct in_addr servers[8]; /* fileserver addresses */
+ time_t rtime; /* last retrieval time */
+};
+
+/*
+ * volume -> vnode hash table entry
+ */
+struct afs_cache_vhash {
+ afs_voltype_t vtype; /* which volume variation */
+ uint8_t hash_bucket; /* which hash bucket this represents */
+} __attribute__((packed));
+
+/*
+ * AFS volume location record
+ */
+struct afs_vlocation {
+ atomic_t usage;
+ time_t time_of_death; /* time at which put reduced usage to 0 */
+ struct list_head link; /* link in cell volume location list */
+ struct list_head grave; /* link in master graveyard list */
+ struct list_head update; /* link in master update list */
+ struct afs_cell *cell; /* cell to which volume belongs */
+#ifdef AFS_CACHING_SUPPORT
+ struct cachefs_cookie *cache; /* caching cookie */
+#endif
+ struct afs_cache_vlocation vldb; /* volume information DB record */
+ struct afs_volume *vols[3]; /* volume access record pointer (index by type) */
+ wait_queue_head_t waitq; /* status change waitqueue */
+ time_t update_at; /* time at which record should be updated */
+ spinlock_t lock; /* access lock */
+ afs_vlocation_state_t state; /* volume location state */
+ unsigned short upd_rej_cnt; /* ENOMEDIUM count during update */
+ unsigned short upd_busy_cnt; /* EBUSY count during update */
+ bool valid; /* T if valid */
+};
+
+/*
+ * AFS fileserver record
+ */
+struct afs_server {
+ atomic_t usage;
+ time_t time_of_death; /* time at which put reduced usage to 0 */
+ struct in_addr addr; /* server address */
+ struct afs_cell *cell; /* cell in which server resides */
+ struct list_head link; /* link in cell's server list */
+ struct list_head grave; /* link in master graveyard list */
+ struct rb_node master_rb; /* link in master by-addr tree */
+ struct rw_semaphore sem; /* access lock */
+
+ /* file service access */
+ struct rb_root fs_vnodes; /* vnodes backed by this server (ordered by FID) */
+ unsigned long fs_act_jif; /* time at which last activity occurred */
+ unsigned long fs_dead_jif; /* time at which no longer to be considered dead */
+ spinlock_t fs_lock; /* access lock */
+ int fs_state; /* 0 or reason FS currently marked dead (-errno) */
+
+ /* callback promise management */
+ struct rb_root cb_promises; /* vnode expiration list (ordered earliest first) */
+ struct delayed_work cb_updater; /* callback updater */
+ struct delayed_work cb_break_work; /* collected break dispatcher */
+ wait_queue_head_t cb_break_waitq; /* space available in cb_break waitqueue */
+ spinlock_t cb_lock; /* access lock */
+ struct afs_callback cb_break[64]; /* ring of callbacks awaiting breaking */
+ atomic_t cb_break_n; /* number of pending breaks */
+ u8 cb_break_head; /* head of callback breaking ring */
+ u8 cb_break_tail; /* tail of callback breaking ring */
+};
+
+/*
+ * AFS volume access record
+ */
+struct afs_volume {
+ atomic_t usage;
+ struct afs_cell *cell; /* cell to which belongs (unrefd ptr) */
+ struct afs_vlocation *vlocation; /* volume location */
+#ifdef AFS_CACHING_SUPPORT
+ struct cachefs_cookie *cache; /* caching cookie */
+#endif
+ afs_volid_t vid; /* volume ID */
+ afs_voltype_t type; /* type of volume */
+ char type_force; /* force volume type (suppress R/O -> R/W) */
+ unsigned short nservers; /* number of server slots filled */
+ unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */
+ struct afs_server *servers[8]; /* servers on which volume resides (ordered) */
+ struct rw_semaphore server_sem; /* lock for accessing current server */
+};
+
+/*
+ * vnode catalogue entry
+ */
+struct afs_cache_vnode {
+ afs_vnodeid_t vnode_id; /* vnode ID */
+ unsigned vnode_unique; /* vnode ID uniquifier */
+ afs_dataversion_t data_version; /* data version */
+};
+
+/*
+ * AFS inode private data
+ */
+struct afs_vnode {
+ struct inode vfs_inode; /* the VFS's inode record */
+
+ struct afs_volume *volume; /* volume on which vnode resides */
+ struct afs_server *server; /* server currently supplying this file */
+ struct afs_fid fid; /* the file identifier for this inode */
+ struct afs_file_status status; /* AFS status info for this file */
+#ifdef AFS_CACHING_SUPPORT
+ struct cachefs_cookie *cache; /* caching cookie */
+#endif
+ struct afs_permits *permits; /* cache of permits so far obtained */
+ struct mutex permits_lock; /* lock for altering permits list */
+ struct mutex validate_lock; /* lock for validating this vnode */
+ wait_queue_head_t update_waitq; /* status fetch waitqueue */
+ int update_cnt; /* number of outstanding ops that will update the
+ * status */
+ spinlock_t lock; /* waitqueue/flags lock */
+ unsigned long flags;
+#define AFS_VNODE_CB_BROKEN 0 /* set if vnode's callback was broken */
+#define AFS_VNODE_UNSET 1 /* set if vnode attributes not yet set */
+#define AFS_VNODE_MODIFIED 2 /* set if vnode's data modified */
+#define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */
+#define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */
+#define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */
+
+ long acl_order; /* ACL check count (callback break count) */
+
+ /* outstanding callback notification on this file */
+ struct rb_node server_rb; /* link in server->fs_vnodes */
+ struct rb_node cb_promise; /* link in server->cb_promises */
+ struct work_struct cb_broken_work; /* work to be done on callback break */
+ time_t cb_expires; /* time at which callback expires */
+ time_t cb_expires_at; /* time used to order cb_promise */
+ unsigned cb_version; /* callback version */
+ unsigned cb_expiry; /* callback expiry time */
+ afs_callback_type_t cb_type; /* type of callback */
+ bool cb_promised; /* true if promise still holds */
+};
+
+/*
+ * cached security record for one user's attempt to access a vnode
+ */
+struct afs_permit {
+ struct key *key; /* RxRPC ticket holding a security context */
+ afs_access_t access_mask; /* access mask for this key */
+};
+
+/*
+ * cache of security records from attempts to access a vnode
+ */
+struct afs_permits {
+ struct rcu_head rcu; /* disposal procedure */
+ int count; /* number of records */
+ struct afs_permit permits[0]; /* the permits so far examined */
+};
+
+/*
+ * record of one of a system's set of network interfaces
+ */
+struct afs_interface {
+ unsigned index; /* interface index */
+ struct in_addr address; /* IPv4 address bound to interface */
+ struct in_addr netmask; /* netmask applied to address */
+ unsigned mtu; /* MTU of interface */
+};
+
+/*
+ * UUID definition [internet draft]
+ * - the timestamp is a 60-bit value, split 32/16/12, and goes in 100ns
+ * increments since midnight 15th October 1582
+ * - add AFS_UUID_TO_UNIX_TIME to convert unix time in 100ns units to UUID
+ * time
+ * - the clock sequence is a 14-bit counter to avoid duplicate times
+ */
+struct afs_uuid {
+ u32 time_low; /* low part of timestamp */
+ u16 time_mid; /* mid part of timestamp */
+ u16 time_hi_and_version; /* high part of timestamp and version */
+#define AFS_UUID_TO_UNIX_TIME 0x01b21dd213814000
+#define AFS_UUID_TIMEHI_MASK 0x0fff
+#define AFS_UUID_VERSION_TIME 0x1000 /* time-based UUID */
+#define AFS_UUID_VERSION_NAME 0x3000 /* name-based UUID */
+#define AFS_UUID_VERSION_RANDOM 0x4000 /* (pseudo-)random generated UUID */
+ u8 clock_seq_hi_and_reserved; /* clock seq hi and variant */
+#define AFS_UUID_CLOCKHI_MASK 0x3f
+#define AFS_UUID_VARIANT_STD 0x80
+ u8 clock_seq_low; /* clock seq low */
+ u8 node[6]; /* spatially unique node ID (MAC addr) */
+};
+
+/*****************************************************************************/
+/*
+ * callback.c
+ */
+extern void afs_init_callback_state(struct afs_server *);
+extern void afs_broken_callback_work(struct work_struct *);
+extern void afs_break_callbacks(struct afs_server *, size_t,
+ struct afs_callback[]);
+extern void afs_discard_callback_on_delete(struct afs_vnode *);
+extern void afs_give_up_callback(struct afs_vnode *);
+extern void afs_dispatch_give_up_callbacks(struct work_struct *);
+extern void afs_flush_callback_breaks(struct afs_server *);
+extern int __init afs_callback_update_init(void);
+extern void __exit afs_callback_update_kill(void);
+
/*
* cell.c
*/
@@ -60,57 +403,156 @@ extern struct list_head afs_proc_cells;
extern struct cachefs_index_def afs_cache_cell_index_def;
#endif
+#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
+extern int afs_cell_init(char *);
+extern struct afs_cell *afs_cell_create(const char *, char *);
+extern struct afs_cell *afs_cell_lookup(const char *, unsigned);
+extern struct afs_cell *afs_grab_cell(struct afs_cell *);
+extern void afs_put_cell(struct afs_cell *);
+extern void afs_cell_purge(void);
+
+/*
+ * cmservice.c
+ */
+extern bool afs_cm_incoming_call(struct afs_call *);
+
/*
* dir.c
*/
extern const struct inode_operations afs_dir_inode_operations;
extern const struct file_operations afs_dir_file_operations;
+extern int afs_permission(struct inode *, int, struct nameidata *);
+
/*
* file.c
*/
extern const struct address_space_operations afs_fs_aops;
extern const struct inode_operations afs_file_inode_operations;
+extern const struct file_operations afs_file_operations;
+
+extern int afs_open(struct inode *, struct file *);
+extern int afs_release(struct inode *, struct file *);
#ifdef AFS_CACHING_SUPPORT
-extern int afs_cache_get_page_cookie(struct page *page,
- struct cachefs_page **_page_cookie);
+extern int afs_cache_get_page_cookie(struct page *, struct cachefs_page **);
#endif
/*
- * inode.c
+ * fsclient.c
*/
-extern int afs_iget(struct super_block *sb, struct afs_fid *fid,
- struct inode **_inode);
-extern int afs_inode_getattr(struct vfsmount *mnt, struct dentry *dentry,
- struct kstat *stat);
-extern void afs_clear_inode(struct inode *inode);
+extern int afs_fs_fetch_file_status(struct afs_server *, struct key *,
+ struct afs_vnode *, struct afs_volsync *,
+ const struct afs_wait_mode *);
+extern int afs_fs_give_up_callbacks(struct afs_server *,
+ const struct afs_wait_mode *);
+extern int afs_fs_fetch_data(struct afs_server *, struct key *,
+ struct afs_vnode *, off_t, size_t, struct page *,
+ const struct afs_wait_mode *);
+extern int afs_fs_create(struct afs_server *, struct key *,
+ struct afs_vnode *, const char *, umode_t,
+ struct afs_fid *, struct afs_file_status *,
+ struct afs_callback *,
+ const struct afs_wait_mode *);
+extern int afs_fs_remove(struct afs_server *, struct key *,
+ struct afs_vnode *, const char *, bool,
+ const struct afs_wait_mode *);
+extern int afs_fs_link(struct afs_server *, struct key *, struct afs_vnode *,
+ struct afs_vnode *, const char *,
+ const struct afs_wait_mode *);
+extern int afs_fs_symlink(struct afs_server *, struct key *,
+ struct afs_vnode *, const char *, const char *,
+ struct afs_fid *, struct afs_file_status *,
+ const struct afs_wait_mode *);
+extern int afs_fs_rename(struct afs_server *, struct key *,
+ struct afs_vnode *, const char *,
+ struct afs_vnode *, const char *,
+ const struct afs_wait_mode *);
/*
- * key_afs.c
+ * inode.c
*/
-#ifdef CONFIG_KEYS
-extern int afs_key_register(void);
-extern void afs_key_unregister(void);
-#endif
+extern struct inode *afs_iget(struct super_block *, struct key *,
+ struct afs_fid *, struct afs_file_status *,
+ struct afs_callback *);
+extern int afs_validate(struct afs_vnode *, struct key *);
+extern int afs_inode_getattr(struct vfsmount *, struct dentry *,
+ struct kstat *);
+extern void afs_zap_permits(struct rcu_head *);
+extern void afs_clear_inode(struct inode *);
/*
* main.c
*/
+extern struct afs_uuid afs_uuid;
#ifdef AFS_CACHING_SUPPORT
extern struct cachefs_netfs afs_cache_netfs;
#endif
/*
+ * misc.c
+ */
+extern int afs_abort_to_error(u32);
+
+/*
* mntpt.c
*/
extern const struct inode_operations afs_mntpt_inode_operations;
extern const struct file_operations afs_mntpt_file_operations;
-extern struct afs_timer afs_mntpt_expiry_timer;
-extern struct afs_timer_ops afs_mntpt_expiry_timer_ops;
extern unsigned long afs_mntpt_expiry_timeout;
-extern int afs_mntpt_check_symlink(struct afs_vnode *vnode);
+extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
+extern void afs_mntpt_kill_timer(void);
+extern void afs_umount_begin(struct vfsmount *, int);
+
+/*
+ * proc.c
+ */
+extern int afs_proc_init(void);
+extern void afs_proc_cleanup(void);
+extern int afs_proc_cell_setup(struct afs_cell *);
+extern void afs_proc_cell_remove(struct afs_cell *);
+
+/*
+ * rxrpc.c
+ */
+extern int afs_open_socket(void);
+extern void afs_close_socket(void);
+extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t,
+ const struct afs_wait_mode *);
+extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *,
+ size_t, size_t);
+extern void afs_flat_call_destructor(struct afs_call *);
+extern void afs_transfer_reply(struct afs_call *, struct sk_buff *);
+extern void afs_send_empty_reply(struct afs_call *);
+extern void afs_send_simple_reply(struct afs_call *, const void *, size_t);
+extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *,
+ size_t);
+
+/*
+ * security.c
+ */
+extern void afs_clear_permits(struct afs_vnode *);
+extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
+extern struct key *afs_request_key(struct afs_cell *);
+extern int afs_permission(struct inode *, int, struct nameidata *);
+
+/*
+ * server.c
+ */
+extern spinlock_t afs_server_peer_lock;
+
+#define afs_get_server(S) \
+do { \
+ _debug("GET SERVER %d", atomic_read(&(S)->usage)); \
+ atomic_inc(&(S)->usage); \
+} while(0)
+
+extern struct afs_server *afs_lookup_server(struct afs_cell *,
+ const struct in_addr *);
+extern struct afs_server *afs_find_server(const struct in_addr *);
+extern void afs_put_server(struct afs_server *);
+extern void __exit afs_purge_servers(void);
/*
* super.c
@@ -118,22 +560,211 @@ extern int afs_mntpt_check_symlink(struct afs_vnode *vnode);
extern int afs_fs_init(void);
extern void afs_fs_exit(void);
-#define AFS_CB_HASH_COUNT (PAGE_SIZE / sizeof(struct list_head))
+/*
+ * use-rtnetlink.c
+ */
+extern int afs_get_ipv4_interfaces(struct afs_interface *, size_t, bool);
+extern int afs_get_MAC_address(u8 [6]);
-extern struct list_head afs_cb_hash_tbl[];
-extern spinlock_t afs_cb_hash_lock;
+/*
+ * vlclient.c
+ */
+#ifdef AFS_CACHING_SUPPORT
+extern struct cachefs_index_def afs_vlocation_cache_index_def;
+#endif
-#define afs_cb_hash(SRV,FID) \
- afs_cb_hash_tbl[((unsigned long)(SRV) + \
- (FID)->vid + (FID)->vnode + (FID)->unique) % \
- AFS_CB_HASH_COUNT]
+extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *,
+ const char *, struct afs_cache_vlocation *,
+ const struct afs_wait_mode *);
+extern int afs_vl_get_entry_by_id(struct in_addr *, struct key *,
+ afs_volid_t, afs_voltype_t,
+ struct afs_cache_vlocation *,
+ const struct afs_wait_mode *);
/*
- * proc.c
+ * vlocation.c
*/
-extern int afs_proc_init(void);
-extern void afs_proc_cleanup(void);
-extern int afs_proc_cell_setup(struct afs_cell *cell);
-extern void afs_proc_cell_remove(struct afs_cell *cell);
+#define afs_get_vlocation(V) do { atomic_inc(&(V)->usage); } while(0)
+
+extern int __init afs_vlocation_update_init(void);
+extern struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *,
+ struct key *,
+ const char *, size_t);
+extern void afs_put_vlocation(struct afs_vlocation *);
+extern void __exit afs_vlocation_purge(void);
+
+/*
+ * vnode.c
+ */
+#ifdef AFS_CACHING_SUPPORT
+extern struct cachefs_index_def afs_vnode_cache_index_def;
+#endif
+
+extern struct afs_timer_ops afs_vnode_cb_timed_out_ops;
+
+static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
+{
+ return container_of(inode, struct afs_vnode, vfs_inode);
+}
+
+static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
+{
+ return &vnode->vfs_inode;
+}
+
+extern void afs_vnode_finalise_status_update(struct afs_vnode *,
+ struct afs_server *);
+extern int afs_vnode_fetch_status(struct afs_vnode *, struct afs_vnode *,
+ struct key *);
+extern int afs_vnode_fetch_data(struct afs_vnode *, struct key *,
+ off_t, size_t, struct page *);
+extern int afs_vnode_create(struct afs_vnode *, struct key *, const char *,
+ umode_t, struct afs_fid *, struct afs_file_status *,
+ struct afs_callback *, struct afs_server **);
+extern int afs_vnode_remove(struct afs_vnode *, struct key *, const char *,
+ bool);
+extern int afs_vnode_link(struct afs_vnode *, struct afs_vnode *, struct key *,
+ const char *);
+extern int afs_vnode_symlink(struct afs_vnode *, struct key *, const char *,
+ const char *, struct afs_fid *,
+ struct afs_file_status *, struct afs_server **);
+extern int afs_vnode_rename(struct afs_vnode *, struct afs_vnode *,
+ struct key *, const char *, const char *);
+
+/*
+ * volume.c
+ */
+#ifdef AFS_CACHING_SUPPORT
+extern struct cachefs_index_def afs_volume_cache_index_def;
+#endif
+
+#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0)
+
+extern void afs_put_volume(struct afs_volume *);
+extern struct afs_volume *afs_volume_lookup(struct afs_mount_params *);
+extern struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *);
+extern int afs_volume_release_fileserver(struct afs_vnode *,
+ struct afs_server *, int);
+
+/*****************************************************************************/
+/*
+ * debug tracing
+ */
+extern unsigned afs_debug;
+
+#define dbgprintk(FMT,...) \
+ printk("[%x%-6.6s] "FMT"\n", smp_processor_id(), current->comm ,##__VA_ARGS__)
+
+/* make sure we maintain the format strings, even when debugging is disabled */
+static inline __attribute__((format(printf,1,2)))
+void _dbprintk(const char *fmt, ...)
+{
+}
+
+#define kenter(FMT,...) dbgprintk("==> %s("FMT")",__FUNCTION__ ,##__VA_ARGS__)
+#define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__FUNCTION__ ,##__VA_ARGS__)
+#define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__)
+
+
+#if defined(__KDEBUG)
+#define _enter(FMT,...) kenter(FMT,##__VA_ARGS__)
+#define _leave(FMT,...) kleave(FMT,##__VA_ARGS__)
+#define _debug(FMT,...) kdebug(FMT,##__VA_ARGS__)
+
+#elif defined(CONFIG_AFS_DEBUG)
+#define AFS_DEBUG_KENTER 0x01
+#define AFS_DEBUG_KLEAVE 0x02
+#define AFS_DEBUG_KDEBUG 0x04
+
+#define _enter(FMT,...) \
+do { \
+ if (unlikely(afs_debug & AFS_DEBUG_KENTER)) \
+ kenter(FMT,##__VA_ARGS__); \
+} while (0)
+
+#define _leave(FMT,...) \
+do { \
+ if (unlikely(afs_debug & AFS_DEBUG_KLEAVE)) \
+ kleave(FMT,##__VA_ARGS__); \
+} while (0)
+
+#define _debug(FMT,...) \
+do { \
+ if (unlikely(afs_debug & AFS_DEBUG_KDEBUG)) \
+ kdebug(FMT,##__VA_ARGS__); \
+} while (0)
+
+#else
+#define _enter(FMT,...) _dbprintk("==> %s("FMT")",__FUNCTION__ ,##__VA_ARGS__)
+#define _leave(FMT,...) _dbprintk("<== %s()"FMT"",__FUNCTION__ ,##__VA_ARGS__)
+#define _debug(FMT,...) _dbprintk(" "FMT ,##__VA_ARGS__)
+#endif
+
+/*
+ * debug assertion checking
+ */
+#if 1 // defined(__KDEBUGALL)
+
+#define ASSERT(X) \
+do { \
+ if (unlikely(!(X))) { \
+ printk(KERN_ERR "\n"); \
+ printk(KERN_ERR "AFS: Assertion failed\n"); \
+ BUG(); \
+ } \
+} while(0)
+
+#define ASSERTCMP(X, OP, Y) \
+do { \
+ if (unlikely(!((X) OP (Y)))) { \
+ printk(KERN_ERR "\n"); \
+ printk(KERN_ERR "AFS: Assertion failed\n"); \
+ printk(KERN_ERR "%lu " #OP " %lu is false\n", \
+ (unsigned long)(X), (unsigned long)(Y)); \
+ printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \
+ (unsigned long)(X), (unsigned long)(Y)); \
+ BUG(); \
+ } \
+} while(0)
+
+#define ASSERTIF(C, X) \
+do { \
+ if (unlikely((C) && !(X))) { \
+ printk(KERN_ERR "\n"); \
+ printk(KERN_ERR "AFS: Assertion failed\n"); \
+ BUG(); \
+ } \
+} while(0)
+
+#define ASSERTIFCMP(C, X, OP, Y) \
+do { \
+ if (unlikely((C) && !((X) OP (Y)))) { \
+ printk(KERN_ERR "\n"); \
+ printk(KERN_ERR "AFS: Assertion failed\n"); \
+ printk(KERN_ERR "%lu " #OP " %lu is false\n", \
+ (unsigned long)(X), (unsigned long)(Y)); \
+ printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \
+ (unsigned long)(X), (unsigned long)(Y)); \
+ BUG(); \
+ } \
+} while(0)
+
+#else
+
+#define ASSERT(X) \
+do { \
+} while(0)
+
+#define ASSERTCMP(X, OP, Y) \
+do { \
+} while(0)
+
+#define ASSERTIF(C, X) \
+do { \
+} while(0)
+
+#define ASSERTIFCMP(C, X, OP, Y) \
+do { \
+} while(0)
-#endif /* AFS_INTERNAL_H */
+#endif /* __KDEBUGALL */
diff --git a/fs/afs/kafsasyncd.c b/fs/afs/kafsasyncd.c
deleted file mode 100644
index 615df2407cb2..000000000000
--- a/fs/afs/kafsasyncd.c
+++ /dev/null
@@ -1,255 +0,0 @@
-/* kafsasyncd.c: AFS asynchronous operation daemon
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- *
- * The AFS async daemon is used to the following:
- * - probe "dead" servers to see whether they've come back to life yet.
- * - probe "live" servers that we haven't talked to for a while to see if they are better
- * candidates for serving than what we're currently using
- * - poll volume location servers to keep up to date volume location lists
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/completion.h>
-#include <linux/freezer.h>
-#include "cell.h"
-#include "server.h"
-#include "volume.h"
-#include "kafsasyncd.h"
-#include "kafstimod.h"
-#include <rxrpc/call.h>
-#include <asm/errno.h>
-#include "internal.h"
-
-static DECLARE_COMPLETION(kafsasyncd_alive);
-static DECLARE_COMPLETION(kafsasyncd_dead);
-static DECLARE_WAIT_QUEUE_HEAD(kafsasyncd_sleepq);
-static struct task_struct *kafsasyncd_task;
-static int kafsasyncd_die;
-
-static int kafsasyncd(void *arg);
-
-static LIST_HEAD(kafsasyncd_async_attnq);
-static LIST_HEAD(kafsasyncd_async_busyq);
-static DEFINE_SPINLOCK(kafsasyncd_async_lock);
-
-static void kafsasyncd_null_call_attn_func(struct rxrpc_call *call)
-{
-}
-
-static void kafsasyncd_null_call_error_func(struct rxrpc_call *call)
-{
-}
-
-/*****************************************************************************/
-/*
- * start the async daemon
- */
-int afs_kafsasyncd_start(void)
-{
- int ret;
-
- ret = kernel_thread(kafsasyncd, NULL, 0);
- if (ret < 0)
- return ret;
-
- wait_for_completion(&kafsasyncd_alive);
-
- return ret;
-} /* end afs_kafsasyncd_start() */
-
-/*****************************************************************************/
-/*
- * stop the async daemon
- */
-void afs_kafsasyncd_stop(void)
-{
- /* get rid of my daemon */
- kafsasyncd_die = 1;
- wake_up(&kafsasyncd_sleepq);
- wait_for_completion(&kafsasyncd_dead);
-
-} /* end afs_kafsasyncd_stop() */
-
-/*****************************************************************************/
-/*
- * probing daemon
- */
-static int kafsasyncd(void *arg)
-{
- struct afs_async_op *op;
- int die;
-
- DECLARE_WAITQUEUE(myself, current);
-
- kafsasyncd_task = current;
-
- printk("kAFS: Started kafsasyncd %d\n", current->pid);
-
- daemonize("kafsasyncd");
-
- complete(&kafsasyncd_alive);
-
- /* loop around looking for things to attend to */
- do {
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&kafsasyncd_sleepq, &myself);
-
- for (;;) {
- if (!list_empty(&kafsasyncd_async_attnq) ||
- signal_pending(current) ||
- kafsasyncd_die)
- break;
-
- schedule();
- set_current_state(TASK_INTERRUPTIBLE);
- }
-
- remove_wait_queue(&kafsasyncd_sleepq, &myself);
- set_current_state(TASK_RUNNING);
-
- try_to_freeze();
-
- /* discard pending signals */
- afs_discard_my_signals();
-
- die = kafsasyncd_die;
-
- /* deal with the next asynchronous operation requiring
- * attention */
- if (!list_empty(&kafsasyncd_async_attnq)) {
- struct afs_async_op *op;
-
- _debug("@@@ Begin Asynchronous Operation");
-
- op = NULL;
- spin_lock(&kafsasyncd_async_lock);
-
- if (!list_empty(&kafsasyncd_async_attnq)) {
- op = list_entry(kafsasyncd_async_attnq.next,
- struct afs_async_op, link);
- list_move_tail(&op->link,
- &kafsasyncd_async_busyq);
- }
-
- spin_unlock(&kafsasyncd_async_lock);
-
- _debug("@@@ Operation %p {%p}\n",
- op, op ? op->ops : NULL);
-
- if (op)
- op->ops->attend(op);
-
- _debug("@@@ End Asynchronous Operation");
- }
-
- } while(!die);
-
- /* need to kill all outstanding asynchronous operations before
- * exiting */
- kafsasyncd_task = NULL;
- spin_lock(&kafsasyncd_async_lock);
-
- /* fold the busy and attention queues together */
- list_splice_init(&kafsasyncd_async_busyq,
- &kafsasyncd_async_attnq);
-
- /* dequeue kafsasyncd from all their wait queues */
- list_for_each_entry(op, &kafsasyncd_async_attnq, link) {
- op->call->app_attn_func = kafsasyncd_null_call_attn_func;
- op->call->app_error_func = kafsasyncd_null_call_error_func;
- remove_wait_queue(&op->call->waitq, &op->waiter);
- }
-
- spin_unlock(&kafsasyncd_async_lock);
-
- /* abort all the operations */
- while (!list_empty(&kafsasyncd_async_attnq)) {
- op = list_entry(kafsasyncd_async_attnq.next, struct afs_async_op, link);
- list_del_init(&op->link);
-
- rxrpc_call_abort(op->call, -EIO);
- rxrpc_put_call(op->call);
- op->call = NULL;
-
- op->ops->discard(op);
- }
-
- /* and that's all */
- _leave("");
- complete_and_exit(&kafsasyncd_dead, 0);
-
-} /* end kafsasyncd() */
-
-/*****************************************************************************/
-/*
- * begin an operation
- * - place operation on busy queue
- */
-void afs_kafsasyncd_begin_op(struct afs_async_op *op)
-{
- _enter("");
-
- spin_lock(&kafsasyncd_async_lock);
-
- init_waitqueue_entry(&op->waiter, kafsasyncd_task);
- add_wait_queue(&op->call->waitq, &op->waiter);
-
- list_move_tail(&op->link, &kafsasyncd_async_busyq);
-
- spin_unlock(&kafsasyncd_async_lock);
-
- _leave("");
-} /* end afs_kafsasyncd_begin_op() */
-
-/*****************************************************************************/
-/*
- * request attention for an operation
- * - move to attention queue
- */
-void afs_kafsasyncd_attend_op(struct afs_async_op *op)
-{
- _enter("");
-
- spin_lock(&kafsasyncd_async_lock);
-
- list_move_tail(&op->link, &kafsasyncd_async_attnq);
-
- spin_unlock(&kafsasyncd_async_lock);
-
- wake_up(&kafsasyncd_sleepq);
-
- _leave("");
-} /* end afs_kafsasyncd_attend_op() */
-
-/*****************************************************************************/
-/*
- * terminate an operation
- * - remove from either queue
- */
-void afs_kafsasyncd_terminate_op(struct afs_async_op *op)
-{
- _enter("");
-
- spin_lock(&kafsasyncd_async_lock);
-
- if (!list_empty(&op->link)) {
- list_del_init(&op->link);
- remove_wait_queue(&op->call->waitq, &op->waiter);
- }
-
- spin_unlock(&kafsasyncd_async_lock);
-
- wake_up(&kafsasyncd_sleepq);
-
- _leave("");
-} /* end afs_kafsasyncd_terminate_op() */
diff --git a/fs/afs/kafsasyncd.h b/fs/afs/kafsasyncd.h
deleted file mode 100644
index 791803f9a6fb..000000000000
--- a/fs/afs/kafsasyncd.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* kafsasyncd.h: AFS asynchronous operation daemon
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_AFS_KAFSASYNCD_H
-#define _LINUX_AFS_KAFSASYNCD_H
-
-#include "types.h"
-
-struct afs_async_op;
-
-struct afs_async_op_ops {
- void (*attend)(struct afs_async_op *op);
- void (*discard)(struct afs_async_op *op);
-};
-
-/*****************************************************************************/
-/*
- * asynchronous operation record
- */
-struct afs_async_op
-{
- struct list_head link;
- struct afs_server *server; /* server being contacted */
- struct rxrpc_call *call; /* RxRPC call performing op */
- wait_queue_t waiter; /* wait queue for kafsasyncd */
- const struct afs_async_op_ops *ops; /* operations */
-};
-
-static inline void afs_async_op_init(struct afs_async_op *op,
- const struct afs_async_op_ops *ops)
-{
- INIT_LIST_HEAD(&op->link);
- op->call = NULL;
- op->ops = ops;
-}
-
-extern int afs_kafsasyncd_start(void);
-extern void afs_kafsasyncd_stop(void);
-
-extern void afs_kafsasyncd_begin_op(struct afs_async_op *op);
-extern void afs_kafsasyncd_attend_op(struct afs_async_op *op);
-extern void afs_kafsasyncd_terminate_op(struct afs_async_op *op);
-
-#endif /* _LINUX_AFS_KAFSASYNCD_H */
diff --git a/fs/afs/kafstimod.c b/fs/afs/kafstimod.c
deleted file mode 100644
index 694344e4d3c7..000000000000
--- a/fs/afs/kafstimod.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/* kafstimod.c: AFS timeout daemon
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/completion.h>
-#include <linux/freezer.h>
-#include "cell.h"
-#include "volume.h"
-#include "kafstimod.h"
-#include <asm/errno.h>
-#include "internal.h"
-
-static DECLARE_COMPLETION(kafstimod_alive);
-static DECLARE_COMPLETION(kafstimod_dead);
-static DECLARE_WAIT_QUEUE_HEAD(kafstimod_sleepq);
-static int kafstimod_die;
-
-static LIST_HEAD(kafstimod_list);
-static DEFINE_SPINLOCK(kafstimod_lock);
-
-static int kafstimod(void *arg);
-
-/*****************************************************************************/
-/*
- * start the timeout daemon
- */
-int afs_kafstimod_start(void)
-{
- int ret;
-
- ret = kernel_thread(kafstimod, NULL, 0);
- if (ret < 0)
- return ret;
-
- wait_for_completion(&kafstimod_alive);
-
- return ret;
-} /* end afs_kafstimod_start() */
-
-/*****************************************************************************/
-/*
- * stop the timeout daemon
- */
-void afs_kafstimod_stop(void)
-{
- /* get rid of my daemon */
- kafstimod_die = 1;
- wake_up(&kafstimod_sleepq);
- wait_for_completion(&kafstimod_dead);
-
-} /* end afs_kafstimod_stop() */
-
-/*****************************************************************************/
-/*
- * timeout processing daemon
- */
-static int kafstimod(void *arg)
-{
- struct afs_timer *timer;
-
- DECLARE_WAITQUEUE(myself, current);
-
- printk("kAFS: Started kafstimod %d\n", current->pid);
-
- daemonize("kafstimod");
-
- complete(&kafstimod_alive);
-
- /* loop around looking for things to attend to */
- loop:
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&kafstimod_sleepq, &myself);
-
- for (;;) {
- unsigned long jif;
- signed long timeout;
-
- /* deal with the server being asked to die */
- if (kafstimod_die) {
- remove_wait_queue(&kafstimod_sleepq, &myself);
- _leave("");
- complete_and_exit(&kafstimod_dead, 0);
- }
-
- try_to_freeze();
-
- /* discard pending signals */
- afs_discard_my_signals();
-
- /* work out the time to elapse before the next event */
- spin_lock(&kafstimod_lock);
- if (list_empty(&kafstimod_list)) {
- timeout = MAX_SCHEDULE_TIMEOUT;
- }
- else {
- timer = list_entry(kafstimod_list.next,
- struct afs_timer, link);
- timeout = timer->timo_jif;
- jif = jiffies;
-
- if (time_before_eq((unsigned long) timeout, jif))
- goto immediate;
-
- else {
- timeout = (long) timeout - (long) jiffies;
- }
- }
- spin_unlock(&kafstimod_lock);
-
- schedule_timeout(timeout);
-
- set_current_state(TASK_INTERRUPTIBLE);
- }
-
- /* the thing on the front of the queue needs processing
- * - we come here with the lock held and timer pointing to the expired
- * entry
- */
- immediate:
- remove_wait_queue(&kafstimod_sleepq, &myself);
- set_current_state(TASK_RUNNING);
-
- _debug("@@@ Begin Timeout of %p", timer);
-
- /* dequeue the timer */
- list_del_init(&timer->link);
- spin_unlock(&kafstimod_lock);
-
- /* call the timeout function */
- timer->ops->timed_out(timer);
-
- _debug("@@@ End Timeout");
- goto loop;
-
-} /* end kafstimod() */
-
-/*****************************************************************************/
-/*
- * (re-)queue a timer
- */
-void afs_kafstimod_add_timer(struct afs_timer *timer, unsigned long timeout)
-{
- struct afs_timer *ptimer;
- struct list_head *_p;
-
- _enter("%p,%lu", timer, timeout);
-
- spin_lock(&kafstimod_lock);
-
- list_del(&timer->link);
-
- /* the timer was deferred or reset - put it back in the queue at the
- * right place */
- timer->timo_jif = jiffies + timeout;
-
- list_for_each(_p, &kafstimod_list) {
- ptimer = list_entry(_p, struct afs_timer, link);
- if (time_before(timer->timo_jif, ptimer->timo_jif))
- break;
- }
-
- list_add_tail(&timer->link, _p); /* insert before stopping point */
-
- spin_unlock(&kafstimod_lock);
-
- wake_up(&kafstimod_sleepq);
-
- _leave("");
-} /* end afs_kafstimod_add_timer() */
-
-/*****************************************************************************/
-/*
- * dequeue a timer
- * - returns 0 if the timer was deleted or -ENOENT if it wasn't queued
- */
-int afs_kafstimod_del_timer(struct afs_timer *timer)
-{
- int ret = 0;
-
- _enter("%p", timer);
-
- spin_lock(&kafstimod_lock);
-
- if (list_empty(&timer->link))
- ret = -ENOENT;
- else
- list_del_init(&timer->link);
-
- spin_unlock(&kafstimod_lock);
-
- wake_up(&kafstimod_sleepq);
-
- _leave(" = %d", ret);
- return ret;
-} /* end afs_kafstimod_del_timer() */
diff --git a/fs/afs/kafstimod.h b/fs/afs/kafstimod.h
deleted file mode 100644
index e312f1a61a7f..000000000000
--- a/fs/afs/kafstimod.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* kafstimod.h: AFS timeout daemon
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_AFS_KAFSTIMOD_H
-#define _LINUX_AFS_KAFSTIMOD_H
-
-#include "types.h"
-
-struct afs_timer;
-
-struct afs_timer_ops {
- /* called when the front of the timer queue has timed out */
- void (*timed_out)(struct afs_timer *timer);
-};
-
-/*****************************************************************************/
-/*
- * AFS timer/timeout record
- */
-struct afs_timer
-{
- struct list_head link; /* link in timer queue */
- unsigned long timo_jif; /* timeout time */
- const struct afs_timer_ops *ops; /* timeout expiry function */
-};
-
-static inline void afs_timer_init(struct afs_timer *timer,
- const struct afs_timer_ops *ops)
-{
- INIT_LIST_HEAD(&timer->link);
- timer->ops = ops;
-}
-
-extern int afs_kafstimod_start(void);
-extern void afs_kafstimod_stop(void);
-
-extern void afs_kafstimod_add_timer(struct afs_timer *timer,
- unsigned long timeout);
-extern int afs_kafstimod_del_timer(struct afs_timer *timer);
-
-#endif /* _LINUX_AFS_KAFSTIMOD_H */
diff --git a/fs/afs/main.c b/fs/afs/main.c
index f2704ba53857..40c2704e7557 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -1,4 +1,4 @@
-/* main.c: AFS client file system
+/* AFS client file system
*
* Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
@@ -13,43 +13,21 @@
#include <linux/moduleparam.h>
#include <linux/init.h>
#include <linux/completion.h>
-#include <rxrpc/rxrpc.h>
-#include <rxrpc/transport.h>
-#include <rxrpc/call.h>
-#include <rxrpc/peer.h>
-#include "cache.h"
-#include "cell.h"
-#include "server.h"
-#include "fsclient.h"
-#include "cmservice.h"
-#include "kafstimod.h"
-#include "kafsasyncd.h"
#include "internal.h"
-struct rxrpc_transport *afs_transport;
-
-static int afs_adding_peer(struct rxrpc_peer *peer);
-static void afs_discarding_peer(struct rxrpc_peer *peer);
-
-
MODULE_DESCRIPTION("AFS Client File System");
MODULE_AUTHOR("Red Hat, Inc.");
MODULE_LICENSE("GPL");
+unsigned afs_debug;
+module_param_named(debug, afs_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(afs_debug, "AFS debugging mask");
+
static char *rootcell;
module_param(rootcell, charp, 0);
MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
-
-static struct rxrpc_peer_ops afs_peer_ops = {
- .adding = afs_adding_peer,
- .discarding = afs_discarding_peer,
-};
-
-struct list_head afs_cb_hash_tbl[AFS_CB_HASH_COUNT];
-DEFINE_SPINLOCK(afs_cb_hash_lock);
-
#ifdef AFS_CACHING_SUPPORT
static struct cachefs_netfs_operations afs_cache_ops = {
.get_page_cookie = afs_cache_get_page_cookie,
@@ -62,20 +40,63 @@ struct cachefs_netfs afs_cache_netfs = {
};
#endif
-/*****************************************************************************/
+struct afs_uuid afs_uuid;
+
+/*
+ * get a client UUID
+ */
+static int __init afs_get_client_UUID(void)
+{
+ struct timespec ts;
+ u64 uuidtime;
+ u16 clockseq;
+ int ret;
+
+ /* read the MAC address of one of the external interfaces and construct
+ * a UUID from it */
+ ret = afs_get_MAC_address(afs_uuid.node);
+ if (ret < 0)
+ return ret;
+
+ getnstimeofday(&ts);
+ uuidtime = (u64) ts.tv_sec * 1000 * 1000 * 10;
+ uuidtime += ts.tv_nsec / 100;
+ uuidtime += AFS_UUID_TO_UNIX_TIME;
+ afs_uuid.time_low = uuidtime;
+ afs_uuid.time_mid = uuidtime >> 32;
+ afs_uuid.time_hi_and_version = (uuidtime >> 48) & AFS_UUID_TIMEHI_MASK;
+ afs_uuid.time_hi_and_version = AFS_UUID_VERSION_TIME;
+
+ get_random_bytes(&clockseq, 2);
+ afs_uuid.clock_seq_low = clockseq;
+ afs_uuid.clock_seq_hi_and_reserved =
+ (clockseq >> 8) & AFS_UUID_CLOCKHI_MASK;
+ afs_uuid.clock_seq_hi_and_reserved = AFS_UUID_VARIANT_STD;
+
+ _debug("AFS UUID: %08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+ afs_uuid.time_low,
+ afs_uuid.time_mid,
+ afs_uuid.time_hi_and_version,
+ afs_uuid.clock_seq_hi_and_reserved,
+ afs_uuid.clock_seq_low,
+ afs_uuid.node[0], afs_uuid.node[1], afs_uuid.node[2],
+ afs_uuid.node[3], afs_uuid.node[4], afs_uuid.node[5]);
+
+ return 0;
+}
+
/*
* initialise the AFS client FS module
*/
static int __init afs_init(void)
{
- int loop, ret;
+ int ret;
printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n");
- /* initialise the callback hash table */
- spin_lock_init(&afs_cb_hash_lock);
- for (loop = AFS_CB_HASH_COUNT - 1; loop >= 0; loop--)
- INIT_LIST_HEAD(&afs_cb_hash_tbl[loop]);
+ ret = afs_get_client_UUID();
+ if (ret < 0)
+ return ret;
/* register the /proc stuff */
ret = afs_proc_init();
@@ -87,70 +108,56 @@ static int __init afs_init(void)
ret = cachefs_register_netfs(&afs_cache_netfs,
&afs_cache_cell_index_def);
if (ret < 0)
- goto error;
-#endif
-
-#ifdef CONFIG_KEYS_TURNED_OFF
- ret = afs_key_register();
- if (ret < 0)
goto error_cache;
#endif
/* initialise the cell DB */
ret = afs_cell_init(rootcell);
if (ret < 0)
- goto error_keys;
+ goto error_cell_init;
- /* start the timeout daemon */
- ret = afs_kafstimod_start();
+ /* initialise the VL update process */
+ ret = afs_vlocation_update_init();
if (ret < 0)
- goto error_keys;
+ goto error_vl_update_init;
- /* start the async operation daemon */
- ret = afs_kafsasyncd_start();
- if (ret < 0)
- goto error_kafstimod;
+ /* initialise the callback update process */
+ ret = afs_callback_update_init();
/* create the RxRPC transport */
- ret = rxrpc_create_transport(7001, &afs_transport);
+ ret = afs_open_socket();
if (ret < 0)
- goto error_kafsasyncd;
-
- afs_transport->peer_ops = &afs_peer_ops;
+ goto error_open_socket;
/* register the filesystems */
ret = afs_fs_init();
if (ret < 0)
- goto error_transport;
+ goto error_fs;
return ret;
- error_transport:
- rxrpc_put_transport(afs_transport);
- error_kafsasyncd:
- afs_kafsasyncd_stop();
- error_kafstimod:
- afs_kafstimod_stop();
- error_keys:
-#ifdef CONFIG_KEYS_TURNED_OFF
- afs_key_unregister();
- error_cache:
-#endif
+error_fs:
+ afs_close_socket();
+error_open_socket:
+error_vl_update_init:
+error_cell_init:
#ifdef AFS_CACHING_SUPPORT
cachefs_unregister_netfs(&afs_cache_netfs);
- error:
+error_cache:
#endif
+ afs_callback_update_kill();
+ afs_vlocation_purge();
afs_cell_purge();
afs_proc_cleanup();
printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
return ret;
-} /* end afs_init() */
+}
/* XXX late_initcall is kludgy, but the only alternative seems to create
* a transport upon the first mount, which is worse. Or is it?
*/
late_initcall(afs_init); /* must be called after net/ to create socket */
-/*****************************************************************************/
+
/*
* clean up on module removal
*/
@@ -159,127 +166,16 @@ static void __exit afs_exit(void)
printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n");
afs_fs_exit();
- rxrpc_put_transport(afs_transport);
- afs_kafstimod_stop();
- afs_kafsasyncd_stop();
+ afs_close_socket();
+ afs_purge_servers();
+ afs_callback_update_kill();
+ afs_vlocation_purge();
+ flush_scheduled_work();
afs_cell_purge();
-#ifdef CONFIG_KEYS_TURNED_OFF
- afs_key_unregister();
-#endif
#ifdef AFS_CACHING_SUPPORT
cachefs_unregister_netfs(&afs_cache_netfs);
#endif
afs_proc_cleanup();
-
-} /* end afs_exit() */
-
-module_exit(afs_exit);
-
-/*****************************************************************************/
-/*
- * notification that new peer record is being added
- * - called from krxsecd
- * - return an error to induce an abort
- * - mustn't sleep (caller holds an rwlock)
- */
-static int afs_adding_peer(struct rxrpc_peer *peer)
-{
- struct afs_server *server;
- int ret;
-
- _debug("kAFS: Adding new peer %08x\n", ntohl(peer->addr.s_addr));
-
- /* determine which server the peer resides in (if any) */
- ret = afs_server_find_by_peer(peer, &server);
- if (ret < 0)
- return ret; /* none that we recognise, so abort */
-
- _debug("Server %p{u=%d}\n", server, atomic_read(&server->usage));
-
- _debug("Cell %p{u=%d}\n",
- server->cell, atomic_read(&server->cell->usage));
-
- /* cross-point the structs under a global lock */
- spin_lock(&afs_server_peer_lock);
- peer->user = server;
- server->peer = peer;
- spin_unlock(&afs_server_peer_lock);
-
- afs_put_server(server);
-
- return 0;
-} /* end afs_adding_peer() */
-
-/*****************************************************************************/
-/*
- * notification that a peer record is being discarded
- * - called from krxiod or krxsecd
- */
-static void afs_discarding_peer(struct rxrpc_peer *peer)
-{
- struct afs_server *server;
-
- _enter("%p",peer);
-
- _debug("Discarding peer %08x (rtt=%lu.%lumS)\n",
- ntohl(peer->addr.s_addr),
- (long) (peer->rtt / 1000),
- (long) (peer->rtt % 1000));
-
- /* uncross-point the structs under a global lock */
- spin_lock(&afs_server_peer_lock);
- server = peer->user;
- if (server) {
- peer->user = NULL;
- server->peer = NULL;
- }
- spin_unlock(&afs_server_peer_lock);
-
- _leave("");
-
-} /* end afs_discarding_peer() */
-
-/*****************************************************************************/
-/*
- * clear the dead space between task_struct and kernel stack
- * - called by supplying -finstrument-functions to gcc
- */
-#if 0
-void __cyg_profile_func_enter (void *this_fn, void *call_site)
-__attribute__((no_instrument_function));
-
-void __cyg_profile_func_enter (void *this_fn, void *call_site)
-{
- asm volatile(" movl %%esp,%%edi \n"
- " andl %0,%%edi \n"
- " addl %1,%%edi \n"
- " movl %%esp,%%ecx \n"
- " subl %%edi,%%ecx \n"
- " shrl $2,%%ecx \n"
- " movl $0xedededed,%%eax \n"
- " rep stosl \n"
- :
- : "i"(~(THREAD_SIZE - 1)), "i"(sizeof(struct thread_info))
- : "eax", "ecx", "edi", "memory", "cc"
- );
}
-void __cyg_profile_func_exit(void *this_fn, void *call_site)
-__attribute__((no_instrument_function));
-
-void __cyg_profile_func_exit(void *this_fn, void *call_site)
-{
- asm volatile(" movl %%esp,%%edi \n"
- " andl %0,%%edi \n"
- " addl %1,%%edi \n"
- " movl %%esp,%%ecx \n"
- " subl %%edi,%%ecx \n"
- " shrl $2,%%ecx \n"
- " movl $0xdadadada,%%eax \n"
- " rep stosl \n"
- :
- : "i"(~(THREAD_SIZE - 1)), "i"(sizeof(struct thread_info))
- : "eax", "ecx", "edi", "memory", "cc"
- );
-}
-#endif
+module_exit(afs_exit);
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index e4fce66d76e0..cdb9792d8161 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -1,6 +1,6 @@
-/* misc.c: miscellaneous bits
+/* miscellaneous bits
*
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
@@ -12,19 +12,20 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/errno.h>
-#include "errors.h"
#include "internal.h"
+#include "afs_fs.h"
-/*****************************************************************************/
/*
* convert an AFS abort code to a Linux error number
*/
-int afs_abort_to_error(int abortcode)
+int afs_abort_to_error(u32 abort_code)
{
- switch (abortcode) {
+ switch (abort_code) {
+ case 13: return -EACCES;
+ case 30: return -EROFS;
case VSALVAGE: return -EIO;
case VNOVNODE: return -ENOENT;
- case VNOVOL: return -ENXIO;
+ case VNOVOL: return -ENOMEDIUM;
case VVOLEXISTS: return -EEXIST;
case VNOSERVICE: return -EIO;
case VOFFLINE: return -ENOENT;
@@ -33,7 +34,24 @@ int afs_abort_to_error(int abortcode)
case VOVERQUOTA: return -EDQUOT;
case VBUSY: return -EBUSY;
case VMOVED: return -ENXIO;
- default: return -EIO;
+ case 0x2f6df0c: return -EACCES;
+ case 0x2f6df0f: return -EBUSY;
+ case 0x2f6df10: return -EEXIST;
+ case 0x2f6df11: return -EXDEV;
+ case 0x2f6df13: return -ENOTDIR;
+ case 0x2f6df14: return -EISDIR;
+ case 0x2f6df15: return -EINVAL;
+ case 0x2f6df1a: return -EFBIG;
+ case 0x2f6df1b: return -ENOSPC;
+ case 0x2f6df1d: return -EROFS;
+ case 0x2f6df1e: return -EMLINK;
+ case 0x2f6df20: return -EDOM;
+ case 0x2f6df21: return -ERANGE;
+ case 0x2f6df22: return -EDEADLK;
+ case 0x2f6df23: return -ENAMETOOLONG;
+ case 0x2f6df24: return -ENOLCK;
+ case 0x2f6df26: return -ENOTEMPTY;
+ case 0x2f6df78: return -EDQUOT;
+ default: return -EREMOTEIO;
}
-
-} /* end afs_abort_to_error() */
+}
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 68495f0de7b3..b905ae37f912 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -1,4 +1,4 @@
-/* mntpt.c: mountpoint management
+/* mountpoint management
*
* Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
@@ -18,10 +18,6 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/mnt_namespace.h>
-#include "super.h"
-#include "cell.h"
-#include "volume.h"
-#include "vnode.h"
#include "internal.h"
@@ -30,6 +26,7 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
struct nameidata *nd);
static int afs_mntpt_open(struct inode *inode, struct file *file);
static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd);
+static void afs_mntpt_expiry_timed_out(struct work_struct *work);
const struct file_operations afs_mntpt_file_operations = {
.open = afs_mntpt_open,
@@ -43,24 +40,19 @@ const struct inode_operations afs_mntpt_inode_operations = {
};
static LIST_HEAD(afs_vfsmounts);
+static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out);
-static void afs_mntpt_expiry_timed_out(struct afs_timer *timer);
+unsigned long afs_mntpt_expiry_timeout = 10 * 60;
-struct afs_timer_ops afs_mntpt_expiry_timer_ops = {
- .timed_out = afs_mntpt_expiry_timed_out,
-};
-
-struct afs_timer afs_mntpt_expiry_timer;
-
-unsigned long afs_mntpt_expiry_timeout = 20;
-
-/*****************************************************************************/
/*
* check a symbolic link to see whether it actually encodes a mountpoint
* - sets the AFS_VNODE_MOUNTPOINT flag on the vnode appropriately
*/
-int afs_mntpt_check_symlink(struct afs_vnode *vnode)
+int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
{
+ struct file file = {
+ .private_data = key,
+ };
struct page *page;
size_t size;
char *buf;
@@ -69,7 +61,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode)
_enter("{%u,%u}", vnode->fid.vnode, vnode->fid.unique);
/* read the contents of the symlink into the pagecache */
- page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, NULL);
+ page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, &file);
if (IS_ERR(page)) {
ret = PTR_ERR(page);
goto out;
@@ -85,7 +77,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode)
/* examine the symlink's contents */
size = vnode->status.size;
- _debug("symlink to %*.*s", size, (int) size, buf);
+ _debug("symlink to %*.*s", (int) size, (int) size, buf);
if (size > 2 &&
(buf[0] == '%' || buf[0] == '#') &&
@@ -93,22 +85,20 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode)
) {
_debug("symlink is a mountpoint");
spin_lock(&vnode->lock);
- vnode->flags |= AFS_VNODE_MOUNTPOINT;
+ set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
spin_unlock(&vnode->lock);
}
ret = 0;
- out_free:
+out_free:
kunmap(page);
page_cache_release(page);
- out:
+out:
_leave(" = %d", ret);
return ret;
+}
-} /* end afs_mntpt_check_symlink() */
-
-/*****************************************************************************/
/*
* no valid lookup procedure on this sort of dir
*/
@@ -116,7 +106,7 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
struct dentry *dentry,
struct nameidata *nd)
{
- kenter("%p,%p{%p{%s},%s}",
+ _enter("%p,%p{%p{%s},%s}",
dir,
dentry,
dentry->d_parent,
@@ -125,15 +115,14 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
dentry->d_name.name);
return ERR_PTR(-EREMOTE);
-} /* end afs_mntpt_lookup() */
+}
-/*****************************************************************************/
/*
* no valid open procedure on this sort of dir
*/
static int afs_mntpt_open(struct inode *inode, struct file *file)
{
- kenter("%p,%p{%p{%s},%s}",
+ _enter("%p,%p{%p{%s},%s}",
inode, file,
file->f_path.dentry->d_parent,
file->f_path.dentry->d_parent ?
@@ -142,9 +131,8 @@ static int afs_mntpt_open(struct inode *inode, struct file *file)
file->f_path.dentry->d_name.name);
return -EREMOTE;
-} /* end afs_mntpt_open() */
+}
-/*****************************************************************************/
/*
* create a vfsmount to be automounted
*/
@@ -157,7 +145,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
char *buf, *devname = NULL, *options = NULL;
int ret;
- kenter("{%s}", mntpt->d_name.name);
+ _enter("{%s}", mntpt->d_name.name);
BUG_ON(!mntpt->d_inode);
@@ -201,79 +189,108 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
strcat(options, ",rwpath");
/* try and do the mount */
- kdebug("--- attempting mount %s -o %s ---", devname, options);
+ _debug("--- attempting mount %s -o %s ---", devname, options);
mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options);
- kdebug("--- mount result %p ---", mnt);
+ _debug("--- mount result %p ---", mnt);
free_page((unsigned long) devname);
free_page((unsigned long) options);
- kleave(" = %p", mnt);
+ _leave(" = %p", mnt);
return mnt;
- error:
+error:
if (page)
page_cache_release(page);
if (devname)
free_page((unsigned long) devname);
if (options)
free_page((unsigned long) options);
- kleave(" = %d", ret);
+ _leave(" = %d", ret);
return ERR_PTR(ret);
-} /* end afs_mntpt_do_automount() */
+}
-/*****************************************************************************/
/*
* follow a link from a mountpoint directory, thus causing it to be mounted
*/
static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
{
struct vfsmount *newmnt;
- struct dentry *old_dentry;
int err;
- kenter("%p{%s},{%s:%p{%s}}",
+ _enter("%p{%s},{%s:%p{%s},}",
dentry,
dentry->d_name.name,
nd->mnt->mnt_devname,
dentry,
nd->dentry->d_name.name);
- newmnt = afs_mntpt_do_automount(dentry);
+ dput(nd->dentry);
+ nd->dentry = dget(dentry);
+
+ newmnt = afs_mntpt_do_automount(nd->dentry);
if (IS_ERR(newmnt)) {
path_release(nd);
return (void *)newmnt;
}
- old_dentry = nd->dentry;
- nd->dentry = dentry;
- err = do_add_mount(newmnt, nd, 0, &afs_vfsmounts);
- nd->dentry = old_dentry;
-
- path_release(nd);
-
- if (!err) {
- mntget(newmnt);
+ mntget(newmnt);
+ err = do_add_mount(newmnt, nd, MNT_SHRINKABLE, &afs_vfsmounts);
+ switch (err) {
+ case 0:
+ mntput(nd->mnt);
+ dput(nd->dentry);
nd->mnt = newmnt;
- dget(newmnt->mnt_root);
- nd->dentry = newmnt->mnt_root;
+ nd->dentry = dget(newmnt->mnt_root);
+ schedule_delayed_work(&afs_mntpt_expiry_timer,
+ afs_mntpt_expiry_timeout * HZ);
+ break;
+ case -EBUSY:
+ /* someone else made a mount here whilst we were busy */
+ while (d_mountpoint(nd->dentry) &&
+ follow_down(&nd->mnt, &nd->dentry))
+ ;
+ err = 0;
+ default:
+ mntput(newmnt);
+ break;
}
- kleave(" = %d", err);
+ _leave(" = %d", err);
return ERR_PTR(err);
-} /* end afs_mntpt_follow_link() */
+}
-/*****************************************************************************/
/*
* handle mountpoint expiry timer going off
*/
-static void afs_mntpt_expiry_timed_out(struct afs_timer *timer)
+static void afs_mntpt_expiry_timed_out(struct work_struct *work)
{
- kenter("");
+ _enter("");
- mark_mounts_for_expiry(&afs_vfsmounts);
+ if (!list_empty(&afs_vfsmounts)) {
+ mark_mounts_for_expiry(&afs_vfsmounts);
+ schedule_delayed_work(&afs_mntpt_expiry_timer,
+ afs_mntpt_expiry_timeout * HZ);
+ }
+
+ _leave("");
+}
- afs_kafstimod_add_timer(&afs_mntpt_expiry_timer,
- afs_mntpt_expiry_timeout * HZ);
+/*
+ * kill the AFS mountpoint timer if it's still running
+ */
+void afs_mntpt_kill_timer(void)
+{
+ _enter("");
- kleave("");
-} /* end afs_mntpt_expiry_timed_out() */
+ ASSERT(list_empty(&afs_vfsmounts));
+ cancel_delayed_work(&afs_mntpt_expiry_timer);
+ flush_scheduled_work();
+}
+
+/*
+ * begin unmount by attempting to remove all automounted mountpoints we added
+ */
+void afs_umount_begin(struct vfsmount *vfsmnt, int flags)
+{
+ shrink_submounts(vfsmnt, &afs_vfsmounts);
+}
diff --git a/fs/afs/mount.h b/fs/afs/mount.h
deleted file mode 100644
index 9d2f46ec549f..000000000000
--- a/fs/afs/mount.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* mount.h: mount parameters
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_AFS_MOUNT_H
-#define _LINUX_AFS_MOUNT_H
-
-struct afs_mountdata {
- const char *volume; /* name of volume */
- const char *cell; /* name of cell containing volume */
- const char *cache; /* name of cache block device */
- size_t nservers; /* number of server addresses listed */
- uint32_t servers[10]; /* IP addresses of servers in this cell */
-};
-
-#endif /* _LINUX_AFS_MOUNT_H */
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index ae6b85b1e484..d5601f617cdb 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -1,4 +1,4 @@
-/* proc.c: /proc interface for AFS
+/* /proc interface for AFS
*
* Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
@@ -13,8 +13,6 @@
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-#include "cell.h"
-#include "volume.h"
#include <asm/uaccess.h>
#include "internal.h"
@@ -130,7 +128,6 @@ static const struct file_operations afs_proc_cell_servers_fops = {
.release = afs_proc_cell_servers_release,
};
-/*****************************************************************************/
/*
* initialise the /proc/fs/afs/ directory
*/
@@ -142,47 +139,43 @@ int afs_proc_init(void)
proc_afs = proc_mkdir("fs/afs", NULL);
if (!proc_afs)
- goto error;
+ goto error_dir;
proc_afs->owner = THIS_MODULE;
p = create_proc_entry("cells", 0, proc_afs);
if (!p)
- goto error_proc;
+ goto error_cells;
p->proc_fops = &afs_proc_cells_fops;
p->owner = THIS_MODULE;
p = create_proc_entry("rootcell", 0, proc_afs);
if (!p)
- goto error_cells;
+ goto error_rootcell;
p->proc_fops = &afs_proc_rootcell_fops;
p->owner = THIS_MODULE;
_leave(" = 0");
return 0;
- error_cells:
+error_rootcell:
remove_proc_entry("cells", proc_afs);
- error_proc:
+error_cells:
remove_proc_entry("fs/afs", NULL);
- error:
+error_dir:
_leave(" = -ENOMEM");
return -ENOMEM;
+}
-} /* end afs_proc_init() */
-
-/*****************************************************************************/
/*
* clean up the /proc/fs/afs/ directory
*/
void afs_proc_cleanup(void)
{
+ remove_proc_entry("rootcell", proc_afs);
remove_proc_entry("cells", proc_afs);
-
remove_proc_entry("fs/afs", NULL);
+}
-} /* end afs_proc_cleanup() */
-
-/*****************************************************************************/
/*
* open "/proc/fs/afs/cells" which provides a summary of extant cells
*/
@@ -199,9 +192,8 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file)
m->private = PDE(inode)->data;
return 0;
-} /* end afs_proc_cells_open() */
+}
-/*****************************************************************************/
/*
* set up the iterator to start reading from the cells list and return the
* first item
@@ -225,9 +217,8 @@ static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos)
break;
return _p != &afs_proc_cells ? _p : NULL;
-} /* end afs_proc_cells_start() */
+}
-/*****************************************************************************/
/*
* move to next cell in cells list
*/
@@ -241,19 +232,16 @@ static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos)
_p = v == (void *) 1 ? afs_proc_cells.next : _p->next;
return _p != &afs_proc_cells ? _p : NULL;
-} /* end afs_proc_cells_next() */
+}
-/*****************************************************************************/
/*
* clean up after reading from the cells list
*/
static void afs_proc_cells_stop(struct seq_file *p, void *v)
{
up_read(&afs_proc_cells_sem);
+}
-} /* end afs_proc_cells_stop() */
-
-/*****************************************************************************/
/*
* display a header line followed by a load of cell lines
*/
@@ -261,19 +249,18 @@ static int afs_proc_cells_show(struct seq_file *m, void *v)
{
struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link);
- /* display header on line 1 */
if (v == (void *) 1) {
+ /* display header on line 1 */
seq_puts(m, "USE NAME\n");
return 0;
}
/* display one cell per line on subsequent lines */
- seq_printf(m, "%3d %s\n", atomic_read(&cell->usage), cell->name);
-
+ seq_printf(m, "%3d %s\n",
+ atomic_read(&cell->usage), cell->name);
return 0;
-} /* end afs_proc_cells_show() */
+}
-/*****************************************************************************/
/*
* handle writes to /proc/fs/afs/cells
* - to add cells: echo "add <cellname> <IP>[:<IP>][:<IP>]"
@@ -326,30 +313,32 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
if (strcmp(kbuf, "add") == 0) {
struct afs_cell *cell;
- ret = afs_cell_create(name, args, &cell);
- if (ret < 0)
+
+ cell = afs_cell_create(name, args);
+ if (IS_ERR(cell)) {
+ ret = PTR_ERR(cell);
goto done;
+ }
+ afs_put_cell(cell);
printk("kAFS: Added new cell '%s'\n", name);
- }
- else {
+ } else {
goto inval;
}
ret = size;
- done:
+done:
kfree(kbuf);
_leave(" = %d", ret);
return ret;
- inval:
+inval:
ret = -EINVAL;
printk("kAFS: Invalid Command on /proc/fs/afs/cells file\n");
goto done;
-} /* end afs_proc_cells_write() */
+}
-/*****************************************************************************/
/*
* Stubs for /proc/fs/afs/rootcell
*/
@@ -369,7 +358,6 @@ static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf,
return 0;
}
-/*****************************************************************************/
/*
* handle writes to /proc/fs/afs/rootcell
* - to initialize rootcell: echo "cell.name:192.168.231.14"
@@ -407,14 +395,13 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
if (ret >= 0)
ret = size; /* consume everything, always */
- infault:
+infault:
kfree(kbuf);
- nomem:
+nomem:
_leave(" = %d", ret);
return ret;
-} /* end afs_proc_rootcell_write() */
+}
-/*****************************************************************************/
/*
* initialise /proc/fs/afs/<cell>/
*/
@@ -426,25 +413,25 @@ int afs_proc_cell_setup(struct afs_cell *cell)
cell->proc_dir = proc_mkdir(cell->name, proc_afs);
if (!cell->proc_dir)
- return -ENOMEM;
+ goto error_dir;
p = create_proc_entry("servers", 0, cell->proc_dir);
if (!p)
- goto error_proc;
+ goto error_servers;
p->proc_fops = &afs_proc_cell_servers_fops;
p->owner = THIS_MODULE;
p->data = cell;
p = create_proc_entry("vlservers", 0, cell->proc_dir);
if (!p)
- goto error_servers;
+ goto error_vlservers;
p->proc_fops = &afs_proc_cell_vlservers_fops;
p->owner = THIS_MODULE;
p->data = cell;
p = create_proc_entry("volumes", 0, cell->proc_dir);
if (!p)
- goto error_vlservers;
+ goto error_volumes;
p->proc_fops = &afs_proc_cell_volumes_fops;
p->owner = THIS_MODULE;
p->data = cell;
@@ -452,17 +439,17 @@ int afs_proc_cell_setup(struct afs_cell *cell)
_leave(" = 0");
return 0;
- error_vlservers:
+error_volumes:
remove_proc_entry("vlservers", cell->proc_dir);
- error_servers:
+error_vlservers:
remove_proc_entry("servers", cell->proc_dir);
- error_proc:
+error_servers:
remove_proc_entry(cell->name, proc_afs);
+error_dir:
_leave(" = -ENOMEM");
return -ENOMEM;
-} /* end afs_proc_cell_setup() */
+}
-/*****************************************************************************/
/*
* remove /proc/fs/afs/<cell>/
*/
@@ -476,9 +463,8 @@ void afs_proc_cell_remove(struct afs_cell *cell)
remove_proc_entry(cell->name, proc_afs);
_leave("");
-} /* end afs_proc_cell_remove() */
+}
-/*****************************************************************************/
/*
* open "/proc/fs/afs/<cell>/volumes" which provides a summary of extant cells
*/
@@ -488,7 +474,7 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file)
struct seq_file *m;
int ret;
- cell = afs_get_cell_maybe((struct afs_cell **) &PDE(inode)->data);
+ cell = PDE(inode)->data;
if (!cell)
return -ENOENT;
@@ -500,25 +486,16 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file)
m->private = cell;
return 0;
-} /* end afs_proc_cell_volumes_open() */
+}
-/*****************************************************************************/
/*
* close the file and release the ref to the cell
*/
static int afs_proc_cell_volumes_release(struct inode *inode, struct file *file)
{
- struct afs_cell *cell = PDE(inode)->data;
- int ret;
-
- ret = seq_release(inode,file);
-
- afs_put_cell(cell);
-
- return ret;
-} /* end afs_proc_cell_volumes_release() */
+ return seq_release(inode, file);
+}
-/*****************************************************************************/
/*
* set up the iterator to start reading from the cells list and return the
* first item
@@ -545,9 +522,8 @@ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos)
break;
return _p != &cell->vl_list ? _p : NULL;
-} /* end afs_proc_cell_volumes_start() */
+}
-/*****************************************************************************/
/*
* move to next cell in cells list
*/
@@ -562,12 +538,11 @@ static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
(*_pos)++;
_p = v;
- _p = v == (void *) 1 ? cell->vl_list.next : _p->next;
+ _p = (v == (void *) 1) ? cell->vl_list.next : _p->next;
- return _p != &cell->vl_list ? _p : NULL;
-} /* end afs_proc_cell_volumes_next() */
+ return (_p != &cell->vl_list) ? _p : NULL;
+}
-/*****************************************************************************/
/*
* clean up after reading from the cells list
*/
@@ -576,10 +551,18 @@ static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v)
struct afs_cell *cell = p->private;
up_read(&cell->vl_sem);
+}
-} /* end afs_proc_cell_volumes_stop() */
+const char afs_vlocation_states[][4] = {
+ [AFS_VL_NEW] = "New",
+ [AFS_VL_CREATING] = "Crt",
+ [AFS_VL_VALID] = "Val",
+ [AFS_VL_NO_VOLUME] = "NoV",
+ [AFS_VL_UPDATING] = "Upd",
+ [AFS_VL_VOLUME_DELETED] = "Del",
+ [AFS_VL_UNCERTAIN] = "Unc",
+};
-/*****************************************************************************/
/*
* display a header line followed by a load of volume lines
*/
@@ -590,23 +573,22 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v)
/* display header on line 1 */
if (v == (void *) 1) {
- seq_puts(m, "USE VLID[0] VLID[1] VLID[2] NAME\n");
+ seq_puts(m, "USE STT VLID[0] VLID[1] VLID[2] NAME\n");
return 0;
}
/* display one cell per line on subsequent lines */
- seq_printf(m, "%3d %08x %08x %08x %s\n",
+ seq_printf(m, "%3d %s %08x %08x %08x %s\n",
atomic_read(&vlocation->usage),
+ afs_vlocation_states[vlocation->state],
vlocation->vldb.vid[0],
vlocation->vldb.vid[1],
vlocation->vldb.vid[2],
- vlocation->vldb.name
- );
+ vlocation->vldb.name);
return 0;
-} /* end afs_proc_cell_volumes_show() */
+}
-/*****************************************************************************/
/*
* open "/proc/fs/afs/<cell>/vlservers" which provides a list of volume
* location server
@@ -617,11 +599,11 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
struct seq_file *m;
int ret;
- cell = afs_get_cell_maybe((struct afs_cell**)&PDE(inode)->data);
+ cell = PDE(inode)->data;
if (!cell)
return -ENOENT;
- ret = seq_open(file,&afs_proc_cell_vlservers_ops);
+ ret = seq_open(file, &afs_proc_cell_vlservers_ops);
if (ret<0)
return ret;
@@ -629,26 +611,17 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
m->private = cell;
return 0;
-} /* end afs_proc_cell_vlservers_open() */
+}
-/*****************************************************************************/
/*
* close the file and release the ref to the cell
*/
static int afs_proc_cell_vlservers_release(struct inode *inode,
struct file *file)
{
- struct afs_cell *cell = PDE(inode)->data;
- int ret;
-
- ret = seq_release(inode,file);
-
- afs_put_cell(cell);
-
- return ret;
-} /* end afs_proc_cell_vlservers_release() */
+ return seq_release(inode, file);
+}
-/*****************************************************************************/
/*
* set up the iterator to start reading from the cells list and return the
* first item
@@ -672,9 +645,8 @@ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos)
return NULL;
return &cell->vl_addrs[pos];
-} /* end afs_proc_cell_vlservers_start() */
+}
-/*****************************************************************************/
/*
* move to next cell in cells list
*/
@@ -692,9 +664,8 @@ static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
return NULL;
return &cell->vl_addrs[pos];
-} /* end afs_proc_cell_vlservers_next() */
+}
-/*****************************************************************************/
/*
* clean up after reading from the cells list
*/
@@ -703,10 +674,8 @@ static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v)
struct afs_cell *cell = p->private;
up_read(&cell->vl_sem);
+}
-} /* end afs_proc_cell_vlservers_stop() */
-
-/*****************************************************************************/
/*
* display a header line followed by a load of volume lines
*/
@@ -722,11 +691,9 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
/* display one cell per line on subsequent lines */
seq_printf(m, "%u.%u.%u.%u\n", NIPQUAD(addr->s_addr));
-
return 0;
-} /* end afs_proc_cell_vlservers_show() */
+}
-/*****************************************************************************/
/*
* open "/proc/fs/afs/<cell>/servers" which provides a summary of active
* servers
@@ -737,7 +704,7 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file)
struct seq_file *m;
int ret;
- cell = afs_get_cell_maybe((struct afs_cell **) &PDE(inode)->data);
+ cell = PDE(inode)->data;
if (!cell)
return -ENOENT;
@@ -747,34 +714,24 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file)
m = file->private_data;
m->private = cell;
-
return 0;
-} /* end afs_proc_cell_servers_open() */
+}
-/*****************************************************************************/
/*
* close the file and release the ref to the cell
*/
static int afs_proc_cell_servers_release(struct inode *inode,
struct file *file)
{
- struct afs_cell *cell = PDE(inode)->data;
- int ret;
-
- ret = seq_release(inode, file);
-
- afs_put_cell(cell);
-
- return ret;
-} /* end afs_proc_cell_servers_release() */
+ return seq_release(inode, file);
+}
-/*****************************************************************************/
/*
* set up the iterator to start reading from the cells list and return the
* first item
*/
static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
- __acquires(m->private->sv_lock)
+ __acquires(m->private->servers_lock)
{
struct list_head *_p;
struct afs_cell *cell = m->private;
@@ -783,7 +740,7 @@ static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
_enter("cell=%p pos=%Ld", cell, *_pos);
/* lock the list against modification */
- read_lock(&cell->sv_lock);
+ read_lock(&cell->servers_lock);
/* allow for the header line */
if (!pos)
@@ -791,14 +748,13 @@ static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
pos--;
/* find the n'th element in the list */
- list_for_each(_p, &cell->sv_list)
+ list_for_each(_p, &cell->servers)
if (!pos--)
break;
- return _p != &cell->sv_list ? _p : NULL;
-} /* end afs_proc_cell_servers_start() */
+ return _p != &cell->servers ? _p : NULL;
+}
-/*****************************************************************************/
/*
* move to next cell in cells list
*/
@@ -813,25 +769,22 @@ static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
(*_pos)++;
_p = v;
- _p = v == (void *) 1 ? cell->sv_list.next : _p->next;
+ _p = v == (void *) 1 ? cell->servers.next : _p->next;
- return _p != &cell->sv_list ? _p : NULL;
-} /* end afs_proc_cell_servers_next() */
+ return _p != &cell->servers ? _p : NULL;
+}
-/*****************************************************************************/
/*
* clean up after reading from the cells list
*/
static void afs_proc_cell_servers_stop(struct seq_file *p, void *v)
- __releases(p->private->sv_lock)
+ __releases(p->private->servers_lock)
{
struct afs_cell *cell = p->private;
- read_unlock(&cell->sv_lock);
-
-} /* end afs_proc_cell_servers_stop() */
+ read_unlock(&cell->servers_lock);
+}
-/*****************************************************************************/
/*
* display a header line followed by a load of volume lines
*/
@@ -849,10 +802,7 @@ static int afs_proc_cell_servers_show(struct seq_file *m, void *v)
/* display one cell per line on subsequent lines */
sprintf(ipaddr, "%u.%u.%u.%u", NIPQUAD(server->addr));
seq_printf(m, "%3d %-15.15s %5d\n",
- atomic_read(&server->usage),
- ipaddr,
- server->fs_state
- );
+ atomic_read(&server->usage), ipaddr, server->fs_state);
return 0;
-} /* end afs_proc_cell_servers_show() */
+}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
new file mode 100644
index 000000000000..e7b047328a39
--- /dev/null
+++ b/fs/afs/rxrpc.c
@@ -0,0 +1,782 @@
+/* Maintain an RxRPC server socket to do AFS communications through
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <rxrpc/packet.h>
+#include "internal.h"
+#include "afs_cm.h"
+
+static struct socket *afs_socket; /* my RxRPC socket */
+static struct workqueue_struct *afs_async_calls;
+static atomic_t afs_outstanding_calls;
+static atomic_t afs_outstanding_skbs;
+
+static void afs_wake_up_call_waiter(struct afs_call *);
+static int afs_wait_for_call_to_complete(struct afs_call *);
+static void afs_wake_up_async_call(struct afs_call *);
+static int afs_dont_wait_for_call_to_complete(struct afs_call *);
+static void afs_process_async_call(struct work_struct *);
+static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *);
+static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool);
+
+/* synchronous call management */
+const struct afs_wait_mode afs_sync_call = {
+ .rx_wakeup = afs_wake_up_call_waiter,
+ .wait = afs_wait_for_call_to_complete,
+};
+
+/* asynchronous call management */
+const struct afs_wait_mode afs_async_call = {
+ .rx_wakeup = afs_wake_up_async_call,
+ .wait = afs_dont_wait_for_call_to_complete,
+};
+
+/* asynchronous incoming call management */
+static const struct afs_wait_mode afs_async_incoming_call = {
+ .rx_wakeup = afs_wake_up_async_call,
+};
+
+/* asynchronous incoming call initial processing */
+static const struct afs_call_type afs_RXCMxxxx = {
+ .name = "CB.xxxx",
+ .deliver = afs_deliver_cm_op_id,
+ .abort_to_error = afs_abort_to_error,
+};
+
+static void afs_collect_incoming_call(struct work_struct *);
+
+static struct sk_buff_head afs_incoming_calls;
+static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call);
+
+/*
+ * open an RxRPC socket and bind it to be a server for callback notifications
+ * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT
+ */
+int afs_open_socket(void)
+{
+ struct sockaddr_rxrpc srx;
+ struct socket *socket;
+ int ret;
+
+ _enter("");
+
+ skb_queue_head_init(&afs_incoming_calls);
+
+ afs_async_calls = create_singlethread_workqueue("kafsd");
+ if (!afs_async_calls) {
+ _leave(" = -ENOMEM [wq]");
+ return -ENOMEM;
+ }
+
+ ret = sock_create_kern(AF_RXRPC, SOCK_DGRAM, PF_INET, &socket);
+ if (ret < 0) {
+ destroy_workqueue(afs_async_calls);
+ _leave(" = %d [socket]", ret);
+ return ret;
+ }
+
+ socket->sk->sk_allocation = GFP_NOFS;
+
+ /* bind the callback manager's address to make this a server socket */
+ srx.srx_family = AF_RXRPC;
+ srx.srx_service = CM_SERVICE;
+ srx.transport_type = SOCK_DGRAM;
+ srx.transport_len = sizeof(srx.transport.sin);
+ srx.transport.sin.sin_family = AF_INET;
+ srx.transport.sin.sin_port = htons(AFS_CM_PORT);
+ memset(&srx.transport.sin.sin_addr, 0,
+ sizeof(srx.transport.sin.sin_addr));
+
+ ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
+ if (ret < 0) {
+ sock_release(socket);
+ _leave(" = %d [bind]", ret);
+ return ret;
+ }
+
+ rxrpc_kernel_intercept_rx_messages(socket, afs_rx_interceptor);
+
+ afs_socket = socket;
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * close the RxRPC socket AFS was using
+ */
+void afs_close_socket(void)
+{
+ _enter("");
+
+ sock_release(afs_socket);
+
+ _debug("dework");
+ destroy_workqueue(afs_async_calls);
+
+ ASSERTCMP(atomic_read(&afs_outstanding_skbs), ==, 0);
+ ASSERTCMP(atomic_read(&afs_outstanding_calls), ==, 0);
+ _leave("");
+}
+
+/*
+ * note that the data in a socket buffer is now delivered and that the buffer
+ * should be freed
+ */
+static void afs_data_delivered(struct sk_buff *skb)
+{
+ if (!skb) {
+ _debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs));
+ dump_stack();
+ } else {
+ _debug("DLVR %p{%u} [%d]",
+ skb, skb->mark, atomic_read(&afs_outstanding_skbs));
+ if (atomic_dec_return(&afs_outstanding_skbs) == -1)
+ BUG();
+ rxrpc_kernel_data_delivered(skb);
+ }
+}
+
+/*
+ * free a socket buffer
+ */
+static void afs_free_skb(struct sk_buff *skb)
+{
+ if (!skb) {
+ _debug("FREE NULL [%d]", atomic_read(&afs_outstanding_skbs));
+ dump_stack();
+ } else {
+ _debug("FREE %p{%u} [%d]",
+ skb, skb->mark, atomic_read(&afs_outstanding_skbs));
+ if (atomic_dec_return(&afs_outstanding_skbs) == -1)
+ BUG();
+ rxrpc_kernel_free_skb(skb);
+ }
+}
+
+/*
+ * free a call
+ */
+static void afs_free_call(struct afs_call *call)
+{
+ _debug("DONE %p{%s} [%d]",
+ call, call->type->name, atomic_read(&afs_outstanding_calls));
+ if (atomic_dec_return(&afs_outstanding_calls) == -1)
+ BUG();
+
+ ASSERTCMP(call->rxcall, ==, NULL);
+ ASSERT(!work_pending(&call->async_work));
+ ASSERT(skb_queue_empty(&call->rx_queue));
+ ASSERT(call->type->name != NULL);
+
+ kfree(call->request);
+ kfree(call);
+}
+
+/*
+ * allocate a call with flat request and reply buffers
+ */
+struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
+ size_t request_size, size_t reply_size)
+{
+ struct afs_call *call;
+
+ call = kzalloc(sizeof(*call), GFP_NOFS);
+ if (!call)
+ goto nomem_call;
+
+ _debug("CALL %p{%s} [%d]",
+ call, type->name, atomic_read(&afs_outstanding_calls));
+ atomic_inc(&afs_outstanding_calls);
+
+ call->type = type;
+ call->request_size = request_size;
+ call->reply_max = reply_size;
+
+ if (request_size) {
+ call->request = kmalloc(request_size, GFP_NOFS);
+ if (!call->request)
+ goto nomem_free;
+ }
+
+ if (reply_size) {
+ call->buffer = kmalloc(reply_size, GFP_NOFS);
+ if (!call->buffer)
+ goto nomem_free;
+ }
+
+ init_waitqueue_head(&call->waitq);
+ skb_queue_head_init(&call->rx_queue);
+ return call;
+
+nomem_free:
+ afs_free_call(call);
+nomem_call:
+ return NULL;
+}
+
+/*
+ * clean up a call with flat buffer
+ */
+void afs_flat_call_destructor(struct afs_call *call)
+{
+ _enter("");
+
+ kfree(call->request);
+ call->request = NULL;
+ kfree(call->buffer);
+ call->buffer = NULL;
+}
+
+/*
+ * initiate a call
+ */
+int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
+ const struct afs_wait_mode *wait_mode)
+{
+ struct sockaddr_rxrpc srx;
+ struct rxrpc_call *rxcall;
+ struct msghdr msg;
+ struct kvec iov[1];
+ int ret;
+
+ _enter("%x,{%d},", addr->s_addr, ntohs(call->port));
+
+ ASSERT(call->type != NULL);
+ ASSERT(call->type->name != NULL);
+
+ _debug("MAKE %p{%s} [%d]",
+ call, call->type->name, atomic_read(&afs_outstanding_calls));
+
+ call->wait_mode = wait_mode;
+ INIT_WORK(&call->async_work, afs_process_async_call);
+
+ memset(&srx, 0, sizeof(srx));
+ srx.srx_family = AF_RXRPC;
+ srx.srx_service = call->service_id;
+ srx.transport_type = SOCK_DGRAM;
+ srx.transport_len = sizeof(srx.transport.sin);
+ srx.transport.sin.sin_family = AF_INET;
+ srx.transport.sin.sin_port = call->port;
+ memcpy(&srx.transport.sin.sin_addr, addr, 4);
+
+ /* create a call */
+ rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key,
+ (unsigned long) call, gfp);
+ call->key = NULL;
+ if (IS_ERR(rxcall)) {
+ ret = PTR_ERR(rxcall);
+ goto error_kill_call;
+ }
+
+ call->rxcall = rxcall;
+
+ /* send the request */
+ iov[0].iov_base = call->request;
+ iov[0].iov_len = call->request_size;
+
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = (struct iovec *) iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ /* have to change the state *before* sending the last packet as RxRPC
+ * might give us the reply before it returns from sending the
+ * request */
+ call->state = AFS_CALL_AWAIT_REPLY;
+ ret = rxrpc_kernel_send_data(rxcall, &msg, call->request_size);
+ if (ret < 0)
+ goto error_do_abort;
+
+ /* at this point, an async call may no longer exist as it may have
+ * already completed */
+ return wait_mode->wait(call);
+
+error_do_abort:
+ rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT);
+ rxrpc_kernel_end_call(rxcall);
+ call->rxcall = NULL;
+error_kill_call:
+ call->type->destructor(call);
+ afs_free_call(call);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * handles intercepted messages that were arriving in the socket's Rx queue
+ * - called with the socket receive queue lock held to ensure message ordering
+ * - called with softirqs disabled
+ */
+static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID,
+ struct sk_buff *skb)
+{
+ struct afs_call *call = (struct afs_call *) user_call_ID;
+
+ _enter("%p,,%u", call, skb->mark);
+
+ _debug("ICPT %p{%u} [%d]",
+ skb, skb->mark, atomic_read(&afs_outstanding_skbs));
+
+ ASSERTCMP(sk, ==, afs_socket->sk);
+ atomic_inc(&afs_outstanding_skbs);
+
+ if (!call) {
+ /* its an incoming call for our callback service */
+ skb_queue_tail(&afs_incoming_calls, skb);
+ schedule_work(&afs_collect_incoming_call_work);
+ } else {
+ /* route the messages directly to the appropriate call */
+ skb_queue_tail(&call->rx_queue, skb);
+ call->wait_mode->rx_wakeup(call);
+ }
+
+ _leave("");
+}
+
+/*
+ * deliver messages to a call
+ */
+static void afs_deliver_to_call(struct afs_call *call)
+{
+ struct sk_buff *skb;
+ bool last;
+ u32 abort_code;
+ int ret;
+
+ _enter("");
+
+ while ((call->state == AFS_CALL_AWAIT_REPLY ||
+ call->state == AFS_CALL_AWAIT_OP_ID ||
+ call->state == AFS_CALL_AWAIT_REQUEST ||
+ call->state == AFS_CALL_AWAIT_ACK) &&
+ (skb = skb_dequeue(&call->rx_queue))) {
+ switch (skb->mark) {
+ case RXRPC_SKB_MARK_DATA:
+ _debug("Rcv DATA");
+ last = rxrpc_kernel_is_data_last(skb);
+ ret = call->type->deliver(call, skb, last);
+ switch (ret) {
+ case 0:
+ if (last &&
+ call->state == AFS_CALL_AWAIT_REPLY)
+ call->state = AFS_CALL_COMPLETE;
+ break;
+ case -ENOTCONN:
+ abort_code = RX_CALL_DEAD;
+ goto do_abort;
+ case -ENOTSUPP:
+ abort_code = RX_INVALID_OPERATION;
+ goto do_abort;
+ default:
+ abort_code = RXGEN_CC_UNMARSHAL;
+ if (call->state != AFS_CALL_AWAIT_REPLY)
+ abort_code = RXGEN_SS_UNMARSHAL;
+ do_abort:
+ rxrpc_kernel_abort_call(call->rxcall,
+ abort_code);
+ call->error = ret;
+ call->state = AFS_CALL_ERROR;
+ break;
+ }
+ afs_data_delivered(skb);
+ skb = NULL;
+ continue;
+ case RXRPC_SKB_MARK_FINAL_ACK:
+ _debug("Rcv ACK");
+ call->state = AFS_CALL_COMPLETE;
+ break;
+ case RXRPC_SKB_MARK_BUSY:
+ _debug("Rcv BUSY");
+ call->error = -EBUSY;
+ call->state = AFS_CALL_BUSY;
+ break;
+ case RXRPC_SKB_MARK_REMOTE_ABORT:
+ abort_code = rxrpc_kernel_get_abort_code(skb);
+ call->error = call->type->abort_to_error(abort_code);
+ call->state = AFS_CALL_ABORTED;
+ _debug("Rcv ABORT %u -> %d", abort_code, call->error);
+ break;
+ case RXRPC_SKB_MARK_NET_ERROR:
+ call->error = -rxrpc_kernel_get_error_number(skb);
+ call->state = AFS_CALL_ERROR;
+ _debug("Rcv NET ERROR %d", call->error);
+ break;
+ case RXRPC_SKB_MARK_LOCAL_ERROR:
+ call->error = -rxrpc_kernel_get_error_number(skb);
+ call->state = AFS_CALL_ERROR;
+ _debug("Rcv LOCAL ERROR %d", call->error);
+ break;
+ default:
+ BUG();
+ break;
+ }
+
+ afs_free_skb(skb);
+ }
+
+ /* make sure the queue is empty if the call is done with (we might have
+ * aborted the call early because of an unmarshalling error) */
+ if (call->state >= AFS_CALL_COMPLETE) {
+ while ((skb = skb_dequeue(&call->rx_queue)))
+ afs_free_skb(skb);
+ if (call->incoming) {
+ rxrpc_kernel_end_call(call->rxcall);
+ call->rxcall = NULL;
+ call->type->destructor(call);
+ afs_free_call(call);
+ }
+ }
+
+ _leave("");
+}
+
+/*
+ * wait synchronously for a call to complete
+ */
+static int afs_wait_for_call_to_complete(struct afs_call *call)
+{
+ struct sk_buff *skb;
+ int ret;
+
+ DECLARE_WAITQUEUE(myself, current);
+
+ _enter("");
+
+ add_wait_queue(&call->waitq, &myself);
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ /* deliver any messages that are in the queue */
+ if (!skb_queue_empty(&call->rx_queue)) {
+ __set_current_state(TASK_RUNNING);
+ afs_deliver_to_call(call);
+ continue;
+ }
+
+ ret = call->error;
+ if (call->state >= AFS_CALL_COMPLETE)
+ break;
+ ret = -EINTR;
+ if (signal_pending(current))
+ break;
+ schedule();
+ }
+
+ remove_wait_queue(&call->waitq, &myself);
+ __set_current_state(TASK_RUNNING);
+
+ /* kill the call */
+ if (call->state < AFS_CALL_COMPLETE) {
+ _debug("call incomplete");
+ rxrpc_kernel_abort_call(call->rxcall, RX_CALL_DEAD);
+ while ((skb = skb_dequeue(&call->rx_queue)))
+ afs_free_skb(skb);
+ }
+
+ _debug("call complete");
+ rxrpc_kernel_end_call(call->rxcall);
+ call->rxcall = NULL;
+ call->type->destructor(call);
+ afs_free_call(call);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * wake up a waiting call
+ */
+static void afs_wake_up_call_waiter(struct afs_call *call)
+{
+ wake_up(&call->waitq);
+}
+
+/*
+ * wake up an asynchronous call
+ */
+static void afs_wake_up_async_call(struct afs_call *call)
+{
+ _enter("");
+ queue_work(afs_async_calls, &call->async_work);
+}
+
+/*
+ * put a call into asynchronous mode
+ * - mustn't touch the call descriptor as the call my have completed by the
+ * time we get here
+ */
+static int afs_dont_wait_for_call_to_complete(struct afs_call *call)
+{
+ _enter("");
+ return -EINPROGRESS;
+}
+
+/*
+ * delete an asynchronous call
+ */
+static void afs_delete_async_call(struct work_struct *work)
+{
+ struct afs_call *call =
+ container_of(work, struct afs_call, async_work);
+
+ _enter("");
+
+ afs_free_call(call);
+
+ _leave("");
+}
+
+/*
+ * perform processing on an asynchronous call
+ * - on a multiple-thread workqueue this work item may try to run on several
+ * CPUs at the same time
+ */
+static void afs_process_async_call(struct work_struct *work)
+{
+ struct afs_call *call =
+ container_of(work, struct afs_call, async_work);
+
+ _enter("");
+
+ if (!skb_queue_empty(&call->rx_queue))
+ afs_deliver_to_call(call);
+
+ if (call->state >= AFS_CALL_COMPLETE && call->wait_mode) {
+ if (call->wait_mode->async_complete)
+ call->wait_mode->async_complete(call->reply,
+ call->error);
+ call->reply = NULL;
+
+ /* kill the call */
+ rxrpc_kernel_end_call(call->rxcall);
+ call->rxcall = NULL;
+ if (call->type->destructor)
+ call->type->destructor(call);
+
+ /* we can't just delete the call because the work item may be
+ * queued */
+ PREPARE_WORK(&call->async_work, afs_delete_async_call);
+ queue_work(afs_async_calls, &call->async_work);
+ }
+
+ _leave("");
+}
+
+/*
+ * empty a socket buffer into a flat reply buffer
+ */
+void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb)
+{
+ size_t len = skb->len;
+
+ if (skb_copy_bits(skb, 0, call->buffer + call->reply_size, len) < 0)
+ BUG();
+ call->reply_size += len;
+}
+
+/*
+ * accept the backlog of incoming calls
+ */
+static void afs_collect_incoming_call(struct work_struct *work)
+{
+ struct rxrpc_call *rxcall;
+ struct afs_call *call = NULL;
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&afs_incoming_calls))) {
+ _debug("new call");
+
+ /* don't need the notification */
+ afs_free_skb(skb);
+
+ if (!call) {
+ call = kzalloc(sizeof(struct afs_call), GFP_KERNEL);
+ if (!call) {
+ rxrpc_kernel_reject_call(afs_socket);
+ return;
+ }
+
+ INIT_WORK(&call->async_work, afs_process_async_call);
+ call->wait_mode = &afs_async_incoming_call;
+ call->type = &afs_RXCMxxxx;
+ init_waitqueue_head(&call->waitq);
+ skb_queue_head_init(&call->rx_queue);
+ call->state = AFS_CALL_AWAIT_OP_ID;
+
+ _debug("CALL %p{%s} [%d]",
+ call, call->type->name,
+ atomic_read(&afs_outstanding_calls));
+ atomic_inc(&afs_outstanding_calls);
+ }
+
+ rxcall = rxrpc_kernel_accept_call(afs_socket,
+ (unsigned long) call);
+ if (!IS_ERR(rxcall)) {
+ call->rxcall = rxcall;
+ call = NULL;
+ }
+ }
+
+ if (call)
+ afs_free_call(call);
+}
+
+/*
+ * grab the operation ID from an incoming cache manager call
+ */
+static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
+ bool last)
+{
+ size_t len = skb->len;
+ void *oibuf = (void *) &call->operation_ID;
+
+ _enter("{%u},{%zu},%d", call->offset, len, last);
+
+ ASSERTCMP(call->offset, <, 4);
+
+ /* the operation ID forms the first four bytes of the request data */
+ len = min_t(size_t, len, 4 - call->offset);
+ if (skb_copy_bits(skb, 0, oibuf + call->offset, len) < 0)
+ BUG();
+ if (!pskb_pull(skb, len))
+ BUG();
+ call->offset += len;
+
+ if (call->offset < 4) {
+ if (last) {
+ _leave(" = -EBADMSG [op ID short]");
+ return -EBADMSG;
+ }
+ _leave(" = 0 [incomplete]");
+ return 0;
+ }
+
+ call->state = AFS_CALL_AWAIT_REQUEST;
+
+ /* ask the cache manager to route the call (it'll change the call type
+ * if successful) */
+ if (!afs_cm_incoming_call(call))
+ return -ENOTSUPP;
+
+ /* pass responsibility for the remainer of this message off to the
+ * cache manager op */
+ return call->type->deliver(call, skb, last);
+}
+
+/*
+ * send an empty reply
+ */
+void afs_send_empty_reply(struct afs_call *call)
+{
+ struct msghdr msg;
+ struct iovec iov[1];
+
+ _enter("");
+
+ iov[0].iov_base = NULL;
+ iov[0].iov_len = 0;
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = iov;
+ msg.msg_iovlen = 0;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ call->state = AFS_CALL_AWAIT_ACK;
+ switch (rxrpc_kernel_send_data(call->rxcall, &msg, 0)) {
+ case 0:
+ _leave(" [replied]");
+ return;
+
+ case -ENOMEM:
+ _debug("oom");
+ rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
+ default:
+ rxrpc_kernel_end_call(call->rxcall);
+ call->rxcall = NULL;
+ call->type->destructor(call);
+ afs_free_call(call);
+ _leave(" [error]");
+ return;
+ }
+}
+
+/*
+ * send a simple reply
+ */
+void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
+{
+ struct msghdr msg;
+ struct iovec iov[1];
+
+ _enter("");
+
+ iov[0].iov_base = (void *) buf;
+ iov[0].iov_len = len;
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ call->state = AFS_CALL_AWAIT_ACK;
+ switch (rxrpc_kernel_send_data(call->rxcall, &msg, len)) {
+ case 0:
+ _leave(" [replied]");
+ return;
+
+ case -ENOMEM:
+ _debug("oom");
+ rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
+ default:
+ rxrpc_kernel_end_call(call->rxcall);
+ call->rxcall = NULL;
+ call->type->destructor(call);
+ afs_free_call(call);
+ _leave(" [error]");
+ return;
+ }
+}
+
+/*
+ * extract a piece of data from the received data socket buffers
+ */
+int afs_extract_data(struct afs_call *call, struct sk_buff *skb,
+ bool last, void *buf, size_t count)
+{
+ size_t len = skb->len;
+
+ _enter("{%u},{%zu},%d,,%zu", call->offset, len, last, count);
+
+ ASSERTCMP(call->offset, <, count);
+
+ len = min_t(size_t, len, count - call->offset);
+ if (skb_copy_bits(skb, 0, buf + call->offset, len) < 0 ||
+ !pskb_pull(skb, len))
+ BUG();
+ call->offset += len;
+
+ if (call->offset < count) {
+ if (last) {
+ _leave(" = -EBADMSG [%d < %lu]", call->offset, count);
+ return -EBADMSG;
+ }
+ _leave(" = -EAGAIN");
+ return -EAGAIN;
+ }
+ return 0;
+}
diff --git a/fs/afs/security.c b/fs/afs/security.c
new file mode 100644
index 000000000000..f9f424d80458
--- /dev/null
+++ b/fs/afs/security.c
@@ -0,0 +1,356 @@
+/* AFS security handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/ctype.h>
+#include <keys/rxrpc-type.h>
+#include "internal.h"
+
+/*
+ * get a key
+ */
+struct key *afs_request_key(struct afs_cell *cell)
+{
+ struct key *key;
+
+ _enter("{%x}", key_serial(cell->anonymous_key));
+
+ _debug("key %s", cell->anonymous_key->description);
+ key = request_key(&key_type_rxrpc, cell->anonymous_key->description,
+ NULL);
+ if (IS_ERR(key)) {
+ if (PTR_ERR(key) != -ENOKEY) {
+ _leave(" = %ld", PTR_ERR(key));
+ return key;
+ }
+
+ /* act as anonymous user */
+ _leave(" = {%x} [anon]", key_serial(cell->anonymous_key));
+ return key_get(cell->anonymous_key);
+ } else {
+ /* act as authorised user */
+ _leave(" = {%x} [auth]", key_serial(key));
+ return key;
+ }
+}
+
+/*
+ * dispose of a permits list
+ */
+void afs_zap_permits(struct rcu_head *rcu)
+{
+ struct afs_permits *permits =
+ container_of(rcu, struct afs_permits, rcu);
+ int loop;
+
+ _enter("{%d}", permits->count);
+
+ for (loop = permits->count - 1; loop >= 0; loop--)
+ key_put(permits->permits[loop].key);
+ kfree(permits);
+}
+
+/*
+ * dispose of a permits list in which all the key pointers have been copied
+ */
+static void afs_dispose_of_permits(struct rcu_head *rcu)
+{
+ struct afs_permits *permits =
+ container_of(rcu, struct afs_permits, rcu);
+
+ _enter("{%d}", permits->count);
+
+ kfree(permits);
+}
+
+/*
+ * get the authorising vnode - this is the specified inode itself if it's a
+ * directory or it's the parent directory if the specified inode is a file or
+ * symlink
+ * - the caller must release the ref on the inode
+ */
+static struct afs_vnode *afs_get_auth_inode(struct afs_vnode *vnode,
+ struct key *key)
+{
+ struct afs_vnode *auth_vnode;
+ struct inode *auth_inode;
+
+ _enter("");
+
+ if (S_ISDIR(vnode->vfs_inode.i_mode)) {
+ auth_inode = igrab(&vnode->vfs_inode);
+ ASSERT(auth_inode != NULL);
+ } else {
+ auth_inode = afs_iget(vnode->vfs_inode.i_sb, key,
+ &vnode->status.parent, NULL, NULL);
+ if (IS_ERR(auth_inode))
+ return ERR_PTR(PTR_ERR(auth_inode));
+ }
+
+ auth_vnode = AFS_FS_I(auth_inode);
+ _leave(" = {%x}", auth_vnode->fid.vnode);
+ return auth_vnode;
+}
+
+/*
+ * clear the permit cache on a directory vnode
+ */
+void afs_clear_permits(struct afs_vnode *vnode)
+{
+ struct afs_permits *permits;
+
+ _enter("{%x}", vnode->fid.vnode);
+
+ mutex_lock(&vnode->permits_lock);
+ permits = vnode->permits;
+ rcu_assign_pointer(vnode->permits, NULL);
+ mutex_unlock(&vnode->permits_lock);
+
+ if (permits)
+ call_rcu(&permits->rcu, afs_zap_permits);
+ _leave("");
+}
+
+/*
+ * add the result obtained for a vnode to its or its parent directory's cache
+ * for the key used to access it
+ */
+void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
+{
+ struct afs_permits *permits, *xpermits;
+ struct afs_permit *permit;
+ struct afs_vnode *auth_vnode;
+ int count, loop;
+
+ _enter("{%x},%x,%lx", vnode->fid.vnode, key_serial(key), acl_order);
+
+ auth_vnode = afs_get_auth_inode(vnode, key);
+ if (IS_ERR(auth_vnode)) {
+ _leave(" [get error %ld]", PTR_ERR(auth_vnode));
+ return;
+ }
+
+ mutex_lock(&auth_vnode->permits_lock);
+
+ /* guard against a rename being detected whilst we waited for the
+ * lock */
+ if (memcmp(&auth_vnode->fid, &vnode->status.parent,
+ sizeof(struct afs_fid)) != 0) {
+ _debug("renamed");
+ goto out_unlock;
+ }
+
+ /* have to be careful as the directory's callback may be broken between
+ * us receiving the status we're trying to cache and us getting the
+ * lock to update the cache for the status */
+ if (auth_vnode->acl_order - acl_order > 0) {
+ _debug("ACL changed?");
+ goto out_unlock;
+ }
+
+ /* always update the anonymous mask */
+ _debug("anon access %x", vnode->status.anon_access);
+ auth_vnode->status.anon_access = vnode->status.anon_access;
+ if (key == vnode->volume->cell->anonymous_key)
+ goto out_unlock;
+
+ xpermits = auth_vnode->permits;
+ count = 0;
+ if (xpermits) {
+ /* see if the permit is already in the list
+ * - if it is then we just amend the list
+ */
+ count = xpermits->count;
+ permit = xpermits->permits;
+ for (loop = count; loop > 0; loop--) {
+ if (permit->key == key) {
+ permit->access_mask =
+ vnode->status.caller_access;
+ goto out_unlock;
+ }
+ permit++;
+ }
+ }
+
+ permits = kmalloc(sizeof(*permits) + sizeof(*permit) * (count + 1),
+ GFP_NOFS);
+ if (!permits)
+ goto out_unlock;
+
+ memcpy(permits->permits, xpermits->permits,
+ count * sizeof(struct afs_permit));
+
+ _debug("key %x access %x",
+ key_serial(key), vnode->status.caller_access);
+ permits->permits[count].access_mask = vnode->status.caller_access;
+ permits->permits[count].key = key_get(key);
+ permits->count = count + 1;
+
+ rcu_assign_pointer(auth_vnode->permits, permits);
+ if (xpermits)
+ call_rcu(&xpermits->rcu, afs_dispose_of_permits);
+
+out_unlock:
+ mutex_unlock(&auth_vnode->permits_lock);
+ iput(&auth_vnode->vfs_inode);
+ _leave("");
+}
+
+/*
+ * check with the fileserver to see if the directory or parent directory is
+ * permitted to be accessed with this authorisation, and if so, what access it
+ * is granted
+ */
+static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
+ afs_access_t *_access)
+{
+ struct afs_permits *permits;
+ struct afs_permit *permit;
+ struct afs_vnode *auth_vnode;
+ bool valid;
+ int loop, ret;
+
+ _enter("");
+
+ auth_vnode = afs_get_auth_inode(vnode, key);
+ if (IS_ERR(auth_vnode)) {
+ *_access = 0;
+ _leave(" = %ld", PTR_ERR(auth_vnode));
+ return PTR_ERR(auth_vnode);
+ }
+
+ ASSERT(S_ISDIR(auth_vnode->vfs_inode.i_mode));
+
+ /* check the permits to see if we've got one yet */
+ if (key == auth_vnode->volume->cell->anonymous_key) {
+ _debug("anon");
+ *_access = auth_vnode->status.anon_access;
+ valid = true;
+ } else {
+ valid = false;
+ rcu_read_lock();
+ permits = rcu_dereference(auth_vnode->permits);
+ if (permits) {
+ permit = permits->permits;
+ for (loop = permits->count; loop > 0; loop--) {
+ if (permit->key == key) {
+ _debug("found in cache");
+ *_access = permit->access_mask;
+ valid = true;
+ break;
+ }
+ permit++;
+ }
+ }
+ rcu_read_unlock();
+ }
+
+ if (!valid) {
+ /* check the status on the file we're actually interested in
+ * (the post-processing will cache the result on auth_vnode) */
+ _debug("no valid permit");
+
+ set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+ ret = afs_vnode_fetch_status(vnode, auth_vnode, key);
+ if (ret < 0) {
+ iput(&auth_vnode->vfs_inode);
+ *_access = 0;
+ _leave(" = %d", ret);
+ return ret;
+ }
+ }
+
+ *_access = vnode->status.caller_access;
+ iput(&auth_vnode->vfs_inode);
+ _leave(" = 0 [access %x]", *_access);
+ return 0;
+}
+
+/*
+ * check the permissions on an AFS file
+ * - AFS ACLs are attached to directories only, and a file is controlled by its
+ * parent directory's ACL
+ */
+int afs_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+ struct afs_vnode *vnode = AFS_FS_I(inode);
+ afs_access_t access;
+ struct key *key;
+ int ret;
+
+ _enter("{{%x:%x},%lx},%x,",
+ vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask);
+
+ key = afs_request_key(vnode->volume->cell);
+ if (IS_ERR(key)) {
+ _leave(" = %ld [key]", PTR_ERR(key));
+ return PTR_ERR(key);
+ }
+
+ /* if the promise has expired, we need to check the server again */
+ if (!vnode->cb_promised) {
+ _debug("not promised");
+ ret = afs_vnode_fetch_status(vnode, NULL, key);
+ if (ret < 0)
+ goto error;
+ _debug("new promise [fl=%lx]", vnode->flags);
+ }
+
+ /* check the permits to see if we've got one yet */
+ ret = afs_check_permit(vnode, key, &access);
+ if (ret < 0)
+ goto error;
+
+ /* interpret the access mask */
+ _debug("REQ %x ACC %x on %s",
+ mask, access, S_ISDIR(inode->i_mode) ? "dir" : "file");
+
+ if (S_ISDIR(inode->i_mode)) {
+ if (mask & MAY_EXEC) {
+ if (!(access & AFS_ACE_LOOKUP))
+ goto permission_denied;
+ } else if (mask & MAY_READ) {
+ if (!(access & AFS_ACE_READ))
+ goto permission_denied;
+ } else if (mask & MAY_WRITE) {
+ if (!(access & (AFS_ACE_DELETE | /* rmdir, unlink, rename from */
+ AFS_ACE_INSERT | /* create, mkdir, symlink, rename to */
+ AFS_ACE_WRITE))) /* chmod */
+ goto permission_denied;
+ } else {
+ BUG();
+ }
+ } else {
+ if (!(access & AFS_ACE_LOOKUP))
+ goto permission_denied;
+ if (mask & (MAY_EXEC | MAY_READ)) {
+ if (!(access & AFS_ACE_READ))
+ goto permission_denied;
+ } else if (mask & MAY_WRITE) {
+ if (!(access & AFS_ACE_WRITE))
+ goto permission_denied;
+ }
+ }
+
+ key_put(key);
+ ret = generic_permission(inode, mask, NULL);
+ _leave(" = %d", ret);
+ return ret;
+
+permission_denied:
+ ret = -EACCES;
+error:
+ key_put(key);
+ _leave(" = %d", ret);
+ return ret;
+}
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 44aff81dc6a7..96bb23b476a2 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -1,6 +1,6 @@
-/* server.c: AFS server record management
+/* AFS server record management
*
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
@@ -11,489 +11,314 @@
#include <linux/sched.h>
#include <linux/slab.h>
-#include <rxrpc/peer.h>
-#include <rxrpc/connection.h>
-#include "volume.h"
-#include "cell.h"
-#include "server.h"
-#include "transport.h"
-#include "vlclient.h"
-#include "kafstimod.h"
#include "internal.h"
-DEFINE_SPINLOCK(afs_server_peer_lock);
+unsigned afs_server_timeout = 10; /* server timeout in seconds */
-#define FS_SERVICE_ID 1 /* AFS Volume Location Service ID */
-#define VL_SERVICE_ID 52 /* AFS Volume Location Service ID */
+static void afs_reap_server(struct work_struct *);
-static void __afs_server_timeout(struct afs_timer *timer)
+/* tree of all the servers, indexed by IP address */
+static struct rb_root afs_servers = RB_ROOT;
+static DEFINE_RWLOCK(afs_servers_lock);
+
+/* LRU list of all the servers not currently in use */
+static LIST_HEAD(afs_server_graveyard);
+static DEFINE_SPINLOCK(afs_server_graveyard_lock);
+static DECLARE_DELAYED_WORK(afs_server_reaper, afs_reap_server);
+
+/*
+ * install a server record in the master tree
+ */
+static int afs_install_server(struct afs_server *server)
{
- struct afs_server *server =
- list_entry(timer, struct afs_server, timeout);
+ struct afs_server *xserver;
+ struct rb_node **pp, *p;
+ int ret;
- _debug("SERVER TIMEOUT [%p{u=%d}]",
- server, atomic_read(&server->usage));
+ _enter("%p", server);
- afs_server_do_timeout(server);
-}
+ write_lock(&afs_servers_lock);
+
+ ret = -EEXIST;
+ pp = &afs_servers.rb_node;
+ p = NULL;
+ while (*pp) {
+ p = *pp;
+ _debug("- consider %p", p);
+ xserver = rb_entry(p, struct afs_server, master_rb);
+ if (server->addr.s_addr < xserver->addr.s_addr)
+ pp = &(*pp)->rb_left;
+ else if (server->addr.s_addr > xserver->addr.s_addr)
+ pp = &(*pp)->rb_right;
+ else
+ goto error;
+ }
-static const struct afs_timer_ops afs_server_timer_ops = {
- .timed_out = __afs_server_timeout,
-};
+ rb_link_node(&server->master_rb, p, pp);
+ rb_insert_color(&server->master_rb, &afs_servers);
+ ret = 0;
+
+error:
+ write_unlock(&afs_servers_lock);
+ return ret;
+}
-/*****************************************************************************/
/*
- * lookup a server record in a cell
- * - TODO: search the cell's server list
+ * allocate a new server record
*/
-int afs_server_lookup(struct afs_cell *cell, const struct in_addr *addr,
- struct afs_server **_server)
+static struct afs_server *afs_alloc_server(struct afs_cell *cell,
+ const struct in_addr *addr)
{
- struct afs_server *server, *active, *zombie;
- int loop;
+ struct afs_server *server;
- _enter("%p,%08x,", cell, ntohl(addr->s_addr));
+ _enter("");
- /* allocate and initialise a server record */
server = kzalloc(sizeof(struct afs_server), GFP_KERNEL);
- if (!server) {
- _leave(" = -ENOMEM");
- return -ENOMEM;
+ if (server) {
+ atomic_set(&server->usage, 1);
+ server->cell = cell;
+
+ INIT_LIST_HEAD(&server->link);
+ INIT_LIST_HEAD(&server->grave);
+ init_rwsem(&server->sem);
+ spin_lock_init(&server->fs_lock);
+ server->fs_vnodes = RB_ROOT;
+ server->cb_promises = RB_ROOT;
+ spin_lock_init(&server->cb_lock);
+ init_waitqueue_head(&server->cb_break_waitq);
+ INIT_DELAYED_WORK(&server->cb_break_work,
+ afs_dispatch_give_up_callbacks);
+
+ memcpy(&server->addr, addr, sizeof(struct in_addr));
+ server->addr.s_addr = addr->s_addr;
}
- atomic_set(&server->usage, 1);
-
- INIT_LIST_HEAD(&server->link);
- init_rwsem(&server->sem);
- INIT_LIST_HEAD(&server->fs_callq);
- spin_lock_init(&server->fs_lock);
- INIT_LIST_HEAD(&server->cb_promises);
- spin_lock_init(&server->cb_lock);
-
- for (loop = 0; loop < AFS_SERVER_CONN_LIST_SIZE; loop++)
- server->fs_conn_cnt[loop] = 4;
+ _leave(" = %p{%d}", server, atomic_read(&server->usage));
+ return server;
+}
- memcpy(&server->addr, addr, sizeof(struct in_addr));
- server->addr.s_addr = addr->s_addr;
+/*
+ * get an FS-server record for a cell
+ */
+struct afs_server *afs_lookup_server(struct afs_cell *cell,
+ const struct in_addr *addr)
+{
+ struct afs_server *server, *candidate;
- afs_timer_init(&server->timeout, &afs_server_timer_ops);
+ _enter("%p,"NIPQUAD_FMT, cell, NIPQUAD(addr->s_addr));
- /* add to the cell */
- write_lock(&cell->sv_lock);
+ /* quick scan of the list to see if we already have the server */
+ read_lock(&cell->servers_lock);
- /* check the active list */
- list_for_each_entry(active, &cell->sv_list, link) {
- if (active->addr.s_addr == addr->s_addr)
- goto use_active_server;
+ list_for_each_entry(server, &cell->servers, link) {
+ if (server->addr.s_addr == addr->s_addr)
+ goto found_server_quickly;
}
+ read_unlock(&cell->servers_lock);
- /* check the inactive list */
- spin_lock(&cell->sv_gylock);
- list_for_each_entry(zombie, &cell->sv_graveyard, link) {
- if (zombie->addr.s_addr == addr->s_addr)
- goto resurrect_server;
+ candidate = afs_alloc_server(cell, addr);
+ if (!candidate) {
+ _leave(" = -ENOMEM");
+ return ERR_PTR(-ENOMEM);
}
- spin_unlock(&cell->sv_gylock);
- afs_get_cell(cell);
- server->cell = cell;
- list_add_tail(&server->link, &cell->sv_list);
+ write_lock(&cell->servers_lock);
- write_unlock(&cell->sv_lock);
+ /* check the cell's server list again */
+ list_for_each_entry(server, &cell->servers, link) {
+ if (server->addr.s_addr == addr->s_addr)
+ goto found_server;
+ }
- *_server = server;
- _leave(" = 0 (%p)", server);
- return 0;
+ _debug("new");
+ server = candidate;
+ if (afs_install_server(server) < 0)
+ goto server_in_two_cells;
- /* found a matching active server */
- use_active_server:
- _debug("active server");
- afs_get_server(active);
- write_unlock(&cell->sv_lock);
+ afs_get_cell(cell);
+ list_add_tail(&server->link, &cell->servers);
+
+ write_unlock(&cell->servers_lock);
+ _leave(" = %p{%d}", server, atomic_read(&server->usage));
+ return server;
+
+ /* found a matching server quickly */
+found_server_quickly:
+ _debug("found quickly");
+ afs_get_server(server);
+ read_unlock(&cell->servers_lock);
+no_longer_unused:
+ if (!list_empty(&server->grave)) {
+ spin_lock(&afs_server_graveyard_lock);
+ list_del_init(&server->grave);
+ spin_unlock(&afs_server_graveyard_lock);
+ }
+ _leave(" = %p{%d}", server, atomic_read(&server->usage));
+ return server;
+
+ /* found a matching server on the second pass */
+found_server:
+ _debug("found");
+ afs_get_server(server);
+ write_unlock(&cell->servers_lock);
+ kfree(candidate);
+ goto no_longer_unused;
+
+ /* found a server that seems to be in two cells */
+server_in_two_cells:
+ write_unlock(&cell->servers_lock);
+ kfree(candidate);
+ printk(KERN_NOTICE "kAFS:"
+ " Server "NIPQUAD_FMT" appears to be in two cells\n",
+ NIPQUAD(*addr));
+ _leave(" = -EEXIST");
+ return ERR_PTR(-EEXIST);
+}
- kfree(server);
+/*
+ * look up a server by its IP address
+ */
+struct afs_server *afs_find_server(const struct in_addr *_addr)
+{
+ struct afs_server *server = NULL;
+ struct rb_node *p;
+ struct in_addr addr = *_addr;
- *_server = active;
- _leave(" = 0 (%p)", active);
- return 0;
+ _enter(NIPQUAD_FMT, NIPQUAD(addr.s_addr));
- /* found a matching server in the graveyard, so resurrect it and
- * dispose of the new record */
- resurrect_server:
- _debug("resurrecting server");
+ read_lock(&afs_servers_lock);
- list_move_tail(&zombie->link, &cell->sv_list);
- afs_get_server(zombie);
- afs_kafstimod_del_timer(&zombie->timeout);
- spin_unlock(&cell->sv_gylock);
- write_unlock(&cell->sv_lock);
+ p = afs_servers.rb_node;
+ while (p) {
+ server = rb_entry(p, struct afs_server, master_rb);
- kfree(server);
+ _debug("- consider %p", p);
- *_server = zombie;
- _leave(" = 0 (%p)", zombie);
- return 0;
+ if (addr.s_addr < server->addr.s_addr) {
+ p = p->rb_left;
+ } else if (addr.s_addr > server->addr.s_addr) {
+ p = p->rb_right;
+ } else {
+ afs_get_server(server);
+ goto found;
+ }
+ }
-} /* end afs_server_lookup() */
+ server = NULL;
+found:
+ read_unlock(&afs_servers_lock);
+ ASSERTIFCMP(server, server->addr.s_addr, ==, addr.s_addr);
+ _leave(" = %p", server);
+ return server;
+}
-/*****************************************************************************/
/*
* destroy a server record
* - removes from the cell list
*/
void afs_put_server(struct afs_server *server)
{
- struct afs_cell *cell;
-
if (!server)
return;
- _enter("%p", server);
-
- cell = server->cell;
+ _enter("%p{%d}", server, atomic_read(&server->usage));
- /* sanity check */
- BUG_ON(atomic_read(&server->usage) <= 0);
+ _debug("PUT SERVER %d", atomic_read(&server->usage));
- /* to prevent a race, the decrement and the dequeue must be effectively
- * atomic */
- write_lock(&cell->sv_lock);
+ ASSERTCMP(atomic_read(&server->usage), >, 0);
if (likely(!atomic_dec_and_test(&server->usage))) {
- write_unlock(&cell->sv_lock);
_leave("");
return;
}
- spin_lock(&cell->sv_gylock);
- list_move_tail(&server->link, &cell->sv_graveyard);
+ afs_flush_callback_breaks(server);
- /* time out in 10 secs */
- afs_kafstimod_add_timer(&server->timeout, 10 * HZ);
-
- spin_unlock(&cell->sv_gylock);
- write_unlock(&cell->sv_lock);
-
- _leave(" [killed]");
-} /* end afs_put_server() */
+ spin_lock(&afs_server_graveyard_lock);
+ if (atomic_read(&server->usage) == 0) {
+ list_move_tail(&server->grave, &afs_server_graveyard);
+ server->time_of_death = get_seconds();
+ schedule_delayed_work(&afs_server_reaper,
+ afs_server_timeout * HZ);
+ }
+ spin_unlock(&afs_server_graveyard_lock);
+ _leave(" [dead]");
+}
-/*****************************************************************************/
/*
- * timeout server record
- * - removes from the cell's graveyard if the usage count is zero
+ * destroy a dead server
*/
-void afs_server_do_timeout(struct afs_server *server)
+static void afs_destroy_server(struct afs_server *server)
{
- struct rxrpc_peer *peer;
- struct afs_cell *cell;
- int loop;
-
_enter("%p", server);
- cell = server->cell;
-
- BUG_ON(atomic_read(&server->usage) < 0);
-
- /* remove from graveyard if still dead */
- spin_lock(&cell->vl_gylock);
- if (atomic_read(&server->usage) == 0)
- list_del_init(&server->link);
- else
- server = NULL;
- spin_unlock(&cell->vl_gylock);
-
- if (!server) {
- _leave("");
- return; /* resurrected */
- }
-
- /* we can now destroy it properly */
- afs_put_cell(cell);
-
- /* uncross-point the structs under a global lock */
- spin_lock(&afs_server_peer_lock);
- peer = server->peer;
- if (peer) {
- server->peer = NULL;
- peer->user = NULL;
- }
- spin_unlock(&afs_server_peer_lock);
-
- /* finish cleaning up the server */
- for (loop = AFS_SERVER_CONN_LIST_SIZE - 1; loop >= 0; loop--)
- if (server->fs_conn[loop])
- rxrpc_put_connection(server->fs_conn[loop]);
-
- if (server->vlserver)
- rxrpc_put_connection(server->vlserver);
+ ASSERTCMP(server->fs_vnodes.rb_node, ==, NULL);
+ ASSERTCMP(server->cb_promises.rb_node, ==, NULL);
+ ASSERTCMP(server->cb_break_head, ==, server->cb_break_tail);
+ ASSERTCMP(atomic_read(&server->cb_break_n), ==, 0);
+ afs_put_cell(server->cell);
kfree(server);
+}
- _leave(" [destroyed]");
-} /* end afs_server_do_timeout() */
-
-/*****************************************************************************/
/*
- * get a callslot on a connection to the fileserver on the specified server
+ * reap dead server records
*/
-int afs_server_request_callslot(struct afs_server *server,
- struct afs_server_callslot *callslot)
+static void afs_reap_server(struct work_struct *work)
{
- struct afs_server_callslot *pcallslot;
- struct rxrpc_connection *conn;
- int nconn, ret;
-
- _enter("%p,",server);
-
- INIT_LIST_HEAD(&callslot->link);
- callslot->task = current;
- callslot->conn = NULL;
- callslot->nconn = -1;
- callslot->ready = 0;
-
- ret = 0;
- conn = NULL;
-
- /* get hold of a callslot first */
- spin_lock(&server->fs_lock);
-
- /* resurrect the server if it's death timeout has expired */
- if (server->fs_state) {
- if (time_before(jiffies, server->fs_dead_jif)) {
- ret = server->fs_state;
- spin_unlock(&server->fs_lock);
- _leave(" = %d [still dead]", ret);
- return ret;
+ LIST_HEAD(corpses);
+ struct afs_server *server;
+ unsigned long delay, expiry;
+ time_t now;
+
+ now = get_seconds();
+ spin_lock(&afs_server_graveyard_lock);
+
+ while (!list_empty(&afs_server_graveyard)) {
+ server = list_entry(afs_server_graveyard.next,
+ struct afs_server, grave);
+
+ /* the queue is ordered most dead first */
+ expiry = server->time_of_death + afs_server_timeout;
+ if (expiry > now) {
+ delay = (expiry - now) * HZ;
+ if (!schedule_delayed_work(&afs_server_reaper, delay)) {
+ cancel_delayed_work(&afs_server_reaper);
+ schedule_delayed_work(&afs_server_reaper,
+ delay);
+ }
+ break;
}
- server->fs_state = 0;
- }
-
- /* try and find a connection that has spare callslots */
- for (nconn = 0; nconn < AFS_SERVER_CONN_LIST_SIZE; nconn++) {
- if (server->fs_conn_cnt[nconn] > 0) {
- server->fs_conn_cnt[nconn]--;
- spin_unlock(&server->fs_lock);
- callslot->nconn = nconn;
- goto obtained_slot;
+ write_lock(&server->cell->servers_lock);
+ write_lock(&afs_servers_lock);
+ if (atomic_read(&server->usage) > 0) {
+ list_del_init(&server->grave);
+ } else {
+ list_move_tail(&server->grave, &corpses);
+ list_del_init(&server->link);
+ rb_erase(&server->master_rb, &afs_servers);
}
+ write_unlock(&afs_servers_lock);
+ write_unlock(&server->cell->servers_lock);
}
- /* none were available - wait interruptibly for one to become
- * available */
- set_current_state(TASK_INTERRUPTIBLE);
- list_add_tail(&callslot->link, &server->fs_callq);
- spin_unlock(&server->fs_lock);
-
- while (!callslot->ready && !signal_pending(current)) {
- schedule();
- set_current_state(TASK_INTERRUPTIBLE);
- }
-
- set_current_state(TASK_RUNNING);
-
- /* even if we were interrupted we may still be queued */
- if (!callslot->ready) {
- spin_lock(&server->fs_lock);
- list_del_init(&callslot->link);
- spin_unlock(&server->fs_lock);
- }
-
- nconn = callslot->nconn;
+ spin_unlock(&afs_server_graveyard_lock);
- /* if interrupted, we must release any slot we also got before
- * returning an error */
- if (signal_pending(current)) {
- ret = -EINTR;
- goto error_release;
+ /* now reap the corpses we've extracted */
+ while (!list_empty(&corpses)) {
+ server = list_entry(corpses.next, struct afs_server, grave);
+ list_del(&server->grave);
+ afs_destroy_server(server);
}
+}
- /* if we were woken up with an error, then pass that error back to the
- * called */
- if (nconn < 0) {
- _leave(" = %d", callslot->errno);
- return callslot->errno;
- }
-
- /* were we given a connection directly? */
- if (callslot->conn) {
- /* yes - use it */
- _leave(" = 0 (nc=%d)", nconn);
- return 0;
- }
-
- /* got a callslot, but no connection */
- obtained_slot:
-
- /* need to get hold of the RxRPC connection */
- down_write(&server->sem);
-
- /* quick check to see if there's an outstanding error */
- ret = server->fs_state;
- if (ret)
- goto error_release_upw;
-
- if (server->fs_conn[nconn]) {
- /* reuse an existing connection */
- rxrpc_get_connection(server->fs_conn[nconn]);
- callslot->conn = server->fs_conn[nconn];
- }
- else {
- /* create a new connection */
- ret = rxrpc_create_connection(afs_transport,
- htons(7000),
- server->addr.s_addr,
- FS_SERVICE_ID,
- NULL,
- &server->fs_conn[nconn]);
-
- if (ret < 0)
- goto error_release_upw;
-
- callslot->conn = server->fs_conn[0];
- rxrpc_get_connection(callslot->conn);
- }
-
- up_write(&server->sem);
-
- _leave(" = 0");
- return 0;
-
- /* handle an error occurring */
- error_release_upw:
- up_write(&server->sem);
-
- error_release:
- /* either release the callslot or pass it along to another deserving
- * task */
- spin_lock(&server->fs_lock);
-
- if (nconn < 0) {
- /* no callslot allocated */
- }
- else if (list_empty(&server->fs_callq)) {
- /* no one waiting */
- server->fs_conn_cnt[nconn]++;
- spin_unlock(&server->fs_lock);
- }
- else {
- /* someone's waiting - dequeue them and wake them up */
- pcallslot = list_entry(server->fs_callq.next,
- struct afs_server_callslot, link);
- list_del_init(&pcallslot->link);
-
- pcallslot->errno = server->fs_state;
- if (!pcallslot->errno) {
- /* pass them out callslot details */
- callslot->conn = xchg(&pcallslot->conn,
- callslot->conn);
- pcallslot->nconn = nconn;
- callslot->nconn = nconn = -1;
- }
- pcallslot->ready = 1;
- wake_up_process(pcallslot->task);
- spin_unlock(&server->fs_lock);
- }
-
- rxrpc_put_connection(callslot->conn);
- callslot->conn = NULL;
-
- _leave(" = %d", ret);
- return ret;
-
-} /* end afs_server_request_callslot() */
-
-/*****************************************************************************/
-/*
- * release a callslot back to the server
- * - transfers the RxRPC connection to the next pending callslot if possible
- */
-void afs_server_release_callslot(struct afs_server *server,
- struct afs_server_callslot *callslot)
-{
- struct afs_server_callslot *pcallslot;
-
- _enter("{ad=%08x,cnt=%u},{%d}",
- ntohl(server->addr.s_addr),
- server->fs_conn_cnt[callslot->nconn],
- callslot->nconn);
-
- BUG_ON(callslot->nconn < 0);
-
- spin_lock(&server->fs_lock);
-
- if (list_empty(&server->fs_callq)) {
- /* no one waiting */
- server->fs_conn_cnt[callslot->nconn]++;
- spin_unlock(&server->fs_lock);
- }
- else {
- /* someone's waiting - dequeue them and wake them up */
- pcallslot = list_entry(server->fs_callq.next,
- struct afs_server_callslot, link);
- list_del_init(&pcallslot->link);
-
- pcallslot->errno = server->fs_state;
- if (!pcallslot->errno) {
- /* pass them out callslot details */
- callslot->conn = xchg(&pcallslot->conn, callslot->conn);
- pcallslot->nconn = callslot->nconn;
- callslot->nconn = -1;
- }
-
- pcallslot->ready = 1;
- wake_up_process(pcallslot->task);
- spin_unlock(&server->fs_lock);
- }
-
- rxrpc_put_connection(callslot->conn);
-
- _leave("");
-} /* end afs_server_release_callslot() */
-
-/*****************************************************************************/
/*
- * get a handle to a connection to the vlserver (volume location) on the
- * specified server
+ * discard all the server records for rmmod
*/
-int afs_server_get_vlconn(struct afs_server *server,
- struct rxrpc_connection **_conn)
+void __exit afs_purge_servers(void)
{
- struct rxrpc_connection *conn;
- int ret;
-
- _enter("%p,", server);
-
- ret = 0;
- conn = NULL;
- down_read(&server->sem);
-
- if (server->vlserver) {
- /* reuse an existing connection */
- rxrpc_get_connection(server->vlserver);
- conn = server->vlserver;
- up_read(&server->sem);
- }
- else {
- /* create a new connection */
- up_read(&server->sem);
- down_write(&server->sem);
- if (!server->vlserver) {
- ret = rxrpc_create_connection(afs_transport,
- htons(7003),
- server->addr.s_addr,
- VL_SERVICE_ID,
- NULL,
- &server->vlserver);
- }
- if (ret == 0) {
- rxrpc_get_connection(server->vlserver);
- conn = server->vlserver;
- }
- up_write(&server->sem);
- }
-
- *_conn = conn;
- _leave(" = %d", ret);
- return ret;
-} /* end afs_server_get_vlconn() */
+ afs_server_timeout = 0;
+ cancel_delayed_work(&afs_server_reaper);
+ schedule_delayed_work(&afs_server_reaper, 0);
+}
diff --git a/fs/afs/server.h b/fs/afs/server.h
deleted file mode 100644
index c3d24115578f..000000000000
--- a/fs/afs/server.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* server.h: AFS server record
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_AFS_SERVER_H
-#define _LINUX_AFS_SERVER_H
-
-#include "types.h"
-#include "kafstimod.h"
-#include <rxrpc/peer.h>
-#include <linux/rwsem.h>
-
-extern spinlock_t afs_server_peer_lock;
-
-/*****************************************************************************/
-/*
- * AFS server record
- */
-struct afs_server
-{
- atomic_t usage;
- struct afs_cell *cell; /* cell in which server resides */
- struct list_head link; /* link in cell's server list */
- struct rw_semaphore sem; /* access lock */
- struct afs_timer timeout; /* graveyard timeout */
- struct in_addr addr; /* server address */
- struct rxrpc_peer *peer; /* peer record for this server */
- struct rxrpc_connection *vlserver; /* connection to the volume location service */
-
- /* file service access */
-#define AFS_SERVER_CONN_LIST_SIZE 2
- struct rxrpc_connection *fs_conn[AFS_SERVER_CONN_LIST_SIZE]; /* FS connections */
- unsigned fs_conn_cnt[AFS_SERVER_CONN_LIST_SIZE]; /* per conn call count */
- struct list_head fs_callq; /* queue of processes waiting to make a call */
- spinlock_t fs_lock; /* access lock */
- int fs_state; /* 0 or reason FS currently marked dead (-errno) */
- unsigned fs_rtt; /* FS round trip time */
- unsigned long fs_act_jif; /* time at which last activity occurred */
- unsigned long fs_dead_jif; /* time at which no longer to be considered dead */
-
- /* callback promise management */
- struct list_head cb_promises; /* as yet unbroken promises from this server */
- spinlock_t cb_lock; /* access lock */
-};
-
-extern int afs_server_lookup(struct afs_cell *cell,
- const struct in_addr *addr,
- struct afs_server **_server);
-
-#define afs_get_server(S) do { atomic_inc(&(S)->usage); } while(0)
-
-extern void afs_put_server(struct afs_server *server);
-extern void afs_server_do_timeout(struct afs_server *server);
-
-extern int afs_server_find_by_peer(const struct rxrpc_peer *peer,
- struct afs_server **_server);
-
-extern int afs_server_get_vlconn(struct afs_server *server,
- struct rxrpc_connection **_conn);
-
-static inline
-struct afs_server *afs_server_get_from_peer(struct rxrpc_peer *peer)
-{
- struct afs_server *server;
-
- spin_lock(&afs_server_peer_lock);
- server = peer->user;
- if (server)
- afs_get_server(server);
- spin_unlock(&afs_server_peer_lock);
-
- return server;
-}
-
-/*****************************************************************************/
-/*
- * AFS server callslot grant record
- */
-struct afs_server_callslot
-{
- struct list_head link; /* link in server's list */
- struct task_struct *task; /* process waiting to make call */
- struct rxrpc_connection *conn; /* connection to use (or NULL on error) */
- short nconn; /* connection slot number (-1 on error) */
- char ready; /* T when ready */
- int errno; /* error number if nconn==-1 */
-};
-
-extern int afs_server_request_callslot(struct afs_server *server,
- struct afs_server_callslot *callslot);
-
-extern void afs_server_release_callslot(struct afs_server *server,
- struct afs_server_callslot *callslot);
-
-#endif /* _LINUX_AFS_SERVER_H */
diff --git a/fs/afs/super.c b/fs/afs/super.c
index eb7e32349da3..cebd03c91f57 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -1,5 +1,6 @@
-/*
- * Copyright (c) 2002 Red Hat, Inc. All rights reserved.
+/* AFS superblock handling
+ *
+ * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved.
*
* This software may be freely redistributed under the terms of the
* GNU General Public License.
@@ -9,7 +10,7 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
* Authors: David Howells <dhowells@redhat.com>
- * David Woodhouse <dwmw2@cambridge.redhat.com>
+ * David Woodhouse <dwmw2@redhat.com>
*
*/
@@ -19,22 +20,10 @@
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
-#include "vnode.h"
-#include "volume.h"
-#include "cell.h"
-#include "cmservice.h"
-#include "fsclient.h"
-#include "super.h"
#include "internal.h"
#define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
-struct afs_mount_params {
- int rwpath;
- struct afs_cell *default_cell;
- struct afs_volume *volume;
-};
-
static void afs_i_init_once(void *foo, struct kmem_cache *cachep,
unsigned long flags);
@@ -62,13 +51,13 @@ static const struct super_operations afs_super_ops = {
.drop_inode = generic_delete_inode,
.destroy_inode = afs_destroy_inode,
.clear_inode = afs_clear_inode,
+ .umount_begin = afs_umount_begin,
.put_super = afs_put_super,
};
static struct kmem_cache *afs_inode_cachep;
static atomic_t afs_count_active_inodes;
-/*****************************************************************************/
/*
* initialise the filesystem
*/
@@ -78,8 +67,6 @@ int __init afs_fs_init(void)
_enter("");
- afs_timer_init(&afs_mntpt_expiry_timer, &afs_mntpt_expiry_timer_ops);
-
/* create ourselves an inode cache */
atomic_set(&afs_count_active_inodes, 0);
@@ -99,20 +86,22 @@ int __init afs_fs_init(void)
ret = register_filesystem(&afs_fs_type);
if (ret < 0) {
kmem_cache_destroy(afs_inode_cachep);
- kleave(" = %d", ret);
+ _leave(" = %d", ret);
return ret;
}
- kleave(" = 0");
+ _leave(" = 0");
return 0;
-} /* end afs_fs_init() */
+}
-/*****************************************************************************/
/*
* clean up the filesystem
*/
void __exit afs_fs_exit(void)
{
+ _enter("");
+
+ afs_mntpt_kill_timer();
unregister_filesystem(&afs_fs_type);
if (atomic_read(&afs_count_active_inodes) != 0) {
@@ -122,10 +111,9 @@ void __exit afs_fs_exit(void)
}
kmem_cache_destroy(afs_inode_cachep);
+ _leave("");
+}
-} /* end afs_fs_exit() */
-
-/*****************************************************************************/
/*
* check that an argument has a value
*/
@@ -136,9 +124,8 @@ static int want_arg(char **_value, const char *option)
return 0;
}
return 1;
-} /* end want_arg() */
+}
-/*****************************************************************************/
/*
* check that there's no subsequent value
*/
@@ -150,18 +137,17 @@ static int want_no_value(char *const *_value, const char *option)
return 0;
}
return 1;
-} /* end want_no_value() */
+}
-/*****************************************************************************/
/*
* parse the mount options
* - this function has been shamelessly adapted from the ext3 fs which
* shamelessly adapted it from the msdos fs
*/
-static int afs_super_parse_options(struct afs_mount_params *params,
- char *options,
- const char **devname)
+static int afs_parse_options(struct afs_mount_params *params,
+ char *options, const char **devname)
{
+ struct afs_cell *cell;
char *key, *value;
int ret;
@@ -170,51 +156,135 @@ static int afs_super_parse_options(struct afs_mount_params *params,
options[PAGE_SIZE - 1] = 0;
ret = 0;
- while ((key = strsep(&options, ",")) != 0)
- {
+ while ((key = strsep(&options, ","))) {
value = strchr(key, '=');
if (value)
*value++ = 0;
- printk("kAFS: KEY: %s, VAL:%s\n", key, value ?: "-");
+ _debug("kAFS: KEY: %s, VAL:%s", key, value ?: "-");
if (strcmp(key, "rwpath") == 0) {
if (!want_no_value(&value, "rwpath"))
return -EINVAL;
params->rwpath = 1;
- continue;
- }
- else if (strcmp(key, "vol") == 0) {
+ } else if (strcmp(key, "vol") == 0) {
if (!want_arg(&value, "vol"))
return -EINVAL;
*devname = value;
- continue;
- }
- else if (strcmp(key, "cell") == 0) {
+ } else if (strcmp(key, "cell") == 0) {
if (!want_arg(&value, "cell"))
return -EINVAL;
- afs_put_cell(params->default_cell);
- ret = afs_cell_lookup(value,
- strlen(value),
- &params->default_cell);
- if (ret < 0)
- return -EINVAL;
- continue;
+ cell = afs_cell_lookup(value, strlen(value));
+ if (IS_ERR(cell))
+ return PTR_ERR(cell);
+ afs_put_cell(params->cell);
+ params->cell = cell;
+ } else {
+ printk("kAFS: Unknown mount option: '%s'\n", key);
+ ret = -EINVAL;
+ goto error;
}
-
- printk("kAFS: Unknown mount option: '%s'\n", key);
- ret = -EINVAL;
- goto error;
}
ret = 0;
-
- error:
+error:
_leave(" = %d", ret);
return ret;
-} /* end afs_super_parse_options() */
+}
+
+/*
+ * parse a device name to get cell name, volume name, volume type and R/W
+ * selector
+ * - this can be one of the following:
+ * "%[cell:]volume[.]" R/W volume
+ * "#[cell:]volume[.]" R/O or R/W volume (rwpath=0),
+ * or R/W (rwpath=1) volume
+ * "%[cell:]volume.readonly" R/O volume
+ * "#[cell:]volume.readonly" R/O volume
+ * "%[cell:]volume.backup" Backup volume
+ * "#[cell:]volume.backup" Backup volume
+ */
+static int afs_parse_device_name(struct afs_mount_params *params,
+ const char *name)
+{
+ struct afs_cell *cell;
+ const char *cellname, *suffix;
+ int cellnamesz;
+
+ _enter(",%s", name);
+
+ if (!name) {
+ printk(KERN_ERR "kAFS: no volume name specified\n");
+ return -EINVAL;
+ }
+
+ if ((name[0] != '%' && name[0] != '#') || !name[1]) {
+ printk(KERN_ERR "kAFS: unparsable volume name\n");
+ return -EINVAL;
+ }
+
+ /* determine the type of volume we're looking for */
+ params->type = AFSVL_ROVOL;
+ params->force = false;
+ if (params->rwpath || name[0] == '%') {
+ params->type = AFSVL_RWVOL;
+ params->force = true;
+ }
+ name++;
+
+ /* split the cell name out if there is one */
+ params->volname = strchr(name, ':');
+ if (params->volname) {
+ cellname = name;
+ cellnamesz = params->volname - name;
+ params->volname++;
+ } else {
+ params->volname = name;
+ cellname = NULL;
+ cellnamesz = 0;
+ }
+
+ /* the volume type is further affected by a possible suffix */
+ suffix = strrchr(params->volname, '.');
+ if (suffix) {
+ if (strcmp(suffix, ".readonly") == 0) {
+ params->type = AFSVL_ROVOL;
+ params->force = true;
+ } else if (strcmp(suffix, ".backup") == 0) {
+ params->type = AFSVL_BACKVOL;
+ params->force = true;
+ } else if (suffix[1] == 0) {
+ } else {
+ suffix = NULL;
+ }
+ }
+
+ params->volnamesz = suffix ?
+ suffix - params->volname : strlen(params->volname);
+
+ _debug("cell %*.*s [%p]",
+ cellnamesz, cellnamesz, cellname ?: "", params->cell);
+
+ /* lookup the cell record */
+ if (cellname || !params->cell) {
+ cell = afs_cell_lookup(cellname, cellnamesz);
+ if (IS_ERR(cell)) {
+ printk(KERN_ERR "kAFS: unable to lookup cell '%s'\n",
+ cellname ?: "");
+ return PTR_ERR(cell);
+ }
+ afs_put_cell(params->cell);
+ params->cell = cell;
+ }
+
+ _debug("CELL:%s [%p] VOLUME:%*.*s SUFFIX:%s TYPE:%d%s",
+ params->cell->name, params->cell,
+ params->volnamesz, params->volnamesz, params->volname,
+ suffix ?: "-", params->type, params->force ? " FORCE" : "");
+
+ return 0;
+}
-/*****************************************************************************/
/*
* check a superblock to see if it's the one we're looking for
*/
@@ -224,13 +294,12 @@ static int afs_test_super(struct super_block *sb, void *data)
struct afs_super_info *as = sb->s_fs_info;
return as->volume == params->volume;
-} /* end afs_test_super() */
+}
-/*****************************************************************************/
/*
* fill in the superblock
*/
-static int afs_fill_super(struct super_block *sb, void *data, int silent)
+static int afs_fill_super(struct super_block *sb, void *data)
{
struct afs_mount_params *params = data;
struct afs_super_info *as = NULL;
@@ -239,7 +308,7 @@ static int afs_fill_super(struct super_block *sb, void *data, int silent)
struct inode *inode = NULL;
int ret;
- kenter("");
+ _enter("");
/* allocate a superblock info record */
as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL);
@@ -262,9 +331,9 @@ static int afs_fill_super(struct super_block *sb, void *data, int silent)
fid.vid = as->volume->vid;
fid.vnode = 1;
fid.unique = 1;
- ret = afs_iget(sb, &fid, &inode);
- if (ret < 0)
- goto error;
+ inode = afs_iget(sb, params->key, &fid, NULL, NULL);
+ if (IS_ERR(inode))
+ goto error_inode;
ret = -ENOMEM;
root = d_alloc_root(inode);
@@ -273,21 +342,23 @@ static int afs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_root = root;
- kleave(" = 0");
+ _leave(" = 0");
return 0;
- error:
+error_inode:
+ ret = PTR_ERR(inode);
+ inode = NULL;
+error:
iput(inode);
afs_put_volume(as->volume);
kfree(as);
sb->s_fs_info = NULL;
- kleave(" = %d", ret);
+ _leave(" = %d", ret);
return ret;
-} /* end afs_fill_super() */
+}
-/*****************************************************************************/
/*
* get an AFS superblock
* - TODO: don't use get_sb_nodev(), but rather call sget() directly
@@ -300,69 +371,80 @@ static int afs_get_sb(struct file_system_type *fs_type,
{
struct afs_mount_params params;
struct super_block *sb;
+ struct afs_volume *vol;
+ struct key *key;
int ret;
_enter(",,%s,%p", dev_name, options);
memset(&params, 0, sizeof(params));
- /* start the cache manager */
- ret = afscm_start();
- if (ret < 0) {
- _leave(" = %d", ret);
- return ret;
- }
-
- /* parse the options */
+ /* parse the options and device name */
if (options) {
- ret = afs_super_parse_options(&params, options, &dev_name);
+ ret = afs_parse_options(&params, options, &dev_name);
if (ret < 0)
goto error;
- if (!dev_name) {
- printk("kAFS: no volume name specified\n");
- ret = -EINVAL;
- goto error;
- }
}
- /* parse the device name */
- ret = afs_volume_lookup(dev_name,
- params.default_cell,
- params.rwpath,
- &params.volume);
+
+ ret = afs_parse_device_name(&params, dev_name);
if (ret < 0)
goto error;
- /* allocate a deviceless superblock */
- sb = sget(fs_type, afs_test_super, set_anon_super, &params);
- if (IS_ERR(sb))
+ /* try and do the mount securely */
+ key = afs_request_key(params.cell);
+ if (IS_ERR(key)) {
+ _leave(" = %ld [key]", PTR_ERR(key));
+ ret = PTR_ERR(key);
goto error;
+ }
+ params.key = key;
- sb->s_flags = flags;
+ /* parse the device name */
+ vol = afs_volume_lookup(&params);
+ if (IS_ERR(vol)) {
+ ret = PTR_ERR(vol);
+ goto error;
+ }
+ params.volume = vol;
- ret = afs_fill_super(sb, &params, flags & MS_SILENT ? 1 : 0);
- if (ret < 0) {
- up_write(&sb->s_umount);
- deactivate_super(sb);
+ /* allocate a deviceless superblock */
+ sb = sget(fs_type, afs_test_super, set_anon_super, &params);
+ if (IS_ERR(sb)) {
+ ret = PTR_ERR(sb);
goto error;
}
- sb->s_flags |= MS_ACTIVE;
- simple_set_mnt(mnt, sb);
+ if (!sb->s_root) {
+ /* initial superblock/root creation */
+ _debug("create");
+ sb->s_flags = flags;
+ ret = afs_fill_super(sb, &params);
+ if (ret < 0) {
+ up_write(&sb->s_umount);
+ deactivate_super(sb);
+ goto error;
+ }
+ sb->s_flags |= MS_ACTIVE;
+ } else {
+ _debug("reuse");
+ ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
+ }
+
+ simple_set_mnt(mnt, sb);
afs_put_volume(params.volume);
- afs_put_cell(params.default_cell);
- _leave(" = 0 [%p]", 0, sb);
+ afs_put_cell(params.cell);
+ _leave(" = 0 [%p]", sb);
return 0;
- error:
+error:
afs_put_volume(params.volume);
- afs_put_cell(params.default_cell);
- afscm_stop();
+ afs_put_cell(params.cell);
+ key_put(params.key);
_leave(" = %d", ret);
return ret;
-} /* end afs_get_sb() */
+}
-/*****************************************************************************/
/*
* finish the unmounting process on the superblock
*/
@@ -373,35 +455,30 @@ static void afs_put_super(struct super_block *sb)
_enter("");
afs_put_volume(as->volume);
- afscm_stop();
_leave("");
-} /* end afs_put_super() */
+}
-/*****************************************************************************/
/*
* initialise an inode cache slab element prior to any use
*/
static void afs_i_init_once(void *_vnode, struct kmem_cache *cachep,
unsigned long flags)
{
- struct afs_vnode *vnode = (struct afs_vnode *) _vnode;
+ struct afs_vnode *vnode = _vnode;
if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
SLAB_CTOR_CONSTRUCTOR) {
memset(vnode, 0, sizeof(*vnode));
inode_init_once(&vnode->vfs_inode);
init_waitqueue_head(&vnode->update_waitq);
+ mutex_init(&vnode->permits_lock);
+ mutex_init(&vnode->validate_lock);
spin_lock_init(&vnode->lock);
- INIT_LIST_HEAD(&vnode->cb_link);
- INIT_LIST_HEAD(&vnode->cb_hash_link);
- afs_timer_init(&vnode->cb_timeout,
- &afs_vnode_cb_timed_out_ops);
+ INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work);
}
+}
-} /* end afs_i_init_once() */
-
-/*****************************************************************************/
/*
* allocate an AFS inode struct from our slab cache
*/
@@ -409,8 +486,7 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
{
struct afs_vnode *vnode;
- vnode = (struct afs_vnode *)
- kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL);
+ vnode = kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL);
if (!vnode)
return NULL;
@@ -421,21 +497,25 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
vnode->volume = NULL;
vnode->update_cnt = 0;
- vnode->flags = 0;
+ vnode->flags = 1 << AFS_VNODE_UNSET;
+ vnode->cb_promised = false;
return &vnode->vfs_inode;
-} /* end afs_alloc_inode() */
+}
-/*****************************************************************************/
/*
* destroy an AFS inode struct
*/
static void afs_destroy_inode(struct inode *inode)
{
+ struct afs_vnode *vnode = AFS_FS_I(inode);
+
_enter("{%lu}", inode->i_ino);
- kmem_cache_free(afs_inode_cachep, AFS_FS_I(inode));
+ _debug("DESTROY INODE %p", inode);
- atomic_dec(&afs_count_active_inodes);
+ ASSERTCMP(vnode->server, ==, NULL);
-} /* end afs_destroy_inode() */
+ kmem_cache_free(afs_inode_cachep, vnode);
+ atomic_dec(&afs_count_active_inodes);
+}
diff --git a/fs/afs/super.h b/fs/afs/super.h
deleted file mode 100644
index 32de8cc6fae8..000000000000
--- a/fs/afs/super.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* super.h: AFS filesystem internal private data
- *
- * Copyright (c) 2002 Red Hat, Inc. All rights reserved.
- *
- * This software may be freely redistributed under the terms of the
- * GNU General Public License.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Authors: David Woodhouse <dwmw2@cambridge.redhat.com>
- * David Howells <dhowells@redhat.com>
- *
- */
-
-#ifndef _LINUX_AFS_SUPER_H
-#define _LINUX_AFS_SUPER_H
-
-#include <linux/fs.h>
-#include "server.h"
-
-#ifdef __KERNEL__
-
-/*****************************************************************************/
-/*
- * AFS superblock private data
- * - there's one superblock per volume
- */
-struct afs_super_info
-{
- struct afs_volume *volume; /* volume record */
- char rwparent; /* T if parent is R/W AFS volume */
-};
-
-static inline struct afs_super_info *AFS_FS_S(struct super_block *sb)
-{
- return sb->s_fs_info;
-}
-
-extern struct file_system_type afs_fs_type;
-
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_AFS_SUPER_H */
diff --git a/fs/afs/transport.h b/fs/afs/transport.h
deleted file mode 100644
index 7013ae6ccc8c..000000000000
--- a/fs/afs/transport.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* transport.h: AFS transport management
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_AFS_TRANSPORT_H
-#define _LINUX_AFS_TRANSPORT_H
-
-#include "types.h"
-#include <rxrpc/transport.h>
-
-/* the cache manager transport endpoint */
-extern struct rxrpc_transport *afs_transport;
-
-#endif /* _LINUX_AFS_TRANSPORT_H */
diff --git a/fs/afs/types.h b/fs/afs/types.h
deleted file mode 100644
index b1a2367c7587..000000000000
--- a/fs/afs/types.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* types.h: AFS types
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_AFS_TYPES_H
-#define _LINUX_AFS_TYPES_H
-
-#ifdef __KERNEL__
-#include <rxrpc/types.h>
-#endif /* __KERNEL__ */
-
-typedef unsigned afs_volid_t;
-typedef unsigned afs_vnodeid_t;
-typedef unsigned long long afs_dataversion_t;
-
-typedef enum {
- AFSVL_RWVOL, /* read/write volume */
- AFSVL_ROVOL, /* read-only volume */
- AFSVL_BACKVOL, /* backup volume */
-} __attribute__((packed)) afs_voltype_t;
-
-typedef enum {
- AFS_FTYPE_INVALID = 0,
- AFS_FTYPE_FILE = 1,
- AFS_FTYPE_DIR = 2,
- AFS_FTYPE_SYMLINK = 3,
-} afs_file_type_t;
-
-#ifdef __KERNEL__
-
-struct afs_cell;
-struct afs_vnode;
-
-/*****************************************************************************/
-/*
- * AFS file identifier
- */
-struct afs_fid
-{
- afs_volid_t vid; /* volume ID */
- afs_vnodeid_t vnode; /* file index within volume */
- unsigned unique; /* unique ID number (file index version) */
-};
-
-/*****************************************************************************/
-/*
- * AFS callback notification
- */
-typedef enum {
- AFSCM_CB_UNTYPED = 0, /* no type set on CB break */
- AFSCM_CB_EXCLUSIVE = 1, /* CB exclusive to CM [not implemented] */
- AFSCM_CB_SHARED = 2, /* CB shared by other CM's */
- AFSCM_CB_DROPPED = 3, /* CB promise cancelled by file server */
-} afs_callback_type_t;
-
-struct afs_callback
-{
- struct afs_server *server; /* server that made the promise */
- struct afs_fid fid; /* file identifier */
- unsigned version; /* callback version */
- unsigned expiry; /* time at which expires */
- afs_callback_type_t type; /* type of callback */
-};
-
-#define AFSCBMAX 50
-
-/*****************************************************************************/
-/*
- * AFS volume information
- */
-struct afs_volume_info
-{
- afs_volid_t vid; /* volume ID */
- afs_voltype_t type; /* type of this volume */
- afs_volid_t type_vids[5]; /* volume ID's for possible types for this vol */
-
- /* list of fileservers serving this volume */
- size_t nservers; /* number of entries used in servers[] */
- struct {
- struct in_addr addr; /* fileserver address */
- } servers[8];
-};
-
-/*****************************************************************************/
-/*
- * AFS file status information
- */
-struct afs_file_status
-{
- unsigned if_version; /* interface version */
-#define AFS_FSTATUS_VERSION 1
-
- afs_file_type_t type; /* file type */
- unsigned nlink; /* link count */
- size_t size; /* file size */
- afs_dataversion_t version; /* current data version */
- unsigned author; /* author ID */
- unsigned owner; /* owner ID */
- unsigned caller_access; /* access rights for authenticated caller */
- unsigned anon_access; /* access rights for unauthenticated caller */
- umode_t mode; /* UNIX mode */
- struct afs_fid parent; /* parent file ID */
- time_t mtime_client; /* last time client changed data */
- time_t mtime_server; /* last time server changed data */
-};
-
-/*****************************************************************************/
-/*
- * AFS volume synchronisation information
- */
-struct afs_volsync
-{
- time_t creation; /* volume creation time */
-};
-
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_AFS_TYPES_H */
diff --git a/fs/afs/use-rtnetlink.c b/fs/afs/use-rtnetlink.c
new file mode 100644
index 000000000000..82f0daa28970
--- /dev/null
+++ b/fs/afs/use-rtnetlink.c
@@ -0,0 +1,473 @@
+/* RTNETLINK client
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_addr.h>
+#include <linux/if_arp.h>
+#include <linux/inetdevice.h>
+#include <net/netlink.h>
+#include "internal.h"
+
+struct afs_rtm_desc {
+ struct socket *nlsock;
+ struct afs_interface *bufs;
+ u8 *mac;
+ size_t nbufs;
+ size_t maxbufs;
+ void *data;
+ ssize_t datalen;
+ size_t datamax;
+ int msg_seq;
+ unsigned mac_index;
+ bool wantloopback;
+ int (*parse)(struct afs_rtm_desc *, struct nlmsghdr *);
+};
+
+/*
+ * parse an RTM_GETADDR response
+ */
+static int afs_rtm_getaddr_parse(struct afs_rtm_desc *desc,
+ struct nlmsghdr *nlhdr)
+{
+ struct afs_interface *this;
+ struct ifaddrmsg *ifa;
+ struct rtattr *rtattr;
+ const char *name;
+ size_t len;
+
+ ifa = (struct ifaddrmsg *) NLMSG_DATA(nlhdr);
+
+ _enter("{ix=%d,af=%d}", ifa->ifa_index, ifa->ifa_family);
+
+ if (ifa->ifa_family != AF_INET) {
+ _leave(" = 0 [family %d]", ifa->ifa_family);
+ return 0;
+ }
+ if (desc->nbufs >= desc->maxbufs) {
+ _leave(" = 0 [max %zu/%zu]", desc->nbufs, desc->maxbufs);
+ return 0;
+ }
+
+ this = &desc->bufs[desc->nbufs];
+
+ this->index = ifa->ifa_index;
+ this->netmask.s_addr = inet_make_mask(ifa->ifa_prefixlen);
+ this->mtu = 0;
+
+ rtattr = NLMSG_DATA(nlhdr) + NLMSG_ALIGN(sizeof(struct ifaddrmsg));
+ len = NLMSG_PAYLOAD(nlhdr, sizeof(struct ifaddrmsg));
+
+ name = "unknown";
+ for (; RTA_OK(rtattr, len); rtattr = RTA_NEXT(rtattr, len)) {
+ switch (rtattr->rta_type) {
+ case IFA_ADDRESS:
+ memcpy(&this->address, RTA_DATA(rtattr), 4);
+ break;
+ case IFA_LABEL:
+ name = RTA_DATA(rtattr);
+ break;
+ }
+ }
+
+ _debug("%s: "NIPQUAD_FMT"/"NIPQUAD_FMT,
+ name, NIPQUAD(this->address), NIPQUAD(this->netmask));
+
+ desc->nbufs++;
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * parse an RTM_GETLINK response for MTUs
+ */
+static int afs_rtm_getlink_if_parse(struct afs_rtm_desc *desc,
+ struct nlmsghdr *nlhdr)
+{
+ struct afs_interface *this;
+ struct ifinfomsg *ifi;
+ struct rtattr *rtattr;
+ const char *name;
+ size_t len, loop;
+
+ ifi = (struct ifinfomsg *) NLMSG_DATA(nlhdr);
+
+ _enter("{ix=%d}", ifi->ifi_index);
+
+ for (loop = 0; loop < desc->nbufs; loop++) {
+ this = &desc->bufs[loop];
+ if (this->index == ifi->ifi_index)
+ goto found;
+ }
+
+ _leave(" = 0 [no match]");
+ return 0;
+
+found:
+ if (ifi->ifi_type == ARPHRD_LOOPBACK && !desc->wantloopback) {
+ _leave(" = 0 [loopback]");
+ return 0;
+ }
+
+ rtattr = NLMSG_DATA(nlhdr) + NLMSG_ALIGN(sizeof(struct ifinfomsg));
+ len = NLMSG_PAYLOAD(nlhdr, sizeof(struct ifinfomsg));
+
+ name = "unknown";
+ for (; RTA_OK(rtattr, len); rtattr = RTA_NEXT(rtattr, len)) {
+ switch (rtattr->rta_type) {
+ case IFLA_MTU:
+ memcpy(&this->mtu, RTA_DATA(rtattr), 4);
+ break;
+ case IFLA_IFNAME:
+ name = RTA_DATA(rtattr);
+ break;
+ }
+ }
+
+ _debug("%s: "NIPQUAD_FMT"/"NIPQUAD_FMT" mtu %u",
+ name, NIPQUAD(this->address), NIPQUAD(this->netmask),
+ this->mtu);
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * parse an RTM_GETLINK response for the MAC address belonging to the lowest
+ * non-internal interface
+ */
+static int afs_rtm_getlink_mac_parse(struct afs_rtm_desc *desc,
+ struct nlmsghdr *nlhdr)
+{
+ struct ifinfomsg *ifi;
+ struct rtattr *rtattr;
+ const char *name;
+ size_t remain, len;
+ bool set;
+
+ ifi = (struct ifinfomsg *) NLMSG_DATA(nlhdr);
+
+ _enter("{ix=%d}", ifi->ifi_index);
+
+ if (ifi->ifi_index >= desc->mac_index) {
+ _leave(" = 0 [high]");
+ return 0;
+ }
+ if (ifi->ifi_type == ARPHRD_LOOPBACK) {
+ _leave(" = 0 [loopback]");
+ return 0;
+ }
+
+ rtattr = NLMSG_DATA(nlhdr) + NLMSG_ALIGN(sizeof(struct ifinfomsg));
+ remain = NLMSG_PAYLOAD(nlhdr, sizeof(struct ifinfomsg));
+
+ name = "unknown";
+ set = false;
+ for (; RTA_OK(rtattr, remain); rtattr = RTA_NEXT(rtattr, remain)) {
+ switch (rtattr->rta_type) {
+ case IFLA_ADDRESS:
+ len = RTA_PAYLOAD(rtattr);
+ memcpy(desc->mac, RTA_DATA(rtattr),
+ min_t(size_t, len, 6));
+ desc->mac_index = ifi->ifi_index;
+ set = true;
+ break;
+ case IFLA_IFNAME:
+ name = RTA_DATA(rtattr);
+ break;
+ }
+ }
+
+ if (set)
+ _debug("%s: %02x:%02x:%02x:%02x:%02x:%02x",
+ name,
+ desc->mac[0], desc->mac[1], desc->mac[2],
+ desc->mac[3], desc->mac[4], desc->mac[5]);
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * read the rtnetlink response and pass to parsing routine
+ */
+static int afs_read_rtm(struct afs_rtm_desc *desc)
+{
+ struct nlmsghdr *nlhdr, tmphdr;
+ struct msghdr msg;
+ struct kvec iov[1];
+ void *data;
+ bool last = false;
+ int len, ret, remain;
+
+ _enter("");
+
+ do {
+ /* first of all peek to see how big the packet is */
+ memset(&msg, 0, sizeof(msg));
+ iov[0].iov_base = &tmphdr;
+ iov[0].iov_len = sizeof(tmphdr);
+ len = kernel_recvmsg(desc->nlsock, &msg, iov, 1,
+ sizeof(tmphdr), MSG_PEEK | MSG_TRUNC);
+ if (len < 0) {
+ _leave(" = %d [peek]", len);
+ return len;
+ }
+ if (len == 0)
+ continue;
+ if (len < sizeof(tmphdr) || len < NLMSG_PAYLOAD(&tmphdr, 0)) {
+ _leave(" = -EMSGSIZE");
+ return -EMSGSIZE;
+ }
+
+ if (desc->datamax < len) {
+ kfree(desc->data);
+ desc->data = NULL;
+ data = kmalloc(len, GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+ desc->data = data;
+ }
+ desc->datamax = len;
+
+ /* read all the data from this packet */
+ iov[0].iov_base = desc->data;
+ iov[0].iov_len = desc->datamax;
+ desc->datalen = kernel_recvmsg(desc->nlsock, &msg, iov, 1,
+ desc->datamax, 0);
+ if (desc->datalen < 0) {
+ _leave(" = %ld [recv]", desc->datalen);
+ return desc->datalen;
+ }
+
+ nlhdr = desc->data;
+
+ /* check if the header is valid */
+ if (!NLMSG_OK(nlhdr, desc->datalen) ||
+ nlhdr->nlmsg_type == NLMSG_ERROR) {
+ _leave(" = -EIO");
+ return -EIO;
+ }
+
+ /* see if this is the last message */
+ if (nlhdr->nlmsg_type == NLMSG_DONE ||
+ !(nlhdr->nlmsg_flags & NLM_F_MULTI))
+ last = true;
+
+ /* parse the bits we got this time */
+ nlmsg_for_each_msg(nlhdr, desc->data, desc->datalen, remain) {
+ ret = desc->parse(desc, nlhdr);
+ if (ret < 0) {
+ _leave(" = %d [parse]", ret);
+ return ret;
+ }
+ }
+
+ } while (!last);
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * list the interface bound addresses to get the address and netmask
+ */
+static int afs_rtm_getaddr(struct afs_rtm_desc *desc)
+{
+ struct msghdr msg;
+ struct kvec iov[1];
+ int ret;
+
+ struct {
+ struct nlmsghdr nl_msg __attribute__((aligned(NLMSG_ALIGNTO)));
+ struct ifaddrmsg addr_msg __attribute__((aligned(NLMSG_ALIGNTO)));
+ } request;
+
+ _enter("");
+
+ memset(&request, 0, sizeof(request));
+
+ request.nl_msg.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg));
+ request.nl_msg.nlmsg_type = RTM_GETADDR;
+ request.nl_msg.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ request.nl_msg.nlmsg_seq = desc->msg_seq++;
+ request.nl_msg.nlmsg_pid = 0;
+
+ memset(&msg, 0, sizeof(msg));
+ iov[0].iov_base = &request;
+ iov[0].iov_len = sizeof(request);
+
+ ret = kernel_sendmsg(desc->nlsock, &msg, iov, 1, iov[0].iov_len);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * list the interface link statuses to get the MTUs
+ */
+static int afs_rtm_getlink(struct afs_rtm_desc *desc)
+{
+ struct msghdr msg;
+ struct kvec iov[1];
+ int ret;
+
+ struct {
+ struct nlmsghdr nl_msg __attribute__((aligned(NLMSG_ALIGNTO)));
+ struct ifinfomsg link_msg __attribute__((aligned(NLMSG_ALIGNTO)));
+ } request;
+
+ _enter("");
+
+ memset(&request, 0, sizeof(request));
+
+ request.nl_msg.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+ request.nl_msg.nlmsg_type = RTM_GETLINK;
+ request.nl_msg.nlmsg_flags = NLM_F_REQUEST | NLM_F_ROOT;
+ request.nl_msg.nlmsg_seq = desc->msg_seq++;
+ request.nl_msg.nlmsg_pid = 0;
+
+ memset(&msg, 0, sizeof(msg));
+ iov[0].iov_base = &request;
+ iov[0].iov_len = sizeof(request);
+
+ ret = kernel_sendmsg(desc->nlsock, &msg, iov, 1, iov[0].iov_len);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * cull any interface records for which there isn't an MTU value
+ */
+static void afs_cull_interfaces(struct afs_rtm_desc *desc)
+{
+ struct afs_interface *bufs = desc->bufs;
+ size_t nbufs = desc->nbufs;
+ int loop, point = 0;
+
+ _enter("{%zu}", nbufs);
+
+ for (loop = 0; loop < nbufs; loop++) {
+ if (desc->bufs[loop].mtu != 0) {
+ if (loop != point) {
+ ASSERTCMP(loop, >, point);
+ bufs[point] = bufs[loop];
+ }
+ point++;
+ }
+ }
+
+ desc->nbufs = point;
+ _leave(" [%zu/%zu]", desc->nbufs, nbufs);
+}
+
+/*
+ * get a list of this system's interface IPv4 addresses, netmasks and MTUs
+ * - returns the number of interface records in the buffer
+ */
+int afs_get_ipv4_interfaces(struct afs_interface *bufs, size_t maxbufs,
+ bool wantloopback)
+{
+ struct afs_rtm_desc desc;
+ int ret, loop;
+
+ _enter("");
+
+ memset(&desc, 0, sizeof(desc));
+ desc.bufs = bufs;
+ desc.maxbufs = maxbufs;
+ desc.wantloopback = wantloopback;
+
+ ret = sock_create_kern(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE,
+ &desc.nlsock);
+ if (ret < 0) {
+ _leave(" = %d [sock]", ret);
+ return ret;
+ }
+
+ /* issue RTM_GETADDR */
+ desc.parse = afs_rtm_getaddr_parse;
+ ret = afs_rtm_getaddr(&desc);
+ if (ret < 0)
+ goto error;
+ ret = afs_read_rtm(&desc);
+ if (ret < 0)
+ goto error;
+
+ /* issue RTM_GETLINK */
+ desc.parse = afs_rtm_getlink_if_parse;
+ ret = afs_rtm_getlink(&desc);
+ if (ret < 0)
+ goto error;
+ ret = afs_read_rtm(&desc);
+ if (ret < 0)
+ goto error;
+
+ afs_cull_interfaces(&desc);
+ ret = desc.nbufs;
+
+ for (loop = 0; loop < ret; loop++)
+ _debug("[%d] "NIPQUAD_FMT"/"NIPQUAD_FMT" mtu %u",
+ bufs[loop].index,
+ NIPQUAD(bufs[loop].address),
+ NIPQUAD(bufs[loop].netmask),
+ bufs[loop].mtu);
+
+error:
+ kfree(desc.data);
+ sock_release(desc.nlsock);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * get a MAC address from a random ethernet interface that has a real one
+ * - the buffer should be 6 bytes in size
+ */
+int afs_get_MAC_address(u8 mac[6])
+{
+ struct afs_rtm_desc desc;
+ int ret;
+
+ _enter("");
+
+ memset(&desc, 0, sizeof(desc));
+ desc.mac = mac;
+ desc.mac_index = UINT_MAX;
+
+ ret = sock_create_kern(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE,
+ &desc.nlsock);
+ if (ret < 0) {
+ _leave(" = %d [sock]", ret);
+ return ret;
+ }
+
+ /* issue RTM_GETLINK */
+ desc.parse = afs_rtm_getlink_mac_parse;
+ ret = afs_rtm_getlink(&desc);
+ if (ret < 0)
+ goto error;
+ ret = afs_read_rtm(&desc);
+ if (ret < 0)
+ goto error;
+
+ if (desc.mac_index < UINT_MAX) {
+ /* got a MAC address */
+ _debug("[%d] %02x:%02x:%02x:%02x:%02x:%02x",
+ desc.mac_index,
+ mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
+ } else {
+ ret = -ENONET;
+ }
+
+error:
+ sock_release(desc.nlsock);
+ _leave(" = %d", ret);
+ return ret;
+}
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 7b0e3192ee39..36c1306e09e0 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -1,4 +1,4 @@
-/* vlclient.c: AFS Volume Location Service client
+/* AFS Volume Location Service client
*
* Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
@@ -11,247 +11,76 @@
#include <linux/init.h>
#include <linux/sched.h>
-#include <rxrpc/rxrpc.h>
-#include <rxrpc/transport.h>
-#include <rxrpc/connection.h>
-#include <rxrpc/call.h>
-#include "server.h"
-#include "volume.h"
-#include "vlclient.h"
-#include "kafsasyncd.h"
-#include "kafstimod.h"
-#include "errors.h"
#include "internal.h"
-#define VLGETENTRYBYID 503 /* AFS Get Cache Entry By ID operation ID */
-#define VLGETENTRYBYNAME 504 /* AFS Get Cache Entry By Name operation ID */
-#define VLPROBE 514 /* AFS Probe Volume Location Service operation ID */
-
-static void afs_rxvl_get_entry_by_id_attn(struct rxrpc_call *call);
-static void afs_rxvl_get_entry_by_id_error(struct rxrpc_call *call);
-
-/*****************************************************************************/
/*
- * map afs VL abort codes to/from Linux error codes
- * - called with call->lock held
+ * map volume locator abort codes to error codes
*/
-static void afs_rxvl_aemap(struct rxrpc_call *call)
+static int afs_vl_abort_to_error(u32 abort_code)
{
- int err;
-
- _enter("{%u,%u,%d}",
- call->app_err_state, call->app_abort_code, call->app_errno);
-
- switch (call->app_err_state) {
- case RXRPC_ESTATE_LOCAL_ABORT:
- call->app_abort_code = -call->app_errno;
- return;
-
- case RXRPC_ESTATE_PEER_ABORT:
- switch (call->app_abort_code) {
- case AFSVL_IDEXIST: err = -EEXIST; break;
- case AFSVL_IO: err = -EREMOTEIO; break;
- case AFSVL_NAMEEXIST: err = -EEXIST; break;
- case AFSVL_CREATEFAIL: err = -EREMOTEIO; break;
- case AFSVL_NOENT: err = -ENOMEDIUM; break;
- case AFSVL_EMPTY: err = -ENOMEDIUM; break;
- case AFSVL_ENTDELETED: err = -ENOMEDIUM; break;
- case AFSVL_BADNAME: err = -EINVAL; break;
- case AFSVL_BADINDEX: err = -EINVAL; break;
- case AFSVL_BADVOLTYPE: err = -EINVAL; break;
- case AFSVL_BADSERVER: err = -EINVAL; break;
- case AFSVL_BADPARTITION: err = -EINVAL; break;
- case AFSVL_REPSFULL: err = -EFBIG; break;
- case AFSVL_NOREPSERVER: err = -ENOENT; break;
- case AFSVL_DUPREPSERVER: err = -EEXIST; break;
- case AFSVL_RWNOTFOUND: err = -ENOENT; break;
- case AFSVL_BADREFCOUNT: err = -EINVAL; break;
- case AFSVL_SIZEEXCEEDED: err = -EINVAL; break;
- case AFSVL_BADENTRY: err = -EINVAL; break;
- case AFSVL_BADVOLIDBUMP: err = -EINVAL; break;
- case AFSVL_IDALREADYHASHED: err = -EINVAL; break;
- case AFSVL_ENTRYLOCKED: err = -EBUSY; break;
- case AFSVL_BADVOLOPER: err = -EBADRQC; break;
- case AFSVL_BADRELLOCKTYPE: err = -EINVAL; break;
- case AFSVL_RERELEASE: err = -EREMOTEIO; break;
- case AFSVL_BADSERVERFLAG: err = -EINVAL; break;
- case AFSVL_PERM: err = -EACCES; break;
- case AFSVL_NOMEM: err = -EREMOTEIO; break;
- default:
- err = afs_abort_to_error(call->app_abort_code);
- break;
- }
- call->app_errno = err;
- return;
-
+ _enter("%u", abort_code);
+
+ switch (abort_code) {
+ case AFSVL_IDEXIST: return -EEXIST;
+ case AFSVL_IO: return -EREMOTEIO;
+ case AFSVL_NAMEEXIST: return -EEXIST;
+ case AFSVL_CREATEFAIL: return -EREMOTEIO;
+ case AFSVL_NOENT: return -ENOMEDIUM;
+ case AFSVL_EMPTY: return -ENOMEDIUM;
+ case AFSVL_ENTDELETED: return -ENOMEDIUM;
+ case AFSVL_BADNAME: return -EINVAL;
+ case AFSVL_BADINDEX: return -EINVAL;
+ case AFSVL_BADVOLTYPE: return -EINVAL;
+ case AFSVL_BADSERVER: return -EINVAL;
+ case AFSVL_BADPARTITION: return -EINVAL;
+ case AFSVL_REPSFULL: return -EFBIG;
+ case AFSVL_NOREPSERVER: return -ENOENT;
+ case AFSVL_DUPREPSERVER: return -EEXIST;
+ case AFSVL_RWNOTFOUND: return -ENOENT;
+ case AFSVL_BADREFCOUNT: return -EINVAL;
+ case AFSVL_SIZEEXCEEDED: return -EINVAL;
+ case AFSVL_BADENTRY: return -EINVAL;
+ case AFSVL_BADVOLIDBUMP: return -EINVAL;
+ case AFSVL_IDALREADYHASHED: return -EINVAL;
+ case AFSVL_ENTRYLOCKED: return -EBUSY;
+ case AFSVL_BADVOLOPER: return -EBADRQC;
+ case AFSVL_BADRELLOCKTYPE: return -EINVAL;
+ case AFSVL_RERELEASE: return -EREMOTEIO;
+ case AFSVL_BADSERVERFLAG: return -EINVAL;
+ case AFSVL_PERM: return -EACCES;
+ case AFSVL_NOMEM: return -EREMOTEIO;
default:
- return;
+ return afs_abort_to_error(abort_code);
}
-} /* end afs_rxvl_aemap() */
+}
-#if 0
-/*****************************************************************************/
/*
- * probe a volume location server to see if it is still alive -- unused
+ * deliver reply data to a VL.GetEntryByXXX call
*/
-static int afs_rxvl_probe(struct afs_server *server, int alloc_flags)
+static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call,
+ struct sk_buff *skb, bool last)
{
- struct rxrpc_connection *conn;
- struct rxrpc_call *call;
- struct kvec piov[1];
- size_t sent;
- int ret;
- __be32 param[1];
-
- DECLARE_WAITQUEUE(myself, current);
-
- /* get hold of the vlserver connection */
- ret = afs_server_get_vlconn(server, &conn);
- if (ret < 0)
- goto out;
-
- /* create a call through that connection */
- ret = rxrpc_create_call(conn, NULL, NULL, afs_rxvl_aemap, &call);
- if (ret < 0) {
- printk("kAFS: Unable to create call: %d\n", ret);
- goto out_put_conn;
- }
- call->app_opcode = VLPROBE;
-
- /* we want to get event notifications from the call */
- add_wait_queue(&call->waitq, &myself);
-
- /* marshall the parameters */
- param[0] = htonl(VLPROBE);
- piov[0].iov_len = sizeof(param);
- piov[0].iov_base = param;
-
- /* send the parameters to the server */
- ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET,
- alloc_flags, 0, &sent);
- if (ret < 0)
- goto abort;
-
- /* wait for the reply to completely arrive */
- for (;;) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (call->app_call_state != RXRPC_CSTATE_CLNT_RCV_REPLY ||
- signal_pending(current))
- break;
- schedule();
- }
- set_current_state(TASK_RUNNING);
-
- ret = -EINTR;
- if (signal_pending(current))
- goto abort;
-
- switch (call->app_call_state) {
- case RXRPC_CSTATE_ERROR:
- ret = call->app_errno;
- goto out_unwait;
-
- case RXRPC_CSTATE_CLNT_GOT_REPLY:
- ret = 0;
- goto out_unwait;
-
- default:
- BUG();
- }
-
- abort:
- set_current_state(TASK_UNINTERRUPTIBLE);
- rxrpc_call_abort(call, ret);
- schedule();
-
- out_unwait:
- set_current_state(TASK_RUNNING);
- remove_wait_queue(&call->waitq, &myself);
- rxrpc_put_call(call);
- out_put_conn:
- rxrpc_put_connection(conn);
- out:
- return ret;
+ struct afs_cache_vlocation *entry;
+ __be32 *bp;
+ u32 tmp;
+ int loop;
-} /* end afs_rxvl_probe() */
-#endif
+ _enter(",,%u", last);
-/*****************************************************************************/
-/*
- * look up a volume location database entry by name
- */
-int afs_rxvl_get_entry_by_name(struct afs_server *server,
- const char *volname,
- unsigned volnamesz,
- struct afs_cache_vlocation *entry)
-{
- DECLARE_WAITQUEUE(myself, current);
-
- struct rxrpc_connection *conn;
- struct rxrpc_call *call;
- struct kvec piov[3];
- unsigned tmp;
- size_t sent;
- int ret, loop;
- __be32 *bp, param[2], zero;
-
- _enter(",%*.*s,%u,", volnamesz, volnamesz, volname, volnamesz);
-
- memset(entry, 0, sizeof(*entry));
-
- /* get hold of the vlserver connection */
- ret = afs_server_get_vlconn(server, &conn);
- if (ret < 0)
- goto out;
-
- /* create a call through that connection */
- ret = rxrpc_create_call(conn, NULL, NULL, afs_rxvl_aemap, &call);
- if (ret < 0) {
- printk("kAFS: Unable to create call: %d\n", ret);
- goto out_put_conn;
- }
- call->app_opcode = VLGETENTRYBYNAME;
+ afs_transfer_reply(call, skb);
+ if (!last)
+ return 0;
- /* we want to get event notifications from the call */
- add_wait_queue(&call->waitq, &myself);
+ if (call->reply_size != call->reply_max)
+ return -EBADMSG;
- /* marshall the parameters */
- piov[1].iov_len = volnamesz;
- piov[1].iov_base = (char *) volname;
-
- zero = 0;
- piov[2].iov_len = (4 - (piov[1].iov_len & 3)) & 3;
- piov[2].iov_base = &zero;
-
- param[0] = htonl(VLGETENTRYBYNAME);
- param[1] = htonl(piov[1].iov_len);
-
- piov[0].iov_len = sizeof(param);
- piov[0].iov_base = param;
-
- /* send the parameters to the server */
- ret = rxrpc_call_write_data(call, 3, piov, RXRPC_LAST_PACKET, GFP_NOFS,
- 0, &sent);
- if (ret < 0)
- goto abort;
-
- /* wait for the reply to completely arrive */
- bp = rxrpc_call_alloc_scratch(call, 384);
-
- ret = rxrpc_call_read_data(call, bp, 384,
- RXRPC_CALL_READ_BLOCK |
- RXRPC_CALL_READ_ALL);
- if (ret < 0) {
- if (ret == -ECONNABORTED) {
- ret = call->app_errno;
- goto out_unwait;
- }
- goto abort;
- }
+ /* unmarshall the reply once we've received all of it */
+ entry = call->reply;
+ bp = call->buffer;
- /* unmarshall the reply */
for (loop = 0; loop < 64; loop++)
entry->name[loop] = ntohl(*bp++);
+ entry->name[loop] = 0;
bp++; /* final NUL */
bp++; /* type */
@@ -264,6 +93,7 @@ int afs_rxvl_get_entry_by_name(struct afs_server *server,
for (loop = 0; loop < 8; loop++) {
tmp = ntohl(*bp++);
+ entry->srvtmask[loop] = 0;
if (tmp & AFS_VLSF_RWVOL)
entry->srvtmask[loop] |= AFS_VOL_VTM_RW;
if (tmp & AFS_VLSF_ROVOL)
@@ -279,417 +109,110 @@ int afs_rxvl_get_entry_by_name(struct afs_server *server,
bp++; /* clone ID */
tmp = ntohl(*bp++); /* flags */
+ entry->vidmask = 0;
if (tmp & AFS_VLF_RWEXISTS)
entry->vidmask |= AFS_VOL_VTM_RW;
if (tmp & AFS_VLF_ROEXISTS)
entry->vidmask |= AFS_VOL_VTM_RO;
if (tmp & AFS_VLF_BACKEXISTS)
entry->vidmask |= AFS_VOL_VTM_BAK;
-
- ret = -ENOMEDIUM;
if (!entry->vidmask)
- goto abort;
-
- /* success */
- entry->rtime = get_seconds();
- ret = 0;
-
- out_unwait:
- set_current_state(TASK_RUNNING);
- remove_wait_queue(&call->waitq, &myself);
- rxrpc_put_call(call);
- out_put_conn:
- rxrpc_put_connection(conn);
- out:
- _leave(" = %d", ret);
- return ret;
-
- abort:
- set_current_state(TASK_UNINTERRUPTIBLE);
- rxrpc_call_abort(call, ret);
- schedule();
- goto out_unwait;
-} /* end afs_rxvl_get_entry_by_name() */
-
-/*****************************************************************************/
-/*
- * look up a volume location database entry by ID
- */
-int afs_rxvl_get_entry_by_id(struct afs_server *server,
- afs_volid_t volid,
- afs_voltype_t voltype,
- struct afs_cache_vlocation *entry)
-{
- DECLARE_WAITQUEUE(myself, current);
-
- struct rxrpc_connection *conn;
- struct rxrpc_call *call;
- struct kvec piov[1];
- unsigned tmp;
- size_t sent;
- int ret, loop;
- __be32 *bp, param[3];
-
- _enter(",%x,%d,", volid, voltype);
-
- memset(entry, 0, sizeof(*entry));
-
- /* get hold of the vlserver connection */
- ret = afs_server_get_vlconn(server, &conn);
- if (ret < 0)
- goto out;
-
- /* create a call through that connection */
- ret = rxrpc_create_call(conn, NULL, NULL, afs_rxvl_aemap, &call);
- if (ret < 0) {
- printk("kAFS: Unable to create call: %d\n", ret);
- goto out_put_conn;
- }
- call->app_opcode = VLGETENTRYBYID;
-
- /* we want to get event notifications from the call */
- add_wait_queue(&call->waitq, &myself);
-
- /* marshall the parameters */
- param[0] = htonl(VLGETENTRYBYID);
- param[1] = htonl(volid);
- param[2] = htonl(voltype);
-
- piov[0].iov_len = sizeof(param);
- piov[0].iov_base = param;
-
- /* send the parameters to the server */
- ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
- 0, &sent);
- if (ret < 0)
- goto abort;
-
- /* wait for the reply to completely arrive */
- bp = rxrpc_call_alloc_scratch(call, 384);
-
- ret = rxrpc_call_read_data(call, bp, 384,
- RXRPC_CALL_READ_BLOCK |
- RXRPC_CALL_READ_ALL);
- if (ret < 0) {
- if (ret == -ECONNABORTED) {
- ret = call->app_errno;
- goto out_unwait;
- }
- goto abort;
- }
-
- /* unmarshall the reply */
- for (loop = 0; loop < 64; loop++)
- entry->name[loop] = ntohl(*bp++);
- bp++; /* final NUL */
+ return -EBADMSG;
- bp++; /* type */
- entry->nservers = ntohl(*bp++);
-
- for (loop = 0; loop < 8; loop++)
- entry->servers[loop].s_addr = *bp++;
-
- bp += 8; /* partition IDs */
+ _leave(" = 0 [done]");
+ return 0;
+}
- for (loop = 0; loop < 8; loop++) {
- tmp = ntohl(*bp++);
- if (tmp & AFS_VLSF_RWVOL)
- entry->srvtmask[loop] |= AFS_VOL_VTM_RW;
- if (tmp & AFS_VLSF_ROVOL)
- entry->srvtmask[loop] |= AFS_VOL_VTM_RO;
- if (tmp & AFS_VLSF_BACKVOL)
- entry->srvtmask[loop] |= AFS_VOL_VTM_BAK;
- }
-
- entry->vid[0] = ntohl(*bp++);
- entry->vid[1] = ntohl(*bp++);
- entry->vid[2] = ntohl(*bp++);
-
- bp++; /* clone ID */
-
- tmp = ntohl(*bp++); /* flags */
- if (tmp & AFS_VLF_RWEXISTS)
- entry->vidmask |= AFS_VOL_VTM_RW;
- if (tmp & AFS_VLF_ROEXISTS)
- entry->vidmask |= AFS_VOL_VTM_RO;
- if (tmp & AFS_VLF_BACKEXISTS)
- entry->vidmask |= AFS_VOL_VTM_BAK;
-
- ret = -ENOMEDIUM;
- if (!entry->vidmask)
- goto abort;
-
-#if 0 /* TODO: remove */
- entry->nservers = 3;
- entry->servers[0].s_addr = htonl(0xac101249);
- entry->servers[1].s_addr = htonl(0xac101243);
- entry->servers[2].s_addr = htonl(0xac10125b /*0xac10125b*/);
-
- entry->srvtmask[0] = AFS_VOL_VTM_RO;
- entry->srvtmask[1] = AFS_VOL_VTM_RO;
- entry->srvtmask[2] = AFS_VOL_VTM_RO | AFS_VOL_VTM_RW;
-#endif
-
- /* success */
- entry->rtime = get_seconds();
- ret = 0;
-
- out_unwait:
- set_current_state(TASK_RUNNING);
- remove_wait_queue(&call->waitq, &myself);
- rxrpc_put_call(call);
- out_put_conn:
- rxrpc_put_connection(conn);
- out:
- _leave(" = %d", ret);
- return ret;
-
- abort:
- set_current_state(TASK_UNINTERRUPTIBLE);
- rxrpc_call_abort(call, ret);
- schedule();
- goto out_unwait;
-} /* end afs_rxvl_get_entry_by_id() */
-
-/*****************************************************************************/
/*
- * look up a volume location database entry by ID asynchronously
+ * VL.GetEntryByName operation type
*/
-int afs_rxvl_get_entry_by_id_async(struct afs_async_op *op,
- afs_volid_t volid,
- afs_voltype_t voltype)
-{
- struct rxrpc_connection *conn;
- struct rxrpc_call *call;
- struct kvec piov[1];
- size_t sent;
- int ret;
- __be32 param[3];
-
- _enter(",%x,%d,", volid, voltype);
-
- /* get hold of the vlserver connection */
- ret = afs_server_get_vlconn(op->server, &conn);
- if (ret < 0) {
- _leave(" = %d", ret);
- return ret;
- }
-
- /* create a call through that connection */
- ret = rxrpc_create_call(conn,
- afs_rxvl_get_entry_by_id_attn,
- afs_rxvl_get_entry_by_id_error,
- afs_rxvl_aemap,
- &op->call);
- rxrpc_put_connection(conn);
-
- if (ret < 0) {
- printk("kAFS: Unable to create call: %d\n", ret);
- _leave(" = %d", ret);
- return ret;
- }
+static const struct afs_call_type afs_RXVLGetEntryByName = {
+ .name = "VL.GetEntryByName",
+ .deliver = afs_deliver_vl_get_entry_by_xxx,
+ .abort_to_error = afs_vl_abort_to_error,
+ .destructor = afs_flat_call_destructor,
+};
- op->call->app_opcode = VLGETENTRYBYID;
- op->call->app_user = op;
-
- call = op->call;
- rxrpc_get_call(call);
-
- /* send event notifications from the call to kafsasyncd */
- afs_kafsasyncd_begin_op(op);
-
- /* marshall the parameters */
- param[0] = htonl(VLGETENTRYBYID);
- param[1] = htonl(volid);
- param[2] = htonl(voltype);
-
- piov[0].iov_len = sizeof(param);
- piov[0].iov_base = param;
-
- /* allocate result read buffer in scratch space */
- call->app_scr_ptr = rxrpc_call_alloc_scratch(op->call, 384);
-
- /* send the parameters to the server */
- ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
- 0, &sent);
- if (ret < 0) {
- rxrpc_call_abort(call, ret); /* handle from kafsasyncd */
- ret = 0;
- goto out;
- }
-
- /* wait for the reply to completely arrive */
- ret = rxrpc_call_read_data(call, call->app_scr_ptr, 384, 0);
- switch (ret) {
- case 0:
- case -EAGAIN:
- case -ECONNABORTED:
- ret = 0;
- break; /* all handled by kafsasyncd */
-
- default:
- rxrpc_call_abort(call, ret); /* make kafsasyncd handle it */
- ret = 0;
- break;
- }
-
- out:
- rxrpc_put_call(call);
- _leave(" = %d", ret);
- return ret;
-
-} /* end afs_rxvl_get_entry_by_id_async() */
+/*
+ * VL.GetEntryById operation type
+ */
+static const struct afs_call_type afs_RXVLGetEntryById = {
+ .name = "VL.GetEntryById",
+ .deliver = afs_deliver_vl_get_entry_by_xxx,
+ .abort_to_error = afs_vl_abort_to_error,
+ .destructor = afs_flat_call_destructor,
+};
-/*****************************************************************************/
/*
- * attend to the asynchronous get VLDB entry by ID
+ * dispatch a get volume entry by name operation
*/
-int afs_rxvl_get_entry_by_id_async2(struct afs_async_op *op,
- struct afs_cache_vlocation *entry)
+int afs_vl_get_entry_by_name(struct in_addr *addr,
+ struct key *key,
+ const char *volname,
+ struct afs_cache_vlocation *entry,
+ const struct afs_wait_mode *wait_mode)
{
+ struct afs_call *call;
+ size_t volnamesz, reqsz, padsz;
__be32 *bp;
- __u32 tmp;
- int loop, ret;
-
- _enter("{op=%p cst=%u}", op, op->call->app_call_state);
-
- memset(entry, 0, sizeof(*entry));
-
- if (op->call->app_call_state == RXRPC_CSTATE_COMPLETE) {
- /* operation finished */
- afs_kafsasyncd_terminate_op(op);
-
- bp = op->call->app_scr_ptr;
-
- /* unmarshall the reply */
- for (loop = 0; loop < 64; loop++)
- entry->name[loop] = ntohl(*bp++);
- bp++; /* final NUL */
-
- bp++; /* type */
- entry->nservers = ntohl(*bp++);
-
- for (loop = 0; loop < 8; loop++)
- entry->servers[loop].s_addr = *bp++;
-
- bp += 8; /* partition IDs */
-
- for (loop = 0; loop < 8; loop++) {
- tmp = ntohl(*bp++);
- if (tmp & AFS_VLSF_RWVOL)
- entry->srvtmask[loop] |= AFS_VOL_VTM_RW;
- if (tmp & AFS_VLSF_ROVOL)
- entry->srvtmask[loop] |= AFS_VOL_VTM_RO;
- if (tmp & AFS_VLSF_BACKVOL)
- entry->srvtmask[loop] |= AFS_VOL_VTM_BAK;
- }
-
- entry->vid[0] = ntohl(*bp++);
- entry->vid[1] = ntohl(*bp++);
- entry->vid[2] = ntohl(*bp++);
-
- bp++; /* clone ID */
-
- tmp = ntohl(*bp++); /* flags */
- if (tmp & AFS_VLF_RWEXISTS)
- entry->vidmask |= AFS_VOL_VTM_RW;
- if (tmp & AFS_VLF_ROEXISTS)
- entry->vidmask |= AFS_VOL_VTM_RO;
- if (tmp & AFS_VLF_BACKEXISTS)
- entry->vidmask |= AFS_VOL_VTM_BAK;
-
- ret = -ENOMEDIUM;
- if (!entry->vidmask) {
- rxrpc_call_abort(op->call, ret);
- goto done;
- }
-
-#if 0 /* TODO: remove */
- entry->nservers = 3;
- entry->servers[0].s_addr = htonl(0xac101249);
- entry->servers[1].s_addr = htonl(0xac101243);
- entry->servers[2].s_addr = htonl(0xac10125b /*0xac10125b*/);
-
- entry->srvtmask[0] = AFS_VOL_VTM_RO;
- entry->srvtmask[1] = AFS_VOL_VTM_RO;
- entry->srvtmask[2] = AFS_VOL_VTM_RO | AFS_VOL_VTM_RW;
-#endif
-
- /* success */
- entry->rtime = get_seconds();
- ret = 0;
- goto done;
- }
- if (op->call->app_call_state == RXRPC_CSTATE_ERROR) {
- /* operation error */
- ret = op->call->app_errno;
- goto done;
- }
+ _enter("");
- _leave(" = -EAGAIN");
- return -EAGAIN;
+ volnamesz = strlen(volname);
+ padsz = (4 - (volnamesz & 3)) & 3;
+ reqsz = 8 + volnamesz + padsz;
- done:
- rxrpc_put_call(op->call);
- op->call = NULL;
- _leave(" = %d", ret);
- return ret;
-} /* end afs_rxvl_get_entry_by_id_async2() */
+ call = afs_alloc_flat_call(&afs_RXVLGetEntryByName, reqsz, 384);
+ if (!call)
+ return -ENOMEM;
-/*****************************************************************************/
-/*
- * handle attention events on an async get-entry-by-ID op
- * - called from krxiod
- */
-static void afs_rxvl_get_entry_by_id_attn(struct rxrpc_call *call)
-{
- struct afs_async_op *op = call->app_user;
-
- _enter("{op=%p cst=%u}", op, call->app_call_state);
-
- switch (call->app_call_state) {
- case RXRPC_CSTATE_COMPLETE:
- afs_kafsasyncd_attend_op(op);
- break;
- case RXRPC_CSTATE_CLNT_RCV_REPLY:
- if (call->app_async_read)
- break;
- case RXRPC_CSTATE_CLNT_GOT_REPLY:
- if (call->app_read_count == 0)
- break;
- printk("kAFS: Reply bigger than expected"
- " {cst=%u asyn=%d mark=%Zu rdy=%Zu pr=%u%s}",
- call->app_call_state,
- call->app_async_read,
- call->app_mark,
- call->app_ready_qty,
- call->pkt_rcv_count,
- call->app_last_rcv ? " last" : "");
-
- rxrpc_call_abort(call, -EBADMSG);
- break;
- default:
- BUG();
- }
+ call->key = key;
+ call->reply = entry;
+ call->service_id = VL_SERVICE;
+ call->port = htons(AFS_VL_PORT);
- _leave("");
+ /* marshall the parameters */
+ bp = call->request;
+ *bp++ = htonl(VLGETENTRYBYNAME);
+ *bp++ = htonl(volnamesz);
+ memcpy(bp, volname, volnamesz);
+ if (padsz > 0)
+ memset((void *) bp + volnamesz, 0, padsz);
-} /* end afs_rxvl_get_entry_by_id_attn() */
+ /* initiate the call */
+ return afs_make_call(addr, call, GFP_KERNEL, wait_mode);
+}
-/*****************************************************************************/
/*
- * handle error events on an async get-entry-by-ID op
- * - called from krxiod
+ * dispatch a get volume entry by ID operation
*/
-static void afs_rxvl_get_entry_by_id_error(struct rxrpc_call *call)
+int afs_vl_get_entry_by_id(struct in_addr *addr,
+ struct key *key,
+ afs_volid_t volid,
+ afs_voltype_t voltype,
+ struct afs_cache_vlocation *entry,
+ const struct afs_wait_mode *wait_mode)
{
- struct afs_async_op *op = call->app_user;
+ struct afs_call *call;
+ __be32 *bp;
- _enter("{op=%p cst=%u}", op, call->app_call_state);
+ _enter("");
- afs_kafsasyncd_attend_op(op);
+ call = afs_alloc_flat_call(&afs_RXVLGetEntryById, 12, 384);
+ if (!call)
+ return -ENOMEM;
- _leave("");
+ call->key = key;
+ call->reply = entry;
+ call->service_id = VL_SERVICE;
+ call->port = htons(AFS_VL_PORT);
-} /* end afs_rxvl_get_entry_by_id_error() */
+ /* marshall the parameters */
+ bp = call->request;
+ *bp++ = htonl(VLGETENTRYBYID);
+ *bp++ = htonl(volid);
+ *bp = htonl(voltype);
+
+ /* initiate the call */
+ return afs_make_call(addr, call, GFP_KERNEL, wait_mode);
+}
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 782ee7c600ca..74cce174882a 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -1,6 +1,6 @@
-/* vlocation.c: volume location management
+/* AFS volume location management
*
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
@@ -12,131 +12,61 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include "volume.h"
-#include "cell.h"
-#include "cmservice.h"
-#include "fsclient.h"
-#include "vlclient.h"
-#include "kafstimod.h"
-#include <rxrpc/connection.h>
#include "internal.h"
-#define AFS_VLDB_TIMEOUT HZ*1000
+unsigned afs_vlocation_timeout = 10; /* volume location timeout in seconds */
+unsigned afs_vlocation_update_timeout = 10 * 60;
-static void afs_vlocation_update_timer(struct afs_timer *timer);
-static void afs_vlocation_update_attend(struct afs_async_op *op);
-static void afs_vlocation_update_discard(struct afs_async_op *op);
-static void __afs_put_vlocation(struct afs_vlocation *vlocation);
+static void afs_vlocation_reaper(struct work_struct *);
+static void afs_vlocation_updater(struct work_struct *);
-static void __afs_vlocation_timeout(struct afs_timer *timer)
-{
- struct afs_vlocation *vlocation =
- list_entry(timer, struct afs_vlocation, timeout);
-
- _debug("VL TIMEOUT [%s{u=%d}]",
- vlocation->vldb.name, atomic_read(&vlocation->usage));
-
- afs_vlocation_do_timeout(vlocation);
-}
-
-static const struct afs_timer_ops afs_vlocation_timer_ops = {
- .timed_out = __afs_vlocation_timeout,
-};
+static LIST_HEAD(afs_vlocation_updates);
+static LIST_HEAD(afs_vlocation_graveyard);
+static DEFINE_SPINLOCK(afs_vlocation_updates_lock);
+static DEFINE_SPINLOCK(afs_vlocation_graveyard_lock);
+static DECLARE_DELAYED_WORK(afs_vlocation_reap, afs_vlocation_reaper);
+static DECLARE_DELAYED_WORK(afs_vlocation_update, afs_vlocation_updater);
+static struct workqueue_struct *afs_vlocation_update_worker;
-static const struct afs_timer_ops afs_vlocation_update_timer_ops = {
- .timed_out = afs_vlocation_update_timer,
-};
-
-static const struct afs_async_op_ops afs_vlocation_update_op_ops = {
- .attend = afs_vlocation_update_attend,
- .discard = afs_vlocation_update_discard,
-};
-
-static LIST_HEAD(afs_vlocation_update_pendq); /* queue of VLs awaiting update */
-static struct afs_vlocation *afs_vlocation_update; /* VL currently being updated */
-static DEFINE_SPINLOCK(afs_vlocation_update_lock); /* lock guarding update queue */
-
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vlocation_cache_match(void *target,
- const void *entry);
-static void afs_vlocation_cache_update(void *source, void *entry);
-
-struct cachefs_index_def afs_vlocation_cache_index_def = {
- .name = "vldb",
- .data_size = sizeof(struct afs_cache_vlocation),
- .keys[0] = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
- .match = afs_vlocation_cache_match,
- .update = afs_vlocation_cache_update,
-};
-#endif
-
-/*****************************************************************************/
/*
* iterate through the VL servers in a cell until one of them admits knowing
* about the volume in question
- * - caller must have cell->vl_sem write-locked
*/
-static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vlocation,
- const char *name,
- unsigned namesz,
+static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vl,
+ struct key *key,
struct afs_cache_vlocation *vldb)
{
- struct afs_server *server = NULL;
- struct afs_cell *cell = vlocation->cell;
+ struct afs_cell *cell = vl->cell;
+ struct in_addr addr;
int count, ret;
- _enter("%s,%*.*s,%u", cell->name, namesz, namesz, name, namesz);
+ _enter("%s,%s", cell->name, vl->vldb.name);
+ down_write(&vl->cell->vl_sem);
ret = -ENOMEDIUM;
for (count = cell->vl_naddrs; count > 0; count--) {
- _debug("CellServ[%hu]: %08x",
- cell->vl_curr_svix,
- cell->vl_addrs[cell->vl_curr_svix].s_addr);
-
- /* try and create a server */
- ret = afs_server_lookup(cell,
- &cell->vl_addrs[cell->vl_curr_svix],
- &server);
- switch (ret) {
- case 0:
- break;
- case -ENOMEM:
- case -ENONET:
- goto out;
- default:
- goto rotate;
- }
+ addr = cell->vl_addrs[cell->vl_curr_svix];
+
+ _debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr);
/* attempt to access the VL server */
- ret = afs_rxvl_get_entry_by_name(server, name, namesz, vldb);
+ ret = afs_vl_get_entry_by_name(&addr, key, vl->vldb.name, vldb,
+ &afs_sync_call);
switch (ret) {
case 0:
- afs_put_server(server);
goto out;
case -ENOMEM:
case -ENONET:
case -ENETUNREACH:
case -EHOSTUNREACH:
case -ECONNREFUSED:
- down_write(&server->sem);
- if (server->vlserver) {
- rxrpc_put_connection(server->vlserver);
- server->vlserver = NULL;
- }
- up_write(&server->sem);
- afs_put_server(server);
if (ret == -ENOMEM || ret == -ENONET)
goto out;
goto rotate;
case -ENOMEDIUM:
- afs_put_server(server);
goto out;
default:
- afs_put_server(server);
- ret = -ENOMEDIUM;
+ ret = -EIO;
goto rotate;
}
@@ -146,76 +76,66 @@ static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vlocation,
cell->vl_curr_svix %= cell->vl_naddrs;
}
- out:
+out:
+ up_write(&vl->cell->vl_sem);
_leave(" = %d", ret);
return ret;
+}
-} /* end afs_vlocation_access_vl_by_name() */
-
-/*****************************************************************************/
/*
* iterate through the VL servers in a cell until one of them admits knowing
* about the volume in question
- * - caller must have cell->vl_sem write-locked
*/
-static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vlocation,
+static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl,
+ struct key *key,
afs_volid_t volid,
afs_voltype_t voltype,
struct afs_cache_vlocation *vldb)
{
- struct afs_server *server = NULL;
- struct afs_cell *cell = vlocation->cell;
+ struct afs_cell *cell = vl->cell;
+ struct in_addr addr;
int count, ret;
_enter("%s,%x,%d,", cell->name, volid, voltype);
+ down_write(&vl->cell->vl_sem);
ret = -ENOMEDIUM;
for (count = cell->vl_naddrs; count > 0; count--) {
- _debug("CellServ[%hu]: %08x",
- cell->vl_curr_svix,
- cell->vl_addrs[cell->vl_curr_svix].s_addr);
-
- /* try and create a server */
- ret = afs_server_lookup(cell,
- &cell->vl_addrs[cell->vl_curr_svix],
- &server);
- switch (ret) {
- case 0:
- break;
- case -ENOMEM:
- case -ENONET:
- goto out;
- default:
- goto rotate;
- }
+ addr = cell->vl_addrs[cell->vl_curr_svix];
+
+ _debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr);
/* attempt to access the VL server */
- ret = afs_rxvl_get_entry_by_id(server, volid, voltype, vldb);
+ ret = afs_vl_get_entry_by_id(&addr, key, volid, voltype, vldb,
+ &afs_sync_call);
switch (ret) {
case 0:
- afs_put_server(server);
goto out;
case -ENOMEM:
case -ENONET:
case -ENETUNREACH:
case -EHOSTUNREACH:
case -ECONNREFUSED:
- down_write(&server->sem);
- if (server->vlserver) {
- rxrpc_put_connection(server->vlserver);
- server->vlserver = NULL;
- }
- up_write(&server->sem);
- afs_put_server(server);
if (ret == -ENOMEM || ret == -ENONET)
goto out;
goto rotate;
+ case -EBUSY:
+ vl->upd_busy_cnt++;
+ if (vl->upd_busy_cnt <= 3) {
+ if (vl->upd_busy_cnt > 1) {
+ /* second+ BUSY - sleep a little bit */
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(1);
+ __set_current_state(TASK_RUNNING);
+ }
+ continue;
+ }
+ break;
case -ENOMEDIUM:
- afs_put_server(server);
- goto out;
+ vl->upd_rej_cnt++;
+ goto rotate;
default:
- afs_put_server(server);
- ret = -ENOMEDIUM;
+ ret = -EIO;
goto rotate;
}
@@ -223,729 +143,580 @@ static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vlocation,
rotate:
cell->vl_curr_svix++;
cell->vl_curr_svix %= cell->vl_naddrs;
+ vl->upd_busy_cnt = 0;
}
- out:
+out:
+ if (ret < 0 && vl->upd_rej_cnt > 0) {
+ printk(KERN_NOTICE "kAFS:"
+ " Active volume no longer valid '%s'\n",
+ vl->vldb.name);
+ vl->valid = 0;
+ ret = -ENOMEDIUM;
+ }
+
+ up_write(&vl->cell->vl_sem);
_leave(" = %d", ret);
return ret;
+}
-} /* end afs_vlocation_access_vl_by_id() */
-
-/*****************************************************************************/
/*
- * lookup volume location
- * - caller must have cell->vol_sem write-locked
- * - iterate through the VL servers in a cell until one of them admits knowing
- * about the volume in question
- * - lookup in the local cache if not able to find on the VL server
- * - insert/update in the local cache if did get a VL response
+ * allocate a volume location record
*/
-int afs_vlocation_lookup(struct afs_cell *cell,
- const char *name,
- unsigned namesz,
- struct afs_vlocation **_vlocation)
+static struct afs_vlocation *afs_vlocation_alloc(struct afs_cell *cell,
+ const char *name,
+ size_t namesz)
{
- struct afs_cache_vlocation vldb;
- struct afs_vlocation *vlocation;
- afs_voltype_t voltype;
- afs_volid_t vid;
- int active = 0, ret;
-
- _enter("{%s},%*.*s,%u,", cell->name, namesz, namesz, name, namesz);
-
- if (namesz > sizeof(vlocation->vldb.name)) {
- _leave(" = -ENAMETOOLONG");
- return -ENAMETOOLONG;
- }
-
- /* search the cell's active list first */
- list_for_each_entry(vlocation, &cell->vl_list, link) {
- if (namesz < sizeof(vlocation->vldb.name) &&
- vlocation->vldb.name[namesz] != '\0')
- continue;
-
- if (memcmp(vlocation->vldb.name, name, namesz) == 0)
- goto found_in_memory;
- }
-
- /* search the cell's graveyard list second */
- spin_lock(&cell->vl_gylock);
- list_for_each_entry(vlocation, &cell->vl_graveyard, link) {
- if (namesz < sizeof(vlocation->vldb.name) &&
- vlocation->vldb.name[namesz] != '\0')
- continue;
-
- if (memcmp(vlocation->vldb.name, name, namesz) == 0)
- goto found_in_graveyard;
- }
- spin_unlock(&cell->vl_gylock);
-
- /* not in the cell's in-memory lists - create a new record */
- vlocation = kzalloc(sizeof(struct afs_vlocation), GFP_KERNEL);
- if (!vlocation)
- return -ENOMEM;
-
- atomic_set(&vlocation->usage, 1);
- INIT_LIST_HEAD(&vlocation->link);
- rwlock_init(&vlocation->lock);
- memcpy(vlocation->vldb.name, name, namesz);
-
- afs_timer_init(&vlocation->timeout, &afs_vlocation_timer_ops);
- afs_timer_init(&vlocation->upd_timer, &afs_vlocation_update_timer_ops);
- afs_async_op_init(&vlocation->upd_op, &afs_vlocation_update_op_ops);
-
- afs_get_cell(cell);
- vlocation->cell = cell;
-
- list_add_tail(&vlocation->link, &cell->vl_list);
-
-#ifdef AFS_CACHING_SUPPORT
- /* we want to store it in the cache, plus it might already be
- * encached */
- cachefs_acquire_cookie(cell->cache,
- &afs_volume_cache_index_def,
- vlocation,
- &vlocation->cache);
-
- if (vlocation->valid)
- goto found_in_cache;
-#endif
-
- /* try to look up an unknown volume in the cell VL databases by name */
- ret = afs_vlocation_access_vl_by_name(vlocation, name, namesz, &vldb);
- if (ret < 0) {
- printk("kAFS: failed to locate '%*.*s' in cell '%s'\n",
- namesz, namesz, name, cell->name);
- goto error;
+ struct afs_vlocation *vl;
+
+ vl = kzalloc(sizeof(struct afs_vlocation), GFP_KERNEL);
+ if (vl) {
+ vl->cell = cell;
+ vl->state = AFS_VL_NEW;
+ atomic_set(&vl->usage, 1);
+ INIT_LIST_HEAD(&vl->link);
+ INIT_LIST_HEAD(&vl->grave);
+ INIT_LIST_HEAD(&vl->update);
+ init_waitqueue_head(&vl->waitq);
+ spin_lock_init(&vl->lock);
+ memcpy(vl->vldb.name, name, namesz);
}
- goto found_on_vlserver;
-
- found_in_graveyard:
- /* found in the graveyard - resurrect */
- _debug("found in graveyard");
- atomic_inc(&vlocation->usage);
- list_move_tail(&vlocation->link, &cell->vl_list);
- spin_unlock(&cell->vl_gylock);
-
- afs_kafstimod_del_timer(&vlocation->timeout);
- goto active;
-
- found_in_memory:
- /* found in memory - check to see if it's active */
- _debug("found in memory");
- atomic_inc(&vlocation->usage);
+ _leave(" = %p", vl);
+ return vl;
+}
- active:
- active = 1;
+/*
+ * update record if we found it in the cache
+ */
+static int afs_vlocation_update_record(struct afs_vlocation *vl,
+ struct key *key,
+ struct afs_cache_vlocation *vldb)
+{
+ afs_voltype_t voltype;
+ afs_volid_t vid;
+ int ret;
-#ifdef AFS_CACHING_SUPPORT
- found_in_cache:
-#endif
/* try to look up a cached volume in the cell VL databases by ID */
- _debug("found in cache");
-
_debug("Locally Cached: %s %02x { %08x(%x) %08x(%x) %08x(%x) }",
- vlocation->vldb.name,
- vlocation->vldb.vidmask,
- ntohl(vlocation->vldb.servers[0].s_addr),
- vlocation->vldb.srvtmask[0],
- ntohl(vlocation->vldb.servers[1].s_addr),
- vlocation->vldb.srvtmask[1],
- ntohl(vlocation->vldb.servers[2].s_addr),
- vlocation->vldb.srvtmask[2]
- );
+ vl->vldb.name,
+ vl->vldb.vidmask,
+ ntohl(vl->vldb.servers[0].s_addr),
+ vl->vldb.srvtmask[0],
+ ntohl(vl->vldb.servers[1].s_addr),
+ vl->vldb.srvtmask[1],
+ ntohl(vl->vldb.servers[2].s_addr),
+ vl->vldb.srvtmask[2]);
_debug("Vids: %08x %08x %08x",
- vlocation->vldb.vid[0],
- vlocation->vldb.vid[1],
- vlocation->vldb.vid[2]);
+ vl->vldb.vid[0],
+ vl->vldb.vid[1],
+ vl->vldb.vid[2]);
- if (vlocation->vldb.vidmask & AFS_VOL_VTM_RW) {
- vid = vlocation->vldb.vid[0];
+ if (vl->vldb.vidmask & AFS_VOL_VTM_RW) {
+ vid = vl->vldb.vid[0];
voltype = AFSVL_RWVOL;
- }
- else if (vlocation->vldb.vidmask & AFS_VOL_VTM_RO) {
- vid = vlocation->vldb.vid[1];
+ } else if (vl->vldb.vidmask & AFS_VOL_VTM_RO) {
+ vid = vl->vldb.vid[1];
voltype = AFSVL_ROVOL;
- }
- else if (vlocation->vldb.vidmask & AFS_VOL_VTM_BAK) {
- vid = vlocation->vldb.vid[2];
+ } else if (vl->vldb.vidmask & AFS_VOL_VTM_BAK) {
+ vid = vl->vldb.vid[2];
voltype = AFSVL_BACKVOL;
- }
- else {
+ } else {
BUG();
vid = 0;
voltype = 0;
}
- ret = afs_vlocation_access_vl_by_id(vlocation, vid, voltype, &vldb);
+ /* contact the server to make sure the volume is still available
+ * - TODO: need to handle disconnected operation here
+ */
+ ret = afs_vlocation_access_vl_by_id(vl, key, vid, voltype, vldb);
switch (ret) {
/* net error */
default:
- printk("kAFS: failed to volume '%*.*s' (%x) up in '%s': %d\n",
- namesz, namesz, name, vid, cell->name, ret);
- goto error;
+ printk(KERN_WARNING "kAFS:"
+ " failed to update volume '%s' (%x) up in '%s': %d\n",
+ vl->vldb.name, vid, vl->cell->name, ret);
+ _leave(" = %d", ret);
+ return ret;
/* pulled from local cache into memory */
case 0:
- goto found_on_vlserver;
+ _leave(" = 0");
+ return 0;
/* uh oh... looks like the volume got deleted */
case -ENOMEDIUM:
- printk("kAFS: volume '%*.*s' (%x) does not exist '%s'\n",
- namesz, namesz, name, vid, cell->name);
+ printk(KERN_ERR "kAFS:"
+ " volume '%s' (%x) does not exist '%s'\n",
+ vl->vldb.name, vid, vl->cell->name);
/* TODO: make existing record unavailable */
- goto error;
+ _leave(" = %d", ret);
+ return ret;
}
+}
- found_on_vlserver:
- _debug("Done VL Lookup: %*.*s %02x { %08x(%x) %08x(%x) %08x(%x) }",
- namesz, namesz, name,
- vldb.vidmask,
- ntohl(vldb.servers[0].s_addr), vldb.srvtmask[0],
- ntohl(vldb.servers[1].s_addr), vldb.srvtmask[1],
- ntohl(vldb.servers[2].s_addr), vldb.srvtmask[2]
- );
-
- _debug("Vids: %08x %08x %08x", vldb.vid[0], vldb.vid[1], vldb.vid[2]);
+/*
+ * apply the update to a VL record
+ */
+static void afs_vlocation_apply_update(struct afs_vlocation *vl,
+ struct afs_cache_vlocation *vldb)
+{
+ _debug("Done VL Lookup: %s %02x { %08x(%x) %08x(%x) %08x(%x) }",
+ vldb->name, vldb->vidmask,
+ ntohl(vldb->servers[0].s_addr), vldb->srvtmask[0],
+ ntohl(vldb->servers[1].s_addr), vldb->srvtmask[1],
+ ntohl(vldb->servers[2].s_addr), vldb->srvtmask[2]);
- if ((namesz < sizeof(vlocation->vldb.name) &&
- vlocation->vldb.name[namesz] != '\0') ||
- memcmp(vldb.name, name, namesz) != 0)
- printk("kAFS: name of volume '%*.*s' changed to '%s' on server\n",
- namesz, namesz, name, vldb.name);
+ _debug("Vids: %08x %08x %08x",
+ vldb->vid[0], vldb->vid[1], vldb->vid[2]);
- memcpy(&vlocation->vldb, &vldb, sizeof(vlocation->vldb));
+ if (strcmp(vldb->name, vl->vldb.name) != 0)
+ printk(KERN_NOTICE "kAFS:"
+ " name of volume '%s' changed to '%s' on server\n",
+ vl->vldb.name, vldb->name);
- afs_kafstimod_add_timer(&vlocation->upd_timer, 10 * HZ);
+ vl->vldb = *vldb;
#ifdef AFS_CACHING_SUPPORT
/* update volume entry in local cache */
- cachefs_update_cookie(vlocation->cache);
-#endif
-
- *_vlocation = vlocation;
- _leave(" = 0 (%p)",vlocation);
- return 0;
-
- error:
- if (vlocation) {
- if (active) {
- __afs_put_vlocation(vlocation);
- }
- else {
- list_del(&vlocation->link);
-#ifdef AFS_CACHING_SUPPORT
- cachefs_relinquish_cookie(vlocation->cache, 0);
+ cachefs_update_cookie(vl->cache);
#endif
- afs_put_cell(vlocation->cell);
- kfree(vlocation);
- }
- }
-
- _leave(" = %d", ret);
- return ret;
-} /* end afs_vlocation_lookup() */
+}
-/*****************************************************************************/
/*
- * finish using a volume location record
- * - caller must have cell->vol_sem write-locked
+ * fill in a volume location record, consulting the cache and the VL server
+ * both
*/
-static void __afs_put_vlocation(struct afs_vlocation *vlocation)
+static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
+ struct key *key)
{
- struct afs_cell *cell;
+ struct afs_cache_vlocation vldb;
+ int ret;
- if (!vlocation)
- return;
+ _enter("");
- _enter("%s", vlocation->vldb.name);
+ ASSERTCMP(vl->valid, ==, 0);
- cell = vlocation->cell;
+ memset(&vldb, 0, sizeof(vldb));
- /* sanity check */
- BUG_ON(atomic_read(&vlocation->usage) <= 0);
+ /* see if we have an in-cache copy (will set vl->valid if there is) */
+#ifdef AFS_CACHING_SUPPORT
+ cachefs_acquire_cookie(cell->cache,
+ &afs_volume_cache_index_def,
+ vlocation,
+ &vl->cache);
+#endif
- spin_lock(&cell->vl_gylock);
- if (likely(!atomic_dec_and_test(&vlocation->usage))) {
- spin_unlock(&cell->vl_gylock);
- _leave("");
- return;
+ if (vl->valid) {
+ /* try to update a known volume in the cell VL databases by
+ * ID as the name may have changed */
+ _debug("found in cache");
+ ret = afs_vlocation_update_record(vl, key, &vldb);
+ } else {
+ /* try to look up an unknown volume in the cell VL databases by
+ * name */
+ ret = afs_vlocation_access_vl_by_name(vl, key, &vldb);
+ if (ret < 0) {
+ printk("kAFS: failed to locate '%s' in cell '%s'\n",
+ vl->vldb.name, vl->cell->name);
+ return ret;
+ }
}
- /* move to graveyard queue */
- list_move_tail(&vlocation->link,&cell->vl_graveyard);
-
- /* remove from pending timeout queue (refcounted if actually being
- * updated) */
- list_del_init(&vlocation->upd_op.link);
-
- /* time out in 10 secs */
- afs_kafstimod_del_timer(&vlocation->upd_timer);
- afs_kafstimod_add_timer(&vlocation->timeout, 10 * HZ);
-
- spin_unlock(&cell->vl_gylock);
-
- _leave(" [killed]");
-} /* end __afs_put_vlocation() */
-
-/*****************************************************************************/
-/*
- * finish using a volume location record
- */
-void afs_put_vlocation(struct afs_vlocation *vlocation)
-{
- if (vlocation) {
- struct afs_cell *cell = vlocation->cell;
-
- down_write(&cell->vl_sem);
- __afs_put_vlocation(vlocation);
- up_write(&cell->vl_sem);
- }
-} /* end afs_put_vlocation() */
+ afs_vlocation_apply_update(vl, &vldb);
+ _leave(" = 0");
+ return 0;
+}
-/*****************************************************************************/
/*
- * timeout vlocation record
- * - removes from the cell's graveyard if the usage count is zero
+ * queue a vlocation record for updates
*/
-void afs_vlocation_do_timeout(struct afs_vlocation *vlocation)
+void afs_vlocation_queue_for_updates(struct afs_vlocation *vl)
{
- struct afs_cell *cell;
+ struct afs_vlocation *xvl;
- _enter("%s", vlocation->vldb.name);
+ /* wait at least 10 minutes before updating... */
+ vl->update_at = get_seconds() + afs_vlocation_update_timeout;
- cell = vlocation->cell;
+ spin_lock(&afs_vlocation_updates_lock);
- BUG_ON(atomic_read(&vlocation->usage) < 0);
-
- /* remove from graveyard if still dead */
- spin_lock(&cell->vl_gylock);
- if (atomic_read(&vlocation->usage) == 0)
- list_del_init(&vlocation->link);
- else
- vlocation = NULL;
- spin_unlock(&cell->vl_gylock);
-
- if (!vlocation) {
- _leave("");
- return; /* resurrected */
+ if (!list_empty(&afs_vlocation_updates)) {
+ /* ... but wait at least 1 second more than the newest record
+ * already queued so that we don't spam the VL server suddenly
+ * with lots of requests
+ */
+ xvl = list_entry(afs_vlocation_updates.prev,
+ struct afs_vlocation, update);
+ if (vl->update_at <= xvl->update_at)
+ vl->update_at = xvl->update_at + 1;
+ } else {
+ queue_delayed_work(afs_vlocation_update_worker,
+ &afs_vlocation_update,
+ afs_vlocation_update_timeout * HZ);
}
- /* we can now destroy it properly */
-#ifdef AFS_CACHING_SUPPORT
- cachefs_relinquish_cookie(vlocation->cache, 0);
-#endif
- afs_put_cell(cell);
-
- kfree(vlocation);
-
- _leave(" [destroyed]");
-} /* end afs_vlocation_do_timeout() */
+ list_add_tail(&vl->update, &afs_vlocation_updates);
+ spin_unlock(&afs_vlocation_updates_lock);
+}
-/*****************************************************************************/
/*
- * send an update operation to the currently selected server
+ * lookup volume location
+ * - iterate through the VL servers in a cell until one of them admits knowing
+ * about the volume in question
+ * - lookup in the local cache if not able to find on the VL server
+ * - insert/update in the local cache if did get a VL response
*/
-static int afs_vlocation_update_begin(struct afs_vlocation *vlocation)
+struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *cell,
+ struct key *key,
+ const char *name,
+ size_t namesz)
{
- afs_voltype_t voltype;
- afs_volid_t vid;
+ struct afs_vlocation *vl;
int ret;
- _enter("%s{ufs=%u ucs=%u}",
- vlocation->vldb.name,
- vlocation->upd_first_svix,
- vlocation->upd_curr_svix);
+ _enter("{%s},{%x},%*.*s,%zu",
+ cell->name, key_serial(key),
+ (int) namesz, (int) namesz, name, namesz);
- /* try to look up a cached volume in the cell VL databases by ID */
- if (vlocation->vldb.vidmask & AFS_VOL_VTM_RW) {
- vid = vlocation->vldb.vid[0];
- voltype = AFSVL_RWVOL;
- }
- else if (vlocation->vldb.vidmask & AFS_VOL_VTM_RO) {
- vid = vlocation->vldb.vid[1];
- voltype = AFSVL_ROVOL;
+ if (namesz > sizeof(vl->vldb.name)) {
+ _leave(" = -ENAMETOOLONG");
+ return ERR_PTR(-ENAMETOOLONG);
}
- else if (vlocation->vldb.vidmask & AFS_VOL_VTM_BAK) {
- vid = vlocation->vldb.vid[2];
- voltype = AFSVL_BACKVOL;
+
+ /* see if we have an in-memory copy first */
+ down_write(&cell->vl_sem);
+ spin_lock(&cell->vl_lock);
+ list_for_each_entry(vl, &cell->vl_list, link) {
+ if (vl->vldb.name[namesz] != '\0')
+ continue;
+ if (memcmp(vl->vldb.name, name, namesz) == 0)
+ goto found_in_memory;
}
- else {
- BUG();
- vid = 0;
- voltype = 0;
+ spin_unlock(&cell->vl_lock);
+
+ /* not in the cell's in-memory lists - create a new record */
+ vl = afs_vlocation_alloc(cell, name, namesz);
+ if (!vl) {
+ up_write(&cell->vl_sem);
+ return ERR_PTR(-ENOMEM);
}
- /* contact the chosen server */
- ret = afs_server_lookup(
- vlocation->cell,
- &vlocation->cell->vl_addrs[vlocation->upd_curr_svix],
- &vlocation->upd_op.server);
+ afs_get_cell(cell);
- switch (ret) {
- case 0:
- break;
- case -ENOMEM:
- case -ENONET:
- default:
- _leave(" = %d", ret);
- return ret;
- }
+ list_add_tail(&vl->link, &cell->vl_list);
+ vl->state = AFS_VL_CREATING;
+ up_write(&cell->vl_sem);
- /* initiate the update operation */
- ret = afs_rxvl_get_entry_by_id_async(&vlocation->upd_op, vid, voltype);
- if (ret < 0) {
- _leave(" = %d", ret);
- return ret;
+fill_in_record:
+ ret = afs_vlocation_fill_in_record(vl, key);
+ if (ret < 0)
+ goto error_abandon;
+ spin_lock(&vl->lock);
+ vl->state = AFS_VL_VALID;
+ wake_up(&vl->waitq);
+ spin_unlock(&vl->lock);
+
+ /* schedule for regular updates */
+ afs_vlocation_queue_for_updates(vl);
+ goto success;
+
+found_in_memory:
+ /* found in memory */
+ _debug("found in memory");
+ atomic_inc(&vl->usage);
+ spin_unlock(&cell->vl_lock);
+ if (!list_empty(&vl->grave)) {
+ spin_lock(&afs_vlocation_graveyard_lock);
+ list_del_init(&vl->grave);
+ spin_unlock(&afs_vlocation_graveyard_lock);
}
+ up_write(&cell->vl_sem);
+
+ /* see if it was an abandoned record that we might try filling in */
+ spin_lock(&vl->lock);
+ while (vl->state != AFS_VL_VALID) {
+ afs_vlocation_state_t state = vl->state;
+
+ _debug("invalid [state %d]", state);
+
+ if ((state == AFS_VL_NEW || state == AFS_VL_NO_VOLUME)) {
+ vl->state = AFS_VL_CREATING;
+ spin_unlock(&vl->lock);
+ goto fill_in_record;
+ }
+
+ /* must now wait for creation or update by someone else to
+ * complete */
+ _debug("wait");
+ spin_unlock(&vl->lock);
+ ret = wait_event_interruptible(
+ vl->waitq,
+ vl->state == AFS_VL_NEW ||
+ vl->state == AFS_VL_VALID ||
+ vl->state == AFS_VL_NO_VOLUME);
+ if (ret < 0)
+ goto error;
+ spin_lock(&vl->lock);
+ }
+ spin_unlock(&vl->lock);
+
+success:
+ _leave(" = %p",vl);
+ return vl;
+
+error_abandon:
+ spin_lock(&vl->lock);
+ vl->state = AFS_VL_NEW;
+ wake_up(&vl->waitq);
+ spin_unlock(&vl->lock);
+error:
+ ASSERT(vl != NULL);
+ afs_put_vlocation(vl);
_leave(" = %d", ret);
- return ret;
-} /* end afs_vlocation_update_begin() */
+ return ERR_PTR(ret);
+}
-/*****************************************************************************/
/*
- * abandon updating a VL record
- * - does not restart the update timer
+ * finish using a volume location record
*/
-static void afs_vlocation_update_abandon(struct afs_vlocation *vlocation,
- afs_vlocation_upd_t state,
- int ret)
+void afs_put_vlocation(struct afs_vlocation *vl)
{
- _enter("%s,%u", vlocation->vldb.name, state);
-
- if (ret < 0)
- printk("kAFS: Abandoning VL update '%s': %d\n",
- vlocation->vldb.name, ret);
-
- /* discard the server record */
- afs_put_server(vlocation->upd_op.server);
- vlocation->upd_op.server = NULL;
+ if (!vl)
+ return;
- spin_lock(&afs_vlocation_update_lock);
- afs_vlocation_update = NULL;
- vlocation->upd_state = state;
+ _enter("%s", vl->vldb.name);
- /* TODO: start updating next VL record on pending list */
+ ASSERTCMP(atomic_read(&vl->usage), >, 0);
- spin_unlock(&afs_vlocation_update_lock);
+ if (likely(!atomic_dec_and_test(&vl->usage))) {
+ _leave("");
+ return;
+ }
- _leave("");
-} /* end afs_vlocation_update_abandon() */
+ spin_lock(&afs_vlocation_graveyard_lock);
+ if (atomic_read(&vl->usage) == 0) {
+ _debug("buried");
+ list_move_tail(&vl->grave, &afs_vlocation_graveyard);
+ vl->time_of_death = get_seconds();
+ schedule_delayed_work(&afs_vlocation_reap,
+ afs_vlocation_timeout * HZ);
+
+ /* suspend updates on this record */
+ if (!list_empty(&vl->update)) {
+ spin_lock(&afs_vlocation_updates_lock);
+ list_del_init(&vl->update);
+ spin_unlock(&afs_vlocation_updates_lock);
+ }
+ }
+ spin_unlock(&afs_vlocation_graveyard_lock);
+ _leave(" [killed?]");
+}
-/*****************************************************************************/
/*
- * handle periodic update timeouts and busy retry timeouts
- * - called from kafstimod
+ * destroy a dead volume location record
*/
-static void afs_vlocation_update_timer(struct afs_timer *timer)
+static void afs_vlocation_destroy(struct afs_vlocation *vl)
{
- struct afs_vlocation *vlocation =
- list_entry(timer, struct afs_vlocation, upd_timer);
- int ret;
+ _enter("%p", vl);
- _enter("%s", vlocation->vldb.name);
+#ifdef AFS_CACHING_SUPPORT
+ cachefs_relinquish_cookie(vl->cache, 0);
+#endif
- /* only update if not in the graveyard (defend against putting too) */
- spin_lock(&vlocation->cell->vl_gylock);
+ afs_put_cell(vl->cell);
+ kfree(vl);
+}
- if (!atomic_read(&vlocation->usage))
- goto out_unlock1;
+/*
+ * reap dead volume location records
+ */
+static void afs_vlocation_reaper(struct work_struct *work)
+{
+ LIST_HEAD(corpses);
+ struct afs_vlocation *vl;
+ unsigned long delay, expiry;
+ time_t now;
- spin_lock(&afs_vlocation_update_lock);
+ _enter("");
- /* if we were woken up due to EBUSY sleep then restart immediately if
- * possible or else jump to front of pending queue */
- if (vlocation->upd_state == AFS_VLUPD_BUSYSLEEP) {
- if (afs_vlocation_update) {
- list_add(&vlocation->upd_op.link,
- &afs_vlocation_update_pendq);
+ now = get_seconds();
+ spin_lock(&afs_vlocation_graveyard_lock);
+
+ while (!list_empty(&afs_vlocation_graveyard)) {
+ vl = list_entry(afs_vlocation_graveyard.next,
+ struct afs_vlocation, grave);
+
+ _debug("check %p", vl);
+
+ /* the queue is ordered most dead first */
+ expiry = vl->time_of_death + afs_vlocation_timeout;
+ if (expiry > now) {
+ delay = (expiry - now) * HZ;
+ _debug("delay %lu", delay);
+ if (!schedule_delayed_work(&afs_vlocation_reap,
+ delay)) {
+ cancel_delayed_work(&afs_vlocation_reap);
+ schedule_delayed_work(&afs_vlocation_reap,
+ delay);
+ }
+ break;
}
- else {
- afs_get_vlocation(vlocation);
- afs_vlocation_update = vlocation;
- vlocation->upd_state = AFS_VLUPD_INPROGRESS;
+
+ spin_lock(&vl->cell->vl_lock);
+ if (atomic_read(&vl->usage) > 0) {
+ _debug("no reap");
+ list_del_init(&vl->grave);
+ } else {
+ _debug("reap");
+ list_move_tail(&vl->grave, &corpses);
+ list_del_init(&vl->link);
}
- goto out_unlock2;
+ spin_unlock(&vl->cell->vl_lock);
}
- /* put on pending queue if there's already another update in progress */
- if (afs_vlocation_update) {
- vlocation->upd_state = AFS_VLUPD_PENDING;
- list_add_tail(&vlocation->upd_op.link,
- &afs_vlocation_update_pendq);
- goto out_unlock2;
- }
+ spin_unlock(&afs_vlocation_graveyard_lock);
- /* hold a ref on it while actually updating */
- afs_get_vlocation(vlocation);
- afs_vlocation_update = vlocation;
- vlocation->upd_state = AFS_VLUPD_INPROGRESS;
-
- spin_unlock(&afs_vlocation_update_lock);
- spin_unlock(&vlocation->cell->vl_gylock);
-
- /* okay... we can start the update */
- _debug("BEGIN VL UPDATE [%s]", vlocation->vldb.name);
- vlocation->upd_first_svix = vlocation->cell->vl_curr_svix;
- vlocation->upd_curr_svix = vlocation->upd_first_svix;
- vlocation->upd_rej_cnt = 0;
- vlocation->upd_busy_cnt = 0;
-
- ret = afs_vlocation_update_begin(vlocation);
- if (ret < 0) {
- afs_vlocation_update_abandon(vlocation, AFS_VLUPD_SLEEP, ret);
- afs_kafstimod_add_timer(&vlocation->upd_timer,
- AFS_VLDB_TIMEOUT);
- afs_put_vlocation(vlocation);
+ /* now reap the corpses we've extracted */
+ while (!list_empty(&corpses)) {
+ vl = list_entry(corpses.next, struct afs_vlocation, grave);
+ list_del(&vl->grave);
+ afs_vlocation_destroy(vl);
}
_leave("");
- return;
+}
- out_unlock2:
- spin_unlock(&afs_vlocation_update_lock);
- out_unlock1:
- spin_unlock(&vlocation->cell->vl_gylock);
- _leave("");
- return;
+/*
+ * initialise the VL update process
+ */
+int __init afs_vlocation_update_init(void)
+{
+ afs_vlocation_update_worker =
+ create_singlethread_workqueue("kafs_vlupdated");
+ return afs_vlocation_update_worker ? 0 : -ENOMEM;
+}
-} /* end afs_vlocation_update_timer() */
+/*
+ * discard all the volume location records for rmmod
+ */
+void __exit afs_vlocation_purge(void)
+{
+ afs_vlocation_timeout = 0;
+
+ spin_lock(&afs_vlocation_updates_lock);
+ list_del_init(&afs_vlocation_updates);
+ spin_unlock(&afs_vlocation_updates_lock);
+ cancel_delayed_work(&afs_vlocation_update);
+ queue_delayed_work(afs_vlocation_update_worker,
+ &afs_vlocation_update, 0);
+ destroy_workqueue(afs_vlocation_update_worker);
+
+ cancel_delayed_work(&afs_vlocation_reap);
+ schedule_delayed_work(&afs_vlocation_reap, 0);
+}
-/*****************************************************************************/
/*
- * attend to an update operation upon which an event happened
- * - called in kafsasyncd context
+ * update a volume location
*/
-static void afs_vlocation_update_attend(struct afs_async_op *op)
+static void afs_vlocation_updater(struct work_struct *work)
{
struct afs_cache_vlocation vldb;
- struct afs_vlocation *vlocation =
- list_entry(op, struct afs_vlocation, upd_op);
- unsigned tmp;
+ struct afs_vlocation *vl, *xvl;
+ time_t now;
+ long timeout;
int ret;
- _enter("%s", vlocation->vldb.name);
-
- ret = afs_rxvl_get_entry_by_id_async2(op, &vldb);
- switch (ret) {
- case -EAGAIN:
- _leave(" [unfinished]");
- return;
-
- case 0:
- _debug("END VL UPDATE: %d\n", ret);
- vlocation->valid = 1;
-
- _debug("Done VL Lookup: %02x { %08x(%x) %08x(%x) %08x(%x) }",
- vldb.vidmask,
- ntohl(vldb.servers[0].s_addr), vldb.srvtmask[0],
- ntohl(vldb.servers[1].s_addr), vldb.srvtmask[1],
- ntohl(vldb.servers[2].s_addr), vldb.srvtmask[2]
- );
-
- _debug("Vids: %08x %08x %08x",
- vldb.vid[0], vldb.vid[1], vldb.vid[2]);
-
- afs_vlocation_update_abandon(vlocation, AFS_VLUPD_SLEEP, 0);
-
- down_write(&vlocation->cell->vl_sem);
-
- /* actually update the cache */
- if (strncmp(vldb.name, vlocation->vldb.name,
- sizeof(vlocation->vldb.name)) != 0)
- printk("kAFS: name of volume '%s'"
- " changed to '%s' on server\n",
- vlocation->vldb.name, vldb.name);
-
- memcpy(&vlocation->vldb, &vldb, sizeof(vlocation->vldb));
-
-#if 0
- /* TODO update volume entry in local cache */
-#endif
-
- up_write(&vlocation->cell->vl_sem);
-
- if (ret < 0)
- printk("kAFS: failed to update local cache: %d\n", ret);
-
- afs_kafstimod_add_timer(&vlocation->upd_timer,
- AFS_VLDB_TIMEOUT);
- afs_put_vlocation(vlocation);
- _leave(" [found]");
- return;
-
- case -ENOMEDIUM:
- vlocation->upd_rej_cnt++;
- goto try_next;
-
- /* the server is locked - retry in a very short while */
- case -EBUSY:
- vlocation->upd_busy_cnt++;
- if (vlocation->upd_busy_cnt > 3)
- goto try_next; /* too many retries */
-
- afs_vlocation_update_abandon(vlocation,
- AFS_VLUPD_BUSYSLEEP, 0);
- afs_kafstimod_add_timer(&vlocation->upd_timer, HZ / 2);
- afs_put_vlocation(vlocation);
- _leave(" [busy]");
- return;
-
- case -ENETUNREACH:
- case -EHOSTUNREACH:
- case -ECONNREFUSED:
- case -EREMOTEIO:
- /* record bad vlserver info in the cell too
- * - TODO: use down_write_trylock() if available
- */
- if (vlocation->upd_curr_svix == vlocation->cell->vl_curr_svix)
- vlocation->cell->vl_curr_svix =
- vlocation->cell->vl_curr_svix %
- vlocation->cell->vl_naddrs;
-
- case -EBADRQC:
- case -EINVAL:
- case -EACCES:
- case -EBADMSG:
- goto try_next;
-
- default:
- goto abandon;
- }
-
- /* try contacting the next server */
- try_next:
- vlocation->upd_busy_cnt = 0;
-
- /* discard the server record */
- afs_put_server(vlocation->upd_op.server);
- vlocation->upd_op.server = NULL;
+ _enter("");
- tmp = vlocation->cell->vl_naddrs;
- if (tmp == 0)
- goto abandon;
+ now = get_seconds();
- vlocation->upd_curr_svix++;
- if (vlocation->upd_curr_svix >= tmp)
- vlocation->upd_curr_svix = 0;
- if (vlocation->upd_first_svix >= tmp)
- vlocation->upd_first_svix = tmp - 1;
+ /* find a record to update */
+ spin_lock(&afs_vlocation_updates_lock);
+ for (;;) {
+ if (list_empty(&afs_vlocation_updates)) {
+ spin_unlock(&afs_vlocation_updates_lock);
+ _leave(" [nothing]");
+ return;
+ }
- /* move to the next server */
- if (vlocation->upd_curr_svix != vlocation->upd_first_svix) {
- afs_vlocation_update_begin(vlocation);
- _leave(" [next]");
- return;
+ vl = list_entry(afs_vlocation_updates.next,
+ struct afs_vlocation, update);
+ if (atomic_read(&vl->usage) > 0)
+ break;
+ list_del_init(&vl->update);
}
- /* run out of servers to try - was the volume rejected? */
- if (vlocation->upd_rej_cnt > 0) {
- printk("kAFS: Active volume no longer valid '%s'\n",
- vlocation->vldb.name);
- vlocation->valid = 0;
- afs_vlocation_update_abandon(vlocation, AFS_VLUPD_SLEEP, 0);
- afs_kafstimod_add_timer(&vlocation->upd_timer,
- AFS_VLDB_TIMEOUT);
- afs_put_vlocation(vlocation);
- _leave(" [invalidated]");
+ timeout = vl->update_at - now;
+ if (timeout > 0) {
+ queue_delayed_work(afs_vlocation_update_worker,
+ &afs_vlocation_update, timeout * HZ);
+ spin_unlock(&afs_vlocation_updates_lock);
+ _leave(" [nothing]");
return;
}
- /* abandon the update */
- abandon:
- afs_vlocation_update_abandon(vlocation, AFS_VLUPD_SLEEP, ret);
- afs_kafstimod_add_timer(&vlocation->upd_timer, HZ * 10);
- afs_put_vlocation(vlocation);
- _leave(" [abandoned]");
-
-} /* end afs_vlocation_update_attend() */
-
-/*****************************************************************************/
-/*
- * deal with an update operation being discarded
- * - called in kafsasyncd context when it's dying due to rmmod
- * - the call has already been aborted and put()'d
- */
-static void afs_vlocation_update_discard(struct afs_async_op *op)
-{
- struct afs_vlocation *vlocation =
- list_entry(op, struct afs_vlocation, upd_op);
+ list_del_init(&vl->update);
+ atomic_inc(&vl->usage);
+ spin_unlock(&afs_vlocation_updates_lock);
- _enter("%s", vlocation->vldb.name);
+ /* we can now perform the update */
+ _debug("update %s", vl->vldb.name);
+ vl->state = AFS_VL_UPDATING;
+ vl->upd_rej_cnt = 0;
+ vl->upd_busy_cnt = 0;
- afs_put_server(op->server);
- op->server = NULL;
+ ret = afs_vlocation_update_record(vl, NULL, &vldb);
+ spin_lock(&vl->lock);
+ switch (ret) {
+ case 0:
+ afs_vlocation_apply_update(vl, &vldb);
+ vl->state = AFS_VL_VALID;
+ wake_up(&vl->waitq);
+ break;
+ case -ENOMEDIUM:
+ vl->state = AFS_VL_VOLUME_DELETED;
+ break;
+ default:
+ vl->state = AFS_VL_UNCERTAIN;
+ break;
+ }
+ spin_unlock(&vl->lock);
- afs_put_vlocation(vlocation);
+ /* and then reschedule */
+ _debug("reschedule");
+ vl->update_at = get_seconds() + afs_vlocation_update_timeout;
- _leave("");
-} /* end afs_vlocation_update_discard() */
+ spin_lock(&afs_vlocation_updates_lock);
-/*****************************************************************************/
-/*
- * match a VLDB record stored in the cache
- * - may also load target from entry
- */
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vlocation_cache_match(void *target,
- const void *entry)
-{
- const struct afs_cache_vlocation *vldb = entry;
- struct afs_vlocation *vlocation = target;
-
- _enter("{%s},{%s}", vlocation->vldb.name, vldb->name);
-
- if (strncmp(vlocation->vldb.name, vldb->name, sizeof(vldb->name)) == 0
- ) {
- if (!vlocation->valid ||
- vlocation->vldb.rtime == vldb->rtime
- ) {
- vlocation->vldb = *vldb;
- vlocation->valid = 1;
- _leave(" = SUCCESS [c->m]");
- return CACHEFS_MATCH_SUCCESS;
- }
- /* need to update cache if cached info differs */
- else if (memcmp(&vlocation->vldb, vldb, sizeof(*vldb)) != 0) {
- /* delete if VIDs for this name differ */
- if (memcmp(&vlocation->vldb.vid,
- &vldb->vid,
- sizeof(vldb->vid)) != 0) {
- _leave(" = DELETE");
- return CACHEFS_MATCH_SUCCESS_DELETE;
- }
-
- _leave(" = UPDATE");
- return CACHEFS_MATCH_SUCCESS_UPDATE;
- }
- else {
- _leave(" = SUCCESS");
- return CACHEFS_MATCH_SUCCESS;
- }
+ if (!list_empty(&afs_vlocation_updates)) {
+ /* next update in 10 minutes, but wait at least 1 second more
+ * than the newest record already queued so that we don't spam
+ * the VL server suddenly with lots of requests
+ */
+ xvl = list_entry(afs_vlocation_updates.prev,
+ struct afs_vlocation, update);
+ if (vl->update_at <= xvl->update_at)
+ vl->update_at = xvl->update_at + 1;
+ xvl = list_entry(afs_vlocation_updates.next,
+ struct afs_vlocation, update);
+ timeout = xvl->update_at - now;
+ if (timeout < 0)
+ timeout = 0;
+ } else {
+ timeout = afs_vlocation_update_timeout;
}
- _leave(" = FAILED");
- return CACHEFS_MATCH_FAILED;
-} /* end afs_vlocation_cache_match() */
-#endif
-
-/*****************************************************************************/
-/*
- * update a VLDB record stored in the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_vlocation_cache_update(void *source, void *entry)
-{
- struct afs_cache_vlocation *vldb = entry;
- struct afs_vlocation *vlocation = source;
+ ASSERT(list_empty(&vl->update));
- _enter("");
-
- *vldb = vlocation->vldb;
+ list_add_tail(&vl->update, &afs_vlocation_updates);
-} /* end afs_vlocation_cache_update() */
-#endif
+ _debug("timeout %ld", timeout);
+ queue_delayed_work(afs_vlocation_update_worker,
+ &afs_vlocation_update, timeout * HZ);
+ spin_unlock(&afs_vlocation_updates_lock);
+ afs_put_vlocation(vl);
+}
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index cf62da5d7825..a1904ab8426a 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -1,6 +1,6 @@
-/* vnode.c: AFS vnode management
+/* AFS vnode management
*
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
@@ -14,142 +14,237 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include "volume.h"
-#include "cell.h"
-#include "cmservice.h"
-#include "fsclient.h"
-#include "vlclient.h"
-#include "vnode.h"
#include "internal.h"
-static void afs_vnode_cb_timed_out(struct afs_timer *timer);
+#if 0
+static noinline bool dump_tree_aux(struct rb_node *node, struct rb_node *parent,
+ int depth, char lr)
+{
+ struct afs_vnode *vnode;
+ bool bad = false;
+
+ if (!node)
+ return false;
+
+ if (node->rb_left)
+ bad = dump_tree_aux(node->rb_left, node, depth + 2, '/');
+
+ vnode = rb_entry(node, struct afs_vnode, cb_promise);
+ _debug("%c %*.*s%c%p {%d}",
+ rb_is_red(node) ? 'R' : 'B',
+ depth, depth, "", lr,
+ vnode, vnode->cb_expires_at);
+ if (rb_parent(node) != parent) {
+ printk("BAD: %p != %p\n", rb_parent(node), parent);
+ bad = true;
+ }
-struct afs_timer_ops afs_vnode_cb_timed_out_ops = {
- .timed_out = afs_vnode_cb_timed_out,
-};
+ if (node->rb_right)
+ bad |= dump_tree_aux(node->rb_right, node, depth + 2, '\\');
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vnode_cache_match(void *target,
- const void *entry);
-static void afs_vnode_cache_update(void *source, void *entry);
+ return bad;
+}
-struct cachefs_index_def afs_vnode_cache_index_def = {
- .name = "vnode",
- .data_size = sizeof(struct afs_cache_vnode),
- .keys[0] = { CACHEFS_INDEX_KEYS_BIN, 4 },
- .match = afs_vnode_cache_match,
- .update = afs_vnode_cache_update,
-};
+static noinline void dump_tree(const char *name, struct afs_server *server)
+{
+ _enter("%s", name);
+ if (dump_tree_aux(server->cb_promises.rb_node, NULL, 0, '-'))
+ BUG();
+}
#endif
-/*****************************************************************************/
/*
- * handle a callback timing out
- * TODO: retain a ref to vnode struct for an outstanding callback timeout
+ * insert a vnode into the backing server's vnode tree
*/
-static void afs_vnode_cb_timed_out(struct afs_timer *timer)
+static void afs_install_vnode(struct afs_vnode *vnode,
+ struct afs_server *server)
{
- struct afs_server *oldserver;
- struct afs_vnode *vnode;
+ struct afs_server *old_server = vnode->server;
+ struct afs_vnode *xvnode;
+ struct rb_node *parent, **p;
- vnode = list_entry(timer, struct afs_vnode, cb_timeout);
+ _enter("%p,%p", vnode, server);
- _enter("%p", vnode);
+ if (old_server) {
+ spin_lock(&old_server->fs_lock);
+ rb_erase(&vnode->server_rb, &old_server->fs_vnodes);
+ spin_unlock(&old_server->fs_lock);
+ }
- /* set the changed flag in the vnode and release the server */
- spin_lock(&vnode->lock);
+ afs_get_server(server);
+ vnode->server = server;
+ afs_put_server(old_server);
+
+ /* insert into the server's vnode tree in FID order */
+ spin_lock(&server->fs_lock);
+
+ parent = NULL;
+ p = &server->fs_vnodes.rb_node;
+ while (*p) {
+ parent = *p;
+ xvnode = rb_entry(parent, struct afs_vnode, server_rb);
+ if (vnode->fid.vid < xvnode->fid.vid)
+ p = &(*p)->rb_left;
+ else if (vnode->fid.vid > xvnode->fid.vid)
+ p = &(*p)->rb_right;
+ else if (vnode->fid.vnode < xvnode->fid.vnode)
+ p = &(*p)->rb_left;
+ else if (vnode->fid.vnode > xvnode->fid.vnode)
+ p = &(*p)->rb_right;
+ else if (vnode->fid.unique < xvnode->fid.unique)
+ p = &(*p)->rb_left;
+ else if (vnode->fid.unique > xvnode->fid.unique)
+ p = &(*p)->rb_right;
+ else
+ BUG(); /* can't happen unless afs_iget() malfunctions */
+ }
+
+ rb_link_node(&vnode->server_rb, parent, p);
+ rb_insert_color(&vnode->server_rb, &server->fs_vnodes);
- oldserver = xchg(&vnode->cb_server, NULL);
- if (oldserver) {
- vnode->flags |= AFS_VNODE_CHANGED;
+ spin_unlock(&server->fs_lock);
+ _leave("");
+}
- spin_lock(&afs_cb_hash_lock);
- list_del_init(&vnode->cb_hash_link);
- spin_unlock(&afs_cb_hash_lock);
+/*
+ * insert a vnode into the promising server's update/expiration tree
+ * - caller must hold vnode->lock
+ */
+static void afs_vnode_note_promise(struct afs_vnode *vnode,
+ struct afs_server *server)
+{
+ struct afs_server *old_server;
+ struct afs_vnode *xvnode;
+ struct rb_node *parent, **p;
- spin_lock(&oldserver->cb_lock);
- list_del_init(&vnode->cb_link);
- spin_unlock(&oldserver->cb_lock);
+ _enter("%p,%p", vnode, server);
+
+ ASSERT(server != NULL);
+
+ old_server = vnode->server;
+ if (vnode->cb_promised) {
+ if (server == old_server &&
+ vnode->cb_expires == vnode->cb_expires_at) {
+ _leave(" [no change]");
+ return;
+ }
+
+ spin_lock(&old_server->cb_lock);
+ if (vnode->cb_promised) {
+ _debug("delete");
+ rb_erase(&vnode->cb_promise, &old_server->cb_promises);
+ vnode->cb_promised = false;
+ }
+ spin_unlock(&old_server->cb_lock);
}
- spin_unlock(&vnode->lock);
+ if (vnode->server != server)
+ afs_install_vnode(vnode, server);
+
+ vnode->cb_expires_at = vnode->cb_expires;
+ _debug("PROMISE on %p {%lu}",
+ vnode, (unsigned long) vnode->cb_expires_at);
+
+ /* abuse an RB-tree to hold the expiration order (we may have multiple
+ * items with the same expiration time) */
+ spin_lock(&server->cb_lock);
+
+ parent = NULL;
+ p = &server->cb_promises.rb_node;
+ while (*p) {
+ parent = *p;
+ xvnode = rb_entry(parent, struct afs_vnode, cb_promise);
+ if (vnode->cb_expires_at < xvnode->cb_expires_at)
+ p = &(*p)->rb_left;
+ else
+ p = &(*p)->rb_right;
+ }
- afs_put_server(oldserver);
+ rb_link_node(&vnode->cb_promise, parent, p);
+ rb_insert_color(&vnode->cb_promise, &server->cb_promises);
+ vnode->cb_promised = true;
+ spin_unlock(&server->cb_lock);
_leave("");
-} /* end afs_vnode_cb_timed_out() */
+}
-/*****************************************************************************/
/*
- * finish off updating the recorded status of a file
+ * handle remote file deletion by discarding the callback promise
+ */
+static void afs_vnode_deleted_remotely(struct afs_vnode *vnode)
+{
+ struct afs_server *server;
+
+ set_bit(AFS_VNODE_DELETED, &vnode->flags);
+
+ server = vnode->server;
+ if (vnode->cb_promised) {
+ spin_lock(&server->cb_lock);
+ if (vnode->cb_promised) {
+ rb_erase(&vnode->cb_promise, &server->cb_promises);
+ vnode->cb_promised = false;
+ }
+ spin_unlock(&server->cb_lock);
+ }
+
+ spin_lock(&vnode->server->fs_lock);
+ rb_erase(&vnode->server_rb, &vnode->server->fs_vnodes);
+ spin_unlock(&vnode->server->fs_lock);
+
+ vnode->server = NULL;
+ afs_put_server(server);
+}
+
+/*
+ * finish off updating the recorded status of a file after a successful
+ * operation completion
* - starts callback expiry timer
* - adds to server's callback list
*/
-static void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
- struct afs_server *server,
- int ret)
+void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
+ struct afs_server *server)
{
struct afs_server *oldserver = NULL;
- _enter("%p,%p,%d", vnode, server, ret);
+ _enter("%p,%p", vnode, server);
spin_lock(&vnode->lock);
+ clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+ afs_vnode_note_promise(vnode, server);
+ vnode->update_cnt--;
+ ASSERTCMP(vnode->update_cnt, >=, 0);
+ spin_unlock(&vnode->lock);
+
+ wake_up_all(&vnode->update_waitq);
+ afs_put_server(oldserver);
+ _leave("");
+}
- vnode->flags &= ~AFS_VNODE_CHANGED;
+/*
+ * finish off updating the recorded status of a file after an operation failed
+ */
+static void afs_vnode_status_update_failed(struct afs_vnode *vnode, int ret)
+{
+ _enter("%p,%d", vnode, ret);
- if (ret == 0) {
- /* adjust the callback timeout appropriately */
- afs_kafstimod_add_timer(&vnode->cb_timeout,
- vnode->cb_expiry * HZ);
-
- spin_lock(&afs_cb_hash_lock);
- list_move_tail(&vnode->cb_hash_link,
- &afs_cb_hash(server, &vnode->fid));
- spin_unlock(&afs_cb_hash_lock);
-
- /* swap ref to old callback server with that for new callback
- * server */
- oldserver = xchg(&vnode->cb_server, server);
- if (oldserver != server) {
- if (oldserver) {
- spin_lock(&oldserver->cb_lock);
- list_del_init(&vnode->cb_link);
- spin_unlock(&oldserver->cb_lock);
- }
+ spin_lock(&vnode->lock);
- afs_get_server(server);
- spin_lock(&server->cb_lock);
- list_add_tail(&vnode->cb_link, &server->cb_promises);
- spin_unlock(&server->cb_lock);
- }
- else {
- /* same server */
- oldserver = NULL;
- }
- }
- else if (ret == -ENOENT) {
- /* the file was deleted - clear the callback timeout */
- oldserver = xchg(&vnode->cb_server, NULL);
- afs_kafstimod_del_timer(&vnode->cb_timeout);
+ clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+ if (ret == -ENOENT) {
+ /* the file was deleted on the server */
_debug("got NOENT from server - marking file deleted");
- vnode->flags |= AFS_VNODE_DELETED;
+ afs_vnode_deleted_remotely(vnode);
}
vnode->update_cnt--;
-
+ ASSERTCMP(vnode->update_cnt, >=, 0);
spin_unlock(&vnode->lock);
wake_up_all(&vnode->update_waitq);
-
- afs_put_server(oldserver);
-
_leave("");
+}
-} /* end afs_vnode_finalise_status_update() */
-
-/*****************************************************************************/
/*
* fetch file status from the volume
* - don't issue a fetch if:
@@ -157,9 +252,11 @@ static void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
* - there are any outstanding ops that will fetch the status
* - TODO implement local caching
*/
-int afs_vnode_fetch_status(struct afs_vnode *vnode)
+int afs_vnode_fetch_status(struct afs_vnode *vnode,
+ struct afs_vnode *auth_vnode, struct key *key)
{
struct afs_server *server;
+ unsigned long acl_order;
int ret;
DECLARE_WAITQUEUE(myself, current);
@@ -168,38 +265,49 @@ int afs_vnode_fetch_status(struct afs_vnode *vnode)
vnode->volume->vlocation->vldb.name,
vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
- if (!(vnode->flags & AFS_VNODE_CHANGED) && vnode->cb_server) {
+ if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
+ vnode->cb_promised) {
_leave(" [unchanged]");
return 0;
}
- if (vnode->flags & AFS_VNODE_DELETED) {
+ if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
_leave(" [deleted]");
return -ENOENT;
}
+ acl_order = 0;
+ if (auth_vnode)
+ acl_order = auth_vnode->acl_order;
+
spin_lock(&vnode->lock);
- if (!(vnode->flags & AFS_VNODE_CHANGED)) {
+ if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
+ vnode->cb_promised) {
spin_unlock(&vnode->lock);
_leave(" [unchanged]");
return 0;
}
+ ASSERTCMP(vnode->update_cnt, >=, 0);
+
if (vnode->update_cnt > 0) {
/* someone else started a fetch */
+ _debug("wait on fetch %d", vnode->update_cnt);
+
set_current_state(TASK_UNINTERRUPTIBLE);
+ ASSERT(myself.func != NULL);
add_wait_queue(&vnode->update_waitq, &myself);
/* wait for the status to be updated */
for (;;) {
- if (!(vnode->flags & AFS_VNODE_CHANGED))
+ if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags))
break;
- if (vnode->flags & AFS_VNODE_DELETED)
+ if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
break;
- /* it got updated and invalidated all before we saw
- * it */
+ /* check to see if it got updated and invalidated all
+ * before we saw it */
if (vnode->update_cnt == 0) {
remove_wait_queue(&vnode->update_waitq,
&myself);
@@ -219,10 +327,11 @@ int afs_vnode_fetch_status(struct afs_vnode *vnode)
spin_unlock(&vnode->lock);
set_current_state(TASK_RUNNING);
- return vnode->flags & AFS_VNODE_DELETED ? -ENOENT : 0;
+ return test_bit(AFS_VNODE_DELETED, &vnode->flags) ?
+ -ENOENT : 0;
}
- get_anyway:
+get_anyway:
/* okay... we're going to have to initiate the op */
vnode->update_cnt++;
@@ -232,39 +341,60 @@ int afs_vnode_fetch_status(struct afs_vnode *vnode)
* vnode */
do {
/* pick a server to query */
- ret = afs_volume_pick_fileserver(vnode->volume, &server);
- if (ret<0)
- return ret;
+ server = afs_volume_pick_fileserver(vnode);
+ if (IS_ERR(server))
+ goto no_server;
- _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+ _debug("USING SERVER: %p{%08x}",
+ server, ntohl(server->addr.s_addr));
- ret = afs_rxfs_fetch_file_status(server, vnode, NULL);
+ ret = afs_fs_fetch_file_status(server, key, vnode, NULL,
+ &afs_sync_call);
- } while (!afs_volume_release_fileserver(vnode->volume, server, ret));
+ } while (!afs_volume_release_fileserver(vnode, server, ret));
/* adjust the flags */
- afs_vnode_finalise_status_update(vnode, server, ret);
+ if (ret == 0) {
+ _debug("adjust");
+ if (auth_vnode)
+ afs_cache_permit(vnode, key, acl_order);
+ afs_vnode_finalise_status_update(vnode, server);
+ afs_put_server(server);
+ } else {
+ _debug("failed [%d]", ret);
+ afs_vnode_status_update_failed(vnode, ret);
+ }
- _leave(" = %d", ret);
+ ASSERTCMP(vnode->update_cnt, >=, 0);
+
+ _leave(" = %d [cnt %d]", ret, vnode->update_cnt);
return ret;
-} /* end afs_vnode_fetch_status() */
-/*****************************************************************************/
+no_server:
+ spin_lock(&vnode->lock);
+ vnode->update_cnt--;
+ ASSERTCMP(vnode->update_cnt, >=, 0);
+ spin_unlock(&vnode->lock);
+ _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+ return PTR_ERR(server);
+}
+
/*
* fetch file data from the volume
- * - TODO implement caching and server failover
+ * - TODO implement caching
*/
-int afs_vnode_fetch_data(struct afs_vnode *vnode,
- struct afs_rxfs_fetch_descriptor *desc)
+int afs_vnode_fetch_data(struct afs_vnode *vnode, struct key *key,
+ off_t offset, size_t length, struct page *page)
{
struct afs_server *server;
int ret;
- _enter("%s,{%u,%u,%u}",
+ _enter("%s{%u,%u,%u},%x,,,",
vnode->volume->vlocation->vldb.name,
vnode->fid.vid,
vnode->fid.vnode,
- vnode->fid.unique);
+ vnode->fid.unique,
+ key_serial(key));
/* this op will fetch the status */
spin_lock(&vnode->lock);
@@ -275,120 +405,351 @@ int afs_vnode_fetch_data(struct afs_vnode *vnode,
* vnode */
do {
/* pick a server to query */
- ret = afs_volume_pick_fileserver(vnode->volume, &server);
- if (ret < 0)
- return ret;
+ server = afs_volume_pick_fileserver(vnode);
+ if (IS_ERR(server))
+ goto no_server;
_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
- ret = afs_rxfs_fetch_file_data(server, vnode, desc, NULL);
+ ret = afs_fs_fetch_data(server, key, vnode, offset, length,
+ page, &afs_sync_call);
- } while (!afs_volume_release_fileserver(vnode->volume, server, ret));
+ } while (!afs_volume_release_fileserver(vnode, server, ret));
/* adjust the flags */
- afs_vnode_finalise_status_update(vnode, server, ret);
+ if (ret == 0) {
+ afs_vnode_finalise_status_update(vnode, server);
+ afs_put_server(server);
+ } else {
+ afs_vnode_status_update_failed(vnode, ret);
+ }
_leave(" = %d", ret);
return ret;
-} /* end afs_vnode_fetch_data() */
+no_server:
+ spin_lock(&vnode->lock);
+ vnode->update_cnt--;
+ ASSERTCMP(vnode->update_cnt, >=, 0);
+ spin_unlock(&vnode->lock);
+ return PTR_ERR(server);
+}
-/*****************************************************************************/
/*
- * break any outstanding callback on a vnode
- * - only relevent to server that issued it
+ * make a file or a directory
*/
-int afs_vnode_give_up_callback(struct afs_vnode *vnode)
+int afs_vnode_create(struct afs_vnode *vnode, struct key *key,
+ const char *name, umode_t mode, struct afs_fid *newfid,
+ struct afs_file_status *newstatus,
+ struct afs_callback *newcb, struct afs_server **_server)
{
struct afs_server *server;
int ret;
- _enter("%s,{%u,%u,%u}",
+ _enter("%s{%u,%u,%u},%x,%s,,",
vnode->volume->vlocation->vldb.name,
vnode->fid.vid,
vnode->fid.vnode,
- vnode->fid.unique);
-
- spin_lock(&afs_cb_hash_lock);
- list_del_init(&vnode->cb_hash_link);
- spin_unlock(&afs_cb_hash_lock);
+ vnode->fid.unique,
+ key_serial(key),
+ name);
- /* set the changed flag in the vnode and release the server */
+ /* this op will fetch the status on the directory we're creating in */
spin_lock(&vnode->lock);
+ vnode->update_cnt++;
+ spin_unlock(&vnode->lock);
- afs_kafstimod_del_timer(&vnode->cb_timeout);
+ do {
+ /* pick a server to query */
+ server = afs_volume_pick_fileserver(vnode);
+ if (IS_ERR(server))
+ goto no_server;
+
+ _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
- server = xchg(&vnode->cb_server, NULL);
- if (server) {
- vnode->flags |= AFS_VNODE_CHANGED;
+ ret = afs_fs_create(server, key, vnode, name, mode, newfid,
+ newstatus, newcb, &afs_sync_call);
- spin_lock(&server->cb_lock);
- list_del_init(&vnode->cb_link);
- spin_unlock(&server->cb_lock);
+ } while (!afs_volume_release_fileserver(vnode, server, ret));
+
+ /* adjust the flags */
+ if (ret == 0) {
+ afs_vnode_finalise_status_update(vnode, server);
+ *_server = server;
+ } else {
+ afs_vnode_status_update_failed(vnode, ret);
+ *_server = NULL;
}
+ _leave(" = %d [cnt %d]", ret, vnode->update_cnt);
+ return ret;
+
+no_server:
+ spin_lock(&vnode->lock);
+ vnode->update_cnt--;
+ ASSERTCMP(vnode->update_cnt, >=, 0);
spin_unlock(&vnode->lock);
+ _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+ return PTR_ERR(server);
+}
- ret = 0;
- if (server) {
- ret = afs_rxfs_give_up_callback(server, vnode);
+/*
+ * remove a file or directory
+ */
+int afs_vnode_remove(struct afs_vnode *vnode, struct key *key, const char *name,
+ bool isdir)
+{
+ struct afs_server *server;
+ int ret;
+
+ _enter("%s{%u,%u,%u},%x,%s",
+ vnode->volume->vlocation->vldb.name,
+ vnode->fid.vid,
+ vnode->fid.vnode,
+ vnode->fid.unique,
+ key_serial(key),
+ name);
+
+ /* this op will fetch the status on the directory we're removing from */
+ spin_lock(&vnode->lock);
+ vnode->update_cnt++;
+ spin_unlock(&vnode->lock);
+
+ do {
+ /* pick a server to query */
+ server = afs_volume_pick_fileserver(vnode);
+ if (IS_ERR(server))
+ goto no_server;
+
+ _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+ ret = afs_fs_remove(server, key, vnode, name, isdir,
+ &afs_sync_call);
+
+ } while (!afs_volume_release_fileserver(vnode, server, ret));
+
+ /* adjust the flags */
+ if (ret == 0) {
+ afs_vnode_finalise_status_update(vnode, server);
afs_put_server(server);
+ } else {
+ afs_vnode_status_update_failed(vnode, ret);
}
- _leave(" = %d", ret);
+ _leave(" = %d [cnt %d]", ret, vnode->update_cnt);
return ret;
-} /* end afs_vnode_give_up_callback() */
-/*****************************************************************************/
+no_server:
+ spin_lock(&vnode->lock);
+ vnode->update_cnt--;
+ ASSERTCMP(vnode->update_cnt, >=, 0);
+ spin_unlock(&vnode->lock);
+ _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+ return PTR_ERR(server);
+}
+
/*
- * match a vnode record stored in the cache
+ * create a hard link
*/
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vnode_cache_match(void *target,
- const void *entry)
+extern int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode,
+ struct key *key, const char *name)
{
- const struct afs_cache_vnode *cvnode = entry;
- struct afs_vnode *vnode = target;
+ struct afs_server *server;
+ int ret;
- _enter("{%x,%x,%Lx},{%x,%x,%Lx}",
+ _enter("%s{%u,%u,%u},%s{%u,%u,%u},%x,%s",
+ dvnode->volume->vlocation->vldb.name,
+ dvnode->fid.vid,
+ dvnode->fid.vnode,
+ dvnode->fid.unique,
+ vnode->volume->vlocation->vldb.name,
+ vnode->fid.vid,
vnode->fid.vnode,
vnode->fid.unique,
- vnode->status.version,
- cvnode->vnode_id,
- cvnode->vnode_unique,
- cvnode->data_version);
-
- if (vnode->fid.vnode != cvnode->vnode_id) {
- _leave(" = FAILED");
- return CACHEFS_MATCH_FAILED;
+ key_serial(key),
+ name);
+
+ /* this op will fetch the status on the directory we're removing from */
+ spin_lock(&vnode->lock);
+ vnode->update_cnt++;
+ spin_unlock(&vnode->lock);
+ spin_lock(&dvnode->lock);
+ dvnode->update_cnt++;
+ spin_unlock(&dvnode->lock);
+
+ do {
+ /* pick a server to query */
+ server = afs_volume_pick_fileserver(dvnode);
+ if (IS_ERR(server))
+ goto no_server;
+
+ _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+ ret = afs_fs_link(server, key, dvnode, vnode, name,
+ &afs_sync_call);
+
+ } while (!afs_volume_release_fileserver(dvnode, server, ret));
+
+ /* adjust the flags */
+ if (ret == 0) {
+ afs_vnode_finalise_status_update(vnode, server);
+ afs_vnode_finalise_status_update(dvnode, server);
+ afs_put_server(server);
+ } else {
+ afs_vnode_status_update_failed(vnode, ret);
+ afs_vnode_status_update_failed(dvnode, ret);
}
- if (vnode->fid.unique != cvnode->vnode_unique ||
- vnode->status.version != cvnode->data_version) {
- _leave(" = DELETE");
- return CACHEFS_MATCH_SUCCESS_DELETE;
+ _leave(" = %d [cnt %d]", ret, vnode->update_cnt);
+ return ret;
+
+no_server:
+ spin_lock(&vnode->lock);
+ vnode->update_cnt--;
+ ASSERTCMP(vnode->update_cnt, >=, 0);
+ spin_unlock(&vnode->lock);
+ spin_lock(&dvnode->lock);
+ dvnode->update_cnt--;
+ ASSERTCMP(dvnode->update_cnt, >=, 0);
+ spin_unlock(&dvnode->lock);
+ _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+ return PTR_ERR(server);
+}
+
+/*
+ * create a symbolic link
+ */
+int afs_vnode_symlink(struct afs_vnode *vnode, struct key *key,
+ const char *name, const char *content,
+ struct afs_fid *newfid,
+ struct afs_file_status *newstatus,
+ struct afs_server **_server)
+{
+ struct afs_server *server;
+ int ret;
+
+ _enter("%s{%u,%u,%u},%x,%s,%s,,,",
+ vnode->volume->vlocation->vldb.name,
+ vnode->fid.vid,
+ vnode->fid.vnode,
+ vnode->fid.unique,
+ key_serial(key),
+ name, content);
+
+ /* this op will fetch the status on the directory we're creating in */
+ spin_lock(&vnode->lock);
+ vnode->update_cnt++;
+ spin_unlock(&vnode->lock);
+
+ do {
+ /* pick a server to query */
+ server = afs_volume_pick_fileserver(vnode);
+ if (IS_ERR(server))
+ goto no_server;
+
+ _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+ ret = afs_fs_symlink(server, key, vnode, name, content,
+ newfid, newstatus, &afs_sync_call);
+
+ } while (!afs_volume_release_fileserver(vnode, server, ret));
+
+ /* adjust the flags */
+ if (ret == 0) {
+ afs_vnode_finalise_status_update(vnode, server);
+ *_server = server;
+ } else {
+ afs_vnode_status_update_failed(vnode, ret);
+ *_server = NULL;
}
- _leave(" = SUCCESS");
- return CACHEFS_MATCH_SUCCESS;
-} /* end afs_vnode_cache_match() */
-#endif
+ _leave(" = %d [cnt %d]", ret, vnode->update_cnt);
+ return ret;
+
+no_server:
+ spin_lock(&vnode->lock);
+ vnode->update_cnt--;
+ ASSERTCMP(vnode->update_cnt, >=, 0);
+ spin_unlock(&vnode->lock);
+ _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+ return PTR_ERR(server);
+}
-/*****************************************************************************/
/*
- * update a vnode record stored in the cache
+ * rename a file
*/
-#ifdef AFS_CACHING_SUPPORT
-static void afs_vnode_cache_update(void *source, void *entry)
+int afs_vnode_rename(struct afs_vnode *orig_dvnode,
+ struct afs_vnode *new_dvnode,
+ struct key *key,
+ const char *orig_name,
+ const char *new_name)
{
- struct afs_cache_vnode *cvnode = entry;
- struct afs_vnode *vnode = source;
+ struct afs_server *server;
+ int ret;
- _enter("");
+ _enter("%s{%u,%u,%u},%s{%u,%u,%u},%x,%s,%s",
+ orig_dvnode->volume->vlocation->vldb.name,
+ orig_dvnode->fid.vid,
+ orig_dvnode->fid.vnode,
+ orig_dvnode->fid.unique,
+ new_dvnode->volume->vlocation->vldb.name,
+ new_dvnode->fid.vid,
+ new_dvnode->fid.vnode,
+ new_dvnode->fid.unique,
+ key_serial(key),
+ orig_name,
+ new_name);
+
+ /* this op will fetch the status on both the directories we're dealing
+ * with */
+ spin_lock(&orig_dvnode->lock);
+ orig_dvnode->update_cnt++;
+ spin_unlock(&orig_dvnode->lock);
+ if (new_dvnode != orig_dvnode) {
+ spin_lock(&new_dvnode->lock);
+ new_dvnode->update_cnt++;
+ spin_unlock(&new_dvnode->lock);
+ }
- cvnode->vnode_id = vnode->fid.vnode;
- cvnode->vnode_unique = vnode->fid.unique;
- cvnode->data_version = vnode->status.version;
+ do {
+ /* pick a server to query */
+ server = afs_volume_pick_fileserver(orig_dvnode);
+ if (IS_ERR(server))
+ goto no_server;
-} /* end afs_vnode_cache_update() */
-#endif
+ _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+ ret = afs_fs_rename(server, key, orig_dvnode, orig_name,
+ new_dvnode, new_name, &afs_sync_call);
+
+ } while (!afs_volume_release_fileserver(orig_dvnode, server, ret));
+
+ /* adjust the flags */
+ if (ret == 0) {
+ afs_vnode_finalise_status_update(orig_dvnode, server);
+ if (new_dvnode != orig_dvnode)
+ afs_vnode_finalise_status_update(new_dvnode, server);
+ afs_put_server(server);
+ } else {
+ afs_vnode_status_update_failed(orig_dvnode, ret);
+ if (new_dvnode != orig_dvnode)
+ afs_vnode_status_update_failed(new_dvnode, ret);
+ }
+
+ _leave(" = %d [cnt %d]", ret, orig_dvnode->update_cnt);
+ return ret;
+
+no_server:
+ spin_lock(&orig_dvnode->lock);
+ orig_dvnode->update_cnt--;
+ ASSERTCMP(orig_dvnode->update_cnt, >=, 0);
+ spin_unlock(&orig_dvnode->lock);
+ if (new_dvnode != orig_dvnode) {
+ spin_lock(&new_dvnode->lock);
+ new_dvnode->update_cnt--;
+ ASSERTCMP(new_dvnode->update_cnt, >=, 0);
+ spin_unlock(&new_dvnode->lock);
+ }
+ _leave(" = %ld [cnt %d]", PTR_ERR(server), orig_dvnode->update_cnt);
+ return PTR_ERR(server);
+}
diff --git a/fs/afs/vnode.h b/fs/afs/vnode.h
deleted file mode 100644
index b86a97102e8b..000000000000
--- a/fs/afs/vnode.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* vnode.h: AFS vnode record
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_AFS_VNODE_H
-#define _LINUX_AFS_VNODE_H
-
-#include <linux/fs.h>
-#include "server.h"
-#include "kafstimod.h"
-#include "cache.h"
-
-#ifdef __KERNEL__
-
-struct afs_rxfs_fetch_descriptor;
-
-/*****************************************************************************/
-/*
- * vnode catalogue entry
- */
-struct afs_cache_vnode
-{
- afs_vnodeid_t vnode_id; /* vnode ID */
- unsigned vnode_unique; /* vnode ID uniquifier */
- afs_dataversion_t data_version; /* data version */
-};
-
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_vnode_cache_index_def;
-#endif
-
-/*****************************************************************************/
-/*
- * AFS inode private data
- */
-struct afs_vnode
-{
- struct inode vfs_inode; /* the VFS's inode record */
-
- struct afs_volume *volume; /* volume on which vnode resides */
- struct afs_fid fid; /* the file identifier for this inode */
- struct afs_file_status status; /* AFS status info for this file */
-#ifdef AFS_CACHING_SUPPORT
- struct cachefs_cookie *cache; /* caching cookie */
-#endif
-
- wait_queue_head_t update_waitq; /* status fetch waitqueue */
- unsigned update_cnt; /* number of outstanding ops that will update the
- * status */
- spinlock_t lock; /* waitqueue/flags lock */
- unsigned flags;
-#define AFS_VNODE_CHANGED 0x00000001 /* set if vnode reported changed by callback */
-#define AFS_VNODE_DELETED 0x00000002 /* set if vnode deleted on server */
-#define AFS_VNODE_MOUNTPOINT 0x00000004 /* set if vnode is a mountpoint symlink */
-
- /* outstanding callback notification on this file */
- struct afs_server *cb_server; /* server that made the current promise */
- struct list_head cb_link; /* link in server's promises list */
- struct list_head cb_hash_link; /* link in master callback hash */
- struct afs_timer cb_timeout; /* timeout on promise */
- unsigned cb_version; /* callback version */
- unsigned cb_expiry; /* callback expiry time */
- afs_callback_type_t cb_type; /* type of callback */
-};
-
-static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
-{
- return container_of(inode,struct afs_vnode,vfs_inode);
-}
-
-static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
-{
- return &vnode->vfs_inode;
-}
-
-extern int afs_vnode_fetch_status(struct afs_vnode *vnode);
-
-extern int afs_vnode_fetch_data(struct afs_vnode *vnode,
- struct afs_rxfs_fetch_descriptor *desc);
-
-extern int afs_vnode_give_up_callback(struct afs_vnode *vnode);
-
-extern struct afs_timer_ops afs_vnode_cb_timed_out_ops;
-
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_AFS_VNODE_H */
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 768c6dbd323a..dd160cada45d 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -1,6 +1,6 @@
-/* volume.c: AFS volume management
+/* AFS volume management
*
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
@@ -15,35 +15,10 @@
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
-#include "volume.h"
-#include "vnode.h"
-#include "cell.h"
-#include "cache.h"
-#include "cmservice.h"
-#include "fsclient.h"
-#include "vlclient.h"
#include "internal.h"
-#ifdef __KDEBUG
static const char *afs_voltypes[] = { "R/W", "R/O", "BAK" };
-#endif
-
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_volume_cache_match(void *target,
- const void *entry);
-static void afs_volume_cache_update(void *source, void *entry);
-
-struct cachefs_index_def afs_volume_cache_index_def = {
- .name = "volume",
- .data_size = sizeof(struct afs_cache_vhash),
- .keys[0] = { CACHEFS_INDEX_KEYS_BIN, 1 },
- .keys[1] = { CACHEFS_INDEX_KEYS_BIN, 1 },
- .match = afs_volume_cache_match,
- .update = afs_volume_cache_update,
-};
-#endif
-/*****************************************************************************/
/*
* lookup a volume by name
* - this can be one of the following:
@@ -66,118 +41,52 @@ struct cachefs_index_def afs_volume_cache_index_def = {
* - Rule 3: If parent volume is R/W, then only mount R/W volume unless
* explicitly told otherwise
*/
-int afs_volume_lookup(const char *name, struct afs_cell *cell, int rwpath,
- struct afs_volume **_volume)
+struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
{
struct afs_vlocation *vlocation = NULL;
struct afs_volume *volume = NULL;
- afs_voltype_t type;
- const char *cellname, *volname, *suffix;
+ struct afs_server *server = NULL;
char srvtmask;
- int force, ret, loop, cellnamesz, volnamesz;
-
- _enter("%s,,%d,", name, rwpath);
-
- if (!name || (name[0] != '%' && name[0] != '#') || !name[1]) {
- printk("kAFS: unparsable volume name\n");
- return -EINVAL;
- }
-
- /* determine the type of volume we're looking for */
- force = 0;
- type = AFSVL_ROVOL;
-
- if (rwpath || name[0] == '%') {
- type = AFSVL_RWVOL;
- force = 1;
- }
-
- suffix = strrchr(name, '.');
- if (suffix) {
- if (strcmp(suffix, ".readonly") == 0) {
- type = AFSVL_ROVOL;
- force = 1;
- }
- else if (strcmp(suffix, ".backup") == 0) {
- type = AFSVL_BACKVOL;
- force = 1;
- }
- else if (suffix[1] == 0) {
- }
- else {
- suffix = NULL;
- }
- }
+ int ret, loop;
- /* split the cell and volume names */
- name++;
- volname = strchr(name, ':');
- if (volname) {
- cellname = name;
- cellnamesz = volname - name;
- volname++;
- }
- else {
- volname = name;
- cellname = NULL;
- cellnamesz = 0;
- }
-
- volnamesz = suffix ? suffix - volname : strlen(volname);
-
- _debug("CELL:%*.*s [%p] VOLUME:%*.*s SUFFIX:%s TYPE:%d%s",
- cellnamesz, cellnamesz, cellname ?: "", cell,
- volnamesz, volnamesz, volname, suffix ?: "-",
- type,
- force ? " FORCE" : "");
-
- /* lookup the cell record */
- if (cellname || !cell) {
- ret = afs_cell_lookup(cellname, cellnamesz, &cell);
- if (ret<0) {
- printk("kAFS: unable to lookup cell '%s'\n",
- cellname ?: "");
- goto error;
- }
- }
- else {
- afs_get_cell(cell);
- }
+ _enter("{%*.*s,%d}",
+ params->volnamesz, params->volnamesz, params->volname, params->rwpath);
/* lookup the volume location record */
- ret = afs_vlocation_lookup(cell, volname, volnamesz, &vlocation);
- if (ret < 0)
+ vlocation = afs_vlocation_lookup(params->cell, params->key,
+ params->volname, params->volnamesz);
+ if (IS_ERR(vlocation)) {
+ ret = PTR_ERR(vlocation);
+ vlocation = NULL;
goto error;
+ }
/* make the final decision on the type we want */
ret = -ENOMEDIUM;
- if (force && !(vlocation->vldb.vidmask & (1 << type)))
+ if (params->force && !(vlocation->vldb.vidmask & (1 << params->type)))
goto error;
srvtmask = 0;
for (loop = 0; loop < vlocation->vldb.nservers; loop++)
srvtmask |= vlocation->vldb.srvtmask[loop];
- if (force) {
- if (!(srvtmask & (1 << type)))
+ if (params->force) {
+ if (!(srvtmask & (1 << params->type)))
goto error;
- }
- else if (srvtmask & AFS_VOL_VTM_RO) {
- type = AFSVL_ROVOL;
- }
- else if (srvtmask & AFS_VOL_VTM_RW) {
- type = AFSVL_RWVOL;
- }
- else {
+ } else if (srvtmask & AFS_VOL_VTM_RO) {
+ params->type = AFSVL_ROVOL;
+ } else if (srvtmask & AFS_VOL_VTM_RW) {
+ params->type = AFSVL_RWVOL;
+ } else {
goto error;
}
- down_write(&cell->vl_sem);
+ down_write(&params->cell->vl_sem);
/* is the volume already active? */
- if (vlocation->vols[type]) {
+ if (vlocation->vols[params->type]) {
/* yes - re-use it */
- volume = vlocation->vols[type];
+ volume = vlocation->vols[params->type];
afs_get_volume(volume);
goto success;
}
@@ -191,23 +100,24 @@ int afs_volume_lookup(const char *name, struct afs_cell *cell, int rwpath,
goto error_up;
atomic_set(&volume->usage, 1);
- volume->type = type;
- volume->type_force = force;
- volume->cell = cell;
- volume->vid = vlocation->vldb.vid[type];
+ volume->type = params->type;
+ volume->type_force = params->force;
+ volume->cell = params->cell;
+ volume->vid = vlocation->vldb.vid[params->type];
init_rwsem(&volume->server_sem);
/* look up all the applicable server records */
for (loop = 0; loop < 8; loop++) {
if (vlocation->vldb.srvtmask[loop] & (1 << volume->type)) {
- ret = afs_server_lookup(
- volume->cell,
- &vlocation->vldb.servers[loop],
- &volume->servers[volume->nservers]);
- if (ret < 0)
+ server = afs_lookup_server(
+ volume->cell, &vlocation->vldb.servers[loop]);
+ if (IS_ERR(server)) {
+ ret = PTR_ERR(server);
goto error_discard;
+ }
+ volume->servers[volume->nservers] = server;
volume->nservers++;
}
}
@@ -223,35 +133,34 @@ int afs_volume_lookup(const char *name, struct afs_cell *cell, int rwpath,
afs_get_vlocation(vlocation);
volume->vlocation = vlocation;
- vlocation->vols[type] = volume;
+ vlocation->vols[volume->type] = volume;
- success:
+success:
_debug("kAFS selected %s volume %08x",
afs_voltypes[volume->type], volume->vid);
- *_volume = volume;
- ret = 0;
+ up_write(&params->cell->vl_sem);
+ afs_put_vlocation(vlocation);
+ _leave(" = %p", volume);
+ return volume;
/* clean up */
- error_up:
- up_write(&cell->vl_sem);
- error:
+error_up:
+ up_write(&params->cell->vl_sem);
+error:
afs_put_vlocation(vlocation);
- afs_put_cell(cell);
-
- _leave(" = %d (%p)", ret, volume);
- return ret;
+ _leave(" = %d", ret);
+ return ERR_PTR(ret);
- error_discard:
- up_write(&cell->vl_sem);
+error_discard:
+ up_write(&params->cell->vl_sem);
for (loop = volume->nservers - 1; loop >= 0; loop--)
afs_put_server(volume->servers[loop]);
kfree(volume);
goto error;
-} /* end afs_volume_lookup() */
+}
-/*****************************************************************************/
/*
* destroy a volume record
*/
@@ -265,10 +174,9 @@ void afs_put_volume(struct afs_volume *volume)
_enter("%p", volume);
- vlocation = volume->vlocation;
+ ASSERTCMP(atomic_read(&volume->usage), >, 0);
- /* sanity check */
- BUG_ON(atomic_read(&volume->usage) <= 0);
+ vlocation = volume->vlocation;
/* to prevent a race, the decrement and the dequeue must be effectively
* atomic */
@@ -296,21 +204,27 @@ void afs_put_volume(struct afs_volume *volume)
kfree(volume);
_leave(" [destroyed]");
-} /* end afs_put_volume() */
+}
-/*****************************************************************************/
/*
* pick a server to use to try accessing this volume
* - returns with an elevated usage count on the server chosen
*/
-int afs_volume_pick_fileserver(struct afs_volume *volume,
- struct afs_server **_server)
+struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *vnode)
{
+ struct afs_volume *volume = vnode->volume;
struct afs_server *server;
int ret, state, loop;
_enter("%s", volume->vlocation->vldb.name);
+ /* stick with the server we're already using if we can */
+ if (vnode->server && vnode->server->fs_state == 0) {
+ afs_get_server(vnode->server);
+ _leave(" = %p [current]", vnode->server);
+ return vnode->server;
+ }
+
down_read(&volume->server_sem);
/* handle the no-server case */
@@ -318,7 +232,7 @@ int afs_volume_pick_fileserver(struct afs_volume *volume,
ret = volume->rjservers ? -ENOMEDIUM : -ESTALE;
up_read(&volume->server_sem);
_leave(" = %d [no servers]", ret);
- return ret;
+ return ERR_PTR(ret);
}
/* basically, just search the list for the first live server and use
@@ -328,15 +242,16 @@ int afs_volume_pick_fileserver(struct afs_volume *volume,
server = volume->servers[loop];
state = server->fs_state;
+ _debug("consider %d [%d]", loop, state);
+
switch (state) {
/* found an apparently healthy server */
case 0:
afs_get_server(server);
up_read(&volume->server_sem);
- *_server = server;
- _leave(" = 0 (picked %08x)",
- ntohl(server->addr.s_addr));
- return 0;
+ _leave(" = %p (picked %08x)",
+ server, ntohl(server->addr.s_addr));
+ return server;
case -ENETUNREACH:
if (ret == 0)
@@ -372,20 +287,21 @@ int afs_volume_pick_fileserver(struct afs_volume *volume,
*/
up_read(&volume->server_sem);
_leave(" = %d", ret);
- return ret;
-} /* end afs_volume_pick_fileserver() */
+ return ERR_PTR(ret);
+}
-/*****************************************************************************/
/*
* release a server after use
* - releases the ref on the server struct that was acquired by picking
* - records result of using a particular server to access a volume
* - return 0 to try again, 1 if okay or to issue error
+ * - the caller must release the server struct if result was 0
*/
-int afs_volume_release_fileserver(struct afs_volume *volume,
+int afs_volume_release_fileserver(struct afs_vnode *vnode,
struct afs_server *server,
int result)
{
+ struct afs_volume *volume = vnode->volume;
unsigned loop;
_enter("%s,%08x,%d",
@@ -396,14 +312,16 @@ int afs_volume_release_fileserver(struct afs_volume *volume,
/* success */
case 0:
server->fs_act_jif = jiffies;
- break;
+ server->fs_state = 0;
+ _leave("");
+ return 1;
/* the fileserver denied all knowledge of the volume */
case -ENOMEDIUM:
server->fs_act_jif = jiffies;
down_write(&volume->server_sem);
- /* first, find where the server is in the active list (if it
+ /* firstly, find where the server is in the active list (if it
* is) */
for (loop = 0; loop < volume->nservers; loop++)
if (volume->servers[loop] == server)
@@ -441,6 +359,7 @@ int afs_volume_release_fileserver(struct afs_volume *volume,
case -ENETUNREACH:
case -EHOSTUNREACH:
case -ECONNREFUSED:
+ case -ETIME:
case -ETIMEDOUT:
case -EREMOTEIO:
/* mark the server as dead
@@ -460,60 +379,17 @@ int afs_volume_release_fileserver(struct afs_volume *volume,
server->fs_act_jif = jiffies;
case -ENOMEM:
case -ENONET:
- break;
+ /* tell the caller to accept the result */
+ afs_put_server(server);
+ _leave(" [local failure]");
+ return 1;
}
- /* tell the caller to accept the result */
- afs_put_server(server);
- _leave("");
- return 1;
-
/* tell the caller to loop around and try the next server */
- try_next_server_upw:
+try_next_server_upw:
up_write(&volume->server_sem);
- try_next_server:
+try_next_server:
afs_put_server(server);
_leave(" [try next server]");
return 0;
-
-} /* end afs_volume_release_fileserver() */
-
-/*****************************************************************************/
-/*
- * match a volume hash record stored in the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_volume_cache_match(void *target,
- const void *entry)
-{
- const struct afs_cache_vhash *vhash = entry;
- struct afs_volume *volume = target;
-
- _enter("{%u},{%u}", volume->type, vhash->vtype);
-
- if (volume->type == vhash->vtype) {
- _leave(" = SUCCESS");
- return CACHEFS_MATCH_SUCCESS;
- }
-
- _leave(" = FAILED");
- return CACHEFS_MATCH_FAILED;
-} /* end afs_volume_cache_match() */
-#endif
-
-/*****************************************************************************/
-/*
- * update a volume hash record stored in the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_volume_cache_update(void *source, void *entry)
-{
- struct afs_cache_vhash *vhash = entry;
- struct afs_volume *volume = source;
-
- _enter("");
-
- vhash->vtype = volume->type;
-
-} /* end afs_volume_cache_update() */
-#endif
+}
diff --git a/fs/afs/volume.h b/fs/afs/volume.h
deleted file mode 100644
index bfdcf19ba3f3..000000000000
--- a/fs/afs/volume.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/* volume.h: AFS volume management
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_AFS_VOLUME_H
-#define _LINUX_AFS_VOLUME_H
-
-#include "types.h"
-#include "fsclient.h"
-#include "kafstimod.h"
-#include "kafsasyncd.h"
-#include "cache.h"
-
-typedef enum {
- AFS_VLUPD_SLEEP, /* sleeping waiting for update timer to fire */
- AFS_VLUPD_PENDING, /* on pending queue */
- AFS_VLUPD_INPROGRESS, /* op in progress */
- AFS_VLUPD_BUSYSLEEP, /* sleeping because server returned EBUSY */
-
-} __attribute__((packed)) afs_vlocation_upd_t;
-
-/*****************************************************************************/
-/*
- * entry in the cached volume location catalogue
- */
-struct afs_cache_vlocation
-{
- uint8_t name[64]; /* volume name (lowercase, padded with NULs) */
- uint8_t nservers; /* number of entries used in servers[] */
- uint8_t vidmask; /* voltype mask for vid[] */
- uint8_t srvtmask[8]; /* voltype masks for servers[] */
-#define AFS_VOL_VTM_RW 0x01 /* R/W version of the volume is available (on this server) */
-#define AFS_VOL_VTM_RO 0x02 /* R/O version of the volume is available (on this server) */
-#define AFS_VOL_VTM_BAK 0x04 /* backup version of the volume is available (on this server) */
-
- afs_volid_t vid[3]; /* volume IDs for R/W, R/O and Bak volumes */
- struct in_addr servers[8]; /* fileserver addresses */
- time_t rtime; /* last retrieval time */
-};
-
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_vlocation_cache_index_def;
-#endif
-
-/*****************************************************************************/
-/*
- * volume -> vnode hash table entry
- */
-struct afs_cache_vhash
-{
- afs_voltype_t vtype; /* which volume variation */
- uint8_t hash_bucket; /* which hash bucket this represents */
-} __attribute__((packed));
-
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_volume_cache_index_def;
-#endif
-
-/*****************************************************************************/
-/*
- * AFS volume location record
- */
-struct afs_vlocation
-{
- atomic_t usage;
- struct list_head link; /* link in cell volume location list */
- struct afs_timer timeout; /* decaching timer */
- struct afs_cell *cell; /* cell to which volume belongs */
-#ifdef AFS_CACHING_SUPPORT
- struct cachefs_cookie *cache; /* caching cookie */
-#endif
- struct afs_cache_vlocation vldb; /* volume information DB record */
- struct afs_volume *vols[3]; /* volume access record pointer (index by type) */
- rwlock_t lock; /* access lock */
- unsigned long read_jif; /* time at which last read from vlserver */
- struct afs_timer upd_timer; /* update timer */
- struct afs_async_op upd_op; /* update operation */
- afs_vlocation_upd_t upd_state; /* update state */
- unsigned short upd_first_svix; /* first server index during update */
- unsigned short upd_curr_svix; /* current server index during update */
- unsigned short upd_rej_cnt; /* ENOMEDIUM count during update */
- unsigned short upd_busy_cnt; /* EBUSY count during update */
- unsigned short valid; /* T if valid */
-};
-
-extern int afs_vlocation_lookup(struct afs_cell *cell,
- const char *name,
- unsigned namesz,
- struct afs_vlocation **_vlocation);
-
-#define afs_get_vlocation(V) do { atomic_inc(&(V)->usage); } while(0)
-
-extern void afs_put_vlocation(struct afs_vlocation *vlocation);
-extern void afs_vlocation_do_timeout(struct afs_vlocation *vlocation);
-
-/*****************************************************************************/
-/*
- * AFS volume access record
- */
-struct afs_volume
-{
- atomic_t usage;
- struct afs_cell *cell; /* cell to which belongs (unrefd ptr) */
- struct afs_vlocation *vlocation; /* volume location */
-#ifdef AFS_CACHING_SUPPORT
- struct cachefs_cookie *cache; /* caching cookie */
-#endif
- afs_volid_t vid; /* volume ID */
- afs_voltype_t type; /* type of volume */
- char type_force; /* force volume type (suppress R/O -> R/W) */
- unsigned short nservers; /* number of server slots filled */
- unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */
- struct afs_server *servers[8]; /* servers on which volume resides (ordered) */
- struct rw_semaphore server_sem; /* lock for accessing current server */
-};
-
-extern int afs_volume_lookup(const char *name,
- struct afs_cell *cell,
- int rwpath,
- struct afs_volume **_volume);
-
-#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0)
-
-extern void afs_put_volume(struct afs_volume *volume);
-
-extern int afs_volume_pick_fileserver(struct afs_volume *volume,
- struct afs_server **_server);
-
-extern int afs_volume_release_fileserver(struct afs_volume *volume,
- struct afs_server *server,
- int result);
-
-#endif /* _LINUX_AFS_VOLUME_H */
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 8b1c5d8bf4ef..c68b055fa26e 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -266,6 +266,23 @@ static int do_siocgstamp(unsigned int fd, unsigned int cmd, unsigned long arg)
return err;
}
+static int do_siocgstampns(unsigned int fd, unsigned int cmd, unsigned long arg)
+{
+ struct compat_timespec __user *up = compat_ptr(arg);
+ struct timespec kts;
+ mm_segment_t old_fs = get_fs();
+ int err;
+
+ set_fs(KERNEL_DS);
+ err = sys_ioctl(fd, cmd, (unsigned long)&kts);
+ set_fs(old_fs);
+ if (!err) {
+ err = put_user(kts.tv_sec, &up->tv_sec);
+ err |= __put_user(kts.tv_nsec, &up->tv_nsec);
+ }
+ return err;
+}
+
struct ifmap32 {
compat_ulong_t mem_start;
compat_ulong_t mem_end;
@@ -2437,6 +2454,7 @@ HANDLE_IOCTL(SIOCBRDELIF, dev_ifsioc)
/* Note SIOCRTMSG is no longer, so this is safe and * the user would have seen just an -EINVAL anyways. */
HANDLE_IOCTL(SIOCRTMSG, ret_einval)
HANDLE_IOCTL(SIOCGSTAMP, do_siocgstamp)
+HANDLE_IOCTL(SIOCGSTAMPNS, do_siocgstampns)
#endif
#ifdef CONFIG_BLOCK
HANDLE_IOCTL(HDIO_GETGEO, hdio_getgeo)
diff --git a/fs/ecryptfs/netlink.c b/fs/ecryptfs/netlink.c
index e3aa2253c850..fe9186312d7c 100644
--- a/fs/ecryptfs/netlink.c
+++ b/fs/ecryptfs/netlink.c
@@ -97,7 +97,7 @@ out:
*/
static int ecryptfs_process_nl_response(struct sk_buff *skb)
{
- struct nlmsghdr *nlh = (struct nlmsghdr*)skb->data;
+ struct nlmsghdr *nlh = nlmsg_hdr(skb);
struct ecryptfs_message *msg = NLMSG_DATA(nlh);
int rc;
@@ -181,7 +181,7 @@ receive:
"rc = [%d]\n", rc);
return;
}
- nlh = (struct nlmsghdr *)skb->data;
+ nlh = nlmsg_hdr(skb);
if (!NLMSG_OK(nlh, skb->len)) {
ecryptfs_printk(KERN_ERR, "Received corrupt netlink "
"message\n");
@@ -229,7 +229,7 @@ int ecryptfs_init_netlink(void)
ecryptfs_nl_sock = netlink_kernel_create(NETLINK_ECRYPTFS, 0,
ecryptfs_receive_nl_message,
- THIS_MODULE);
+ NULL, THIS_MODULE);
if (!ecryptfs_nl_sock) {
rc = -EIO;
ecryptfs_printk(KERN_ERR, "Failed to create netlink socket\n");
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 23029f42ae8c..1d3b7a9fc828 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -670,6 +670,13 @@ static int jffs2_flash_setup(struct jffs2_sb_info *c) {
return ret;
}
+ /* and an UBI volume */
+ if (jffs2_ubivol(c)) {
+ ret = jffs2_ubivol_setup(c);
+ if (ret)
+ return ret;
+ }
+
return ret;
}
@@ -688,4 +695,9 @@ void jffs2_flash_cleanup(struct jffs2_sb_info *c) {
if (jffs2_nor_wbuf_flash(c)) {
jffs2_nor_wbuf_flash_cleanup(c);
}
+
+ /* and an UBI volume */
+ if (jffs2_ubivol(c)) {
+ jffs2_ubivol_cleanup(c);
+ }
}
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 2379c7e88735..80daea96bbc2 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -96,6 +96,9 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
#define jffs2_nor_wbuf_flash(c) (0)
#define jffs2_nor_wbuf_flash_setup(c) (0)
#define jffs2_nor_wbuf_flash_cleanup(c) do {} while (0)
+#define jffs2_ubivol(c) (0)
+#define jffs2_ubivol_setup(c) (0)
+#define jffs2_ubivol_cleanup(c) do {} while (0)
#else /* NAND and/or ECC'd NOR support present */
@@ -131,6 +134,9 @@ void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c);
#define jffs2_dataflash(c) (c->mtd->type == MTD_DATAFLASH)
int jffs2_dataflash_setup(struct jffs2_sb_info *c);
void jffs2_dataflash_cleanup(struct jffs2_sb_info *c);
+#define jffs2_ubivol(c) (c->mtd->type == MTD_UBIVOLUME)
+int jffs2_ubivol_setup(struct jffs2_sb_info *c);
+void jffs2_ubivol_cleanup(struct jffs2_sb_info *c);
#define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && ! (c->mtd->flags & MTD_BIT_WRITEABLE))
int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c);
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index dafcd4102401..c556e85a565c 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1209,3 +1209,27 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) {
kfree(c->wbuf);
}
+
+int jffs2_ubivol_setup(struct jffs2_sb_info *c) {
+ c->cleanmarker_size = 0;
+
+ if (c->mtd->writesize == 1)
+ /* We do not need write-buffer */
+ return 0;
+
+ init_rwsem(&c->wbuf_sem);
+
+ c->wbuf_pagesize = c->mtd->writesize;
+ c->wbuf_ofs = 0xFFFFFFFF;
+ c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
+ if (!c->wbuf)
+ return -ENOMEM;
+
+ printk(KERN_INFO "JFFS2 write-buffering enabled buffer (%d) erasesize (%d)\n", c->wbuf_pagesize, c->sector_size);
+
+ return 0;
+}
+
+void jffs2_ubivol_cleanup(struct jffs2_sb_info *c) {
+ kfree(c->wbuf);
+}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f27e5378caf2..a0c8667caa72 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -27,6 +27,7 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
+#include <linux/swap.h>
#define MLOG_MASK_PREFIX ML_DISK_ALLOC
#include <cluster/masklog.h>
@@ -34,6 +35,7 @@
#include "ocfs2.h"
#include "alloc.h"
+#include "aops.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "inode.h"
@@ -47,63 +49,243 @@
#include "buffer_head_io.h"
-static int ocfs2_extent_contig(struct inode *inode,
- struct ocfs2_extent_rec *ext,
- u64 blkno);
+static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
-static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
- handle_t *handle,
- struct inode *inode,
- int wanted,
- struct ocfs2_alloc_context *meta_ac,
- struct buffer_head *bhs[]);
+/*
+ * Structures which describe a path through a btree, and functions to
+ * manipulate them.
+ *
+ * The idea here is to be as generic as possible with the tree
+ * manipulation code.
+ */
+struct ocfs2_path_item {
+ struct buffer_head *bh;
+ struct ocfs2_extent_list *el;
+};
-static int ocfs2_add_branch(struct ocfs2_super *osb,
- handle_t *handle,
- struct inode *inode,
- struct buffer_head *fe_bh,
- struct buffer_head *eb_bh,
- struct buffer_head *last_eb_bh,
- struct ocfs2_alloc_context *meta_ac);
+#define OCFS2_MAX_PATH_DEPTH 5
-static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
- handle_t *handle,
- struct inode *inode,
- struct buffer_head *fe_bh,
- struct ocfs2_alloc_context *meta_ac,
- struct buffer_head **ret_new_eb_bh);
+struct ocfs2_path {
+ int p_tree_depth;
+ struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
+};
-static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
- handle_t *handle,
- struct inode *inode,
- struct buffer_head *fe_bh,
- u64 blkno,
- u32 new_clusters);
+#define path_root_bh(_path) ((_path)->p_node[0].bh)
+#define path_root_el(_path) ((_path)->p_node[0].el)
+#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
+#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
+#define path_num_items(_path) ((_path)->p_tree_depth + 1)
-static int ocfs2_find_branch_target(struct ocfs2_super *osb,
- struct inode *inode,
- struct buffer_head *fe_bh,
- struct buffer_head **target_bh);
+/*
+ * Reset the actual path elements so that we can re-use the structure
+ * to build another path. Generally, this involves freeing the buffer
+ * heads.
+ */
+static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
+{
+ int i, start = 0, depth = 0;
+ struct ocfs2_path_item *node;
-static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
- struct inode *inode,
- struct ocfs2_dinode *fe,
- unsigned int new_i_clusters,
- struct buffer_head *old_last_eb,
- struct buffer_head **new_last_eb);
+ if (keep_root)
+ start = 1;
+
+ for(i = start; i < path_num_items(path); i++) {
+ node = &path->p_node[i];
+
+ brelse(node->bh);
+ node->bh = NULL;
+ node->el = NULL;
+ }
+
+ /*
+ * Tree depth may change during truncate, or insert. If we're
+ * keeping the root extent list, then make sure that our path
+ * structure reflects the proper depth.
+ */
+ if (keep_root)
+ depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
+
+ path->p_tree_depth = depth;
+}
+
+static void ocfs2_free_path(struct ocfs2_path *path)
+{
+ if (path) {
+ ocfs2_reinit_path(path, 0);
+ kfree(path);
+ }
+}
+
+/*
+ * Make the *dest path the same as src and re-initialize src path to
+ * have a root only.
+ */
+static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
+{
+ int i;
+
+ BUG_ON(path_root_bh(dest) != path_root_bh(src));
+
+ for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
+ brelse(dest->p_node[i].bh);
+
+ dest->p_node[i].bh = src->p_node[i].bh;
+ dest->p_node[i].el = src->p_node[i].el;
+
+ src->p_node[i].bh = NULL;
+ src->p_node[i].el = NULL;
+ }
+}
+
+/*
+ * Insert an extent block at given index.
+ *
+ * This will not take an additional reference on eb_bh.
+ */
+static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
+ struct buffer_head *eb_bh)
+{
+ struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
+
+ /*
+ * Right now, no root bh is an extent block, so this helps
+ * catch code errors with dinode trees. The assertion can be
+ * safely removed if we ever need to insert extent block
+ * structures at the root.
+ */
+ BUG_ON(index == 0);
+
+ path->p_node[index].bh = eb_bh;
+ path->p_node[index].el = &eb->h_list;
+}
+
+static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
+ struct ocfs2_extent_list *root_el)
+{
+ struct ocfs2_path *path;
+
+ BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
+
+ path = kzalloc(sizeof(*path), GFP_NOFS);
+ if (path) {
+ path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
+ get_bh(root_bh);
+ path_root_bh(path) = root_bh;
+ path_root_el(path) = root_el;
+ }
+
+ return path;
+}
+
+/*
+ * Allocate and initialize a new path based on a disk inode tree.
+ */
+static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
+{
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_extent_list *el = &di->id2.i_list;
+
+ return ocfs2_new_path(di_bh, el);
+}
+
+/*
+ * Convenience function to journal all components in a path.
+ */
+static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
+ struct ocfs2_path *path)
+{
+ int i, ret = 0;
+
+ if (!path)
+ goto out;
+
+ for(i = 0; i < path_num_items(path); i++) {
+ ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+out:
+ return ret;
+}
+
+enum ocfs2_contig_type {
+ CONTIG_NONE = 0,
+ CONTIG_LEFT,
+ CONTIG_RIGHT
+};
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
-static int ocfs2_extent_contig(struct inode *inode,
- struct ocfs2_extent_rec *ext,
- u64 blkno)
+/*
+ * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
+ * ocfs2_extent_contig only work properly against leaf nodes!
+ */
+static int ocfs2_block_extent_contig(struct super_block *sb,
+ struct ocfs2_extent_rec *ext,
+ u64 blkno)
+{
+ u64 blk_end = le64_to_cpu(ext->e_blkno);
+
+ blk_end += ocfs2_clusters_to_blocks(sb,
+ le16_to_cpu(ext->e_leaf_clusters));
+
+ return blkno == blk_end;
+}
+
+static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
+ struct ocfs2_extent_rec *right)
+{
+ u32 left_range;
+
+ left_range = le32_to_cpu(left->e_cpos) +
+ le16_to_cpu(left->e_leaf_clusters);
+
+ return (left_range == le32_to_cpu(right->e_cpos));
+}
+
+static enum ocfs2_contig_type
+ ocfs2_extent_contig(struct inode *inode,
+ struct ocfs2_extent_rec *ext,
+ struct ocfs2_extent_rec *insert_rec)
{
- return blkno == (le64_to_cpu(ext->e_blkno) +
- ocfs2_clusters_to_blocks(inode->i_sb,
- le32_to_cpu(ext->e_clusters)));
+ u64 blkno = le64_to_cpu(insert_rec->e_blkno);
+
+ if (ocfs2_extents_adjacent(ext, insert_rec) &&
+ ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
+ return CONTIG_RIGHT;
+
+ blkno = le64_to_cpu(ext->e_blkno);
+ if (ocfs2_extents_adjacent(insert_rec, ext) &&
+ ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
+ return CONTIG_LEFT;
+
+ return CONTIG_NONE;
}
/*
+ * NOTE: We can have pretty much any combination of contiguousness and
+ * appending.
+ *
+ * The usefulness of APPEND_TAIL is more in that it lets us know that
+ * we'll have to update the path to that leaf.
+ */
+enum ocfs2_append_type {
+ APPEND_NONE = 0,
+ APPEND_TAIL,
+};
+
+struct ocfs2_insert_type {
+ enum ocfs2_append_type ins_appending;
+ enum ocfs2_contig_type ins_contig;
+ int ins_contig_index;
+ int ins_free_records;
+ int ins_tree_depth;
+};
+
+/*
* How many free extents have we got before we need more meta data?
*/
int ocfs2_num_free_extents(struct ocfs2_super *osb,
@@ -242,6 +424,28 @@ bail:
}
/*
+ * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
+ *
+ * Returns the sum of the rightmost extent rec logical offset and
+ * cluster count.
+ *
+ * ocfs2_add_branch() uses this to determine what logical cluster
+ * value should be populated into the leftmost new branch records.
+ *
+ * ocfs2_shift_tree_depth() uses this to determine the # clusters
+ * value for the new topmost tree record.
+ */
+static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el)
+{
+ int i;
+
+ i = le16_to_cpu(el->l_next_free_rec) - 1;
+
+ return le32_to_cpu(el->l_recs[i].e_cpos) +
+ ocfs2_rec_clusters(el, &el->l_recs[i]);
+}
+
+/*
* Add an entire tree branch to our inode. eb_bh is the extent block
* to start at, if we don't want to start the branch at the dinode
* structure.
@@ -250,7 +454,7 @@ bail:
* for the new last extent block.
*
* the new branch will be 'empty' in the sense that every block will
- * contain a single record with e_clusters == 0.
+ * contain a single record with cluster count == 0.
*/
static int ocfs2_add_branch(struct ocfs2_super *osb,
handle_t *handle,
@@ -268,6 +472,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *eb_el;
struct ocfs2_extent_list *el;
+ u32 new_cpos;
mlog_entry_void();
@@ -302,6 +507,9 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
goto bail;
}
+ eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
+ new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
+
/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
* linked with the rest of the tree.
* conversly, new_eb_bhs[0] is the new bottommost leaf.
@@ -330,9 +538,18 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
eb->h_next_leaf_blk = 0;
eb_el->l_tree_depth = cpu_to_le16(i);
eb_el->l_next_free_rec = cpu_to_le16(1);
- eb_el->l_recs[0].e_cpos = fe->i_clusters;
+ /*
+ * This actually counts as an empty extent as
+ * c_clusters == 0
+ */
+ eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
- eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
+ /*
+ * eb_el isn't always an interior node, but even leaf
+ * nodes want a zero'd flags and reserved field so
+ * this gets the whole 32 bits regardless of use.
+ */
+ eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
if (!eb_el->l_tree_depth)
new_last_eb_blk = le64_to_cpu(eb->h_blkno);
@@ -376,8 +593,8 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
* either be on the fe, or the extent block passed in. */
i = le16_to_cpu(el->l_next_free_rec);
el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
- el->l_recs[i].e_cpos = fe->i_clusters;
- el->l_recs[i].e_clusters = 0;
+ el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
+ el->l_recs[i].e_int_clusters = 0;
le16_add_cpu(&el->l_next_free_rec, 1);
/* fe needs a new last extent block pointer, as does the
@@ -425,6 +642,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
struct buffer_head **ret_new_eb_bh)
{
int status, i;
+ u32 new_clusters;
struct buffer_head *new_eb_bh = NULL;
struct ocfs2_dinode *fe;
struct ocfs2_extent_block *eb;
@@ -461,11 +679,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
/* copy the fe data into the new extent block */
eb_el->l_tree_depth = fe_el->l_tree_depth;
eb_el->l_next_free_rec = fe_el->l_next_free_rec;
- for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
- eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
- eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
- eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
- }
+ for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
+ eb_el->l_recs[i] = fe_el->l_recs[i];
status = ocfs2_journal_dirty(handle, new_eb_bh);
if (status < 0) {
@@ -480,16 +695,15 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
goto bail;
}
+ new_clusters = ocfs2_sum_rightmost_rec(eb_el);
+
/* update fe now */
le16_add_cpu(&fe_el->l_tree_depth, 1);
fe_el->l_recs[0].e_cpos = 0;
fe_el->l_recs[0].e_blkno = eb->h_blkno;
- fe_el->l_recs[0].e_clusters = fe->i_clusters;
- for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
- fe_el->l_recs[i].e_cpos = 0;
- fe_el->l_recs[i].e_clusters = 0;
- fe_el->l_recs[i].e_blkno = 0;
- }
+ fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
+ for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
+ memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
fe_el->l_next_free_rec = cpu_to_le16(1);
/* If this is our 1st tree depth shift, then last_eb_blk
@@ -515,199 +729,6 @@ bail:
}
/*
- * Expects the tree to already have room in the rightmost leaf for the
- * extent. Updates all the extent blocks (and the dinode) on the way
- * down.
- */
-static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
- handle_t *handle,
- struct inode *inode,
- struct buffer_head *fe_bh,
- u64 start_blk,
- u32 new_clusters)
-{
- int status, i, num_bhs = 0;
- u64 next_blkno;
- u16 next_free;
- struct buffer_head **eb_bhs = NULL;
- struct ocfs2_dinode *fe;
- struct ocfs2_extent_block *eb;
- struct ocfs2_extent_list *el;
-
- mlog_entry_void();
-
- status = ocfs2_journal_access(handle, inode, fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- fe = (struct ocfs2_dinode *) fe_bh->b_data;
- el = &fe->id2.i_list;
- if (el->l_tree_depth) {
- /* This is another operation where we want to be
- * careful about our tree updates. An error here means
- * none of the previous changes we made should roll
- * forward. As a result, we have to record the buffers
- * for this part of the tree in an array and reserve a
- * journal write to them before making any changes. */
- num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
- eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
- GFP_KERNEL);
- if (!eb_bhs) {
- status = -ENOMEM;
- mlog_errno(status);
- goto bail;
- }
-
- i = 0;
- while(el->l_tree_depth) {
- next_free = le16_to_cpu(el->l_next_free_rec);
- if (next_free == 0) {
- ocfs2_error(inode->i_sb,
- "Dinode %llu has a bad extent list",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- status = -EIO;
- goto bail;
- }
- next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
-
- BUG_ON(i >= num_bhs);
- status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
- OCFS2_BH_CACHED, inode);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
- eb);
- status = -EIO;
- goto bail;
- }
-
- status = ocfs2_journal_access(handle, inode, eb_bhs[i],
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- el = &eb->h_list;
- i++;
- /* When we leave this loop, eb_bhs[num_bhs - 1] will
- * hold the bottom-most leaf extent block. */
- }
- BUG_ON(el->l_tree_depth);
-
- el = &fe->id2.i_list;
- /* If we have tree depth, then the fe update is
- * trivial, and we want to switch el out for the
- * bottom-most leaf in order to update it with the
- * actual extent data below. */
- next_free = le16_to_cpu(el->l_next_free_rec);
- if (next_free == 0) {
- ocfs2_error(inode->i_sb,
- "Dinode %llu has a bad extent list",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- status = -EIO;
- goto bail;
- }
- le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
- new_clusters);
- /* (num_bhs - 1) to avoid the leaf */
- for(i = 0; i < (num_bhs - 1); i++) {
- eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
- el = &eb->h_list;
-
- /* finally, make our actual change to the
- * intermediate extent blocks. */
- next_free = le16_to_cpu(el->l_next_free_rec);
- le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
- new_clusters);
-
- status = ocfs2_journal_dirty(handle, eb_bhs[i]);
- if (status < 0)
- mlog_errno(status);
- }
- BUG_ON(i != (num_bhs - 1));
- /* note that the leaf block wasn't touched in
- * the loop above */
- eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
- el = &eb->h_list;
- BUG_ON(el->l_tree_depth);
- }
-
- /* yay, we can finally add the actual extent now! */
- i = le16_to_cpu(el->l_next_free_rec) - 1;
- if (le16_to_cpu(el->l_next_free_rec) &&
- ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
- le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
- } else if (le16_to_cpu(el->l_next_free_rec) &&
- (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
- /* having an empty extent at eof is legal. */
- if (el->l_recs[i].e_cpos != fe->i_clusters) {
- ocfs2_error(inode->i_sb,
- "Dinode %llu trailing extent is bad: "
- "cpos (%u) != number of clusters (%u)",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- le32_to_cpu(el->l_recs[i].e_cpos),
- le32_to_cpu(fe->i_clusters));
- status = -EIO;
- goto bail;
- }
- el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
- el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
- } else {
- /* No contiguous record, or no empty record at eof, so
- * we add a new one. */
-
- BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
- le16_to_cpu(el->l_count));
- i = le16_to_cpu(el->l_next_free_rec);
-
- el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
- el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
- el->l_recs[i].e_cpos = fe->i_clusters;
- le16_add_cpu(&el->l_next_free_rec, 1);
- }
-
- /*
- * extent_map errors are not fatal, so they are ignored outside
- * of flushing the thing.
- */
- status = ocfs2_extent_map_append(inode, &el->l_recs[i],
- new_clusters);
- if (status) {
- mlog_errno(status);
- ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
- }
-
- status = ocfs2_journal_dirty(handle, fe_bh);
- if (status < 0)
- mlog_errno(status);
- if (fe->id2.i_list.l_tree_depth) {
- status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
- if (status < 0)
- mlog_errno(status);
- }
-
- status = 0;
-bail:
- if (eb_bhs) {
- for (i = 0; i < num_bhs; i++)
- if (eb_bhs[i])
- brelse(eb_bhs[i]);
- kfree(eb_bhs);
- }
-
- mlog_exit(status);
- return status;
-}
-
-/*
* Should only be called when there is no space left in any of the
* leaf nodes. What we want to do is find the lowest tree depth
* non-leaf extent block with room for new records. There are three
@@ -807,53 +828,1548 @@ bail:
return status;
}
-/* the caller needs to update fe->i_clusters */
-int ocfs2_insert_extent(struct ocfs2_super *osb,
- handle_t *handle,
- struct inode *inode,
- struct buffer_head *fe_bh,
- u64 start_blk,
- u32 new_clusters,
- struct ocfs2_alloc_context *meta_ac)
+/*
+ * This is only valid for leaf nodes, which are the only ones that can
+ * have empty extents anyway.
+ */
+static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
{
- int status, i, shift;
- struct buffer_head *last_eb_bh = NULL;
+ return !rec->e_leaf_clusters;
+}
+
+/*
+ * This function will discard the rightmost extent record.
+ */
+static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
+{
+ int next_free = le16_to_cpu(el->l_next_free_rec);
+ int count = le16_to_cpu(el->l_count);
+ unsigned int num_bytes;
+
+ BUG_ON(!next_free);
+ /* This will cause us to go off the end of our extent list. */
+ BUG_ON(next_free >= count);
+
+ num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
+
+ memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
+}
+
+static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
+ struct ocfs2_extent_rec *insert_rec)
+{
+ int i, insert_index, next_free, has_empty, num_bytes;
+ u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
+ struct ocfs2_extent_rec *rec;
+
+ next_free = le16_to_cpu(el->l_next_free_rec);
+ has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
+
+ BUG_ON(!next_free);
+
+ /* The tree code before us didn't allow enough room in the leaf. */
+ if (el->l_next_free_rec == el->l_count && !has_empty)
+ BUG();
+
+ /*
+ * The easiest way to approach this is to just remove the
+ * empty extent and temporarily decrement next_free.
+ */
+ if (has_empty) {
+ /*
+ * If next_free was 1 (only an empty extent), this
+ * loop won't execute, which is fine. We still want
+ * the decrement above to happen.
+ */
+ for(i = 0; i < (next_free - 1); i++)
+ el->l_recs[i] = el->l_recs[i+1];
+
+ next_free--;
+ }
+
+ /*
+ * Figure out what the new record index should be.
+ */
+ for(i = 0; i < next_free; i++) {
+ rec = &el->l_recs[i];
+
+ if (insert_cpos < le32_to_cpu(rec->e_cpos))
+ break;
+ }
+ insert_index = i;
+
+ mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
+ insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
+
+ BUG_ON(insert_index < 0);
+ BUG_ON(insert_index >= le16_to_cpu(el->l_count));
+ BUG_ON(insert_index > next_free);
+
+ /*
+ * No need to memmove if we're just adding to the tail.
+ */
+ if (insert_index != next_free) {
+ BUG_ON(next_free >= le16_to_cpu(el->l_count));
+
+ num_bytes = next_free - insert_index;
+ num_bytes *= sizeof(struct ocfs2_extent_rec);
+ memmove(&el->l_recs[insert_index + 1],
+ &el->l_recs[insert_index],
+ num_bytes);
+ }
+
+ /*
+ * Either we had an empty extent, and need to re-increment or
+ * there was no empty extent on a non full rightmost leaf node,
+ * in which case we still need to increment.
+ */
+ next_free++;
+ el->l_next_free_rec = cpu_to_le16(next_free);
+ /*
+ * Make sure none of the math above just messed up our tree.
+ */
+ BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
+
+ el->l_recs[insert_index] = *insert_rec;
+
+}
+
+/*
+ * Create an empty extent record .
+ *
+ * l_next_free_rec may be updated.
+ *
+ * If an empty extent already exists do nothing.
+ */
+static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
+{
+ int next_free = le16_to_cpu(el->l_next_free_rec);
+
+ BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+ if (next_free == 0)
+ goto set_and_inc;
+
+ if (ocfs2_is_empty_extent(&el->l_recs[0]))
+ return;
+
+ mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
+ "Asked to create an empty extent in a full list:\n"
+ "count = %u, tree depth = %u",
+ le16_to_cpu(el->l_count),
+ le16_to_cpu(el->l_tree_depth));
+
+ ocfs2_shift_records_right(el);
+
+set_and_inc:
+ le16_add_cpu(&el->l_next_free_rec, 1);
+ memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+}
+
+/*
+ * For a rotation which involves two leaf nodes, the "root node" is
+ * the lowest level tree node which contains a path to both leafs. This
+ * resulting set of information can be used to form a complete "subtree"
+ *
+ * This function is passed two full paths from the dinode down to a
+ * pair of adjacent leaves. It's task is to figure out which path
+ * index contains the subtree root - this can be the root index itself
+ * in a worst-case rotation.
+ *
+ * The array index of the subtree root is passed back.
+ */
+static int ocfs2_find_subtree_root(struct inode *inode,
+ struct ocfs2_path *left,
+ struct ocfs2_path *right)
+{
+ int i = 0;
+
+ /*
+ * Check that the caller passed in two paths from the same tree.
+ */
+ BUG_ON(path_root_bh(left) != path_root_bh(right));
+
+ do {
+ i++;
+
+ /*
+ * The caller didn't pass two adjacent paths.
+ */
+ mlog_bug_on_msg(i > left->p_tree_depth,
+ "Inode %lu, left depth %u, right depth %u\n"
+ "left leaf blk %llu, right leaf blk %llu\n",
+ inode->i_ino, left->p_tree_depth,
+ right->p_tree_depth,
+ (unsigned long long)path_leaf_bh(left)->b_blocknr,
+ (unsigned long long)path_leaf_bh(right)->b_blocknr);
+ } while (left->p_node[i].bh->b_blocknr ==
+ right->p_node[i].bh->b_blocknr);
+
+ return i - 1;
+}
+
+typedef void (path_insert_t)(void *, struct buffer_head *);
+
+/*
+ * Traverse a btree path in search of cpos, starting at root_el.
+ *
+ * This code can be called with a cpos larger than the tree, in which
+ * case it will return the rightmost path.
+ */
+static int __ocfs2_find_path(struct inode *inode,
+ struct ocfs2_extent_list *root_el, u32 cpos,
+ path_insert_t *func, void *data)
+{
+ int i, ret = 0;
+ u32 range;
+ u64 blkno;
struct buffer_head *bh = NULL;
- struct ocfs2_dinode *fe;
struct ocfs2_extent_block *eb;
- struct ocfs2_extent_list *el;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_rec *rec;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
- mlog_entry_void();
+ el = root_el;
+ while (el->l_tree_depth) {
+ if (le16_to_cpu(el->l_next_free_rec) == 0) {
+ ocfs2_error(inode->i_sb,
+ "Inode %llu has empty extent list at "
+ "depth %u\n",
+ (unsigned long long)oi->ip_blkno,
+ le16_to_cpu(el->l_tree_depth));
+ ret = -EROFS;
+ goto out;
- mlog(0, "add %u clusters starting at block %llu to inode %llu\n",
- new_clusters, (unsigned long long)start_blk,
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ }
- fe = (struct ocfs2_dinode *) fe_bh->b_data;
- el = &fe->id2.i_list;
+ for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
+ rec = &el->l_recs[i];
+
+ /*
+ * In the case that cpos is off the allocation
+ * tree, this should just wind up returning the
+ * rightmost record.
+ */
+ range = le32_to_cpu(rec->e_cpos) +
+ ocfs2_rec_clusters(el, rec);
+ if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
+ break;
+ }
- if (el->l_tree_depth) {
- /* jump to end of tree */
- status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
- &last_eb_bh, OCFS2_BH_CACHED, inode);
- if (status < 0) {
- mlog_exit(status);
- goto bail;
+ blkno = le64_to_cpu(el->l_recs[i].e_blkno);
+ if (blkno == 0) {
+ ocfs2_error(inode->i_sb,
+ "Inode %llu has bad blkno in extent list "
+ "at depth %u (index %d)\n",
+ (unsigned long long)oi->ip_blkno,
+ le16_to_cpu(el->l_tree_depth), i);
+ ret = -EROFS;
+ goto out;
}
- eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+
+ brelse(bh);
+ bh = NULL;
+ ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
+ &bh, OCFS2_BH_CACHED, inode);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) bh->b_data;
el = &eb->h_list;
+ if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+ OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+ ret = -EIO;
+ goto out;
+ }
+
+ if (le16_to_cpu(el->l_next_free_rec) >
+ le16_to_cpu(el->l_count)) {
+ ocfs2_error(inode->i_sb,
+ "Inode %llu has bad count in extent list "
+ "at block %llu (next free=%u, count=%u)\n",
+ (unsigned long long)oi->ip_blkno,
+ (unsigned long long)bh->b_blocknr,
+ le16_to_cpu(el->l_next_free_rec),
+ le16_to_cpu(el->l_count));
+ ret = -EROFS;
+ goto out;
+ }
+
+ if (func)
+ func(data, bh);
+ }
+
+out:
+ /*
+ * Catch any trailing bh that the loop didn't handle.
+ */
+ brelse(bh);
+
+ return ret;
+}
+
+/*
+ * Given an initialized path (that is, it has a valid root extent
+ * list), this function will traverse the btree in search of the path
+ * which would contain cpos.
+ *
+ * The path traveled is recorded in the path structure.
+ *
+ * Note that this will not do any comparisons on leaf node extent
+ * records, so it will work fine in the case that we just added a tree
+ * branch.
+ */
+struct find_path_data {
+ int index;
+ struct ocfs2_path *path;
+};
+static void find_path_ins(void *data, struct buffer_head *bh)
+{
+ struct find_path_data *fp = data;
+
+ get_bh(bh);
+ ocfs2_path_insert_eb(fp->path, fp->index, bh);
+ fp->index++;
+}
+static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
+ u32 cpos)
+{
+ struct find_path_data data;
+
+ data.index = 1;
+ data.path = path;
+ return __ocfs2_find_path(inode, path_root_el(path), cpos,
+ find_path_ins, &data);
+}
+
+static void find_leaf_ins(void *data, struct buffer_head *bh)
+{
+ struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
+ struct ocfs2_extent_list *el = &eb->h_list;
+ struct buffer_head **ret = data;
+
+ /* We want to retain only the leaf block. */
+ if (le16_to_cpu(el->l_tree_depth) == 0) {
+ get_bh(bh);
+ *ret = bh;
+ }
+}
+/*
+ * Find the leaf block in the tree which would contain cpos. No
+ * checking of the actual leaf is done.
+ *
+ * Some paths want to call this instead of allocating a path structure
+ * and calling ocfs2_find_path().
+ *
+ * This function doesn't handle non btree extent lists.
+ */
+int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
+ u32 cpos, struct buffer_head **leaf_bh)
+{
+ int ret;
+ struct buffer_head *bh = NULL;
+
+ ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ *leaf_bh = bh;
+out:
+ return ret;
+}
+
+/*
+ * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
+ *
+ * Basically, we've moved stuff around at the bottom of the tree and
+ * we need to fix up the extent records above the changes to reflect
+ * the new changes.
+ *
+ * left_rec: the record on the left.
+ * left_child_el: is the child list pointed to by left_rec
+ * right_rec: the record to the right of left_rec
+ * right_child_el: is the child list pointed to by right_rec
+ *
+ * By definition, this only works on interior nodes.
+ */
+static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
+ struct ocfs2_extent_list *left_child_el,
+ struct ocfs2_extent_rec *right_rec,
+ struct ocfs2_extent_list *right_child_el)
+{
+ u32 left_clusters, right_end;
+
+ /*
+ * Interior nodes never have holes. Their cpos is the cpos of
+ * the leftmost record in their child list. Their cluster
+ * count covers the full theoretical range of their child list
+ * - the range between their cpos and the cpos of the record
+ * immediately to their right.
+ */
+ left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
+ left_clusters -= le32_to_cpu(left_rec->e_cpos);
+ left_rec->e_int_clusters = cpu_to_le32(left_clusters);
+
+ /*
+ * Calculate the rightmost cluster count boundary before
+ * moving cpos - we will need to adjust clusters after
+ * updating e_cpos to keep the same highest cluster count.
+ */
+ right_end = le32_to_cpu(right_rec->e_cpos);
+ right_end += le32_to_cpu(right_rec->e_int_clusters);
+
+ right_rec->e_cpos = left_rec->e_cpos;
+ le32_add_cpu(&right_rec->e_cpos, left_clusters);
+
+ right_end -= le32_to_cpu(right_rec->e_cpos);
+ right_rec->e_int_clusters = cpu_to_le32(right_end);
+}
+
+/*
+ * Adjust the adjacent root node records involved in a
+ * rotation. left_el_blkno is passed in as a key so that we can easily
+ * find it's index in the root list.
+ */
+static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
+ struct ocfs2_extent_list *left_el,
+ struct ocfs2_extent_list *right_el,
+ u64 left_el_blkno)
+{
+ int i;
+
+ BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
+ le16_to_cpu(left_el->l_tree_depth));
+
+ for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
+ if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
+ break;
+ }
+
+ /*
+ * The path walking code should have never returned a root and
+ * two paths which are not adjacent.
+ */
+ BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
+
+ ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
+ &root_el->l_recs[i + 1], right_el);
+}
+
+/*
+ * We've changed a leaf block (in right_path) and need to reflect that
+ * change back up the subtree.
+ *
+ * This happens in multiple places:
+ * - When we've moved an extent record from the left path leaf to the right
+ * path leaf to make room for an empty extent in the left path leaf.
+ * - When our insert into the right path leaf is at the leftmost edge
+ * and requires an update of the path immediately to it's left. This
+ * can occur at the end of some types of rotation and appending inserts.
+ */
+static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
+ struct ocfs2_path *left_path,
+ struct ocfs2_path *right_path,
+ int subtree_index)
+{
+ int ret, i, idx;
+ struct ocfs2_extent_list *el, *left_el, *right_el;
+ struct ocfs2_extent_rec *left_rec, *right_rec;
+ struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
+
+ /*
+ * Update the counts and position values within all the
+ * interior nodes to reflect the leaf rotation we just did.
+ *
+ * The root node is handled below the loop.
+ *
+ * We begin the loop with right_el and left_el pointing to the
+ * leaf lists and work our way up.
+ *
+ * NOTE: within this loop, left_el and right_el always refer
+ * to the *child* lists.
+ */
+ left_el = path_leaf_el(left_path);
+ right_el = path_leaf_el(right_path);
+ for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
+ mlog(0, "Adjust records at index %u\n", i);
+
+ /*
+ * One nice property of knowing that all of these
+ * nodes are below the root is that we only deal with
+ * the leftmost right node record and the rightmost
+ * left node record.
+ */
+ el = left_path->p_node[i].el;
+ idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
+ left_rec = &el->l_recs[idx];
+
+ el = right_path->p_node[i].el;
+ right_rec = &el->l_recs[0];
+
+ ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
+ right_el);
+
+ ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
+ if (ret)
+ mlog_errno(ret);
+
+ ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
+ if (ret)
+ mlog_errno(ret);
+
+ /*
+ * Setup our list pointers now so that the current
+ * parents become children in the next iteration.
+ */
+ left_el = left_path->p_node[i].el;
+ right_el = right_path->p_node[i].el;
+ }
+
+ /*
+ * At the root node, adjust the two adjacent records which
+ * begin our path to the leaves.
+ */
+
+ el = left_path->p_node[subtree_index].el;
+ left_el = left_path->p_node[subtree_index + 1].el;
+ right_el = right_path->p_node[subtree_index + 1].el;
+
+ ocfs2_adjust_root_records(el, left_el, right_el,
+ left_path->p_node[subtree_index + 1].bh->b_blocknr);
+
+ root_bh = left_path->p_node[subtree_index].bh;
+
+ ret = ocfs2_journal_dirty(handle, root_bh);
+ if (ret)
+ mlog_errno(ret);
+}
+
+static int ocfs2_rotate_subtree_right(struct inode *inode,
+ handle_t *handle,
+ struct ocfs2_path *left_path,
+ struct ocfs2_path *right_path,
+ int subtree_index)
+{
+ int ret, i;
+ struct buffer_head *right_leaf_bh;
+ struct buffer_head *left_leaf_bh = NULL;
+ struct buffer_head *root_bh;
+ struct ocfs2_extent_list *right_el, *left_el;
+ struct ocfs2_extent_rec move_rec;
+
+ left_leaf_bh = path_leaf_bh(left_path);
+ left_el = path_leaf_el(left_path);
+
+ if (left_el->l_next_free_rec != left_el->l_count) {
+ ocfs2_error(inode->i_sb,
+ "Inode %llu has non-full interior leaf node %llu"
+ "(next free = %u)",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)left_leaf_bh->b_blocknr,
+ le16_to_cpu(left_el->l_next_free_rec));
+ return -EROFS;
+ }
+
+ /*
+ * This extent block may already have an empty record, so we
+ * return early if so.
+ */
+ if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
+ return 0;
+
+ root_bh = left_path->p_node[subtree_index].bh;
+ BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+
+ ret = ocfs2_journal_access(handle, inode, root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
+ ret = ocfs2_journal_access(handle, inode,
+ right_path->p_node[i].bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode,
+ left_path->p_node[i].bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ right_leaf_bh = path_leaf_bh(right_path);
+ right_el = path_leaf_el(right_path);
+
+ /* This is a code error, not a disk corruption. */
+ mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
+ "because rightmost leaf block %llu is empty\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)right_leaf_bh->b_blocknr);
+
+ ocfs2_create_empty_extent(right_el);
+
+ ret = ocfs2_journal_dirty(handle, right_leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* Do the copy now. */
+ i = le16_to_cpu(left_el->l_next_free_rec) - 1;
+ move_rec = left_el->l_recs[i];
+ right_el->l_recs[0] = move_rec;
+
+ /*
+ * Clear out the record we just copied and shift everything
+ * over, leaving an empty extent in the left leaf.
+ *
+ * We temporarily subtract from next_free_rec so that the
+ * shift will lose the tail record (which is now defunct).
+ */
+ le16_add_cpu(&left_el->l_next_free_rec, -1);
+ ocfs2_shift_records_right(left_el);
+ memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+ le16_add_cpu(&left_el->l_next_free_rec, 1);
+
+ ret = ocfs2_journal_dirty(handle, left_leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
+ subtree_index);
+
+out:
+ return ret;
+}
+
+/*
+ * Given a full path, determine what cpos value would return us a path
+ * containing the leaf immediately to the left of the current one.
+ *
+ * Will return zero if the path passed in is already the leftmost path.
+ */
+static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+ struct ocfs2_path *path, u32 *cpos)
+{
+ int i, j, ret = 0;
+ u64 blkno;
+ struct ocfs2_extent_list *el;
+
+ BUG_ON(path->p_tree_depth == 0);
+
+ *cpos = 0;
+
+ blkno = path_leaf_bh(path)->b_blocknr;
+
+ /* Start at the tree node just above the leaf and work our way up. */
+ i = path->p_tree_depth - 1;
+ while (i >= 0) {
+ el = path->p_node[i].el;
+
+ /*
+ * Find the extent record just before the one in our
+ * path.
+ */
+ for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
+ if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
+ if (j == 0) {
+ if (i == 0) {
+ /*
+ * We've determined that the
+ * path specified is already
+ * the leftmost one - return a
+ * cpos of zero.
+ */
+ goto out;
+ }
+ /*
+ * The leftmost record points to our
+ * leaf - we need to travel up the
+ * tree one level.
+ */
+ goto next_node;
+ }
+
+ *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
+ *cpos = *cpos + ocfs2_rec_clusters(el,
+ &el->l_recs[j - 1]);
+ *cpos = *cpos - 1;
+ goto out;
+ }
+ }
+
+ /*
+ * If we got here, we never found a valid node where
+ * the tree indicated one should be.
+ */
+ ocfs2_error(sb,
+ "Invalid extent tree at extent block %llu\n",
+ (unsigned long long)blkno);
+ ret = -EROFS;
+ goto out;
+
+next_node:
+ blkno = path->p_node[i].bh->b_blocknr;
+ i--;
+ }
+
+out:
+ return ret;
+}
+
+static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
+ struct ocfs2_path *path)
+{
+ int credits = (path->p_tree_depth - subtree_depth) * 2 + 1;
+
+ if (handle->h_buffer_credits < credits)
+ return ocfs2_extend_trans(handle, credits);
+
+ return 0;
+}
+
+/*
+ * Trap the case where we're inserting into the theoretical range past
+ * the _actual_ left leaf range. Otherwise, we'll rotate a record
+ * whose cpos is less than ours into the right leaf.
+ *
+ * It's only necessary to look at the rightmost record of the left
+ * leaf because the logic that calls us should ensure that the
+ * theoretical ranges in the path components above the leaves are
+ * correct.
+ */
+static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
+ u32 insert_cpos)
+{
+ struct ocfs2_extent_list *left_el;
+ struct ocfs2_extent_rec *rec;
+ int next_free;
+
+ left_el = path_leaf_el(left_path);
+ next_free = le16_to_cpu(left_el->l_next_free_rec);
+ rec = &left_el->l_recs[next_free - 1];
+
+ if (insert_cpos > le32_to_cpu(rec->e_cpos))
+ return 1;
+ return 0;
+}
+
+/*
+ * Rotate all the records in a btree right one record, starting at insert_cpos.
+ *
+ * The path to the rightmost leaf should be passed in.
+ *
+ * The array is assumed to be large enough to hold an entire path (tree depth).
+ *
+ * Upon succesful return from this function:
+ *
+ * - The 'right_path' array will contain a path to the leaf block
+ * whose range contains e_cpos.
+ * - That leaf block will have a single empty extent in list index 0.
+ * - In the case that the rotation requires a post-insert update,
+ * *ret_left_path will contain a valid path which can be passed to
+ * ocfs2_insert_path().
+ */
+static int ocfs2_rotate_tree_right(struct inode *inode,
+ handle_t *handle,
+ u32 insert_cpos,
+ struct ocfs2_path *right_path,
+ struct ocfs2_path **ret_left_path)
+{
+ int ret, start;
+ u32 cpos;
+ struct ocfs2_path *left_path = NULL;
+
+ *ret_left_path = NULL;
+
+ left_path = ocfs2_new_path(path_root_bh(right_path),
+ path_root_el(right_path));
+ if (!left_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
+
+ /*
+ * What we want to do here is:
+ *
+ * 1) Start with the rightmost path.
+ *
+ * 2) Determine a path to the leaf block directly to the left
+ * of that leaf.
+ *
+ * 3) Determine the 'subtree root' - the lowest level tree node
+ * which contains a path to both leaves.
+ *
+ * 4) Rotate the subtree.
+ *
+ * 5) Find the next subtree by considering the left path to be
+ * the new right path.
+ *
+ * The check at the top of this while loop also accepts
+ * insert_cpos == cpos because cpos is only a _theoretical_
+ * value to get us the left path - insert_cpos might very well
+ * be filling that hole.
+ *
+ * Stop at a cpos of '0' because we either started at the
+ * leftmost branch (i.e., a tree with one branch and a
+ * rotation inside of it), or we've gone as far as we can in
+ * rotating subtrees.
+ */
+ while (cpos && insert_cpos <= cpos) {
+ mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
+ insert_cpos, cpos);
+
+ ret = ocfs2_find_path(inode, left_path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mlog_bug_on_msg(path_leaf_bh(left_path) ==
+ path_leaf_bh(right_path),
+ "Inode %lu: error during insert of %u "
+ "(left path cpos %u) results in two identical "
+ "paths ending at %llu\n",
+ inode->i_ino, insert_cpos, cpos,
+ (unsigned long long)
+ path_leaf_bh(left_path)->b_blocknr);
+
+ if (ocfs2_rotate_requires_path_adjustment(left_path,
+ insert_cpos)) {
+ mlog(0, "Path adjustment required\n");
+
+ /*
+ * We've rotated the tree as much as we
+ * should. The rest is up to
+ * ocfs2_insert_path() to complete, after the
+ * record insertion. We indicate this
+ * situation by returning the left path.
+ *
+ * The reason we don't adjust the records here
+ * before the record insert is that an error
+ * later might break the rule where a parent
+ * record e_cpos will reflect the actual
+ * e_cpos of the 1st nonempty record of the
+ * child list.
+ */
+ *ret_left_path = left_path;
+ goto out_ret_path;
+ }
+
+ start = ocfs2_find_subtree_root(inode, left_path, right_path);
+
+ mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
+ start,
+ (unsigned long long) right_path->p_node[start].bh->b_blocknr,
+ right_path->p_tree_depth);
+
+ ret = ocfs2_extend_rotate_transaction(handle, start,
+ right_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
+ right_path, start);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * There is no need to re-read the next right path
+ * as we know that it'll be our current left
+ * path. Optimize by copying values instead.
+ */
+ ocfs2_mv_path(right_path, left_path);
+
+ ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
+ &cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+out:
+ ocfs2_free_path(left_path);
+
+out_ret_path:
+ return ret;
+}
+
+/*
+ * Do the final bits of extent record insertion at the target leaf
+ * list. If this leaf is part of an allocation tree, it is assumed
+ * that the tree above has been prepared.
+ */
+static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
+ struct ocfs2_extent_list *el,
+ struct ocfs2_insert_type *insert,
+ struct inode *inode)
+{
+ int i = insert->ins_contig_index;
+ unsigned int range;
+ struct ocfs2_extent_rec *rec;
+
+ BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+ /*
+ * Contiguous insert - either left or right.
+ */
+ if (insert->ins_contig != CONTIG_NONE) {
+ rec = &el->l_recs[i];
+ if (insert->ins_contig == CONTIG_LEFT) {
+ rec->e_blkno = insert_rec->e_blkno;
+ rec->e_cpos = insert_rec->e_cpos;
+ }
+ le16_add_cpu(&rec->e_leaf_clusters,
+ le16_to_cpu(insert_rec->e_leaf_clusters));
+ return;
+ }
+
+ /*
+ * Handle insert into an empty leaf.
+ */
+ if (le16_to_cpu(el->l_next_free_rec) == 0 ||
+ ((le16_to_cpu(el->l_next_free_rec) == 1) &&
+ ocfs2_is_empty_extent(&el->l_recs[0]))) {
+ el->l_recs[0] = *insert_rec;
+ el->l_next_free_rec = cpu_to_le16(1);
+ return;
+ }
+
+ /*
+ * Appending insert.
+ */
+ if (insert->ins_appending == APPEND_TAIL) {
+ i = le16_to_cpu(el->l_next_free_rec) - 1;
+ rec = &el->l_recs[i];
+ range = le32_to_cpu(rec->e_cpos)
+ + le16_to_cpu(rec->e_leaf_clusters);
+ BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
+
+ mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
+ le16_to_cpu(el->l_count),
+ "inode %lu, depth %u, count %u, next free %u, "
+ "rec.cpos %u, rec.clusters %u, "
+ "insert.cpos %u, insert.clusters %u\n",
+ inode->i_ino,
+ le16_to_cpu(el->l_tree_depth),
+ le16_to_cpu(el->l_count),
+ le16_to_cpu(el->l_next_free_rec),
+ le32_to_cpu(el->l_recs[i].e_cpos),
+ le16_to_cpu(el->l_recs[i].e_leaf_clusters),
+ le32_to_cpu(insert_rec->e_cpos),
+ le16_to_cpu(insert_rec->e_leaf_clusters));
+ i++;
+ el->l_recs[i] = *insert_rec;
+ le16_add_cpu(&el->l_next_free_rec, 1);
+ return;
+ }
+
+ /*
+ * Ok, we have to rotate.
+ *
+ * At this point, it is safe to assume that inserting into an
+ * empty leaf and appending to a leaf have both been handled
+ * above.
+ *
+ * This leaf needs to have space, either by the empty 1st
+ * extent record, or by virtue of an l_next_rec < l_count.
+ */
+ ocfs2_rotate_leaf(el, insert_rec);
+}
+
+static inline void ocfs2_update_dinode_clusters(struct inode *inode,
+ struct ocfs2_dinode *di,
+ u32 clusters)
+{
+ le32_add_cpu(&di->i_clusters, clusters);
+ spin_lock(&OCFS2_I(inode)->ip_lock);
+ OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
+ spin_unlock(&OCFS2_I(inode)->ip_lock);
+}
+
+static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
+ struct ocfs2_extent_rec *insert_rec,
+ struct ocfs2_path *right_path,
+ struct ocfs2_path **ret_left_path)
+{
+ int ret, i, next_free;
+ struct buffer_head *bh;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_path *left_path = NULL;
+
+ *ret_left_path = NULL;
+
+ /*
+ * This shouldn't happen for non-trees. The extent rec cluster
+ * count manipulation below only works for interior nodes.
+ */
+ BUG_ON(right_path->p_tree_depth == 0);
+
+ /*
+ * If our appending insert is at the leftmost edge of a leaf,
+ * then we might need to update the rightmost records of the
+ * neighboring path.
+ */
+ el = path_leaf_el(right_path);
+ next_free = le16_to_cpu(el->l_next_free_rec);
+ if (next_free == 0 ||
+ (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
+ u32 left_cpos;
+
+ ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
+ &left_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mlog(0, "Append may need a left path update. cpos: %u, "
+ "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
+ left_cpos);
+
+ /*
+ * No need to worry if the append is already in the
+ * leftmost leaf.
+ */
+ if (left_cpos) {
+ left_path = ocfs2_new_path(path_root_bh(right_path),
+ path_root_el(right_path));
+ if (!left_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(inode, left_path, left_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * ocfs2_insert_path() will pass the left_path to the
+ * journal for us.
+ */
+ }
+ }
+
+ ret = ocfs2_journal_access_path(inode, handle, right_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ el = path_root_el(right_path);
+ bh = path_root_bh(right_path);
+ i = 0;
+ while (1) {
+ struct ocfs2_extent_rec *rec;
+
+ next_free = le16_to_cpu(el->l_next_free_rec);
+ if (next_free == 0) {
+ ocfs2_error(inode->i_sb,
+ "Dinode %llu has a bad extent list",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ ret = -EIO;
+ goto out;
+ }
+
+ rec = &el->l_recs[next_free - 1];
+
+ rec->e_int_clusters = insert_rec->e_cpos;
+ le32_add_cpu(&rec->e_int_clusters,
+ le16_to_cpu(insert_rec->e_leaf_clusters));
+ le32_add_cpu(&rec->e_int_clusters,
+ -le32_to_cpu(rec->e_cpos));
+
+ ret = ocfs2_journal_dirty(handle, bh);
+ if (ret)
+ mlog_errno(ret);
+
+ /* Don't touch the leaf node */
+ if (++i >= right_path->p_tree_depth)
+ break;
+
+ bh = right_path->p_node[i].bh;
+ el = right_path->p_node[i].el;
+ }
+
+ *ret_left_path = left_path;
+ ret = 0;
+out:
+ if (ret != 0)
+ ocfs2_free_path(left_path);
+
+ return ret;
+}
+
+/*
+ * This function only does inserts on an allocation b-tree. For dinode
+ * lists, ocfs2_insert_at_leaf() is called directly.
+ *
+ * right_path is the path we want to do the actual insert
+ * in. left_path should only be passed in if we need to update that
+ * portion of the tree after an edge insert.
+ */
+static int ocfs2_insert_path(struct inode *inode,
+ handle_t *handle,
+ struct ocfs2_path *left_path,
+ struct ocfs2_path *right_path,
+ struct ocfs2_extent_rec *insert_rec,
+ struct ocfs2_insert_type *insert)
+{
+ int ret, subtree_index;
+ struct buffer_head *leaf_bh = path_leaf_bh(right_path);
+ struct ocfs2_extent_list *el;
+
+ /*
+ * Pass both paths to the journal. The majority of inserts
+ * will be touching all components anyway.
+ */
+ ret = ocfs2_journal_access_path(inode, handle, right_path);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (left_path) {
+ int credits = handle->h_buffer_credits;
+
+ /*
+ * There's a chance that left_path got passed back to
+ * us without being accounted for in the
+ * journal. Extend our transaction here to be sure we
+ * can change those blocks.
+ */
+ credits += left_path->p_tree_depth;
+
+ ret = ocfs2_extend_trans(handle, credits);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_path(inode, handle, left_path);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ el = path_leaf_el(right_path);
+
+ ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
+ ret = ocfs2_journal_dirty(handle, leaf_bh);
+ if (ret)
+ mlog_errno(ret);
+
+ if (left_path) {
+ /*
+ * The rotate code has indicated that we need to fix
+ * up portions of the tree after the insert.
+ *
+ * XXX: Should we extend the transaction here?
+ */
+ subtree_index = ocfs2_find_subtree_root(inode, left_path,
+ right_path);
+ ocfs2_complete_edge_insert(inode, handle, left_path,
+ right_path, subtree_index);
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static int ocfs2_do_insert_extent(struct inode *inode,
+ handle_t *handle,
+ struct buffer_head *di_bh,
+ struct ocfs2_extent_rec *insert_rec,
+ struct ocfs2_insert_type *type)
+{
+ int ret, rotate = 0;
+ u32 cpos;
+ struct ocfs2_path *right_path = NULL;
+ struct ocfs2_path *left_path = NULL;
+ struct ocfs2_dinode *di;
+ struct ocfs2_extent_list *el;
+
+ di = (struct ocfs2_dinode *) di_bh->b_data;
+ el = &di->id2.i_list;
+
+ ret = ocfs2_journal_access(handle, inode, di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (le16_to_cpu(el->l_tree_depth) == 0) {
+ ocfs2_insert_at_leaf(insert_rec, el, type, inode);
+ goto out_update_clusters;
+ }
+
+ right_path = ocfs2_new_inode_path(di_bh);
+ if (!right_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Determine the path to start with. Rotations need the
+ * rightmost path, everything else can go directly to the
+ * target leaf.
+ */
+ cpos = le32_to_cpu(insert_rec->e_cpos);
+ if (type->ins_appending == APPEND_NONE &&
+ type->ins_contig == CONTIG_NONE) {
+ rotate = 1;
+ cpos = UINT_MAX;
+ }
+
+ ret = ocfs2_find_path(inode, right_path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Rotations and appends need special treatment - they modify
+ * parts of the tree's above them.
+ *
+ * Both might pass back a path immediate to the left of the
+ * one being inserted to. This will be cause
+ * ocfs2_insert_path() to modify the rightmost records of
+ * left_path to account for an edge insert.
+ *
+ * XXX: When modifying this code, keep in mind that an insert
+ * can wind up skipping both of these two special cases...
+ */
+ if (rotate) {
+ ret = ocfs2_rotate_tree_right(inode, handle,
+ le32_to_cpu(insert_rec->e_cpos),
+ right_path, &left_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ } else if (type->ins_appending == APPEND_TAIL
+ && type->ins_contig != CONTIG_LEFT) {
+ ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
+ right_path, &left_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ret = ocfs2_insert_path(inode, handle, left_path, right_path,
+ insert_rec, type);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+out_update_clusters:
+ ocfs2_update_dinode_clusters(inode, di,
+ le16_to_cpu(insert_rec->e_leaf_clusters));
+
+ ret = ocfs2_journal_dirty(handle, di_bh);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ ocfs2_free_path(left_path);
+ ocfs2_free_path(right_path);
+
+ return ret;
+}
+
+static void ocfs2_figure_contig_type(struct inode *inode,
+ struct ocfs2_insert_type *insert,
+ struct ocfs2_extent_list *el,
+ struct ocfs2_extent_rec *insert_rec)
+{
+ int i;
+ enum ocfs2_contig_type contig_type = CONTIG_NONE;
+
+ BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+ for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+ contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
+ insert_rec);
+ if (contig_type != CONTIG_NONE) {
+ insert->ins_contig_index = i;
+ break;
+ }
+ }
+ insert->ins_contig = contig_type;
+}
+
+/*
+ * This should only be called against the righmost leaf extent list.
+ *
+ * ocfs2_figure_appending_type() will figure out whether we'll have to
+ * insert at the tail of the rightmost leaf.
+ *
+ * This should also work against the dinode list for tree's with 0
+ * depth. If we consider the dinode list to be the rightmost leaf node
+ * then the logic here makes sense.
+ */
+static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
+ struct ocfs2_extent_list *el,
+ struct ocfs2_extent_rec *insert_rec)
+{
+ int i;
+ u32 cpos = le32_to_cpu(insert_rec->e_cpos);
+ struct ocfs2_extent_rec *rec;
+
+ insert->ins_appending = APPEND_NONE;
+
+ BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+ if (!el->l_next_free_rec)
+ goto set_tail_append;
+
+ if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+ /* Were all records empty? */
+ if (le16_to_cpu(el->l_next_free_rec) == 1)
+ goto set_tail_append;
}
- /* Can we allocate without adding/shifting tree bits? */
i = le16_to_cpu(el->l_next_free_rec) - 1;
- if (le16_to_cpu(el->l_next_free_rec) == 0
- || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count))
- || le32_to_cpu(el->l_recs[i].e_clusters) == 0
- || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk))
- goto out_add;
+ rec = &el->l_recs[i];
+
+ if (cpos >=
+ (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
+ goto set_tail_append;
+
+ return;
+
+set_tail_append:
+ insert->ins_appending = APPEND_TAIL;
+}
+
+/*
+ * Helper function called at the begining of an insert.
+ *
+ * This computes a few things that are commonly used in the process of
+ * inserting into the btree:
+ * - Whether the new extent is contiguous with an existing one.
+ * - The current tree depth.
+ * - Whether the insert is an appending one.
+ * - The total # of free records in the tree.
+ *
+ * All of the information is stored on the ocfs2_insert_type
+ * structure.
+ */
+static int ocfs2_figure_insert_type(struct inode *inode,
+ struct buffer_head *di_bh,
+ struct buffer_head **last_eb_bh,
+ struct ocfs2_extent_rec *insert_rec,
+ struct ocfs2_insert_type *insert)
+{
+ int ret;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_path *path = NULL;
+ struct buffer_head *bh = NULL;
+
+ el = &di->id2.i_list;
+ insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
+
+ if (el->l_tree_depth) {
+ /*
+ * If we have tree depth, we read in the
+ * rightmost extent block ahead of time as
+ * ocfs2_figure_insert_type() and ocfs2_add_branch()
+ * may want it later.
+ */
+ ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+ le64_to_cpu(di->i_last_eb_blk), &bh,
+ OCFS2_BH_CACHED, inode);
+ if (ret) {
+ mlog_exit(ret);
+ goto out;
+ }
+ eb = (struct ocfs2_extent_block *) bh->b_data;
+ el = &eb->h_list;
+ }
+
+ /*
+ * Unless we have a contiguous insert, we'll need to know if
+ * there is room left in our allocation tree for another
+ * extent record.
+ *
+ * XXX: This test is simplistic, we can search for empty
+ * extent records too.
+ */
+ insert->ins_free_records = le16_to_cpu(el->l_count) -
+ le16_to_cpu(el->l_next_free_rec);
+
+ if (!insert->ins_tree_depth) {
+ ocfs2_figure_contig_type(inode, insert, el, insert_rec);
+ ocfs2_figure_appending_type(insert, el, insert_rec);
+ return 0;
+ }
+
+ path = ocfs2_new_inode_path(di_bh);
+ if (!path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * In the case that we're inserting past what the tree
+ * currently accounts for, ocfs2_find_path() will return for
+ * us the rightmost tree path. This is accounted for below in
+ * the appending code.
+ */
+ ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ el = path_leaf_el(path);
+
+ /*
+ * Now that we have the path, there's two things we want to determine:
+ * 1) Contiguousness (also set contig_index if this is so)
+ *
+ * 2) Are we doing an append? We can trivially break this up
+ * into two types of appends: simple record append, or a
+ * rotate inside the tail leaf.
+ */
+ ocfs2_figure_contig_type(inode, insert, el, insert_rec);
+
+ /*
+ * The insert code isn't quite ready to deal with all cases of
+ * left contiguousness. Specifically, if it's an insert into
+ * the 1st record in a leaf, it will require the adjustment of
+ * cluster count on the last record of the path directly to it's
+ * left. For now, just catch that case and fool the layers
+ * above us. This works just fine for tree_depth == 0, which
+ * is why we allow that above.
+ */
+ if (insert->ins_contig == CONTIG_LEFT &&
+ insert->ins_contig_index == 0)
+ insert->ins_contig = CONTIG_NONE;
+
+ /*
+ * Ok, so we can simply compare against last_eb to figure out
+ * whether the path doesn't exist. This will only happen in
+ * the case that we're doing a tail append, so maybe we can
+ * take advantage of that information somehow.
+ */
+ if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
+ /*
+ * Ok, ocfs2_find_path() returned us the rightmost
+ * tree path. This might be an appending insert. There are
+ * two cases:
+ * 1) We're doing a true append at the tail:
+ * -This might even be off the end of the leaf
+ * 2) We're "appending" by rotating in the tail
+ */
+ ocfs2_figure_appending_type(insert, el, insert_rec);
+ }
+
+out:
+ ocfs2_free_path(path);
+
+ if (ret == 0)
+ *last_eb_bh = bh;
+ else
+ brelse(bh);
+ return ret;
+}
+
+/*
+ * Insert an extent into an inode btree.
+ *
+ * The caller needs to update fe->i_clusters
+ */
+int ocfs2_insert_extent(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ u32 cpos,
+ u64 start_blk,
+ u32 new_clusters,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int status, shift;
+ struct buffer_head *last_eb_bh = NULL;
+ struct buffer_head *bh = NULL;
+ struct ocfs2_insert_type insert = {0, };
+ struct ocfs2_extent_rec rec;
+
+ mlog(0, "add %u clusters at position %u to inode %llu\n",
+ new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+ mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
+ (OCFS2_I(inode)->ip_clusters != cpos),
+ "Device %s, asking for sparse allocation: inode %llu, "
+ "cpos %u, clusters %u\n",
+ osb->dev_str,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
+ OCFS2_I(inode)->ip_clusters);
+
+ memset(&rec, 0, sizeof(rec));
+ rec.e_cpos = cpu_to_le32(cpos);
+ rec.e_blkno = cpu_to_le64(start_blk);
+ rec.e_leaf_clusters = cpu_to_le16(new_clusters);
+
+ status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
+ &insert);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
- mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing "
- "tree now.\n");
+ mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
+ "Insert.contig_index: %d, Insert.free_records: %d, "
+ "Insert.tree_depth: %d\n",
+ insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
+ insert.ins_free_records, insert.ins_tree_depth);
+
+ /*
+ * Avoid growing the tree unless we're out of records and the
+ * insert type requres one.
+ */
+ if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records)
+ goto out_add;
shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
if (shift < 0) {
@@ -866,13 +2382,9 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
* and didn't find room for any more extents - we need to add
* another tree level */
if (shift) {
- /* if we hit a leaf, we'd better be empty :) */
- BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
- le16_to_cpu(el->l_count));
BUG_ON(bh);
- mlog(0, "ocfs2_allocate_extent: need to shift tree depth "
- "(current = %u)\n",
- le16_to_cpu(fe->id2.i_list.l_tree_depth));
+ mlog(0, "need to shift tree depth "
+ "(current = %d)\n", insert.ins_tree_depth);
/* ocfs2_shift_tree_depth will return us a buffer with
* the new extent block (so we can pass that to
@@ -883,15 +2395,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
mlog_errno(status);
goto bail;
}
+ insert.ins_tree_depth++;
/* Special case: we have room now if we shifted from
* tree_depth 0 */
- if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1))
+ if (insert.ins_tree_depth == 1)
goto out_add;
}
/* call ocfs2_add_branch to add the final part of the tree with
* the new data. */
- mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh);
+ mlog(0, "add branch. bh = %p\n", bh);
status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
meta_ac);
if (status < 0) {
@@ -900,11 +2413,12 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
}
out_add:
- /* Finally, we can add clusters. */
- status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh,
- start_blk, new_clusters);
+ /* Finally, we can add clusters. This might rotate the tree for us. */
+ status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
if (status < 0)
mlog_errno(status);
+ else
+ ocfs2_extent_map_insert_rec(inode, &rec);
bail:
if (bh)
@@ -1447,168 +2961,389 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
* block will be deleted, and if it will, what the new last extent
* block will be so we can update his h_next_leaf_blk field, as well
* as the dinodes i_last_eb_blk */
-static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
- struct inode *inode,
- struct ocfs2_dinode *fe,
- u32 new_i_clusters,
- struct buffer_head *old_last_eb,
+static int ocfs2_find_new_last_ext_blk(struct inode *inode,
+ unsigned int clusters_to_del,
+ struct ocfs2_path *path,
struct buffer_head **new_last_eb)
{
- int i, status = 0;
- u64 block = 0;
+ int next_free, ret = 0;
+ u32 cpos;
+ struct ocfs2_extent_rec *rec;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
struct buffer_head *bh = NULL;
*new_last_eb = NULL;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
- status = -EIO;
- goto bail;
- }
-
/* we have no tree, so of course, no last_eb. */
- if (!fe->id2.i_list.l_tree_depth)
- goto bail;
+ if (!path->p_tree_depth)
+ goto out;
/* trunc to zero special case - this makes tree_depth = 0
* regardless of what it is. */
- if (!new_i_clusters)
- goto bail;
+ if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
+ goto out;
- eb = (struct ocfs2_extent_block *) old_last_eb->b_data;
- el = &(eb->h_list);
+ el = path_leaf_el(path);
BUG_ON(!el->l_next_free_rec);
- /* Make sure that this guy will actually be empty after we
- * clear away the data. */
- if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
- goto bail;
+ /*
+ * Make sure that this extent list will actually be empty
+ * after we clear away the data. We can shortcut out if
+ * there's more than one non-empty extent in the
+ * list. Otherwise, a check of the remaining extent is
+ * necessary.
+ */
+ next_free = le16_to_cpu(el->l_next_free_rec);
+ rec = NULL;
+ if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+ if (next_free > 2)
+ goto out;
- /* Ok, at this point, we know that last_eb will definitely
- * change, so lets traverse the tree and find the second to
- * last extent block. */
- el = &(fe->id2.i_list);
- /* go down the tree, */
- do {
- for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) {
- if (le32_to_cpu(el->l_recs[i].e_cpos) <
- new_i_clusters) {
- block = le64_to_cpu(el->l_recs[i].e_blkno);
- break;
- }
+ /* We may have a valid extent in index 1, check it. */
+ if (next_free == 2)
+ rec = &el->l_recs[1];
+
+ /*
+ * Fall through - no more nonempty extents, so we want
+ * to delete this leaf.
+ */
+ } else {
+ if (next_free > 1)
+ goto out;
+
+ rec = &el->l_recs[0];
+ }
+
+ if (rec) {
+ /*
+ * Check it we'll only be trimming off the end of this
+ * cluster.
+ */
+ if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
+ goto out;
+ }
+
+ ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) bh->b_data;
+ el = &eb->h_list;
+ if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+ OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+ ret = -EROFS;
+ goto out;
+ }
+
+ *new_last_eb = bh;
+ get_bh(*new_last_eb);
+ mlog(0, "returning block %llu, (cpos: %u)\n",
+ (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
+out:
+ brelse(bh);
+
+ return ret;
+}
+
+/*
+ * Trim some clusters off the rightmost edge of a tree. Only called
+ * during truncate.
+ *
+ * The caller needs to:
+ * - start journaling of each path component.
+ * - compute and fully set up any new last ext block
+ */
+static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
+ handle_t *handle, struct ocfs2_truncate_context *tc,
+ u32 clusters_to_del, u64 *delete_start)
+{
+ int ret, i, index = path->p_tree_depth;
+ u32 new_edge = 0;
+ u64 deleted_eb = 0;
+ struct buffer_head *bh;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_rec *rec;
+
+ *delete_start = 0;
+
+ while (index >= 0) {
+ bh = path->p_node[index].bh;
+ el = path->p_node[index].el;
+
+ mlog(0, "traveling tree (index = %d, block = %llu)\n",
+ index, (unsigned long long)bh->b_blocknr);
+
+ BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
+
+ if (index !=
+ (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has invalid ext. block %llu",
+ inode->i_ino,
+ (unsigned long long)bh->b_blocknr);
+ ret = -EROFS;
+ goto out;
}
- BUG_ON(i < 0);
- if (bh) {
- brelse(bh);
- bh = NULL;
+find_tail_record:
+ i = le16_to_cpu(el->l_next_free_rec) - 1;
+ rec = &el->l_recs[i];
+
+ mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
+ "next = %u\n", i, le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec),
+ (unsigned long long)le64_to_cpu(rec->e_blkno),
+ le16_to_cpu(el->l_next_free_rec));
+
+ BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
+
+ if (le16_to_cpu(el->l_tree_depth) == 0) {
+ /*
+ * If the leaf block contains a single empty
+ * extent and no records, we can just remove
+ * the block.
+ */
+ if (i == 0 && ocfs2_is_empty_extent(rec)) {
+ memset(rec, 0,
+ sizeof(struct ocfs2_extent_rec));
+ el->l_next_free_rec = cpu_to_le16(0);
+
+ goto delete;
+ }
+
+ /*
+ * Remove any empty extents by shifting things
+ * left. That should make life much easier on
+ * the code below. This condition is rare
+ * enough that we shouldn't see a performance
+ * hit.
+ */
+ if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+ le16_add_cpu(&el->l_next_free_rec, -1);
+
+ for(i = 0;
+ i < le16_to_cpu(el->l_next_free_rec); i++)
+ el->l_recs[i] = el->l_recs[i + 1];
+
+ memset(&el->l_recs[i], 0,
+ sizeof(struct ocfs2_extent_rec));
+
+ /*
+ * We've modified our extent list. The
+ * simplest way to handle this change
+ * is to being the search from the
+ * start again.
+ */
+ goto find_tail_record;
+ }
+
+ le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
+
+ /*
+ * We'll use "new_edge" on our way back up the
+ * tree to know what our rightmost cpos is.
+ */
+ new_edge = le16_to_cpu(rec->e_leaf_clusters);
+ new_edge += le32_to_cpu(rec->e_cpos);
+
+ /*
+ * The caller will use this to delete data blocks.
+ */
+ *delete_start = le64_to_cpu(rec->e_blkno)
+ + ocfs2_clusters_to_blocks(inode->i_sb,
+ le16_to_cpu(rec->e_leaf_clusters));
+
+ /*
+ * If it's now empty, remove this record.
+ */
+ if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
+ memset(rec, 0,
+ sizeof(struct ocfs2_extent_rec));
+ le16_add_cpu(&el->l_next_free_rec, -1);
+ }
+ } else {
+ if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
+ memset(rec, 0,
+ sizeof(struct ocfs2_extent_rec));
+ le16_add_cpu(&el->l_next_free_rec, -1);
+
+ goto delete;
+ }
+
+ /* Can this actually happen? */
+ if (le16_to_cpu(el->l_next_free_rec) == 0)
+ goto delete;
+
+ /*
+ * We never actually deleted any clusters
+ * because our leaf was empty. There's no
+ * reason to adjust the rightmost edge then.
+ */
+ if (new_edge == 0)
+ goto delete;
+
+ rec->e_int_clusters = cpu_to_le32(new_edge);
+ le32_add_cpu(&rec->e_int_clusters,
+ -le32_to_cpu(rec->e_cpos));
+
+ /*
+ * A deleted child record should have been
+ * caught above.
+ */
+ BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
}
- status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED,
- inode);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+delete:
+ ret = ocfs2_journal_dirty(handle, bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
}
- eb = (struct ocfs2_extent_block *) bh->b_data;
- el = &eb->h_list;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- status = -EIO;
- goto bail;
+
+ mlog(0, "extent list container %llu, after: record %d: "
+ "(%u, %u, %llu), next = %u.\n",
+ (unsigned long long)bh->b_blocknr, i,
+ le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
+ (unsigned long long)le64_to_cpu(rec->e_blkno),
+ le16_to_cpu(el->l_next_free_rec));
+
+ /*
+ * We must be careful to only attempt delete of an
+ * extent block (and not the root inode block).
+ */
+ if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
+ struct ocfs2_extent_block *eb =
+ (struct ocfs2_extent_block *)bh->b_data;
+
+ /*
+ * Save this for use when processing the
+ * parent block.
+ */
+ deleted_eb = le64_to_cpu(eb->h_blkno);
+
+ mlog(0, "deleting this extent block.\n");
+
+ ocfs2_remove_from_cache(inode, bh);
+
+ BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
+ BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
+ BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
+
+ if (le16_to_cpu(eb->h_suballoc_slot) == 0) {
+ /*
+ * This code only understands how to
+ * lock the suballocator in slot 0,
+ * which is fine because allocation is
+ * only ever done out of that
+ * suballocator too. A future version
+ * might change that however, so avoid
+ * a free if we don't know how to
+ * handle it. This way an fs incompat
+ * bit will not be necessary.
+ */
+ ret = ocfs2_free_extent_block(handle,
+ tc->tc_ext_alloc_inode,
+ tc->tc_ext_alloc_bh,
+ eb);
+
+ /* An error here is not fatal. */
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+ } else {
+ deleted_eb = 0;
}
- } while (el->l_tree_depth);
- *new_last_eb = bh;
- get_bh(*new_last_eb);
- mlog(0, "returning block %llu\n",
- (unsigned long long)le64_to_cpu(eb->h_blkno));
-bail:
- if (bh)
- brelse(bh);
+ index--;
+ }
- return status;
+ ret = 0;
+out:
+ return ret;
}
static int ocfs2_do_truncate(struct ocfs2_super *osb,
unsigned int clusters_to_del,
struct inode *inode,
struct buffer_head *fe_bh,
- struct buffer_head *old_last_eb_bh,
handle_t *handle,
- struct ocfs2_truncate_context *tc)
+ struct ocfs2_truncate_context *tc,
+ struct ocfs2_path *path)
{
- int status, i, depth;
+ int status;
struct ocfs2_dinode *fe;
- struct ocfs2_extent_block *eb;
struct ocfs2_extent_block *last_eb = NULL;
struct ocfs2_extent_list *el;
- struct buffer_head *eb_bh = NULL;
struct buffer_head *last_eb_bh = NULL;
- u64 next_eb = 0;
u64 delete_blk = 0;
fe = (struct ocfs2_dinode *) fe_bh->b_data;
- status = ocfs2_find_new_last_ext_blk(osb,
- inode,
- fe,
- le32_to_cpu(fe->i_clusters) -
- clusters_to_del,
- old_last_eb_bh,
- &last_eb_bh);
+ status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
+ path, &last_eb_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- if (last_eb_bh)
- last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
- status = ocfs2_journal_access(handle, inode, fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ /*
+ * Each component will be touched, so we might as well journal
+ * here to avoid having to handle errors later.
+ */
+ status = ocfs2_journal_access_path(inode, handle, path);
if (status < 0) {
mlog_errno(status);
goto bail;
}
+
+ if (last_eb_bh) {
+ status = ocfs2_journal_access(handle, inode, last_eb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+ }
+
el = &(fe->id2.i_list);
+ /*
+ * Lower levels depend on this never happening, but it's best
+ * to check it up here before changing the tree.
+ */
+ if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has an empty extent record, depth %u\n",
+ inode->i_ino, le16_to_cpu(el->l_tree_depth));
+ status = -EROFS;
+ goto bail;
+ }
+
spin_lock(&OCFS2_I(inode)->ip_lock);
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
clusters_to_del;
spin_unlock(&OCFS2_I(inode)->ip_lock);
le32_add_cpu(&fe->i_clusters, -clusters_to_del);
- fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
- fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
-
- i = le16_to_cpu(el->l_next_free_rec) - 1;
-
- BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
- le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
- /* tree depth zero, we can just delete the clusters, otherwise
- * we need to record the offset of the next level extent block
- * as we may overwrite it. */
- if (!el->l_tree_depth)
- delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
- + ocfs2_clusters_to_blocks(osb->sb,
- le32_to_cpu(el->l_recs[i].e_clusters));
- else
- next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
- if (!el->l_recs[i].e_clusters) {
- /* if we deleted the whole extent record, then clear
- * out the other fields and update the extent
- * list. For depth > 0 trees, we've already recorded
- * the extent block in 'next_eb' */
- el->l_recs[i].e_cpos = 0;
- el->l_recs[i].e_blkno = 0;
- BUG_ON(!el->l_next_free_rec);
- le16_add_cpu(&el->l_next_free_rec, -1);
+ status = ocfs2_trim_tree(inode, path, handle, tc,
+ clusters_to_del, &delete_blk);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
}
- depth = le16_to_cpu(el->l_tree_depth);
- if (!fe->i_clusters) {
+ if (le32_to_cpu(fe->i_clusters) == 0) {
/* trunc to zero is a special case. */
el->l_tree_depth = 0;
fe->i_last_eb_blk = 0;
@@ -1625,12 +3360,6 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
/* If there will be a new last extent block, then by
* definition, there cannot be any leaves to the right of
* him. */
- status = ocfs2_journal_access(handle, inode, last_eb_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
last_eb->h_next_leaf_blk = 0;
status = ocfs2_journal_dirty(handle, last_eb_bh);
if (status < 0) {
@@ -1639,123 +3368,247 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
}
}
- /* if our tree depth > 0, update all the tree blocks below us. */
- while (depth) {
- mlog(0, "traveling tree (depth = %d, next_eb = %llu)\n",
- depth, (unsigned long long)next_eb);
- status = ocfs2_read_block(osb, next_eb, &eb_bh,
- OCFS2_BH_CACHED, inode);
+ if (delete_blk) {
+ status = ocfs2_truncate_log_append(osb, handle, delete_blk,
+ clusters_to_del);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- eb = (struct ocfs2_extent_block *)eb_bh->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- status = -EIO;
- goto bail;
+ }
+ status = 0;
+bail:
+
+ mlog_exit(status);
+ return status;
+}
+
+static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ return 0;
+}
+
+static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ return ocfs2_journal_dirty_data(handle, bh);
+}
+
+static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
+ struct page **pages, int numpages,
+ u64 phys, handle_t *handle)
+{
+ int i, ret, partial = 0;
+ void *kaddr;
+ struct page *page;
+ unsigned int from, to = PAGE_CACHE_SIZE;
+ struct super_block *sb = inode->i_sb;
+
+ BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+
+ if (numpages == 0)
+ goto out;
+
+ from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
+ if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
+ /*
+ * Since 'from' has been capped to a value below page
+ * size, this calculation won't be able to overflow
+ * 'to'
+ */
+ to = ocfs2_align_bytes_to_clusters(sb, from);
+
+ /*
+ * The truncate tail in this case should never contain
+ * more than one page at maximum. The loop below also
+ * assumes this.
+ */
+ BUG_ON(numpages != 1);
+ }
+
+ for(i = 0; i < numpages; i++) {
+ page = pages[i];
+
+ BUG_ON(from > PAGE_CACHE_SIZE);
+ BUG_ON(to > PAGE_CACHE_SIZE);
+
+ ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0);
+ if (ret)
+ mlog_errno(ret);
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + from, 0, to - from);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ /*
+ * Need to set the buffers we zero'd into uptodate
+ * here if they aren't - ocfs2_map_page_blocks()
+ * might've skipped some
+ */
+ if (ocfs2_should_order_data(inode)) {
+ ret = walk_page_buffers(handle,
+ page_buffers(page),
+ from, to, &partial,
+ ocfs2_ordered_zero_func);
+ if (ret < 0)
+ mlog_errno(ret);
+ } else {
+ ret = walk_page_buffers(handle, page_buffers(page),
+ from, to, &partial,
+ ocfs2_writeback_zero_func);
+ if (ret < 0)
+ mlog_errno(ret);
}
- el = &(eb->h_list);
- status = ocfs2_journal_access(handle, inode, eb_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+ if (!partial)
+ SetPageUptodate(page);
+
+ flush_dcache_page(page);
+
+ /*
+ * Every page after the 1st one should be completely zero'd.
+ */
+ from = 0;
+ }
+out:
+ if (pages) {
+ for (i = 0; i < numpages; i++) {
+ page = pages[i];
+ unlock_page(page);
+ mark_page_accessed(page);
+ page_cache_release(page);
}
+ }
+}
- BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
- BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1));
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
+ int *num, u64 *phys)
+{
+ int i, numpages = 0, ret = 0;
+ unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
+ unsigned int ext_flags;
+ struct super_block *sb = inode->i_sb;
+ struct address_space *mapping = inode->i_mapping;
+ unsigned long index;
+ u64 next_cluster_bytes;
+
+ BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+
+ /* Cluster boundary, so we don't need to grab any pages. */
+ if ((isize & (csize - 1)) == 0)
+ goto out;
- i = le16_to_cpu(el->l_next_free_rec) - 1;
+ ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
+ phys, NULL, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
- mlog(0, "extent block %llu, before: record %d: "
- "(%u, %u, %llu), next = %u\n",
- (unsigned long long)le64_to_cpu(eb->h_blkno), i,
- le32_to_cpu(el->l_recs[i].e_cpos),
- le32_to_cpu(el->l_recs[i].e_clusters),
- (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
- le16_to_cpu(el->l_next_free_rec));
+ /* Tail is a hole. */
+ if (*phys == 0)
+ goto out;
- BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
- le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
-
- next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
- /* bottom-most block requires us to delete data.*/
- if (!el->l_tree_depth)
- delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
- + ocfs2_clusters_to_blocks(osb->sb,
- le32_to_cpu(el->l_recs[i].e_clusters));
- if (!el->l_recs[i].e_clusters) {
- el->l_recs[i].e_cpos = 0;
- el->l_recs[i].e_blkno = 0;
- BUG_ON(!el->l_next_free_rec);
- le16_add_cpu(&el->l_next_free_rec, -1);
- }
- mlog(0, "extent block %llu, after: record %d: "
- "(%u, %u, %llu), next = %u\n",
- (unsigned long long)le64_to_cpu(eb->h_blkno), i,
- le32_to_cpu(el->l_recs[i].e_cpos),
- le32_to_cpu(el->l_recs[i].e_clusters),
- (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
- le16_to_cpu(el->l_next_free_rec));
+ /* Tail is marked as unwritten, we can count on write to zero
+ * in that case. */
+ if (ext_flags & OCFS2_EXT_UNWRITTEN)
+ goto out;
- status = ocfs2_journal_dirty(handle, eb_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+ next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
+ index = isize >> PAGE_CACHE_SHIFT;
+ do {
+ pages[numpages] = grab_cache_page(mapping, index);
+ if (!pages[numpages]) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
}
- if (!el->l_next_free_rec) {
- mlog(0, "deleting this extent block.\n");
-
- ocfs2_remove_from_cache(inode, eb_bh);
+ numpages++;
+ index++;
+ } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
- BUG_ON(el->l_recs[0].e_clusters);
- BUG_ON(el->l_recs[0].e_cpos);
- BUG_ON(el->l_recs[0].e_blkno);
- if (eb->h_suballoc_slot == 0) {
- /*
- * This code only understands how to
- * lock the suballocator in slot 0,
- * which is fine because allocation is
- * only ever done out of that
- * suballocator too. A future version
- * might change that however, so avoid
- * a free if we don't know how to
- * handle it. This way an fs incompat
- * bit will not be necessary.
- */
- status = ocfs2_free_extent_block(handle,
- tc->tc_ext_alloc_inode,
- tc->tc_ext_alloc_bh,
- eb);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+out:
+ if (ret != 0) {
+ if (pages) {
+ for (i = 0; i < numpages; i++) {
+ if (pages[i]) {
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
}
}
}
- brelse(eb_bh);
- eb_bh = NULL;
- depth--;
+ numpages = 0;
}
- BUG_ON(!delete_blk);
- status = ocfs2_truncate_log_append(osb, handle, delete_blk,
- clusters_to_del);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+ *num = numpages;
+
+ return ret;
+}
+
+/*
+ * Zero the area past i_size but still within an allocated
+ * cluster. This avoids exposing nonzero data on subsequent file
+ * extends.
+ *
+ * We need to call this before i_size is updated on the inode because
+ * otherwise block_write_full_page() will skip writeout of pages past
+ * i_size. The new_i_size parameter is passed for this reason.
+ */
+int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+ u64 new_i_size)
+{
+ int ret, numpages;
+ loff_t endbyte;
+ struct page **pages = NULL;
+ u64 phys;
+
+ /*
+ * File systems which don't support sparse files zero on every
+ * extend.
+ */
+ if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ return 0;
+
+ pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb),
+ sizeof(struct page *), GFP_NOFS);
+ if (pages == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
}
- status = 0;
-bail:
- if (!status)
- ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters));
- else
- ocfs2_extent_map_drop(inode, 0);
- mlog_exit(status);
- return status;
+
+ ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (numpages == 0)
+ goto out;
+
+ ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
+ handle);
+
+ /*
+ * Initiate writeout of the pages we zero'd here. We don't
+ * wait on them - the truncate_inode_pages() call later will
+ * do that for us.
+ */
+ endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
+ ret = do_sync_mapping_range(inode->i_mapping, new_i_size,
+ endbyte - 1, SYNC_FILE_RANGE_WRITE);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ if (pages)
+ kfree(pages);
+
+ return ret;
}
/*
@@ -1770,82 +3623,90 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
struct ocfs2_truncate_context *tc)
{
int status, i, credits, tl_sem = 0;
- u32 clusters_to_del, target_i_clusters;
- u64 last_eb = 0;
- struct ocfs2_dinode *fe;
- struct ocfs2_extent_block *eb;
+ u32 clusters_to_del, new_highest_cpos, range;
struct ocfs2_extent_list *el;
- struct buffer_head *last_eb_bh;
handle_t *handle = NULL;
struct inode *tl_inode = osb->osb_tl_inode;
+ struct ocfs2_path *path = NULL;
mlog_entry_void();
down_write(&OCFS2_I(inode)->ip_alloc_sem);
- target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
+ new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
i_size_read(inode));
- last_eb_bh = tc->tc_last_eb_bh;
- tc->tc_last_eb_bh = NULL;
+ path = ocfs2_new_inode_path(fe_bh);
+ if (!path) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
- fe = (struct ocfs2_dinode *) fe_bh->b_data;
+ ocfs2_extent_map_trunc(inode, new_highest_cpos);
- if (fe->id2.i_list.l_tree_depth) {
- eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
- el = &eb->h_list;
- } else
- el = &fe->id2.i_list;
- last_eb = le64_to_cpu(fe->i_last_eb_blk);
start:
- mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, "
- "last_eb = %llu, fe->i_last_eb_blk = %llu, "
- "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n",
- le32_to_cpu(fe->i_clusters), (unsigned long long)last_eb,
- (unsigned long long)le64_to_cpu(fe->i_last_eb_blk),
- le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh);
-
- if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
- mlog(0, "last_eb changed!\n");
- BUG_ON(!fe->id2.i_list.l_tree_depth);
- last_eb = le64_to_cpu(fe->i_last_eb_blk);
- /* i_last_eb_blk may have changed, read it if
- * necessary. We don't have to worry about the
- * truncate to zero case here (where there becomes no
- * last_eb) because we never loop back after our work
- * is done. */
- if (last_eb_bh) {
- brelse(last_eb_bh);
- last_eb_bh = NULL;
- }
+ /*
+ * Check that we still have allocation to delete.
+ */
+ if (OCFS2_I(inode)->ip_clusters == 0) {
+ status = 0;
+ goto bail;
+ }
- status = ocfs2_read_block(osb, last_eb,
- &last_eb_bh, OCFS2_BH_CACHED,
- inode);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- status = -EIO;
- goto bail;
- }
- el = &(eb->h_list);
+ /*
+ * Truncate always works against the rightmost tree branch.
+ */
+ status = ocfs2_find_path(inode, path, UINT_MAX);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
+ OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
+
+ /*
+ * By now, el will point to the extent list on the bottom most
+ * portion of this tree. Only the tail record is considered in
+ * each pass.
+ *
+ * We handle the following cases, in order:
+ * - empty extent: delete the remaining branch
+ * - remove the entire record
+ * - remove a partial record
+ * - no record needs to be removed (truncate has completed)
+ */
+ el = path_leaf_el(path);
+ if (le16_to_cpu(el->l_next_free_rec) == 0) {
+ ocfs2_error(inode->i_sb,
+ "Inode %llu has empty extent block at %llu\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)path_leaf_bh(path)->b_blocknr);
+ status = -EROFS;
+ goto bail;
}
- /* by now, el will point to the extent list on the bottom most
- * portion of this tree. */
i = le16_to_cpu(el->l_next_free_rec) - 1;
- if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters)
- clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
- else
- clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
+ range = le32_to_cpu(el->l_recs[i].e_cpos) +
+ ocfs2_rec_clusters(el, &el->l_recs[i]);
+ if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
+ clusters_to_del = 0;
+ } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
+ clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
+ } else if (range > new_highest_cpos) {
+ clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
le32_to_cpu(el->l_recs[i].e_cpos)) -
- target_i_clusters;
+ new_highest_cpos;
+ } else {
+ status = 0;
+ goto bail;
+ }
- mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del);
+ mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
+ clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
+
+ BUG_ON(clusters_to_del == 0);
mutex_lock(&tl_inode->i_mutex);
tl_sem = 1;
@@ -1861,7 +3722,8 @@ start:
}
credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
- fe, el);
+ (struct ocfs2_dinode *)fe_bh->b_data,
+ el);
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
@@ -1870,13 +3732,8 @@ start:
goto bail;
}
- inode->i_ctime = inode->i_mtime = CURRENT_TIME;
- status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
- if (status < 0)
- mlog_errno(status);
-
- status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
- last_eb_bh, handle, tc);
+ status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
+ tc, path);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1888,9 +3745,14 @@ start:
ocfs2_commit_trans(osb, handle);
handle = NULL;
- BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters);
- if (le32_to_cpu(fe->i_clusters) > target_i_clusters)
- goto start;
+ ocfs2_reinit_path(path, 1);
+
+ /*
+ * The check above will catch the case where we've truncated
+ * away all allocation.
+ */
+ goto start;
+
bail:
up_write(&OCFS2_I(inode)->ip_alloc_sem);
@@ -1902,8 +3764,7 @@ bail:
if (handle)
ocfs2_commit_trans(osb, handle);
- if (last_eb_bh)
- brelse(last_eb_bh);
+ ocfs2_free_path(path);
/* This will drop the ext_alloc cluster lock for us */
ocfs2_free_truncate_context(tc);
@@ -1912,7 +3773,6 @@ bail:
return status;
}
-
/*
* Expects the inode to already be locked. This will figure out which
* inodes need to be locked and will put them on the returned truncate
@@ -1923,7 +3783,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
struct buffer_head *fe_bh,
struct ocfs2_truncate_context **tc)
{
- int status, metadata_delete;
+ int status, metadata_delete, i;
unsigned int new_i_clusters;
struct ocfs2_dinode *fe;
struct ocfs2_extent_block *eb;
@@ -1944,21 +3804,6 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
"%llu\n", fe->i_clusters, new_i_clusters,
(unsigned long long)fe->i_size);
- if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
- ocfs2_error(inode->i_sb, "Dinode %llu has cluster count "
- "%u and size %llu whereas struct inode has "
- "cluster count %u and size %llu which caused an "
- "invalid truncate to %u clusters.",
- (unsigned long long)le64_to_cpu(fe->i_blkno),
- le32_to_cpu(fe->i_clusters),
- (unsigned long long)le64_to_cpu(fe->i_size),
- OCFS2_I(inode)->ip_clusters, i_size_read(inode),
- new_i_clusters);
- mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
- status = -EIO;
- goto bail;
- }
-
*tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
if (!(*tc)) {
status = -ENOMEM;
@@ -1986,7 +3831,15 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
goto bail;
}
el = &(eb->h_list);
- if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters)
+
+ i = 0;
+ if (ocfs2_is_empty_extent(&el->l_recs[0]))
+ i = 1;
+ /*
+ * XXX: Should we check that next_free_rec contains
+ * the extent?
+ */
+ if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
metadata_delete = 1;
}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 0b82e8044325..fbcb5934a081 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -31,7 +31,8 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
handle_t *handle,
struct inode *inode,
struct buffer_head *fe_bh,
- u64 blkno,
+ u32 cpos,
+ u64 start_blk,
u32 new_clusters,
struct ocfs2_alloc_context *meta_ac);
int ocfs2_num_free_extents(struct ocfs2_super *osb,
@@ -70,6 +71,8 @@ struct ocfs2_truncate_context {
struct buffer_head *tc_last_eb_bh;
};
+int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+ u64 new_i_size);
int ocfs2_prepare_truncate(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *fe_bh,
@@ -79,4 +82,26 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
struct buffer_head *fe_bh,
struct ocfs2_truncate_context *tc);
+int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
+ u32 cpos, struct buffer_head **leaf_bh);
+
+/*
+ * Helper function to look at the # of clusters in an extent record.
+ */
+static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
+ struct ocfs2_extent_rec *rec)
+{
+ /*
+ * Cluster count in extent records is slightly different
+ * between interior nodes and leaf nodes. This is to support
+ * unwritten extents which need a flags field in leaf node
+ * records, thus shrinking the available space for a clusters
+ * field.
+ */
+ if (el->l_tree_depth)
+ return le32_to_cpu(rec->e_int_clusters);
+ else
+ return le16_to_cpu(rec->e_leaf_clusters);
+}
+
#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 875c11443817..56963e6c46c0 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -24,6 +24,8 @@
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <asm/byteorder.h>
+#include <linux/swap.h>
+#include <linux/pipe_fs_i.h>
#define MLOG_MASK_PREFIX ML_FILE_IO
#include <cluster/masklog.h>
@@ -37,6 +39,7 @@
#include "file.h"
#include "inode.h"
#include "journal.h"
+#include "suballoc.h"
#include "super.h"
#include "symlink.h"
@@ -134,7 +137,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
int err = 0;
+ unsigned int ext_flags;
u64 p_blkno, past_eof;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
(unsigned long long)iblock, bh_result, create);
@@ -149,17 +154,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
goto bail;
}
- /* this can happen if another node truncs after our extend! */
- spin_lock(&OCFS2_I(inode)->ip_lock);
- if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
- OCFS2_I(inode)->ip_clusters))
- err = -EIO;
- spin_unlock(&OCFS2_I(inode)->ip_lock);
- if (err)
- goto bail;
-
- err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
- NULL);
+ err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL,
+ &ext_flags);
if (err) {
mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
"%llu, NULL)\n", err, inode, (unsigned long long)iblock,
@@ -167,22 +163,39 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
goto bail;
}
- map_bh(bh_result, inode->i_sb, p_blkno);
-
- if (bh_result->b_blocknr == 0) {
- err = -EIO;
- mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
- (unsigned long long)iblock,
- (unsigned long long)p_blkno,
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- }
+ /*
+ * ocfs2 never allocates in this function - the only time we
+ * need to use BH_New is when we're extending i_size on a file
+ * system which doesn't support holes, in which case BH_New
+ * allows block_prepare_write() to zero.
+ */
+ mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb),
+ "ino %lu, iblock %llu\n", inode->i_ino,
+ (unsigned long long)iblock);
+
+ /* Treat the unwritten extent as a hole for zeroing purposes. */
+ if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
+ map_bh(bh_result, inode->i_sb, p_blkno);
+
+ if (!ocfs2_sparse_alloc(osb)) {
+ if (p_blkno == 0) {
+ err = -EIO;
+ mlog(ML_ERROR,
+ "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
+ (unsigned long long)iblock,
+ (unsigned long long)p_blkno,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
+ dump_stack();
+ }
- past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
- mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
- (unsigned long long)past_eof);
+ past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+ mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
+ (unsigned long long)past_eof);
- if (create && (iblock >= past_eof))
- set_buffer_new(bh_result);
+ if (create && (iblock >= past_eof))
+ set_buffer_new(bh_result);
+ }
bail:
if (err < 0)
@@ -276,8 +289,11 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
return ret;
}
-/* This can also be called from ocfs2_write_zero_page() which has done
- * it's own cluster locking. */
+/*
+ * This is called from ocfs2_write_zero_page() which has handled it's
+ * own cluster locking and has ensured allocation exists for those
+ * blocks to be written.
+ */
int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
unsigned from, unsigned to)
{
@@ -292,44 +308,17 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
return ret;
}
-/*
- * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
- * from loopback. It must be able to perform its own locking around
- * ocfs2_get_block().
- */
-static int ocfs2_prepare_write(struct file *file, struct page *page,
- unsigned from, unsigned to)
-{
- struct inode *inode = page->mapping->host;
- int ret;
-
- mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
-
- ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
- if (ret != 0) {
- mlog_errno(ret);
- goto out;
- }
-
- ret = ocfs2_prepare_write_nolock(inode, page, from, to);
-
- ocfs2_meta_unlock(inode, 0);
-out:
- mlog_exit(ret);
- return ret;
-}
-
/* Taken from ext3. We don't necessarily need the full blown
* functionality yet, but IMHO it's better to cut and paste the whole
* thing so we can avoid introducing our own bugs (and easily pick up
* their fixes when they happen) --Mark */
-static int walk_page_buffers( handle_t *handle,
- struct buffer_head *head,
- unsigned from,
- unsigned to,
- int *partial,
- int (*fn)( handle_t *handle,
- struct buffer_head *bh))
+int walk_page_buffers( handle_t *handle,
+ struct buffer_head *head,
+ unsigned from,
+ unsigned to,
+ int *partial,
+ int (*fn)( handle_t *handle,
+ struct buffer_head *bh))
{
struct buffer_head *bh;
unsigned block_start, block_end;
@@ -388,95 +377,6 @@ out:
return handle;
}
-static int ocfs2_commit_write(struct file *file, struct page *page,
- unsigned from, unsigned to)
-{
- int ret;
- struct buffer_head *di_bh = NULL;
- struct inode *inode = page->mapping->host;
- handle_t *handle = NULL;
- struct ocfs2_dinode *di;
-
- mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
-
- /* NOTE: ocfs2_file_aio_write has ensured that it's safe for
- * us to continue here without rechecking the I/O against
- * changed inode values.
- *
- * 1) We're currently holding the inode alloc lock, so no
- * nodes can change it underneath us.
- *
- * 2) We've had to take the metadata lock at least once
- * already to check for extending writes, suid removal, etc.
- * The meta data update code then ensures that we don't get a
- * stale inode allocation image (i_size, i_clusters, etc).
- */
-
- ret = ocfs2_meta_lock_with_page(inode, &di_bh, 1, page);
- if (ret != 0) {
- mlog_errno(ret);
- goto out;
- }
-
- ret = ocfs2_data_lock_with_page(inode, 1, page);
- if (ret != 0) {
- mlog_errno(ret);
- goto out_unlock_meta;
- }
-
- handle = ocfs2_start_walk_page_trans(inode, page, from, to);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_unlock_data;
- }
-
- /* Mark our buffer early. We'd rather catch this error up here
- * as opposed to after a successful commit_write which would
- * require us to set back inode->i_size. */
- ret = ocfs2_journal_access(handle, inode, di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_commit;
- }
-
- /* might update i_size */
- ret = generic_commit_write(file, page, from, to);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_commit;
- }
-
- di = (struct ocfs2_dinode *)di_bh->b_data;
-
- /* ocfs2_mark_inode_dirty() is too heavy to use here. */
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
- di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
-
- inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode)));
- di->i_size = cpu_to_le64((u64)i_size_read(inode));
-
- ret = ocfs2_journal_dirty(handle, di_bh);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_commit;
- }
-
-out_commit:
- ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
-out_unlock_data:
- ocfs2_data_unlock(inode, 1);
-out_unlock_meta:
- ocfs2_meta_unlock(inode, 1);
-out:
- if (di_bh)
- brelse(di_bh);
-
- mlog_exit(ret);
- return ret;
-}
-
static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
{
sector_t status;
@@ -499,8 +399,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
down_read(&OCFS2_I(inode)->ip_alloc_sem);
}
- err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
- NULL);
+ err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL);
if (!INODE_JOURNAL(inode)) {
up_read(&OCFS2_I(inode)->ip_alloc_sem);
@@ -540,8 +439,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
int ret;
- u64 p_blkno, inode_blocks;
- int contig_blocks;
+ u64 p_blkno, inode_blocks, contig_blocks;
+ unsigned int ext_flags;
unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
@@ -549,33 +448,20 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
* nicely aligned and of the right size, so there's no need
* for us to check any of that. */
- spin_lock(&OCFS2_I(inode)->ip_lock);
- inode_blocks = ocfs2_clusters_to_blocks(inode->i_sb,
- OCFS2_I(inode)->ip_clusters);
-
- /*
- * For a read which begins past the end of file, we return a hole.
- */
- if (!create && (iblock >= inode_blocks)) {
- spin_unlock(&OCFS2_I(inode)->ip_lock);
- ret = 0;
- goto bail;
- }
+ inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
/*
* Any write past EOF is not allowed because we'd be extending.
*/
if (create && (iblock + max_blocks) > inode_blocks) {
- spin_unlock(&OCFS2_I(inode)->ip_lock);
ret = -EIO;
goto bail;
}
- spin_unlock(&OCFS2_I(inode)->ip_lock);
/* This figures out the size of the next contiguous block, and
* our logical offset */
- ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
- &contig_blocks);
+ ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
+ &contig_blocks, &ext_flags);
if (ret) {
mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
(unsigned long long)iblock);
@@ -583,7 +469,37 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
goto bail;
}
- map_bh(bh_result, inode->i_sb, p_blkno);
+ if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) {
+ ocfs2_error(inode->i_sb,
+ "Inode %llu has a hole at block %llu\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)iblock);
+ ret = -EROFS;
+ goto bail;
+ }
+
+ /*
+ * get_more_blocks() expects us to describe a hole by clearing
+ * the mapped bit on bh_result().
+ *
+ * Consider an unwritten extent as a hole.
+ */
+ if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
+ map_bh(bh_result, inode->i_sb, p_blkno);
+ else {
+ /*
+ * ocfs2_prepare_inode_for_write() should have caught
+ * the case where we'd be filling a hole and triggered
+ * a buffered write instead.
+ */
+ if (create) {
+ ret = -EIO;
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ clear_buffer_mapped(bh_result);
+ }
/* make sure we don't map more than max_blocks blocks here as
that's all the kernel will handle at this point. */
@@ -606,12 +522,17 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
void *private)
{
struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+ int level;
/* this io's submitter should not have unlocked this before we could */
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+
ocfs2_iocb_clear_rw_locked(iocb);
- up_read(&inode->i_alloc_sem);
- ocfs2_rw_unlock(inode, 0);
+
+ level = ocfs2_iocb_rw_locked_level(iocb);
+ if (!level)
+ up_read(&inode->i_alloc_sem);
+ ocfs2_rw_unlock(inode, level);
}
/*
@@ -647,23 +568,27 @@ static ssize_t ocfs2_direct_IO(int rw,
mlog_entry_void();
- /*
- * We get PR data locks even for O_DIRECT. This allows
- * concurrent O_DIRECT I/O but doesn't let O_DIRECT with
- * extending and buffered zeroing writes race. If they did
- * race then the buffered zeroing could be written back after
- * the O_DIRECT I/O. It's one thing to tell people not to mix
- * buffered and O_DIRECT writes, but expecting them to
- * understand that file extension is also an implicit buffered
- * write is too much. By getting the PR we force writeback of
- * the buffered zeroing before proceeding.
- */
- ret = ocfs2_data_lock(inode, 0);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
+ if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+ /*
+ * We get PR data locks even for O_DIRECT. This
+ * allows concurrent O_DIRECT I/O but doesn't let
+ * O_DIRECT with extending and buffered zeroing writes
+ * race. If they did race then the buffered zeroing
+ * could be written back after the O_DIRECT I/O. It's
+ * one thing to tell people not to mix buffered and
+ * O_DIRECT writes, but expecting them to understand
+ * that file extension is also an implicit buffered
+ * write is too much. By getting the PR we force
+ * writeback of the buffered zeroing before
+ * proceeding.
+ */
+ ret = ocfs2_data_lock(inode, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ ocfs2_data_unlock(inode, 0);
}
- ocfs2_data_unlock(inode, 0);
ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
inode->i_sb->s_bdev, iov, offset,
@@ -675,11 +600,715 @@ out:
return ret;
}
+static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
+ u32 cpos,
+ unsigned int *start,
+ unsigned int *end)
+{
+ unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
+
+ if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
+ unsigned int cpp;
+
+ cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
+
+ cluster_start = cpos % cpp;
+ cluster_start = cluster_start << osb->s_clustersize_bits;
+
+ cluster_end = cluster_start + osb->s_clustersize;
+ }
+
+ BUG_ON(cluster_start > PAGE_SIZE);
+ BUG_ON(cluster_end > PAGE_SIZE);
+
+ if (start)
+ *start = cluster_start;
+ if (end)
+ *end = cluster_end;
+}
+
+/*
+ * 'from' and 'to' are the region in the page to avoid zeroing.
+ *
+ * If pagesize > clustersize, this function will avoid zeroing outside
+ * of the cluster boundary.
+ *
+ * from == to == 0 is code for "zero the entire cluster region"
+ */
+static void ocfs2_clear_page_regions(struct page *page,
+ struct ocfs2_super *osb, u32 cpos,
+ unsigned from, unsigned to)
+{
+ void *kaddr;
+ unsigned int cluster_start, cluster_end;
+
+ ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
+
+ kaddr = kmap_atomic(page, KM_USER0);
+
+ if (from || to) {
+ if (from > cluster_start)
+ memset(kaddr + cluster_start, 0, from - cluster_start);
+ if (to < cluster_end)
+ memset(kaddr + to, 0, cluster_end - to);
+ } else {
+ memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
+ }
+
+ kunmap_atomic(kaddr, KM_USER0);
+}
+
+/*
+ * Some of this taken from block_prepare_write(). We already have our
+ * mapping by now though, and the entire write will be allocating or
+ * it won't, so not much need to use BH_New.
+ *
+ * This will also skip zeroing, which is handled externally.
+ */
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+ struct inode *inode, unsigned int from,
+ unsigned int to, int new)
+{
+ int ret = 0;
+ struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
+ unsigned int block_end, block_start;
+ unsigned int bsize = 1 << inode->i_blkbits;
+
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, bsize, 0);
+
+ head = page_buffers(page);
+ for (bh = head, block_start = 0; bh != head || !block_start;
+ bh = bh->b_this_page, block_start += bsize) {
+ block_end = block_start + bsize;
+
+ /*
+ * Ignore blocks outside of our i/o range -
+ * they may belong to unallocated clusters.
+ */
+ if (block_start >= to || block_end <= from) {
+ if (PageUptodate(page))
+ set_buffer_uptodate(bh);
+ continue;
+ }
+
+ /*
+ * For an allocating write with cluster size >= page
+ * size, we always write the entire page.
+ */
+
+ if (buffer_new(bh))
+ clear_buffer_new(bh);
+
+ if (!buffer_mapped(bh)) {
+ map_bh(bh, inode->i_sb, *p_blkno);
+ unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ }
+
+ if (PageUptodate(page)) {
+ if (!buffer_uptodate(bh))
+ set_buffer_uptodate(bh);
+ } else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
+ (block_start < from || block_end > to)) {
+ ll_rw_block(READ, 1, &bh);
+ *wait_bh++=bh;
+ }
+
+ *p_blkno = *p_blkno + 1;
+ }
+
+ /*
+ * If we issued read requests - let them complete.
+ */
+ while(wait_bh > wait) {
+ wait_on_buffer(*--wait_bh);
+ if (!buffer_uptodate(*wait_bh))
+ ret = -EIO;
+ }
+
+ if (ret == 0 || !new)
+ return ret;
+
+ /*
+ * If we get -EIO above, zero out any newly allocated blocks
+ * to avoid exposing stale data.
+ */
+ bh = head;
+ block_start = 0;
+ do {
+ void *kaddr;
+
+ block_end = block_start + bsize;
+ if (block_end <= from)
+ goto next_bh;
+ if (block_start >= to)
+ break;
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr+block_start, 0, bh->b_size);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+
+next_bh:
+ block_start = block_end;
+ bh = bh->b_this_page;
+ } while (bh != head);
+
+ return ret;
+}
+
+/*
+ * This will copy user data from the buffer page in the splice
+ * context.
+ *
+ * For now, we ignore SPLICE_F_MOVE as that would require some extra
+ * communication out all the way to ocfs2_write().
+ */
+int ocfs2_map_and_write_splice_data(struct inode *inode,
+ struct ocfs2_write_ctxt *wc, u64 *p_blkno,
+ unsigned int *ret_from, unsigned int *ret_to)
+{
+ int ret;
+ unsigned int to, from, cluster_start, cluster_end;
+ char *src, *dst;
+ struct ocfs2_splice_write_priv *sp = wc->w_private;
+ struct pipe_buffer *buf = sp->s_buf;
+ unsigned long bytes, src_from;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
+ &cluster_end);
+
+ from = sp->s_offset;
+ src_from = sp->s_buf_offset;
+ bytes = wc->w_count;
+
+ if (wc->w_large_pages) {
+ /*
+ * For cluster size < page size, we have to
+ * calculate pos within the cluster and obey
+ * the rightmost boundary.
+ */
+ bytes = min(bytes, (unsigned long)(osb->s_clustersize
+ - (wc->w_pos & (osb->s_clustersize - 1))));
+ }
+ to = from + bytes;
+
+ if (wc->w_this_page_new)
+ ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+ cluster_start, cluster_end, 1);
+ else
+ ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+ from, to, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ BUG_ON(from > PAGE_CACHE_SIZE);
+ BUG_ON(to > PAGE_CACHE_SIZE);
+ BUG_ON(from > osb->s_clustersize);
+ BUG_ON(to > osb->s_clustersize);
+
+ src = buf->ops->map(sp->s_pipe, buf, 1);
+ dst = kmap_atomic(wc->w_this_page, KM_USER1);
+ memcpy(dst + from, src + src_from, bytes);
+ kunmap_atomic(wc->w_this_page, KM_USER1);
+ buf->ops->unmap(sp->s_pipe, buf, src);
+
+ wc->w_finished_copy = 1;
+
+ *ret_from = from;
+ *ret_to = to;
+out:
+
+ return bytes ? (unsigned int)bytes : ret;
+}
+
+/*
+ * This will copy user data from the iovec in the buffered write
+ * context.
+ */
+int ocfs2_map_and_write_user_data(struct inode *inode,
+ struct ocfs2_write_ctxt *wc, u64 *p_blkno,
+ unsigned int *ret_from, unsigned int *ret_to)
+{
+ int ret;
+ unsigned int to, from, cluster_start, cluster_end;
+ unsigned long bytes, src_from;
+ char *dst;
+ struct ocfs2_buffered_write_priv *bp = wc->w_private;
+ const struct iovec *cur_iov = bp->b_cur_iov;
+ char __user *buf;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
+ &cluster_end);
+
+ buf = cur_iov->iov_base + bp->b_cur_off;
+ src_from = (unsigned long)buf & ~PAGE_CACHE_MASK;
+
+ from = wc->w_pos & (PAGE_CACHE_SIZE - 1);
+
+ /*
+ * This is a lot of comparisons, but it reads quite
+ * easily, which is important here.
+ */
+ /* Stay within the src page */
+ bytes = PAGE_SIZE - src_from;
+ /* Stay within the vector */
+ bytes = min(bytes,
+ (unsigned long)(cur_iov->iov_len - bp->b_cur_off));
+ /* Stay within count */
+ bytes = min(bytes, (unsigned long)wc->w_count);
+ /*
+ * For clustersize > page size, just stay within
+ * target page, otherwise we have to calculate pos
+ * within the cluster and obey the rightmost
+ * boundary.
+ */
+ if (wc->w_large_pages) {
+ /*
+ * For cluster size < page size, we have to
+ * calculate pos within the cluster and obey
+ * the rightmost boundary.
+ */
+ bytes = min(bytes, (unsigned long)(osb->s_clustersize
+ - (wc->w_pos & (osb->s_clustersize - 1))));
+ } else {
+ /*
+ * cluster size > page size is the most common
+ * case - we just stay within the target page
+ * boundary.
+ */
+ bytes = min(bytes, PAGE_CACHE_SIZE - from);
+ }
+
+ to = from + bytes;
+
+ if (wc->w_this_page_new)
+ ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+ cluster_start, cluster_end, 1);
+ else
+ ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+ from, to, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ BUG_ON(from > PAGE_CACHE_SIZE);
+ BUG_ON(to > PAGE_CACHE_SIZE);
+ BUG_ON(from > osb->s_clustersize);
+ BUG_ON(to > osb->s_clustersize);
+
+ dst = kmap(wc->w_this_page);
+ memcpy(dst + from, bp->b_src_buf + src_from, bytes);
+ kunmap(wc->w_this_page);
+
+ /*
+ * XXX: This is slow, but simple. The caller of
+ * ocfs2_buffered_write_cluster() is responsible for
+ * passing through the iovecs, so it's difficult to
+ * predict what our next step is in here after our
+ * initial write. A future version should be pushing
+ * that iovec manipulation further down.
+ *
+ * By setting this, we indicate that a copy from user
+ * data was done, and subsequent calls for this
+ * cluster will skip copying more data.
+ */
+ wc->w_finished_copy = 1;
+
+ *ret_from = from;
+ *ret_to = to;
+out:
+
+ return bytes ? (unsigned int)bytes : ret;
+}
+
+/*
+ * Map, fill and write a page to disk.
+ *
+ * The work of copying data is done via callback. Newly allocated
+ * pages which don't take user data will be zero'd (set 'new' to
+ * indicate an allocating write)
+ *
+ * Returns a negative error code or the number of bytes copied into
+ * the page.
+ */
+int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
+ u64 *p_blkno, struct page *page,
+ struct ocfs2_write_ctxt *wc, int new)
+{
+ int ret, copied = 0;
+ unsigned int from = 0, to = 0;
+ unsigned int cluster_start, cluster_end;
+ unsigned int zero_from = 0, zero_to = 0;
+
+ ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos,
+ &cluster_start, &cluster_end);
+
+ if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index
+ && !wc->w_finished_copy) {
+
+ wc->w_this_page = page;
+ wc->w_this_page_new = new;
+ ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ copied = ret;
+
+ zero_from = from;
+ zero_to = to;
+ if (new) {
+ from = cluster_start;
+ to = cluster_end;
+ }
+ } else {
+ /*
+ * If we haven't allocated the new page yet, we
+ * shouldn't be writing it out without copying user
+ * data. This is likely a math error from the caller.
+ */
+ BUG_ON(!new);
+
+ from = cluster_start;
+ to = cluster_end;
+
+ ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+ cluster_start, cluster_end, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ /*
+ * Parts of newly allocated pages need to be zero'd.
+ *
+ * Above, we have also rewritten 'to' and 'from' - as far as
+ * the rest of the function is concerned, the entire cluster
+ * range inside of a page needs to be written.
+ *
+ * We can skip this if the page is up to date - it's already
+ * been zero'd from being read in as a hole.
+ */
+ if (new && !PageUptodate(page))
+ ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
+ wc->w_cpos, zero_from, zero_to);
+
+ flush_dcache_page(page);
+
+ if (ocfs2_should_order_data(inode)) {
+ ret = walk_page_buffers(handle,
+ page_buffers(page),
+ from, to, NULL,
+ ocfs2_journal_dirty_data);
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+
+ /*
+ * We don't use generic_commit_write() because we need to
+ * handle our own i_size update.
+ */
+ ret = block_commit_write(page, from, to);
+ if (ret)
+ mlog_errno(ret);
+out:
+
+ return copied ? copied : ret;
+}
+
+/*
+ * Do the actual write of some data into an inode. Optionally allocate
+ * in order to fulfill the write.
+ *
+ * cpos is the logical cluster offset within the file to write at
+ *
+ * 'phys' is the physical mapping of that offset. a 'phys' value of
+ * zero indicates that allocation is required. In this case, data_ac
+ * and meta_ac should be valid (meta_ac can be null if metadata
+ * allocation isn't required).
+ */
+static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
+ struct buffer_head *di_bh,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_write_ctxt *wc)
+{
+ int ret, i, numpages = 1, new;
+ unsigned int copied = 0;
+ u32 tmp_pos;
+ u64 v_blkno, p_blkno;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ unsigned long index, start;
+ struct page **cpages;
+
+ new = phys == 0 ? 1 : 0;
+
+ /*
+ * Figure out how many pages we'll be manipulating here. For
+ * non allocating write, we just change the one
+ * page. Otherwise, we'll need a whole clusters worth.
+ */
+ if (new)
+ numpages = ocfs2_pages_per_cluster(inode->i_sb);
+
+ cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
+ if (!cpages) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ return ret;
+ }
+
+ /*
+ * Fill our page array first. That way we've grabbed enough so
+ * that we can zero and flush if we error after adding the
+ * extent.
+ */
+ if (new) {
+ start = ocfs2_align_clusters_to_page_index(inode->i_sb,
+ wc->w_cpos);
+ v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
+ } else {
+ start = wc->w_pos >> PAGE_CACHE_SHIFT;
+ v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits;
+ }
+
+ for(i = 0; i < numpages; i++) {
+ index = start + i;
+
+ cpages[i] = grab_cache_page(mapping, index);
+ if (!cpages[i]) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (new) {
+ /*
+ * This is safe to call with the page locks - it won't take
+ * any additional semaphores or cluster locks.
+ */
+ tmp_pos = wc->w_cpos;
+ ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
+ &tmp_pos, 1, di_bh, handle,
+ data_ac, meta_ac, NULL);
+ /*
+ * This shouldn't happen because we must have already
+ * calculated the correct meta data allocation required. The
+ * internal tree allocation code should know how to increase
+ * transaction credits itself.
+ *
+ * If need be, we could handle -EAGAIN for a
+ * RESTART_TRANS here.
+ */
+ mlog_bug_on_msg(ret == -EAGAIN,
+ "Inode %llu: EAGAIN return during allocation.\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
+ NULL);
+ if (ret < 0) {
+
+ /*
+ * XXX: Should we go readonly here?
+ */
+
+ mlog_errno(ret);
+ goto out;
+ }
+
+ BUG_ON(p_blkno == 0);
+
+ for(i = 0; i < numpages; i++) {
+ ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i],
+ wc, new);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ copied += ret;
+ }
+
+out:
+ for(i = 0; i < numpages; i++) {
+ unlock_page(cpages[i]);
+ mark_page_accessed(cpages[i]);
+ page_cache_release(cpages[i]);
+ }
+ kfree(cpages);
+
+ return copied ? copied : ret;
+}
+
+static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc,
+ struct ocfs2_super *osb, loff_t pos,
+ size_t count, ocfs2_page_writer *cb,
+ void *cb_priv)
+{
+ wc->w_count = count;
+ wc->w_pos = pos;
+ wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
+ wc->w_finished_copy = 0;
+
+ if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
+ wc->w_large_pages = 1;
+ else
+ wc->w_large_pages = 0;
+
+ wc->w_write_data_page = cb;
+ wc->w_private = cb_priv;
+}
+
+/*
+ * Write a cluster to an inode. The cluster may not be allocated yet,
+ * in which case it will be. This only exists for buffered writes -
+ * O_DIRECT takes a more "traditional" path through the kernel.
+ *
+ * The caller is responsible for incrementing pos, written counts, etc
+ *
+ * For file systems that don't support sparse files, pre-allocation
+ * and page zeroing up until cpos should be done prior to this
+ * function call.
+ *
+ * Callers should be holding i_sem, and the rw cluster lock.
+ *
+ * Returns the number of user bytes written, or less than zero for
+ * error.
+ */
+ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
+ size_t count, ocfs2_page_writer *actor,
+ void *priv)
+{
+ int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
+ ssize_t written = 0;
+ u32 phys;
+ struct inode *inode = file->f_mapping->host;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di;
+ struct ocfs2_alloc_context *data_ac = NULL;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ handle_t *handle;
+ struct ocfs2_write_ctxt wc;
+
+ ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
+
+ ret = ocfs2_meta_lock(inode, &di_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ /*
+ * Take alloc sem here to prevent concurrent lookups. That way
+ * the mapping, zeroing and tree manipulation within
+ * ocfs2_write() will be safe against ->readpage(). This
+ * should also serve to lock out allocation from a shared
+ * writeable region.
+ */
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+ ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_meta;
+ }
+
+ /* phys == 0 means that allocation is required. */
+ if (phys == 0) {
+ ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_meta;
+ }
+
+ credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1);
+ }
+
+ ret = ocfs2_data_lock(inode, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_meta;
+ }
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_data;
+ }
+
+ written = ocfs2_write(file, phys, handle, di_bh, data_ac,
+ meta_ac, &wc);
+ if (written < 0) {
+ ret = written;
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ pos += written;
+ if (pos > inode->i_size) {
+ i_size_write(inode, pos);
+ mark_inode_dirty(inode);
+ }
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
+ di->i_size = cpu_to_le64((u64)i_size_read(inode));
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+ di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+
+ ret = ocfs2_journal_dirty(handle, di_bh);
+ if (ret)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out_data:
+ ocfs2_data_unlock(inode, 1);
+
+out_meta:
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+ ocfs2_meta_unlock(inode, 1);
+
+out:
+ brelse(di_bh);
+ if (data_ac)
+ ocfs2_free_alloc_context(data_ac);
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+
+ return written ? written : ret;
+}
+
const struct address_space_operations ocfs2_aops = {
.readpage = ocfs2_readpage,
.writepage = ocfs2_writepage,
- .prepare_write = ocfs2_prepare_write,
- .commit_write = ocfs2_commit_write,
.bmap = ocfs2_bmap,
.sync_page = block_sync_page,
.direct_IO = ocfs2_direct_IO,
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index f446a15eab88..45821d479b5a 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -30,12 +30,83 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
unsigned from,
unsigned to);
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+ struct inode *inode, unsigned int from,
+ unsigned int to, int new);
+
+int walk_page_buffers( handle_t *handle,
+ struct buffer_head *head,
+ unsigned from,
+ unsigned to,
+ int *partial,
+ int (*fn)( handle_t *handle,
+ struct buffer_head *bh));
+
+struct ocfs2_write_ctxt;
+typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
+ u64 *, unsigned int *, unsigned int *);
+
+ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
+ size_t count, ocfs2_page_writer *actor,
+ void *priv);
+
+struct ocfs2_write_ctxt {
+ size_t w_count;
+ loff_t w_pos;
+ u32 w_cpos;
+ unsigned int w_finished_copy;
+
+ /* This is true if page_size > cluster_size */
+ unsigned int w_large_pages;
+
+ /* Filler callback and private data */
+ ocfs2_page_writer *w_write_data_page;
+ void *w_private;
+
+ /* Only valid for the filler callback */
+ struct page *w_this_page;
+ unsigned int w_this_page_new;
+};
+
+struct ocfs2_buffered_write_priv {
+ char *b_src_buf;
+ const struct iovec *b_cur_iov; /* Current iovec */
+ size_t b_cur_off; /* Offset in the
+ * current iovec */
+};
+int ocfs2_map_and_write_user_data(struct inode *inode,
+ struct ocfs2_write_ctxt *wc,
+ u64 *p_blkno,
+ unsigned int *ret_from,
+ unsigned int *ret_to);
+
+struct ocfs2_splice_write_priv {
+ struct splice_desc *s_sd;
+ struct pipe_buffer *s_buf;
+ struct pipe_inode_info *s_pipe;
+ /* Neither offset value is ever larger than one page */
+ unsigned int s_offset;
+ unsigned int s_buf_offset;
+};
+int ocfs2_map_and_write_splice_data(struct inode *inode,
+ struct ocfs2_write_ctxt *wc,
+ u64 *p_blkno,
+ unsigned int *ret_from,
+ unsigned int *ret_to);
+
/* all ocfs2_dio_end_io()'s fault */
#define ocfs2_iocb_is_rw_locked(iocb) \
test_bit(0, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_set_rw_locked(iocb) \
- set_bit(0, (unsigned long *)&iocb->private)
+static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
+{
+ set_bit(0, (unsigned long *)&iocb->private);
+ if (level)
+ set_bit(1, (unsigned long *)&iocb->private);
+ else
+ clear_bit(1, (unsigned long *)&iocb->private);
+}
#define ocfs2_iocb_clear_rw_locked(iocb) \
clear_bit(0, (unsigned long *)&iocb->private)
-
+#define ocfs2_iocb_rw_locked_level(iocb) \
+ test_bit(1, (unsigned long *)&iocb->private)
#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 4705d659fe57..bbacf7da48a4 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -46,6 +46,7 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
+#include <linux/reboot.h>
#include "heartbeat.h"
#include "nodemanager.h"
@@ -72,7 +73,9 @@ static void o2quo_fence_self(void)
/* panic spins with interrupts enabled. with preempt
* threads can still schedule, etc, etc */
o2hb_stop_all_regions();
- panic("ocfs2 is very sorry to be fencing this system by panicing\n");
+
+ printk("ocfs2 is very sorry to be fencing this system by restarting\n");
+ emergency_restart();
}
/* Indicate that a timeout occured on a hearbeat region write. The
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 4dae5df5e467..9606111fe89d 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -38,6 +38,9 @@
* locking semantics of the file system using the protocol. It should
* be somewhere else, I'm sure, but right now it isn't.
*
+ * New in version 8:
+ * - Replace delete inode votes with a cluster lock
+ *
* New in version 7:
* - DLM join domain includes the live nodemap
*
@@ -57,7 +60,7 @@
* - full 64 bit i_size in the metadata lock lvbs
* - introduction of "rw" lock and pushing meta/data locking down
*/
-#define O2NET_PROTOCOL_VERSION 7ULL
+#define O2NET_PROTOCOL_VERSION 8ULL
struct o2net_handshake {
__be64 protocol_version;
__be64 connector_id;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 66821e178167..67e6866a2a4f 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -358,15 +358,17 @@ int ocfs2_do_extend_dir(struct super_block *sb,
{
int status;
int extend;
- u64 p_blkno;
+ u64 p_blkno, v_blkno;
spin_lock(&OCFS2_I(dir)->ip_lock);
extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
spin_unlock(&OCFS2_I(dir)->ip_lock);
if (extend) {
- status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,
- parent_fe_bh, handle,
+ u32 offset = OCFS2_I(dir)->ip_clusters;
+
+ status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
+ 1, parent_fe_bh, handle,
data_ac, meta_ac, NULL);
BUG_ON(status == -EAGAIN);
if (status < 0) {
@@ -375,9 +377,8 @@ int ocfs2_do_extend_dir(struct super_block *sb,
}
}
- status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
- (sb->s_blocksize_bits - 9)),
- 1, &p_blkno, NULL);
+ v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir));
+ status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -486,7 +487,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
dir_i_size += dir->i_sb->s_blocksize;
i_size_write(dir, dir_i_size);
- dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size);
+ dir->i_blocks = ocfs2_inode_sector_count(dir);
status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
if (status < 0) {
mlog_errno(status);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index c558442a0b44..d836b98dd99a 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -430,11 +430,10 @@ redo_bucket:
dlm_lockres_put(res);
- cond_resched_lock(&dlm->spinlock);
-
if (dropped)
goto redo_bucket;
}
+ cond_resched_lock(&dlm->spinlock);
num += n;
mlog(0, "%s: touched %d lockreses in bucket %d "
"(tot=%d)\n", dlm->name, n, i, num);
@@ -1035,7 +1034,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
{
int status = 0, tmpstat, node;
struct domain_join_ctxt *ctxt;
- enum dlm_query_join_response response;
+ enum dlm_query_join_response response = JOIN_DISALLOW;
mlog_entry("%p", dlm);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 6d4a83d50152..c1807a42c49f 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -611,6 +611,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
}
} while (status != 0);
+ spin_lock(&dlm_reco_state_lock);
switch (ndata->state) {
case DLM_RECO_NODE_DATA_INIT:
case DLM_RECO_NODE_DATA_FINALIZE_SENT:
@@ -641,6 +642,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
ndata->node_num, dead_node);
break;
}
+ spin_unlock(&dlm_reco_state_lock);
}
mlog(0, "done requesting all lock info\n");
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e335541727f9..27e43b0c0eae 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -225,11 +225,17 @@ static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
.flags = 0,
};
+static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
+ .get_osb = ocfs2_get_inode_osb,
+ .flags = 0,
+};
+
static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
{
return lockres->l_type == OCFS2_LOCK_TYPE_META ||
lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
- lockres->l_type == OCFS2_LOCK_TYPE_RW;
+ lockres->l_type == OCFS2_LOCK_TYPE_RW ||
+ lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
}
static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
@@ -373,6 +379,9 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
case OCFS2_LOCK_TYPE_DATA:
ops = &ocfs2_inode_data_lops;
break;
+ case OCFS2_LOCK_TYPE_OPEN:
+ ops = &ocfs2_inode_open_lops;
+ break;
default:
mlog_bug_on_msg(1, "type: %d\n", type);
ops = NULL; /* thanks, gcc */
@@ -1129,6 +1138,12 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
goto bail;
}
+ ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
bail:
mlog_exit(ret);
return ret;
@@ -1182,6 +1197,99 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
mlog_exit_void();
}
+/*
+ * ocfs2_open_lock always get PR mode lock.
+ */
+int ocfs2_open_lock(struct inode *inode)
+{
+ int status = 0;
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ BUG_ON(!inode);
+
+ mlog_entry_void();
+
+ mlog(0, "inode %llu take PRMODE open lock\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+ if (ocfs2_mount_local(osb))
+ goto out;
+
+ lockres = &OCFS2_I(inode)->ip_open_lockres;
+
+ status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+ LKM_PRMODE, 0, 0);
+ if (status < 0)
+ mlog_errno(status);
+
+out:
+ mlog_exit(status);
+ return status;
+}
+
+int ocfs2_try_open_lock(struct inode *inode, int write)
+{
+ int status = 0, level;
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ BUG_ON(!inode);
+
+ mlog_entry_void();
+
+ mlog(0, "inode %llu try to take %s open lock\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ write ? "EXMODE" : "PRMODE");
+
+ if (ocfs2_mount_local(osb))
+ goto out;
+
+ lockres = &OCFS2_I(inode)->ip_open_lockres;
+
+ level = write ? LKM_EXMODE : LKM_PRMODE;
+
+ /*
+ * The file system may already holding a PRMODE/EXMODE open lock.
+ * Since we pass LKM_NOQUEUE, the request won't block waiting on
+ * other nodes and the -EAGAIN will indicate to the caller that
+ * this inode is still in use.
+ */
+ status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+ level, LKM_NOQUEUE, 0);
+
+out:
+ mlog_exit(status);
+ return status;
+}
+
+/*
+ * ocfs2_open_unlock unlock PR and EX mode open locks.
+ */
+void ocfs2_open_unlock(struct inode *inode)
+{
+ struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ mlog_entry_void();
+
+ mlog(0, "inode %llu drop open lock\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+ if (ocfs2_mount_local(osb))
+ goto out;
+
+ if(lockres->l_ro_holders)
+ ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+ LKM_PRMODE);
+ if(lockres->l_ex_holders)
+ ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+ LKM_EXMODE);
+
+out:
+ mlog_exit_void();
+}
+
int ocfs2_data_lock_full(struct inode *inode,
int write,
int arg_flags)
@@ -1387,8 +1495,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
inode->i_blocks = 0;
else
- inode->i_blocks =
- ocfs2_align_bytes_to_sectors(i_size_read(inode));
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
inode->i_uid = be32_to_cpu(lvb->lvb_iuid);
inode->i_gid = be32_to_cpu(lvb->lvb_igid);
@@ -1479,12 +1586,15 @@ static int ocfs2_meta_lock_update(struct inode *inode,
{
int status = 0;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- struct ocfs2_lock_res *lockres = NULL;
+ struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
struct ocfs2_dinode *fe;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
mlog_entry_void();
+ if (ocfs2_mount_local(osb))
+ goto bail;
+
spin_lock(&oi->ip_lock);
if (oi->ip_flags & OCFS2_INODE_DELETED) {
mlog(0, "Orphaned inode %llu was deleted while we "
@@ -1496,22 +1606,16 @@ static int ocfs2_meta_lock_update(struct inode *inode,
}
spin_unlock(&oi->ip_lock);
- if (!ocfs2_mount_local(osb)) {
- lockres = &oi->ip_meta_lockres;
-
- if (!ocfs2_should_refresh_lock_res(lockres))
- goto bail;
- }
+ if (!ocfs2_should_refresh_lock_res(lockres))
+ goto bail;
/* This will discard any caching information we might have had
* for the inode metadata. */
ocfs2_metadata_cache_purge(inode);
- /* will do nothing for inode types that don't use the extent
- * map (directories, bitmap files, etc) */
ocfs2_extent_map_trunc(inode, 0);
- if (lockres && ocfs2_meta_lvb_is_trustable(inode, lockres)) {
+ if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
mlog(0, "Trusting LVB on inode %llu\n",
(unsigned long long)oi->ip_blkno);
ocfs2_refresh_inode_from_lvb(inode);
@@ -1558,8 +1662,7 @@ static int ocfs2_meta_lock_update(struct inode *inode,
status = 0;
bail_refresh:
- if (lockres)
- ocfs2_complete_lock_res_refresh(lockres, status);
+ ocfs2_complete_lock_res_refresh(lockres, status);
bail:
mlog_exit(status);
return status;
@@ -1630,7 +1733,6 @@ int ocfs2_meta_lock_full(struct inode *inode,
wait_event(osb->recovery_event,
ocfs2_node_map_is_empty(osb, &osb->recovery_map));
- acquired = 0;
lockres = &OCFS2_I(inode)->ip_meta_lockres;
level = ex ? LKM_EXMODE : LKM_PRMODE;
dlm_flags = 0;
@@ -2458,13 +2560,20 @@ int ocfs2_drop_inode_locks(struct inode *inode)
* ocfs2_clear_inode has done it for us. */
err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
- &OCFS2_I(inode)->ip_data_lockres);
+ &OCFS2_I(inode)->ip_open_lockres);
if (err < 0)
mlog_errno(err);
status = err;
err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+ &OCFS2_I(inode)->ip_data_lockres);
+ if (err < 0)
+ mlog_errno(err);
+ if (err < 0 && !status)
+ status = err;
+
+ err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
&OCFS2_I(inode)->ip_meta_lockres);
if (err < 0)
mlog_errno(err);
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index c343fca68cf1..59cb566e7983 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -80,6 +80,9 @@ void ocfs2_data_unlock(struct inode *inode,
int write);
int ocfs2_rw_lock(struct inode *inode, int write);
void ocfs2_rw_unlock(struct inode *inode, int write);
+int ocfs2_open_lock(struct inode *inode);
+int ocfs2_try_open_lock(struct inode *inode, int write);
+void ocfs2_open_unlock(struct inode *inode);
int ocfs2_meta_lock_atime(struct inode *inode,
struct vfsmount *vfsmnt,
int *level);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 80ac69f11d9f..ba2b2ab1c6e4 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -3,8 +3,7 @@
*
* extent_map.c
*
- * In-memory extent map for OCFS2. Man, this code was prettier in
- * the library.
+ * Block/Cluster mapping functions
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
@@ -26,1016 +25,528 @@
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/rbtree.h>
#define MLOG_MASK_PREFIX ML_EXTENT_MAP
#include <cluster/masklog.h>
#include "ocfs2.h"
+#include "alloc.h"
#include "extent_map.h"
#include "inode.h"
#include "super.h"
#include "buffer_head_io.h"
-
/*
- * SUCK SUCK SUCK
- * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
- */
-
-struct ocfs2_extent_map_entry {
- struct rb_node e_node;
- int e_tree_depth;
- struct ocfs2_extent_rec e_rec;
-};
-
-struct ocfs2_em_insert_context {
- int need_left;
- int need_right;
- struct ocfs2_extent_map_entry *new_ent;
- struct ocfs2_extent_map_entry *old_ent;
- struct ocfs2_extent_map_entry *left_ent;
- struct ocfs2_extent_map_entry *right_ent;
-};
-
-static struct kmem_cache *ocfs2_em_ent_cachep = NULL;
-
-
-static struct ocfs2_extent_map_entry *
-ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
- u32 cpos, u32 clusters,
- struct rb_node ***ret_p,
- struct rb_node **ret_parent);
-static int ocfs2_extent_map_insert(struct inode *inode,
- struct ocfs2_extent_rec *rec,
- int tree_depth);
-static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
- struct ocfs2_extent_map_entry *ent);
-static int ocfs2_extent_map_find_leaf(struct inode *inode,
- u32 cpos, u32 clusters,
- struct ocfs2_extent_list *el);
-static int ocfs2_extent_map_lookup_read(struct inode *inode,
- u32 cpos, u32 clusters,
- struct ocfs2_extent_map_entry **ret_ent);
-static int ocfs2_extent_map_try_insert(struct inode *inode,
- struct ocfs2_extent_rec *rec,
- int tree_depth,
- struct ocfs2_em_insert_context *ctxt);
-
-/* returns 1 only if the rec contains all the given clusters -- that is that
- * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
- * clusters) is >= the argument's endpoint */
-static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
- u32 cpos, u32 clusters)
-{
- if (le32_to_cpu(rec->e_cpos) > cpos)
- return 0;
- if (cpos + clusters > le32_to_cpu(rec->e_cpos) +
- le32_to_cpu(rec->e_clusters))
- return 0;
- return 1;
-}
-
-
-/*
- * Find an entry in the tree that intersects the region passed in.
- * Note that this will find straddled intervals, it is up to the
- * callers to enforce any boundary conditions.
- *
- * Callers must hold ip_lock. This lookup is not guaranteed to return
- * a tree_depth 0 match, and as such can race inserts if the lock
- * were not held.
+ * The extent caching implementation is intentionally trivial.
*
- * The rb_node garbage lets insertion share the search. Trivial
- * callers pass NULL.
+ * We only cache a small number of extents stored directly on the
+ * inode, so linear order operations are acceptable. If we ever want
+ * to increase the size of the extent map, then these algorithms must
+ * get smarter.
*/
-static struct ocfs2_extent_map_entry *
-ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
- u32 cpos, u32 clusters,
- struct rb_node ***ret_p,
- struct rb_node **ret_parent)
+
+void ocfs2_extent_map_init(struct inode *inode)
{
- struct rb_node **p = &em->em_extents.rb_node;
- struct rb_node *parent = NULL;
- struct ocfs2_extent_map_entry *ent = NULL;
-
- while (*p)
- {
- parent = *p;
- ent = rb_entry(parent, struct ocfs2_extent_map_entry,
- e_node);
- if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
- p = &(*p)->rb_left;
- ent = NULL;
- } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
- le32_to_cpu(ent->e_rec.e_clusters))) {
- p = &(*p)->rb_right;
- ent = NULL;
- } else
- break;
- }
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
- if (ret_p != NULL)
- *ret_p = p;
- if (ret_parent != NULL)
- *ret_parent = parent;
- return ent;
+ oi->ip_extent_map.em_num_items = 0;
+ INIT_LIST_HEAD(&oi->ip_extent_map.em_list);
}
-/*
- * Find the leaf containing the interval we want. While we're on our
- * way down the tree, fill in every record we see at any depth, because
- * we might want it later.
- *
- * Note that this code is run without ip_lock. That's because it
- * sleeps while reading. If someone is also filling the extent list at
- * the same time we are, we might have to restart.
- */
-static int ocfs2_extent_map_find_leaf(struct inode *inode,
- u32 cpos, u32 clusters,
- struct ocfs2_extent_list *el)
+static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
+ unsigned int cpos,
+ struct ocfs2_extent_map_item **ret_emi)
{
- int i, ret;
- struct buffer_head *eb_bh = NULL;
- u64 blkno;
- u32 rec_end;
- struct ocfs2_extent_block *eb;
- struct ocfs2_extent_rec *rec;
-
- /*
- * The bh data containing the el cannot change here, because
- * we hold alloc_sem. So we can do this without other
- * locks.
- */
- while (el->l_tree_depth)
- {
- blkno = 0;
- for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
- rec = &el->l_recs[i];
- rec_end = (le32_to_cpu(rec->e_cpos) +
- le32_to_cpu(rec->e_clusters));
-
- ret = -EBADR;
- if (rec_end > OCFS2_I(inode)->ip_clusters) {
- mlog_errno(ret);
- ocfs2_error(inode->i_sb,
- "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
- i,
- (unsigned long long)le64_to_cpu(rec->e_blkno),
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- OCFS2_I(inode)->ip_clusters);
- goto out_free;
- }
-
- if (rec_end <= cpos) {
- ret = ocfs2_extent_map_insert(inode, rec,
- le16_to_cpu(el->l_tree_depth));
- if (ret && (ret != -EEXIST)) {
- mlog_errno(ret);
- goto out_free;
- }
- continue;
- }
- if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
- ret = ocfs2_extent_map_insert(inode, rec,
- le16_to_cpu(el->l_tree_depth));
- if (ret && (ret != -EEXIST)) {
- mlog_errno(ret);
- goto out_free;
- }
- continue;
- }
+ unsigned int range;
+ struct ocfs2_extent_map_item *emi;
- /*
- * We've found a record that matches our
- * interval. We don't insert it because we're
- * about to traverse it.
- */
-
- /* Check to see if we're stradling */
- ret = -ESRCH;
- if (!ocfs2_extent_rec_contains_clusters(rec,
- cpos,
- clusters)) {
- mlog_errno(ret);
- goto out_free;
- }
+ *ret_emi = NULL;
- /*
- * If we've already found a record, the el has
- * two records covering the same interval.
- * EEEK!
- */
- ret = -EBADR;
- if (blkno) {
- mlog_errno(ret);
- ocfs2_error(inode->i_sb,
- "Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n",
- cpos, clusters,
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)blkno, i,
- (unsigned long long)le64_to_cpu(rec->e_blkno));
- goto out_free;
- }
+ list_for_each_entry(emi, &em->em_list, ei_list) {
+ range = emi->ei_cpos + emi->ei_clusters;
- blkno = le64_to_cpu(rec->e_blkno);
- }
+ if (cpos >= emi->ei_cpos && cpos < range) {
+ list_move(&emi->ei_list, &em->em_list);
- /*
- * We don't support holes, and we're still up
- * in the branches, so we'd better have found someone
- */
- ret = -EBADR;
- if (!blkno) {
- ocfs2_error(inode->i_sb,
- "No record found for (cpos = %u, clusters = %u) on inode %llu\n",
- cpos, clusters,
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- mlog_errno(ret);
- goto out_free;
- }
-
- if (eb_bh) {
- brelse(eb_bh);
- eb_bh = NULL;
- }
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- blkno, &eb_bh, OCFS2_BH_CACHED,
- inode);
- if (ret) {
- mlog_errno(ret);
- goto out_free;
- }
- eb = (struct ocfs2_extent_block *)eb_bh->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- ret = -EIO;
- goto out_free;
+ *ret_emi = emi;
+ break;
}
- el = &eb->h_list;
}
+}
- BUG_ON(el->l_tree_depth);
-
- for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
- rec = &el->l_recs[i];
-
- if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
- OCFS2_I(inode)->ip_clusters) {
- ret = -EBADR;
- mlog_errno(ret);
- ocfs2_error(inode->i_sb,
- "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
- i,
- (unsigned long long)le64_to_cpu(rec->e_blkno),
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- OCFS2_I(inode)->ip_clusters);
- return ret;
- }
-
- ret = ocfs2_extent_map_insert(inode, rec,
- le16_to_cpu(el->l_tree_depth));
- if (ret && (ret != -EEXIST)) {
- mlog_errno(ret);
- goto out_free;
- }
+static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
+ unsigned int *phys, unsigned int *len,
+ unsigned int *flags)
+{
+ unsigned int coff;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_extent_map_item *emi;
+
+ spin_lock(&oi->ip_lock);
+
+ __ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi);
+ if (emi) {
+ coff = cpos - emi->ei_cpos;
+ *phys = emi->ei_phys + coff;
+ if (len)
+ *len = emi->ei_clusters - coff;
+ if (flags)
+ *flags = emi->ei_flags;
}
- ret = 0;
+ spin_unlock(&oi->ip_lock);
-out_free:
- if (eb_bh)
- brelse(eb_bh);
+ if (emi == NULL)
+ return -ENOENT;
- return ret;
+ return 0;
}
/*
- * This lookup actually will read from disk. It has one invariant:
- * It will never re-traverse blocks. This means that all inserts should
- * be new regions or more granular regions (both allowed by insert).
+ * Forget about all clusters equal to or greater than cpos.
*/
-static int ocfs2_extent_map_lookup_read(struct inode *inode,
- u32 cpos,
- u32 clusters,
- struct ocfs2_extent_map_entry **ret_ent)
+void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
{
- int ret;
- u64 blkno;
- struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
- struct ocfs2_extent_map_entry *ent;
- struct buffer_head *bh = NULL;
- struct ocfs2_extent_block *eb;
- struct ocfs2_dinode *di;
- struct ocfs2_extent_list *el;
-
- spin_lock(&OCFS2_I(inode)->ip_lock);
- ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
- if (ent) {
- if (!ent->e_tree_depth) {
- spin_unlock(&OCFS2_I(inode)->ip_lock);
- *ret_ent = ent;
- return 0;
- }
- blkno = le64_to_cpu(ent->e_rec.e_blkno);
- spin_unlock(&OCFS2_I(inode)->ip_lock);
-
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
- OCFS2_BH_CACHED, inode);
- if (ret) {
- mlog_errno(ret);
- if (bh)
- brelse(bh);
- return ret;
+ struct list_head *p, *n;
+ struct ocfs2_extent_map_item *emi;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_extent_map *em = &oi->ip_extent_map;
+ LIST_HEAD(tmp_list);
+ unsigned int range;
+
+ spin_lock(&oi->ip_lock);
+ list_for_each_safe(p, n, &em->em_list) {
+ emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
+
+ if (emi->ei_cpos >= cpos) {
+ /* Full truncate of this record. */
+ list_move(&emi->ei_list, &tmp_list);
+ BUG_ON(em->em_num_items == 0);
+ em->em_num_items--;
+ continue;
}
- eb = (struct ocfs2_extent_block *)bh->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- brelse(bh);
- return -EIO;
- }
- el = &eb->h_list;
- } else {
- spin_unlock(&OCFS2_I(inode)->ip_lock);
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- OCFS2_I(inode)->ip_blkno, &bh,
- OCFS2_BH_CACHED, inode);
- if (ret) {
- mlog_errno(ret);
- if (bh)
- brelse(bh);
- return ret;
+ range = emi->ei_cpos + emi->ei_clusters;
+ if (range > cpos) {
+ /* Partial truncate */
+ emi->ei_clusters = cpos - emi->ei_cpos;
}
- di = (struct ocfs2_dinode *)bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(di)) {
- brelse(bh);
- OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
- return -EIO;
- }
- el = &di->id2.i_list;
- }
-
- ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
- brelse(bh);
- if (ret) {
- mlog_errno(ret);
- return ret;
}
+ spin_unlock(&oi->ip_lock);
- ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
- if (!ent) {
- ret = -ESRCH;
- mlog_errno(ret);
- return ret;
+ list_for_each_safe(p, n, &tmp_list) {
+ emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
+ list_del(&emi->ei_list);
+ kfree(emi);
}
-
- /* FIXME: Make sure this isn't a corruption */
- BUG_ON(ent->e_tree_depth);
-
- *ret_ent = ent;
-
- return 0;
}
/*
- * Callers must hold ip_lock. This can insert pieces of the tree,
- * thus racing lookup if the lock weren't held.
+ * Is any part of emi2 contained within emi1
*/
-static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
- struct ocfs2_extent_map_entry *ent)
+static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1,
+ struct ocfs2_extent_map_item *emi2)
{
- struct rb_node **p, *parent;
- struct ocfs2_extent_map_entry *old_ent;
+ unsigned int range1, range2;
- old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
- le32_to_cpu(ent->e_rec.e_clusters),
- &p, &parent);
- if (old_ent)
- return -EEXIST;
+ /*
+ * Check if logical start of emi2 is inside emi1
+ */
+ range1 = emi1->ei_cpos + emi1->ei_clusters;
+ if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1)
+ return 1;
- rb_link_node(&ent->e_node, parent, p);
- rb_insert_color(&ent->e_node, &em->em_extents);
+ /*
+ * Check if logical end of emi2 is inside emi1
+ */
+ range2 = emi2->ei_cpos + emi2->ei_clusters;
+ if (range2 > emi1->ei_cpos && range2 <= range1)
+ return 1;
return 0;
}
+static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest,
+ struct ocfs2_extent_map_item *src)
+{
+ dest->ei_cpos = src->ei_cpos;
+ dest->ei_phys = src->ei_phys;
+ dest->ei_clusters = src->ei_clusters;
+ dest->ei_flags = src->ei_flags;
+}
/*
- * Simple rule: on any return code other than -EAGAIN, anything left
- * in the insert_context will be freed.
- *
- * Simple rule #2: A return code of -EEXIST from this function or
- * its calls to ocfs2_extent_map_insert_entry() signifies that another
- * thread beat us to the insert. It is not an actual error, but it
- * tells the caller we have no more work to do.
+ * Try to merge emi with ins. Returns 1 if merge succeeds, zero
+ * otherwise.
*/
-static int ocfs2_extent_map_try_insert(struct inode *inode,
- struct ocfs2_extent_rec *rec,
- int tree_depth,
- struct ocfs2_em_insert_context *ctxt)
+static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
+ struct ocfs2_extent_map_item *ins)
{
- int ret;
- struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
- struct ocfs2_extent_map_entry *old_ent;
-
- ctxt->need_left = 0;
- ctxt->need_right = 0;
- ctxt->old_ent = NULL;
-
- spin_lock(&OCFS2_I(inode)->ip_lock);
- ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
- if (!ret) {
- ctxt->new_ent = NULL;
- goto out_unlock;
- }
-
- /* Since insert_entry failed, the map MUST have old_ent */
- old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
- le32_to_cpu(rec->e_clusters),
- NULL, NULL);
-
- BUG_ON(!old_ent);
-
- if (old_ent->e_tree_depth < tree_depth) {
- /* Another thread beat us to the lower tree_depth */
- ret = -EEXIST;
- goto out_unlock;
- }
-
- if (old_ent->e_tree_depth == tree_depth) {
- /*
- * Another thread beat us to this tree_depth.
- * Let's make sure we agree with that thread (the
- * extent_rec should be identical).
- */
- if (!memcmp(rec, &old_ent->e_rec,
- sizeof(struct ocfs2_extent_rec)))
- ret = 0;
- else
- /* FIXME: Should this be ESRCH/EBADR??? */
- ret = -EEXIST;
-
- goto out_unlock;
- }
-
/*
- * We do it in this order specifically so that no actual tree
- * changes occur until we have all the pieces we need. We
- * don't want malloc failures to leave an inconsistent tree.
- * Whenever we drop the lock, another process could be
- * inserting. Also note that, if another process just beat us
- * to an insert, we might not need the same pieces we needed
- * the first go round. In the end, the pieces we need will
- * be used, and the pieces we don't will be freed.
+ * Handle contiguousness
*/
- ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
- le32_to_cpu(old_ent->e_rec.e_cpos));
- ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
- le32_to_cpu(old_ent->e_rec.e_clusters)) >
- (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
- ret = -EAGAIN;
- if (ctxt->need_left) {
- if (!ctxt->left_ent)
- goto out_unlock;
- *(ctxt->left_ent) = *old_ent;
- ctxt->left_ent->e_rec.e_clusters =
- cpu_to_le32(le32_to_cpu(rec->e_cpos) -
- le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
- }
- if (ctxt->need_right) {
- if (!ctxt->right_ent)
- goto out_unlock;
- *(ctxt->right_ent) = *old_ent;
- ctxt->right_ent->e_rec.e_cpos =
- cpu_to_le32(le32_to_cpu(rec->e_cpos) +
- le32_to_cpu(rec->e_clusters));
- ctxt->right_ent->e_rec.e_clusters =
- cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
- le32_to_cpu(old_ent->e_rec.e_clusters)) -
- le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
- }
-
- rb_erase(&old_ent->e_node, &em->em_extents);
- /* Now that he's erased, set him up for deletion */
- ctxt->old_ent = old_ent;
-
- if (ctxt->need_left) {
- ret = ocfs2_extent_map_insert_entry(em,
- ctxt->left_ent);
- if (ret)
- goto out_unlock;
- ctxt->left_ent = NULL;
+ if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) &&
+ ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) &&
+ ins->ei_flags == emi->ei_flags) {
+ emi->ei_clusters += ins->ei_clusters;
+ return 1;
+ } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
+ (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys &&
+ ins->ei_flags == emi->ei_flags) {
+ emi->ei_phys = ins->ei_phys;
+ emi->ei_cpos = ins->ei_cpos;
+ emi->ei_clusters += ins->ei_clusters;
+ return 1;
}
- if (ctxt->need_right) {
- ret = ocfs2_extent_map_insert_entry(em,
- ctxt->right_ent);
- if (ret)
- goto out_unlock;
- ctxt->right_ent = NULL;
+ /*
+ * Overlapping extents - this shouldn't happen unless we've
+ * split an extent to change it's flags. That is exceedingly
+ * rare, so there's no sense in trying to optimize it yet.
+ */
+ if (ocfs2_ei_is_contained(emi, ins) ||
+ ocfs2_ei_is_contained(ins, emi)) {
+ ocfs2_copy_emi_fields(emi, ins);
+ return 1;
}
- ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
-
- if (!ret)
- ctxt->new_ent = NULL;
-
-out_unlock:
- spin_unlock(&OCFS2_I(inode)->ip_lock);
-
- return ret;
+ /* No merge was possible. */
+ return 0;
}
-
-static int ocfs2_extent_map_insert(struct inode *inode,
- struct ocfs2_extent_rec *rec,
- int tree_depth)
+/*
+ * In order to reduce complexity on the caller, this insert function
+ * is intentionally liberal in what it will accept.
+ *
+ * The only rule is that the truncate call *must* be used whenever
+ * records have been deleted. This avoids inserting overlapping
+ * records with different physical mappings.
+ */
+void ocfs2_extent_map_insert_rec(struct inode *inode,
+ struct ocfs2_extent_rec *rec)
{
- int ret;
- struct ocfs2_em_insert_context ctxt = {0, };
-
- if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
- OCFS2_I(inode)->ip_map.em_clusters) {
- ret = -EBADR;
- mlog_errno(ret);
- return ret;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_extent_map *em = &oi->ip_extent_map;
+ struct ocfs2_extent_map_item *emi, *new_emi = NULL;
+ struct ocfs2_extent_map_item ins;
+
+ ins.ei_cpos = le32_to_cpu(rec->e_cpos);
+ ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb,
+ le64_to_cpu(rec->e_blkno));
+ ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters);
+ ins.ei_flags = rec->e_flags;
+
+search:
+ spin_lock(&oi->ip_lock);
+
+ list_for_each_entry(emi, &em->em_list, ei_list) {
+ if (ocfs2_try_to_merge_extent_map(emi, &ins)) {
+ list_move(&emi->ei_list, &em->em_list);
+ spin_unlock(&oi->ip_lock);
+ goto out;
+ }
}
- /* Zero e_clusters means a truncated tail record. It better be EOF */
- if (!rec->e_clusters) {
- if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
- OCFS2_I(inode)->ip_map.em_clusters) {
- ret = -EBADR;
- mlog_errno(ret);
- ocfs2_error(inode->i_sb,
- "Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n",
- (unsigned long long)le64_to_cpu(rec->e_blkno),
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- return ret;
- }
+ /*
+ * No item could be merged.
+ *
+ * Either allocate and add a new item, or overwrite the last recently
+ * inserted.
+ */
- /* Ignore the truncated tail */
- return 0;
- }
+ if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) {
+ if (new_emi == NULL) {
+ spin_unlock(&oi->ip_lock);
- ret = -ENOMEM;
- ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
- GFP_NOFS);
- if (!ctxt.new_ent) {
- mlog_errno(ret);
- return ret;
- }
+ new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS);
+ if (new_emi == NULL)
+ goto out;
- ctxt.new_ent->e_rec = *rec;
- ctxt.new_ent->e_tree_depth = tree_depth;
-
- do {
- ret = -ENOMEM;
- if (ctxt.need_left && !ctxt.left_ent) {
- ctxt.left_ent =
- kmem_cache_alloc(ocfs2_em_ent_cachep,
- GFP_NOFS);
- if (!ctxt.left_ent)
- break;
- }
- if (ctxt.need_right && !ctxt.right_ent) {
- ctxt.right_ent =
- kmem_cache_alloc(ocfs2_em_ent_cachep,
- GFP_NOFS);
- if (!ctxt.right_ent)
- break;
+ goto search;
}
- ret = ocfs2_extent_map_try_insert(inode, rec,
- tree_depth, &ctxt);
- } while (ret == -EAGAIN);
-
- if ((ret < 0) && (ret != -EEXIST))
- mlog_errno(ret);
+ ocfs2_copy_emi_fields(new_emi, &ins);
+ list_add(&new_emi->ei_list, &em->em_list);
+ em->em_num_items++;
+ new_emi = NULL;
+ } else {
+ BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0);
+ emi = list_entry(em->em_list.prev,
+ struct ocfs2_extent_map_item, ei_list);
+ list_move(&emi->ei_list, &em->em_list);
+ ocfs2_copy_emi_fields(emi, &ins);
+ }
- if (ctxt.left_ent)
- kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
- if (ctxt.right_ent)
- kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
- if (ctxt.old_ent)
- kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
- if (ctxt.new_ent)
- kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
+ spin_unlock(&oi->ip_lock);
- return ret;
+out:
+ if (new_emi)
+ kfree(new_emi);
}
/*
- * Append this record to the tail of the extent map. It must be
- * tree_depth 0. The record might be an extension of an existing
- * record, and as such that needs to be handled. eg:
- *
- * Existing record in the extent map:
- *
- * cpos = 10, len = 10
- * |---------|
- *
- * New Record:
- *
- * cpos = 10, len = 20
- * |------------------|
- *
- * The passed record is the new on-disk record. The new_clusters value
- * is how many clusters were added to the file. If the append is a
- * contiguous append, the new_clusters has been added to
- * rec->e_clusters. If the append is an entirely new extent, then
- * rec->e_clusters is == new_clusters.
+ * Return the 1st index within el which contains an extent start
+ * larger than v_cluster.
*/
-int ocfs2_extent_map_append(struct inode *inode,
- struct ocfs2_extent_rec *rec,
- u32 new_clusters)
+static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
+ u32 v_cluster)
{
- int ret;
- struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
- struct ocfs2_extent_map_entry *ent;
- struct ocfs2_extent_rec *old;
-
- BUG_ON(!new_clusters);
- BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
+ int i;
+ struct ocfs2_extent_rec *rec;
- if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
- /*
- * Size changed underneath us on disk. Drop any
- * straddling records and update our idea of
- * i_clusters
- */
- ocfs2_extent_map_drop(inode, em->em_clusters - 1);
- em->em_clusters = OCFS2_I(inode)->ip_clusters;
- }
+ for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+ rec = &el->l_recs[i];
- mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
- le32_to_cpu(rec->e_clusters)) !=
- (em->em_clusters + new_clusters),
- "Inode %llu:\n"
- "rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
- "em->em_clusters = %u + new_clusters = %u = %u\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
- le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
- em->em_clusters, new_clusters,
- em->em_clusters + new_clusters);
-
- em->em_clusters += new_clusters;
-
- ret = -ENOENT;
- if (le32_to_cpu(rec->e_clusters) > new_clusters) {
- /* This is a contiguous append */
- ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
- NULL, NULL);
- if (ent) {
- old = &ent->e_rec;
- BUG_ON((le32_to_cpu(rec->e_cpos) +
- le32_to_cpu(rec->e_clusters)) !=
- (le32_to_cpu(old->e_cpos) +
- le32_to_cpu(old->e_clusters) +
- new_clusters));
- if (ent->e_tree_depth == 0) {
- BUG_ON(le32_to_cpu(old->e_cpos) !=
- le32_to_cpu(rec->e_cpos));
- BUG_ON(le64_to_cpu(old->e_blkno) !=
- le64_to_cpu(rec->e_blkno));
- ret = 0;
- }
- /*
- * Let non-leafs fall through as -ENOENT to
- * force insertion of the new leaf.
- */
- le32_add_cpu(&old->e_clusters, new_clusters);
- }
+ if (v_cluster < le32_to_cpu(rec->e_cpos))
+ break;
}
- if (ret == -ENOENT)
- ret = ocfs2_extent_map_insert(inode, rec, 0);
- if (ret < 0)
- mlog_errno(ret);
- return ret;
+ return i;
}
-#if 0
-/* Code here is included but defined out as it completes the extent
- * map api and may be used in the future. */
-
/*
- * Look up the record containing this cluster offset. This record is
- * part of the extent map. Do not free it. Any changes you make to
- * it will reflect in the extent map. So, if your last extent
- * is (cpos = 10, clusters = 10) and you truncate the file by 5
- * clusters, you can do:
+ * Figure out the size of a hole which starts at v_cluster within the given
+ * extent list.
*
- * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
- * rec->e_clusters -= 5;
+ * If there is no more allocation past v_cluster, we return the maximum
+ * cluster size minus v_cluster.
*
- * The lookup does not read from disk. If the map isn't filled in for
- * an entry, you won't find it.
- *
- * Also note that the returned record is valid until alloc_sem is
- * dropped. After that, truncate and extend can happen. Caveat Emptor.
+ * If we have in-inode extents, then el points to the dinode list and
+ * eb_bh is NULL. Otherwise, eb_bh should point to the extent block
+ * containing el.
*/
-int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
- struct ocfs2_extent_rec **rec,
- int *tree_depth)
+static int ocfs2_figure_hole_clusters(struct inode *inode,
+ struct ocfs2_extent_list *el,
+ struct buffer_head *eb_bh,
+ u32 v_cluster,
+ u32 *num_clusters)
{
- int ret = -ENOENT;
- struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
- struct ocfs2_extent_map_entry *ent;
+ int ret, i;
+ struct buffer_head *next_eb_bh = NULL;
+ struct ocfs2_extent_block *eb, *next_eb;
- *rec = NULL;
+ i = ocfs2_search_for_hole_index(el, v_cluster);
- if (cpos >= OCFS2_I(inode)->ip_clusters)
- return -EINVAL;
+ if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) {
+ eb = (struct ocfs2_extent_block *)eb_bh->b_data;
- if (cpos >= em->em_clusters) {
/*
- * Size changed underneath us on disk. Drop any
- * straddling records and update our idea of
- * i_clusters
+ * Check the next leaf for any extents.
*/
- ocfs2_extent_map_drop(inode, em->em_clusters - 1);
- em->em_clusters = OCFS2_I(inode)->ip_clusters ;
- }
-
- ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
- NULL, NULL);
- if (ent) {
- *rec = &ent->e_rec;
- if (tree_depth)
- *tree_depth = ent->e_tree_depth;
- ret = 0;
- }
+ if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
+ goto no_more_extents;
- return ret;
-}
+ ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+ le64_to_cpu(eb->h_next_leaf_blk),
+ &next_eb_bh, OCFS2_BH_CACHED, inode);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
-int ocfs2_extent_map_get_clusters(struct inode *inode,
- u32 v_cpos, int count,
- u32 *p_cpos, int *ret_count)
-{
- int ret;
- u32 coff, ccount;
- struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
- struct ocfs2_extent_map_entry *ent = NULL;
+ if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
+ ret = -EROFS;
+ OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
+ goto out;
+ }
- *p_cpos = ccount = 0;
+ el = &next_eb->h_list;
- if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
- return -EINVAL;
+ i = ocfs2_search_for_hole_index(el, v_cluster);
+ }
- if ((v_cpos + count) > em->em_clusters) {
+no_more_extents:
+ if (i == le16_to_cpu(el->l_next_free_rec)) {
/*
- * Size changed underneath us on disk. Drop any
- * straddling records and update our idea of
- * i_clusters
+ * We're at the end of our existing allocation. Just
+ * return the maximum number of clusters we could
+ * possibly allocate.
*/
- ocfs2_extent_map_drop(inode, em->em_clusters - 1);
- em->em_clusters = OCFS2_I(inode)->ip_clusters;
+ *num_clusters = UINT_MAX - v_cluster;
+ } else {
+ *num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster;
}
+ ret = 0;
+out:
+ brelse(next_eb_bh);
+ return ret;
+}
- ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
- if (ret)
- return ret;
+/*
+ * Return the index of the extent record which contains cluster #v_cluster.
+ * -1 is returned if it was not found.
+ *
+ * Should work fine on interior and exterior nodes.
+ */
+static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
+ u32 v_cluster)
+{
+ int ret = -1;
+ int i;
+ struct ocfs2_extent_rec *rec;
+ u32 rec_end, rec_start, clusters;
- if (ent) {
- /* We should never find ourselves straddling an interval */
- if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
- v_cpos,
- count))
- return -ESRCH;
+ for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+ rec = &el->l_recs[i];
- coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
- *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
- le64_to_cpu(ent->e_rec.e_blkno)) +
- coff;
+ rec_start = le32_to_cpu(rec->e_cpos);
+ clusters = ocfs2_rec_clusters(el, rec);
- if (ret_count)
- *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
+ rec_end = rec_start + clusters;
- return 0;
+ if (v_cluster >= rec_start && v_cluster < rec_end) {
+ ret = i;
+ break;
+ }
}
-
- return -ENOENT;
+ return ret;
}
-#endif /* 0 */
-
-int ocfs2_extent_map_get_blocks(struct inode *inode,
- u64 v_blkno, int count,
- u64 *p_blkno, int *ret_count)
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
+ u32 *p_cluster, u32 *num_clusters,
+ unsigned int *extent_flags)
{
- int ret;
- u64 boff;
- u32 cpos, clusters;
- int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
- struct ocfs2_extent_map_entry *ent = NULL;
- struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+ int ret, i;
+ unsigned int flags = 0;
+ struct buffer_head *di_bh = NULL;
+ struct buffer_head *eb_bh = NULL;
+ struct ocfs2_dinode *di;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *el;
struct ocfs2_extent_rec *rec;
+ u32 coff;
- *p_blkno = 0;
-
- cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
- clusters = ocfs2_blocks_to_clusters(inode->i_sb,
- (u64)count + bpc - 1);
- if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
- ret = -EINVAL;
- mlog_errno(ret);
- return ret;
- }
-
- if ((cpos + clusters) > em->em_clusters) {
- /*
- * Size changed underneath us on disk. Drop any
- * straddling records and update our idea of
- * i_clusters
- */
- ocfs2_extent_map_drop(inode, em->em_clusters - 1);
- em->em_clusters = OCFS2_I(inode)->ip_clusters;
- }
+ ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
+ num_clusters, extent_flags);
+ if (ret == 0)
+ goto out;
- ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
+ ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
+ &di_bh, OCFS2_BH_CACHED, inode);
if (ret) {
mlog_errno(ret);
- return ret;
+ goto out;
}
- if (ent)
- {
- rec = &ent->e_rec;
+ di = (struct ocfs2_dinode *) di_bh->b_data;
+ el = &di->id2.i_list;
- /* We should never find ourselves straddling an interval */
- if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
- ret = -ESRCH;
+ if (el->l_tree_depth) {
+ ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
+ if (ret) {
mlog_errno(ret);
- return ret;
+ goto out;
}
- boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
- le32_to_cpu(rec->e_cpos));
- boff += (v_blkno & (u64)(bpc - 1));
- *p_blkno = le64_to_cpu(rec->e_blkno) + boff;
+ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+ el = &eb->h_list;
- if (ret_count) {
- *ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
- le32_to_cpu(rec->e_clusters)) - boff;
+ if (el->l_tree_depth) {
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in "
+ "leaf block %llu\n", inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
+ ret = -EROFS;
+ goto out;
}
-
- return 0;
}
- return -ENOENT;
-}
-
-int ocfs2_extent_map_init(struct inode *inode)
-{
- struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-
- em->em_extents = RB_ROOT;
- em->em_clusters = 0;
-
- return 0;
-}
-
-/* Needs the lock */
-static void __ocfs2_extent_map_drop(struct inode *inode,
- u32 new_clusters,
- struct rb_node **free_head,
- struct ocfs2_extent_map_entry **tail_ent)
-{
- struct rb_node *node, *next;
- struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
- struct ocfs2_extent_map_entry *ent;
+ i = ocfs2_search_extent_list(el, v_cluster);
+ if (i == -1) {
+ /*
+ * A hole was found. Return some canned values that
+ * callers can key on. If asked for, num_clusters will
+ * be populated with the size of the hole.
+ */
+ *p_cluster = 0;
+ if (num_clusters) {
+ ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
+ v_cluster,
+ num_clusters);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+ } else {
+ rec = &el->l_recs[i];
- *free_head = NULL;
+ BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
- ent = NULL;
- node = rb_last(&em->em_extents);
- while (node)
- {
- next = rb_prev(node);
+ if (!rec->e_blkno) {
+ ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+ "record (%u, %u, 0)", inode->i_ino,
+ le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
+ ret = -EROFS;
+ goto out;
+ }
- ent = rb_entry(node, struct ocfs2_extent_map_entry,
- e_node);
- if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
- break;
+ coff = v_cluster - le32_to_cpu(rec->e_cpos);
- rb_erase(&ent->e_node, &em->em_extents);
+ *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
+ le64_to_cpu(rec->e_blkno));
+ *p_cluster = *p_cluster + coff;
- node->rb_right = *free_head;
- *free_head = node;
+ if (num_clusters)
+ *num_clusters = ocfs2_rec_clusters(el, rec) - coff;
- ent = NULL;
- node = next;
- }
+ flags = rec->e_flags;
- /* Do we have an entry straddling new_clusters? */
- if (tail_ent) {
- if (ent &&
- ((le32_to_cpu(ent->e_rec.e_cpos) +
- le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
- *tail_ent = ent;
- else
- *tail_ent = NULL;
+ ocfs2_extent_map_insert_rec(inode, rec);
}
-}
-
-static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
-{
- struct rb_node *node;
- struct ocfs2_extent_map_entry *ent;
- while (free_head) {
- node = free_head;
- free_head = node->rb_right;
+ if (extent_flags)
+ *extent_flags = flags;
- ent = rb_entry(node, struct ocfs2_extent_map_entry,
- e_node);
- kmem_cache_free(ocfs2_em_ent_cachep, ent);
- }
+out:
+ brelse(di_bh);
+ brelse(eb_bh);
+ return ret;
}
/*
- * Remove all entries past new_clusters, inclusive of an entry that
- * contains new_clusters. This is effectively a cache forget.
- *
- * If you want to also clip the last extent by some number of clusters,
- * you need to call ocfs2_extent_map_trunc().
- * This code does not check or modify ip_clusters.
+ * This expects alloc_sem to be held. The allocation cannot change at
+ * all while the map is in the process of being updated.
*/
-int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
+int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
+ u64 *ret_count, unsigned int *extent_flags)
{
- struct rb_node *free_head = NULL;
- struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
- struct ocfs2_extent_map_entry *ent;
-
- spin_lock(&OCFS2_I(inode)->ip_lock);
+ int ret;
+ int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+ u32 cpos, num_clusters, p_cluster;
+ u64 boff = 0;
- __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
+ cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
- if (ent) {
- rb_erase(&ent->e_node, &em->em_extents);
- ent->e_node.rb_right = free_head;
- free_head = &ent->e_node;
+ ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters,
+ extent_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
}
- spin_unlock(&OCFS2_I(inode)->ip_lock);
-
- if (free_head)
- __ocfs2_extent_map_drop_cleanup(free_head);
-
- return 0;
-}
-
-/*
- * Remove all entries past new_clusters and also clip any extent
- * straddling new_clusters, if there is one. This does not check
- * or modify ip_clusters
- */
-int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
-{
- struct rb_node *free_head = NULL;
- struct ocfs2_extent_map_entry *ent = NULL;
-
- spin_lock(&OCFS2_I(inode)->ip_lock);
-
- __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
-
- if (ent)
- ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
- le32_to_cpu(ent->e_rec.e_cpos));
-
- OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
-
- spin_unlock(&OCFS2_I(inode)->ip_lock);
-
- if (free_head)
- __ocfs2_extent_map_drop_cleanup(free_head);
-
- return 0;
-}
+ /*
+ * p_cluster == 0 indicates a hole.
+ */
+ if (p_cluster) {
+ boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+ boff += (v_blkno & (u64)(bpc - 1));
+ }
-int __init init_ocfs2_extent_maps(void)
-{
- ocfs2_em_ent_cachep =
- kmem_cache_create("ocfs2_em_ent",
- sizeof(struct ocfs2_extent_map_entry),
- 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
- if (!ocfs2_em_ent_cachep)
- return -ENOMEM;
+ *p_blkno = boff;
- return 0;
-}
+ if (ret_count) {
+ *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
+ *ret_count -= v_blkno & (u64)(bpc - 1);
+ }
-void exit_ocfs2_extent_maps(void)
-{
- kmem_cache_destroy(ocfs2_em_ent_cachep);
+out:
+ return ret;
}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index fa3745efa886..de91e3e41a22 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -25,22 +25,29 @@
#ifndef _EXTENT_MAP_H
#define _EXTENT_MAP_H
-int init_ocfs2_extent_maps(void);
-void exit_ocfs2_extent_maps(void);
+struct ocfs2_extent_map_item {
+ unsigned int ei_cpos;
+ unsigned int ei_phys;
+ unsigned int ei_clusters;
+ unsigned int ei_flags;
-/*
- * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem
- * to be held. The allocation cannot change at all while the map is
- * in the process of being updated.
- */
-int ocfs2_extent_map_init(struct inode *inode);
-int ocfs2_extent_map_append(struct inode *inode,
- struct ocfs2_extent_rec *rec,
- u32 new_clusters);
-int ocfs2_extent_map_get_blocks(struct inode *inode,
- u64 v_blkno, int count,
- u64 *p_blkno, int *ret_count);
-int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters);
-int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
+ struct list_head ei_list;
+};
+
+#define OCFS2_MAX_EXTENT_MAP_ITEMS 3
+struct ocfs2_extent_map {
+ unsigned int em_num_items;
+ struct list_head em_list;
+};
+
+void ocfs2_extent_map_init(struct inode *inode);
+void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cluster);
+void ocfs2_extent_map_insert_rec(struct inode *inode,
+ struct ocfs2_extent_rec *rec);
+
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
+ u32 *num_clusters, unsigned int *extent_flags);
+int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
+ u64 *ret_count, unsigned int *extent_flags);
#endif /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index f2cd3bf9efb2..520a2a6d7670 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -33,6 +33,7 @@
#include <linux/sched.h>
#include <linux/pipe_fs_i.h>
#include <linux/mount.h>
+#include <linux/writeback.h>
#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
@@ -215,7 +216,7 @@ int ocfs2_set_inode_size(handle_t *handle,
mlog_entry_void();
i_size_write(inode, new_i_size);
- inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
@@ -261,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
{
int status;
handle_t *handle;
+ struct ocfs2_dinode *di;
mlog_entry_void();
@@ -274,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
goto out;
}
- status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
+ status = ocfs2_journal_access(handle, inode, fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+
+ /*
+ * Do this before setting i_size.
+ */
+ status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
+ if (status) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+
+ i_size_write(inode, new_i_size);
+ inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+ di = (struct ocfs2_dinode *) fe_bh->b_data;
+ di->i_size = cpu_to_le64(new_i_size);
+ di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
+ di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+ status = ocfs2_journal_dirty(handle, fe_bh);
if (status < 0)
mlog_errno(status);
+out_commit:
ocfs2_commit_trans(osb, handle);
out:
+
mlog_exit(status);
return status;
}
@@ -342,19 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode,
mlog_errno(status);
goto bail;
}
- ocfs2_data_unlock(inode, 1);
-
- if (le32_to_cpu(fe->i_clusters) ==
- ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
- mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
- fe->i_clusters);
- /* No allocation change is required, so lets fast path
- * this truncate. */
- status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
- if (status < 0)
- mlog_errno(status);
- goto bail;
- }
/* alright, we're going to need to do a full blown alloc size
* change. Orphan the inode so that recovery can complete the
@@ -363,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode,
status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
if (status < 0) {
mlog_errno(status);
- goto bail;
+ goto bail_unlock_data;
}
status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
if (status < 0) {
mlog_errno(status);
- goto bail;
+ goto bail_unlock_data;
}
status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
if (status < 0) {
mlog_errno(status);
- goto bail;
+ goto bail_unlock_data;
}
/* TODO: orphan dir cleanup here. */
+bail_unlock_data:
+ ocfs2_data_unlock(inode, 1);
+
bail:
mlog_exit(status);
@@ -397,6 +416,7 @@ bail:
*/
int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
struct inode *inode,
+ u32 *logical_offset,
u32 clusters_to_add,
struct buffer_head *fe_bh,
handle_t *handle,
@@ -460,18 +480,14 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
- status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,
- num_bits, meta_ac);
+ status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
+ *logical_offset, block, num_bits,
+ meta_ac);
if (status < 0) {
mlog_errno(status);
goto leave;
}
- le32_add_cpu(&fe->i_clusters, num_bits);
- spin_lock(&OCFS2_I(inode)->ip_lock);
- OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
- spin_unlock(&OCFS2_I(inode)->ip_lock);
-
status = ocfs2_journal_dirty(handle, fe_bh);
if (status < 0) {
mlog_errno(status);
@@ -479,6 +495,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
}
clusters_to_add -= num_bits;
+ *logical_offset += num_bits;
if (clusters_to_add) {
mlog(0, "need to alloc once more, clusters = %u, wanted = "
@@ -494,14 +511,87 @@ leave:
return status;
}
+/*
+ * For a given allocation, determine which allocators will need to be
+ * accessed, and lock them, reserving the appropriate number of bits.
+ *
+ * Called from ocfs2_extend_allocation() for file systems which don't
+ * support holes, and from ocfs2_write() for file systems which
+ * understand sparse inodes.
+ */
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
+ u32 clusters_to_add,
+ struct ocfs2_alloc_context **data_ac,
+ struct ocfs2_alloc_context **meta_ac)
+{
+ int ret, num_free_extents;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ *meta_ac = NULL;
+ *data_ac = NULL;
+
+ mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
+ "clusters_to_add = %u\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
+ le32_to_cpu(di->i_clusters), clusters_to_add);
+
+ num_free_extents = ocfs2_num_free_extents(osb, inode, di);
+ if (num_free_extents < 0) {
+ ret = num_free_extents;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Sparse allocation file systems need to be more conservative
+ * with reserving room for expansion - the actual allocation
+ * happens while we've got a journal handle open so re-taking
+ * a cluster lock (because we ran out of room for another
+ * extent) will violate ordering rules.
+ *
+ * Most of the time we'll only be seeing this 1 cluster at a time
+ * anyway.
+ */
+ if (!num_free_extents ||
+ (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) {
+ ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+
+out:
+ if (ret) {
+ if (*meta_ac) {
+ ocfs2_free_alloc_context(*meta_ac);
+ *meta_ac = NULL;
+ }
+
+ /*
+ * We cannot have an error and a non null *data_ac.
+ */
+ }
+
+ return ret;
+}
+
static int ocfs2_extend_allocation(struct inode *inode,
u32 clusters_to_add)
{
int status = 0;
int restart_func = 0;
int drop_alloc_sem = 0;
- int credits, num_free_extents;
- u32 prev_clusters;
+ int credits;
+ u32 prev_clusters, logical_start;
struct buffer_head *bh = NULL;
struct ocfs2_dinode *fe = NULL;
handle_t *handle = NULL;
@@ -512,6 +602,12 @@ static int ocfs2_extend_allocation(struct inode *inode,
mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
+ /*
+ * This function only exists for file systems which don't
+ * support holes.
+ */
+ BUG_ON(ocfs2_sparse_alloc(osb));
+
status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
OCFS2_BH_CACHED, inode);
if (status < 0) {
@@ -526,39 +622,11 @@ static int ocfs2_extend_allocation(struct inode *inode,
goto leave;
}
+ logical_start = OCFS2_I(inode)->ip_clusters;
+
restart_all:
BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
- mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, "
- "clusters_to_add = %u\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
- fe->i_clusters, clusters_to_add);
-
- num_free_extents = ocfs2_num_free_extents(osb,
- inode,
- fe);
- if (num_free_extents < 0) {
- status = num_free_extents;
- mlog_errno(status);
- goto leave;
- }
-
- if (!num_free_extents) {
- status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);
- if (status < 0) {
- if (status != -ENOSPC)
- mlog_errno(status);
- goto leave;
- }
- }
-
- status = ocfs2_reserve_clusters(osb, clusters_to_add, &data_ac);
- if (status < 0) {
- if (status != -ENOSPC)
- mlog_errno(status);
- goto leave;
- }
-
/* blocks peope in read/write from reading our allocation
* until we're done changing it. We depend on i_mutex to block
* other extend/truncate calls while we're here. Ordering wrt
@@ -566,6 +634,13 @@ restart_all:
down_write(&OCFS2_I(inode)->ip_alloc_sem);
drop_alloc_sem = 1;
+ status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
+ &meta_ac);
+ if (status) {
+ mlog_errno(status);
+ goto leave;
+ }
+
credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
@@ -590,6 +665,7 @@ restarted_transaction:
status = ocfs2_do_extend_allocation(osb,
inode,
+ &logical_start,
clusters_to_add,
bh,
handle,
@@ -778,7 +854,7 @@ static int ocfs2_extend_file(struct inode *inode,
size_t tail_to_skip)
{
int ret = 0;
- u32 clusters_to_add;
+ u32 clusters_to_add = 0;
BUG_ON(!tail_to_skip && !di_bh);
@@ -790,6 +866,11 @@ static int ocfs2_extend_file(struct inode *inode,
goto out;
BUG_ON(new_i_size < i_size_read(inode));
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+ BUG_ON(tail_to_skip != 0);
+ goto out_update_size;
+ }
+
clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
OCFS2_I(inode)->ip_clusters;
@@ -825,6 +906,7 @@ static int ocfs2_extend_file(struct inode *inode,
goto out_unlock;
}
+out_update_size:
if (!tail_to_skip) {
/* We're being called from ocfs2_setattr() which wants
* us to update i_size */
@@ -834,7 +916,8 @@ static int ocfs2_extend_file(struct inode *inode,
}
out_unlock:
- ocfs2_data_unlock(inode, 1);
+ if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ ocfs2_data_unlock(inode, 1);
out:
return ret;
@@ -972,7 +1055,8 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
ret = ocfs2_meta_lock(inode, NULL, 0);
if (ret) {
- mlog_errno(ret);
+ if (ret != -ENOENT)
+ mlog_errno(ret);
goto out;
}
@@ -1035,10 +1119,49 @@ out:
return ret;
}
+/*
+ * Will look for holes and unwritten extents in the range starting at
+ * pos for count bytes (inclusive).
+ */
+static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
+ size_t count)
+{
+ int ret = 0;
+ unsigned int extent_flags;
+ u32 cpos, clusters, extent_len, phys_cpos;
+ struct super_block *sb = inode->i_sb;
+
+ cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
+ clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
+
+ while (clusters) {
+ ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
+ &extent_flags);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
+ ret = 1;
+ break;
+ }
+
+ if (extent_len > clusters)
+ extent_len = clusters;
+
+ clusters -= extent_len;
+ cpos += extent_len;
+ }
+out:
+ return ret;
+}
+
static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
loff_t *ppos,
size_t count,
- int appending)
+ int appending,
+ int *direct_io)
{
int ret = 0, meta_level = appending;
struct inode *inode = dentry->d_inode;
@@ -1089,6 +1212,49 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
} else {
saved_pos = *ppos;
}
+
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+ loff_t end = saved_pos + count;
+
+ /*
+ * Skip the O_DIRECT checks if we don't need
+ * them.
+ */
+ if (!direct_io || !(*direct_io))
+ break;
+
+ /*
+ * Allowing concurrent direct writes means
+ * i_size changes wouldn't be synchronized, so
+ * one node could wind up truncating another
+ * nodes writes.
+ */
+ if (end > i_size_read(inode)) {
+ *direct_io = 0;
+ break;
+ }
+
+ /*
+ * We don't fill holes during direct io, so
+ * check for them here. If any are found, the
+ * caller will have to retake some cluster
+ * locks and initiate the io as buffered.
+ */
+ ret = ocfs2_check_range_for_holes(inode, saved_pos,
+ count);
+ if (ret == 1) {
+ *direct_io = 0;
+ ret = 0;
+ } else if (ret < 0)
+ mlog_errno(ret);
+ break;
+ }
+
+ /*
+ * The rest of this loop is concerned with legacy file
+ * systems which don't support sparse files.
+ */
+
newsize = count + saved_pos;
mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
@@ -1141,55 +1307,264 @@ out:
return ret;
}
+static inline void
+ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
+{
+ const struct iovec *iov = *iovp;
+ size_t base = *basep;
+
+ do {
+ int copy = min(bytes, iov->iov_len - base);
+
+ bytes -= copy;
+ base += copy;
+ if (iov->iov_len == base) {
+ iov++;
+ base = 0;
+ }
+ } while (bytes);
+ *iovp = iov;
+ *basep = base;
+}
+
+static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp,
+ const struct iovec *cur_iov,
+ size_t iov_offset)
+{
+ int ret;
+ char *buf;
+ struct page *src_page = NULL;
+
+ buf = cur_iov->iov_base + iov_offset;
+
+ if (!segment_eq(get_fs(), KERNEL_DS)) {
+ /*
+ * Pull in the user page. We want to do this outside
+ * of the meta data locks in order to preserve locking
+ * order in case of page fault.
+ */
+ ret = get_user_pages(current, current->mm,
+ (unsigned long)buf & PAGE_CACHE_MASK, 1,
+ 0, 0, &src_page, NULL);
+ if (ret == 1)
+ bp->b_src_buf = kmap(src_page);
+ else
+ src_page = ERR_PTR(-EFAULT);
+ } else {
+ bp->b_src_buf = buf;
+ }
+
+ return src_page;
+}
+
+static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp,
+ struct page *page)
+{
+ if (page) {
+ kunmap(page);
+ page_cache_release(page);
+ }
+}
+
+static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
+ const struct iovec *iov,
+ unsigned long nr_segs,
+ size_t count,
+ ssize_t o_direct_written)
+{
+ int ret = 0;
+ ssize_t copied, total = 0;
+ size_t iov_offset = 0;
+ const struct iovec *cur_iov = iov;
+ struct ocfs2_buffered_write_priv bp;
+ struct page *page;
+
+ /*
+ * handle partial DIO write. Adjust cur_iov if needed.
+ */
+ ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
+
+ do {
+ bp.b_cur_off = iov_offset;
+ bp.b_cur_iov = cur_iov;
+
+ page = ocfs2_get_write_source(&bp, cur_iov, iov_offset);
+ if (IS_ERR(page)) {
+ ret = PTR_ERR(page);
+ goto out;
+ }
+
+ copied = ocfs2_buffered_write_cluster(file, *ppos, count,
+ ocfs2_map_and_write_user_data,
+ &bp);
+
+ ocfs2_put_write_source(&bp, page);
+
+ if (copied < 0) {
+ mlog_errno(copied);
+ ret = copied;
+ goto out;
+ }
+
+ total += copied;
+ *ppos = *ppos + copied;
+ count -= copied;
+
+ ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
+ } while(count);
+
+out:
+ return total ? total : ret;
+}
+
+static int ocfs2_check_iovec(const struct iovec *iov, size_t *counted,
+ unsigned long *nr_segs)
+{
+ size_t ocount; /* original count */
+ unsigned long seg;
+
+ ocount = 0;
+ for (seg = 0; seg < *nr_segs; seg++) {
+ const struct iovec *iv = &iov[seg];
+
+ /*
+ * If any segment has a negative length, or the cumulative
+ * length ever wraps negative then return -EINVAL.
+ */
+ ocount += iv->iov_len;
+ if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
+ return -EINVAL;
+ if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
+ continue;
+ if (seg == 0)
+ return -EFAULT;
+ *nr_segs = seg;
+ ocount -= iv->iov_len; /* This segment is no good */
+ break;
+ }
+
+ *counted = ocount;
+ return 0;
+}
+
static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
const struct iovec *iov,
unsigned long nr_segs,
loff_t pos)
{
- int ret, rw_level, have_alloc_sem = 0;
- struct file *filp = iocb->ki_filp;
- struct inode *inode = filp->f_path.dentry->d_inode;
- int appending = filp->f_flags & O_APPEND ? 1 : 0;
-
- mlog_entry("(0x%p, %u, '%.*s')\n", filp,
+ int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
+ int can_do_direct, sync = 0;
+ ssize_t written = 0;
+ size_t ocount; /* original count */
+ size_t count; /* after file limit checks */
+ loff_t *ppos = &iocb->ki_pos;
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_path.dentry->d_inode;
+
+ mlog_entry("(0x%p, %u, '%.*s')\n", file,
(unsigned int)nr_segs,
- filp->f_path.dentry->d_name.len,
- filp->f_path.dentry->d_name.name);
+ file->f_path.dentry->d_name.len,
+ file->f_path.dentry->d_name.name);
- /* happy write of zero bytes */
if (iocb->ki_left == 0)
return 0;
+ ret = ocfs2_check_iovec(iov, &ocount, &nr_segs);
+ if (ret)
+ return ret;
+
+ count = ocount;
+
+ vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+ appending = file->f_flags & O_APPEND ? 1 : 0;
+ direct_io = file->f_flags & O_DIRECT ? 1 : 0;
+
mutex_lock(&inode->i_mutex);
+
+relock:
/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
- if (filp->f_flags & O_DIRECT) {
- have_alloc_sem = 1;
+ if (direct_io) {
down_read(&inode->i_alloc_sem);
+ have_alloc_sem = 1;
}
/* concurrent O_DIRECT writes are allowed */
- rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
+ rw_level = !direct_io;
ret = ocfs2_rw_lock(inode, rw_level);
if (ret < 0) {
- rw_level = -1;
mlog_errno(ret);
- goto out;
+ goto out_sems;
}
- ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos,
- iocb->ki_left, appending);
+ can_do_direct = direct_io;
+ ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
+ iocb->ki_left, appending,
+ &can_do_direct);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
- /* communicate with ocfs2_dio_end_io */
- ocfs2_iocb_set_rw_locked(iocb);
+ /*
+ * We can't complete the direct I/O as requested, fall back to
+ * buffered I/O.
+ */
+ if (direct_io && !can_do_direct) {
+ ocfs2_rw_unlock(inode, rw_level);
+ up_read(&inode->i_alloc_sem);
+
+ have_alloc_sem = 0;
+ rw_level = -1;
- ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos);
+ direct_io = 0;
+ sync = 1;
+ goto relock;
+ }
+
+ if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode)))
+ sync = 1;
+
+ /*
+ * XXX: Is it ok to execute these checks a second time?
+ */
+ ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode));
+ if (ret)
+ goto out;
+
+ /*
+ * Set pos so that sync_page_range_nolock() below understands
+ * where to start from. We might've moved it around via the
+ * calls above. The range we want to actually sync starts from
+ * *ppos here.
+ *
+ */
+ pos = *ppos;
+
+ /* communicate with ocfs2_dio_end_io */
+ ocfs2_iocb_set_rw_locked(iocb, rw_level);
+
+ if (direct_io) {
+ written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
+ ppos, count, ocount);
+ if (written < 0) {
+ ret = written;
+ goto out_dio;
+ }
+ } else {
+ written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs,
+ count, written);
+ if (written < 0) {
+ ret = written;
+ if (ret != -EFAULT || ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+out_dio:
/* buffered aio wouldn't have proper lock coverage today */
- BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
+ BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
/*
* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
@@ -1207,13 +1582,102 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
}
out:
+ if (rw_level != -1)
+ ocfs2_rw_unlock(inode, rw_level);
+
+out_sems:
if (have_alloc_sem)
up_read(&inode->i_alloc_sem);
- if (rw_level != -1)
- ocfs2_rw_unlock(inode, rw_level);
+
+ if (written > 0 && sync) {
+ ssize_t err;
+
+ err = sync_page_range_nolock(inode, file->f_mapping, pos, count);
+ if (err < 0)
+ written = err;
+ }
+
mutex_unlock(&inode->i_mutex);
mlog_exit(ret);
+ return written ? written : ret;
+}
+
+static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf,
+ struct splice_desc *sd)
+{
+ int ret, count, total = 0;
+ ssize_t copied = 0;
+ struct ocfs2_splice_write_priv sp;
+
+ ret = buf->ops->pin(pipe, buf);
+ if (ret)
+ goto out;
+
+ sp.s_sd = sd;
+ sp.s_buf = buf;
+ sp.s_pipe = pipe;
+ sp.s_offset = sd->pos & ~PAGE_CACHE_MASK;
+ sp.s_buf_offset = buf->offset;
+
+ count = sd->len;
+ if (count + sp.s_offset > PAGE_CACHE_SIZE)
+ count = PAGE_CACHE_SIZE - sp.s_offset;
+
+ do {
+ /*
+ * splice wants us to copy up to one page at a
+ * time. For pagesize > cluster size, this means we
+ * might enter ocfs2_buffered_write_cluster() more
+ * than once, so keep track of our progress here.
+ */
+ copied = ocfs2_buffered_write_cluster(sd->file,
+ (loff_t)sd->pos + total,
+ count,
+ ocfs2_map_and_write_splice_data,
+ &sp);
+ if (copied < 0) {
+ mlog_errno(copied);
+ ret = copied;
+ goto out;
+ }
+
+ count -= copied;
+ sp.s_offset += copied;
+ sp.s_buf_offset += copied;
+ total += copied;
+ } while (count);
+
+ ret = 0;
+out:
+
+ return total ? total : ret;
+}
+
+static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
+ struct file *out,
+ loff_t *ppos,
+ size_t len,
+ unsigned int flags)
+{
+ int ret, err;
+ struct address_space *mapping = out->f_mapping;
+ struct inode *inode = mapping->host;
+
+ ret = __splice_from_pipe(pipe, out, ppos, len, flags,
+ ocfs2_splice_write_actor);
+ if (ret > 0) {
+ *ppos += ret;
+
+ if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
+ err = generic_osync_inode(inode, mapping,
+ OSYNC_METADATA|OSYNC_DATA);
+ if (err)
+ ret = err;
+ }
+ }
+
return ret;
}
@@ -1239,14 +1703,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
goto out;
}
- ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0);
+ ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
+ NULL);
if (ret < 0) {
mlog_errno(ret);
goto out_unlock;
}
/* ok, we're done with i_size and alloc work */
- ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
+ ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags);
out_unlock:
ocfs2_rw_unlock(inode, 1);
@@ -1323,7 +1788,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
}
rw_level = 0;
/* communicate with ocfs2_dio_end_io */
- ocfs2_iocb_set_rw_locked(iocb);
+ ocfs2_iocb_set_rw_locked(iocb, rw_level);
}
/*
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index cc973f01f6ce..2c4460fced52 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -39,12 +39,17 @@ enum ocfs2_alloc_restarted {
};
int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
struct inode *inode,
+ u32 *cluster_start,
u32 clusters_to_add,
struct buffer_head *fe_bh,
handle_t *handle,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
enum ocfs2_alloc_restarted *reason);
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
+ u32 clusters_to_add,
+ struct ocfs2_alloc_context **data_ac,
+ struct ocfs2_alloc_context **meta_ac);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 28ab56f2b98c..21a605079c62 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -89,24 +89,6 @@ void ocfs2_set_inode_flags(struct inode *inode)
inode->i_flags |= S_DIRSYNC;
}
-struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
- u64 blkno,
- int delete_vote)
-{
- struct ocfs2_find_inode_args args;
-
- /* ocfs2_ilookup_for_vote should *only* be called from the
- * vote thread */
- BUG_ON(current != osb->vote_task);
-
- args.fi_blkno = blkno;
- args.fi_flags = OCFS2_FI_FLAG_NOWAIT;
- if (delete_vote)
- args.fi_flags |= OCFS2_FI_FLAG_DELETE;
- args.fi_ino = ino_from_blkno(osb->sb, blkno);
- return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args);
-}
-
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
{
struct inode *inode = NULL;
@@ -182,28 +164,6 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque)
if (oi->ip_blkno != args->fi_blkno)
goto bail;
- /* OCFS2_FI_FLAG_NOWAIT is *only* set from
- * ocfs2_ilookup_for_vote which won't create an inode for one
- * that isn't found. The vote thread which doesn't want to get
- * an inode which is in the process of going away - otherwise
- * the call to __wait_on_freeing_inode in find_inode_fast will
- * cause it to deadlock on an inode which may be waiting on a
- * vote (or lock release) in delete_inode */
- if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) &&
- (inode->i_state & (I_FREEING|I_CLEAR))) {
- /* As stated above, we're not going to return an
- * inode. In the case of a delete vote, the voting
- * code is going to signal the other node to go
- * ahead. Mark that state here, so this freeing inode
- * has the state when it gets to delete_inode. */
- if (args->fi_flags & OCFS2_FI_FLAG_DELETE) {
- spin_lock(&oi->ip_lock);
- ocfs2_mark_inode_remotely_deleted(inode);
- spin_unlock(&oi->ip_lock);
- }
- goto bail;
- }
-
ret = 1;
bail:
mlog_exit(ret);
@@ -261,6 +221,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
goto bail;
}
+ OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+ OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
+
inode->i_version = 1;
inode->i_generation = le32_to_cpu(fe->i_generation);
inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
@@ -272,8 +235,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
inode->i_blocks = 0;
else
- inode->i_blocks =
- ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size));
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
inode->i_mapping->a_ops = &ocfs2_aops;
inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
@@ -288,10 +250,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)fe->i_blkno);
- OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
- OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
- OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
-
inode->i_nlink = le16_to_cpu(fe->i_links_count);
if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
@@ -347,6 +305,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
OCFS2_LOCK_TYPE_META, 0, inode);
+
+ ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+ OCFS2_LOCK_TYPE_OPEN, 0, inode);
}
ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
@@ -421,7 +382,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
* cluster lock before trusting anything anyway.
*/
can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
- && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK)
+ && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
&& !ocfs2_mount_local(osb);
/*
@@ -438,7 +399,17 @@ static int ocfs2_read_locked_inode(struct inode *inode,
OCFS2_LOCK_TYPE_META,
generation, inode);
+ ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+ OCFS2_LOCK_TYPE_OPEN,
+ 0, inode);
+
if (can_lock) {
+ status = ocfs2_open_lock(inode);
+ if (status) {
+ make_bad_inode(inode);
+ mlog_errno(status);
+ return status;
+ }
status = ocfs2_meta_lock(inode, NULL, 0);
if (status) {
make_bad_inode(inode);
@@ -447,6 +418,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
}
}
+ if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
+ status = ocfs2_try_open_lock(inode, 0);
+ if (status) {
+ make_bad_inode(inode);
+ return status;
+ }
+ }
+
status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
can_lock ? inode : NULL);
if (status < 0) {
@@ -507,50 +486,56 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
struct buffer_head *fe_bh)
{
int status = 0;
- handle_t *handle = NULL;
struct ocfs2_truncate_context *tc = NULL;
struct ocfs2_dinode *fe;
+ handle_t *handle = NULL;
mlog_entry_void();
fe = (struct ocfs2_dinode *) fe_bh->b_data;
- /* zero allocation, zero truncate :) */
- if (!fe->i_clusters)
- goto bail;
+ if (fe->i_clusters) {
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out;
+ }
- handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
- if (IS_ERR(handle)) {
- status = PTR_ERR(handle);
- handle = NULL;
- mlog_errno(status);
- goto bail;
- }
+ status = ocfs2_journal_access(handle, inode, fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
- status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ i_size_write(inode, 0);
- ocfs2_commit_trans(osb, handle);
- handle = NULL;
+ status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
- status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_commit_trans(osb, handle);
+ handle = NULL;
- status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+ status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
}
-bail:
+
+out:
if (handle)
ocfs2_commit_trans(osb, handle);
-
mlog_exit(status);
return status;
}
@@ -678,10 +663,10 @@ static int ocfs2_wipe_inode(struct inode *inode,
struct inode *orphan_dir_inode = NULL;
struct buffer_head *orphan_dir_bh = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_dinode *di;
- /* We've already voted on this so it should be readonly - no
- * spinlock needed. */
- orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
+ di = (struct ocfs2_dinode *) di_bh->b_data;
+ orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
if (status)
@@ -839,11 +824,20 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
goto bail;
}
- status = ocfs2_request_delete_vote(inode);
- /* -EBUSY means that other nodes are still using the
- * inode. We're done here though, so avoid doing anything on
- * disk and let them worry about deleting it. */
- if (status == -EBUSY) {
+ /*
+ * This is how ocfs2 determines whether an inode is still live
+ * within the cluster. Every node takes a shared read lock on
+ * the inode open lock in ocfs2_read_locked_inode(). When we
+ * get to ->delete_inode(), each node tries to convert it's
+ * lock to an exclusive. Trylocks are serialized by the inode
+ * meta data lock. If the upconvert suceeds, we know the inode
+ * is no longer live and can be deleted.
+ *
+ * Though we call this with the meta data lock held, the
+ * trylock keeps us from ABBA deadlock.
+ */
+ status = ocfs2_try_open_lock(inode, 1);
+ if (status == -EAGAIN) {
status = 0;
mlog(0, "Skipping delete of %llu because it is in use on"
"other nodes\n", (unsigned long long)oi->ip_blkno);
@@ -854,21 +848,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
goto bail;
}
- spin_lock(&oi->ip_lock);
- if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
- /* Nobody knew which slot this inode was orphaned
- * into. This may happen during node death and
- * recovery knows how to clean it up so we can safely
- * ignore this inode for now on. */
- mlog(0, "Nobody knew where inode %llu was orphaned!\n",
- (unsigned long long)oi->ip_blkno);
- } else {
- *wipe = 1;
-
- mlog(0, "Inode %llu is ok to wipe from orphan dir %d\n",
- (unsigned long long)oi->ip_blkno, oi->ip_orphaned_slot);
- }
- spin_unlock(&oi->ip_lock);
+ *wipe = 1;
+ mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n",
+ (unsigned long long)oi->ip_blkno,
+ le16_to_cpu(di->i_orphaned_slot));
bail:
return status;
@@ -1001,11 +984,16 @@ void ocfs2_clear_inode(struct inode *inode)
mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
"Inode=%lu\n", inode->i_ino);
+ /* For remove delete_inode vote, we hold open lock before,
+ * now it is time to unlock PR and EX open locks. */
+ ocfs2_open_unlock(inode);
+
/* Do these before all the other work so that we don't bounce
* the vote thread while waiting to destroy the locks. */
ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
+ ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
/* We very well may get a clear_inode before all an inodes
* metadata has hit disk. Of course, we can't drop any cluster
@@ -1020,8 +1008,7 @@ void ocfs2_clear_inode(struct inode *inode)
"Clear inode of %llu, inode has io markers\n",
(unsigned long long)oi->ip_blkno);
- ocfs2_extent_map_drop(inode, 0);
- ocfs2_extent_map_init(inode);
+ ocfs2_extent_map_trunc(inode, 0);
status = ocfs2_drop_inode_locks(inode);
if (status < 0)
@@ -1030,6 +1017,7 @@ void ocfs2_clear_inode(struct inode *inode)
ocfs2_lock_res_free(&oi->ip_rw_lockres);
ocfs2_lock_res_free(&oi->ip_meta_lockres);
ocfs2_lock_res_free(&oi->ip_data_lockres);
+ ocfs2_lock_res_free(&oi->ip_open_lockres);
ocfs2_metadata_cache_purge(inode);
@@ -1086,9 +1074,6 @@ void ocfs2_drop_inode(struct inode *inode)
mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n",
(unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
- /* Testing ip_orphaned_slot here wouldn't work because we may
- * not have gotten a delete_inode vote from any other nodes
- * yet. */
if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
generic_delete_inode(inode);
else
@@ -1121,8 +1106,8 @@ struct buffer_head *ocfs2_bread(struct inode *inode,
return NULL;
}
- tmperr = ocfs2_extent_map_get_blocks(inode, block, 1,
- &p_blkno, NULL);
+ tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
+ NULL);
if (tmperr < 0) {
mlog_errno(tmperr);
goto fail;
@@ -1259,7 +1244,7 @@ void ocfs2_refresh_inode(struct inode *inode,
if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
inode->i_blocks = 0;
else
- inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode));
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 1a7dd2945b34..03ae075869ee 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -26,6 +26,8 @@
#ifndef OCFS2_INODE_H
#define OCFS2_INODE_H
+#include "extent_map.h"
+
/* OCFS2 Inode Private Data */
struct ocfs2_inode_info
{
@@ -34,6 +36,7 @@ struct ocfs2_inode_info
struct ocfs2_lock_res ip_rw_lockres;
struct ocfs2_lock_res ip_meta_lockres;
struct ocfs2_lock_res ip_data_lockres;
+ struct ocfs2_lock_res ip_open_lockres;
/* protects allocation changes on this inode. */
struct rw_semaphore ip_alloc_sem;
@@ -42,9 +45,7 @@ struct ocfs2_inode_info
spinlock_t ip_lock;
u32 ip_open_count;
u32 ip_clusters;
- struct ocfs2_extent_map ip_map;
struct list_head ip_io_markers;
- int ip_orphaned_slot;
struct mutex ip_io_mutex;
@@ -64,6 +65,8 @@ struct ocfs2_inode_info
struct ocfs2_caching_info ip_metadata_cache;
+ struct ocfs2_extent_map ip_extent_map;
+
struct inode vfs_inode;
};
@@ -117,14 +120,9 @@ void ocfs2_delete_inode(struct inode *inode);
void ocfs2_drop_inode(struct inode *inode);
/* Flags for ocfs2_iget() */
-#define OCFS2_FI_FLAG_NOWAIT 0x1
-#define OCFS2_FI_FLAG_DELETE 0x2
-#define OCFS2_FI_FLAG_SYSFILE 0x4
-#define OCFS2_FI_FLAG_NOLOCK 0x8
+#define OCFS2_FI_FLAG_SYSFILE 0x4
+#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x8
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags);
-struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
- u64 blkno,
- int delete_vote);
int ocfs2_inode_init_private(struct inode *inode);
int ocfs2_inode_revalidate(struct dentry *dentry);
int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
@@ -144,4 +142,11 @@ int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
void ocfs2_set_inode_flags(struct inode *inode);
+static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
+{
+ int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9;
+
+ return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
+}
+
#endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 825cb0ae1b4c..5a8a90d1c787 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -649,29 +649,20 @@ bail:
static int ocfs2_force_read_journal(struct inode *inode)
{
int status = 0;
- int i, p_blocks;
- u64 v_blkno, p_blkno;
-#define CONCURRENT_JOURNAL_FILL 32
+ int i;
+ u64 v_blkno, p_blkno, p_blocks, num_blocks;
+#define CONCURRENT_JOURNAL_FILL 32ULL
struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
mlog_entry_void();
- BUG_ON(inode->i_blocks !=
- ocfs2_align_bytes_to_sectors(i_size_read(inode)));
-
memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
- mlog(0, "Force reading %llu blocks\n",
- (unsigned long long)(inode->i_blocks >>
- (inode->i_sb->s_blocksize_bits - 9)));
-
+ num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size);
v_blkno = 0;
- while (v_blkno <
- (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
-
+ while (v_blkno < num_blocks) {
status = ocfs2_extent_map_get_blocks(inode, v_blkno,
- 1, &p_blkno,
- &p_blocks);
+ &p_blkno, &p_blocks, NULL);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1306,7 +1297,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
continue;
iter = ocfs2_iget(osb, le64_to_cpu(de->inode),
- OCFS2_FI_FLAG_NOLOCK);
+ OCFS2_FI_FLAG_ORPHAN_RECOVERY);
if (IS_ERR(iter))
continue;
@@ -1418,7 +1409,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
/* Set the proper information to get us going into
* ocfs2_delete_inode. */
oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
- oi->ip_orphaned_slot = slot;
spin_unlock(&oi->ip_lock);
iput(inode);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index d026b4f27757..3db5de4506da 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -390,7 +390,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
/* We may be deleting metadata blocks, so metadata alloc dinode +
one desc. block for each possible delete. */
if (tree_depth && next_free == 1 &&
- le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del)
+ ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del)
credits += 1 + tree_depth;
/* update to the truncate log. */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 51b020447683..af01158b39f5 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -85,8 +85,11 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
int ret = 0, lock_level = 0;
struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb);
- /* We don't want to support shared writable mappings yet. */
- if (!ocfs2_mount_local(osb) &&
+ /*
+ * Only support shared writeable mmap for local mounts which
+ * don't know about holes.
+ */
+ if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) &&
((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) &&
((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 28dd757ff67d..2bcf353fd7c5 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -175,8 +175,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
if (IS_ERR(inode)) {
- mlog(ML_ERROR, "Unable to create inode %llu\n",
- (unsigned long long)blkno);
ret = ERR_PTR(-EACCES);
goto bail_unlock;
}
@@ -189,7 +187,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
* unlink. */
spin_lock(&oi->ip_lock);
oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
- oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
spin_unlock(&oi->ip_lock);
bail_add:
@@ -288,7 +285,7 @@ static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
i_size_write(inode, inode->i_sb->s_blocksize);
inode->i_nlink = 2;
- inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize);
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
if (status < 0) {
mlog_errno(status);
@@ -1486,8 +1483,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
struct buffer_head **bhs = NULL;
const char *c;
struct super_block *sb = osb->sb;
- u64 p_blkno;
- int p_blocks;
+ u64 p_blkno, p_blocks;
int virtual, blocks, status, i, bytes_left;
bytes_left = i_size_read(inode) + 1;
@@ -1514,8 +1510,8 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
goto bail;
}
- status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno,
- &p_blocks);
+ status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks,
+ NULL);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1674,8 +1670,11 @@ static int ocfs2_symlink(struct inode *dir,
inode->i_rdev = 0;
newsize = l - 1;
if (l > ocfs2_fast_symlink_chars(sb)) {
+ u32 offset = 0;
+
inode->i_op = &ocfs2_symlink_inode_operations;
- status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh,
+ status = ocfs2_do_extend_allocation(osb, inode, &offset, 1,
+ new_fe_bh,
handle, data_ac, NULL,
NULL);
if (status < 0) {
@@ -1689,7 +1688,7 @@ static int ocfs2_symlink(struct inode *dir,
goto bail;
}
i_size_write(inode, newsize);
- inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize);
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
} else {
inode->i_op = &ocfs2_fast_symlink_inode_operations;
memcpy((char *) fe->id2.i_symlink, symname, l);
@@ -2222,9 +2221,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
/* Record which orphan dir our inode now resides
* in. delete_inode will use this to determine which orphan
* dir to lock. */
- spin_lock(&OCFS2_I(inode)->ip_lock);
- OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
- spin_unlock(&OCFS2_I(inode)->ip_lock);
+ fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
mlog(0, "Inode %llu orphaned in slot %d\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index db8e77cd35d3..82cc92dcf8a6 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -46,11 +46,6 @@
#include "endian.h"
#include "ocfs2_lockid.h"
-struct ocfs2_extent_map {
- u32 em_clusters;
- struct rb_root em_extents;
-};
-
/* Most user visible OCFS2 inodes will have very few pieces of
* metadata, but larger files (including bitmaps, etc) must be taken
* into account when designing an access scheme. We allow a small
@@ -303,6 +298,13 @@ static inline int ocfs2_should_order_data(struct inode *inode)
return 1;
}
+static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb)
+{
+ if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
+ return 1;
+ return 0;
+}
+
/* set / clear functions because cluster events can make these happen
* in parallel so we want the transitions to be atomic. this also
* means that any future flags osb_flags must be protected by spinlock
@@ -461,6 +463,49 @@ static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
return (unsigned long)((bytes + 511) >> 9);
}
+static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb,
+ unsigned long pg_index)
+{
+ u32 clusters = pg_index;
+ unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+
+ if (unlikely(PAGE_CACHE_SHIFT > cbits))
+ clusters = pg_index << (PAGE_CACHE_SHIFT - cbits);
+ else if (PAGE_CACHE_SHIFT < cbits)
+ clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT);
+
+ return clusters;
+}
+
+/*
+ * Find the 1st page index which covers the given clusters.
+ */
+static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_block *sb,
+ u32 clusters)
+{
+ unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+ unsigned long index = clusters;
+
+ if (PAGE_CACHE_SHIFT > cbits) {
+ index = clusters >> (PAGE_CACHE_SHIFT - cbits);
+ } else if (PAGE_CACHE_SHIFT < cbits) {
+ index = clusters << (cbits - PAGE_CACHE_SHIFT);
+ }
+
+ return index;
+}
+
+static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
+{
+ unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+ unsigned int pages_per_cluster = 1;
+
+ if (PAGE_CACHE_SHIFT < cbits)
+ pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
+
+ return pages_per_cluster;
+}
+
#define ocfs2_set_bit ext2_set_bit
#define ocfs2_clear_bit ext2_clear_bit
#define ocfs2_test_bit ext2_test_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index e61e218f5e0b..71306479c68f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -86,7 +86,8 @@
OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB
-#define OCFS2_FEATURE_INCOMPAT_SUPP OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT
+#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
+ | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
#define OCFS2_FEATURE_RO_COMPAT_SUPP 0
/*
@@ -155,6 +156,12 @@
#define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */
/*
+ * Extent record flags (e_node.leaf.flags)
+ */
+#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but
+ * unwritten */
+
+/*
* ioctl commands
*/
#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long)
@@ -282,10 +289,21 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
/*
* On disk extent record for OCFS2
* It describes a range of clusters on disk.
+ *
+ * Length fields are divided into interior and leaf node versions.
+ * This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes.
*/
struct ocfs2_extent_rec {
/*00*/ __le32 e_cpos; /* Offset into the file, in clusters */
- __le32 e_clusters; /* Clusters covered by this extent */
+ union {
+ __le32 e_int_clusters; /* Clusters covered by all children */
+ struct {
+ __le16 e_leaf_clusters; /* Clusters covered by this
+ extent */
+ __u8 e_reserved1;
+ __u8 e_flags; /* Extent flags */
+ };
+ };
__le64 e_blkno; /* Physical disk offset, in blocks */
/*10*/
};
@@ -311,7 +329,10 @@ struct ocfs2_extent_list {
/*00*/ __le16 l_tree_depth; /* Extent tree depth from this
point. 0 means data extents
hang directly off this
- header (a leaf) */
+ header (a leaf)
+ NOTE: The high 8 bits cannot be
+ used - tree_depth is never that big.
+ */
__le16 l_count; /* Number of extent records */
__le16 l_next_free_rec; /* Next unused extent slot */
__le16 l_reserved1;
@@ -446,7 +467,9 @@ struct ocfs2_dinode {
__le32 i_ctime_nsec;
__le32 i_mtime_nsec;
__le32 i_attr;
- __le32 i_reserved1;
+ __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL
+ was set in i_flags */
+ __le16 i_reserved1;
/*70*/ __le64 i_reserved2[8];
/*B8*/ union {
__le64 i_pad1; /* Generic way to refer to this
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 4d5d5655c185..4ca02b1c38ac 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -44,6 +44,7 @@ enum ocfs2_lock_type {
OCFS2_LOCK_TYPE_RENAME,
OCFS2_LOCK_TYPE_RW,
OCFS2_LOCK_TYPE_DENTRY,
+ OCFS2_LOCK_TYPE_OPEN,
OCFS2_NUM_LOCK_TYPES
};
@@ -69,6 +70,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
case OCFS2_LOCK_TYPE_DENTRY:
c = 'N';
break;
+ case OCFS2_LOCK_TYPE_OPEN:
+ c = 'O';
+ break;
default:
c = '\0';
}
@@ -85,6 +89,7 @@ static char *ocfs2_lock_type_strings[] = {
* important job it does, anyway. */
[OCFS2_LOCK_TYPE_RW] = "Write/Read",
[OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
+ [OCFS2_LOCK_TYPE_OPEN] = "Open",
};
static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 2d3ac32cb74e..d921a28329dc 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -197,7 +197,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
goto bail;
}
- status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL);
+ status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL);
if (status < 0) {
mlog_errno(status);
goto bail;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 6dbb11762759..0da655ae5d6f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -381,8 +381,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
le32_to_cpu(fe->i_clusters)));
spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
- alloc_inode->i_blocks =
- ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
+ alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
status = 0;
bail:
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 6534f92424dd..5c9e8243691f 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -806,9 +806,6 @@ static int __init ocfs2_init(void)
ocfs2_print_version();
- if (init_ocfs2_extent_maps())
- return -ENOMEM;
-
status = init_ocfs2_uptodate_cache();
if (status < 0) {
mlog_errno(status);
@@ -837,7 +834,6 @@ leave:
if (status < 0) {
ocfs2_free_mem_caches();
exit_ocfs2_uptodate_cache();
- exit_ocfs2_extent_maps();
}
mlog_exit(status);
@@ -863,8 +859,6 @@ static void __exit ocfs2_exit(void)
unregister_filesystem(&ocfs2_fs_type);
- exit_ocfs2_extent_maps();
-
exit_ocfs2_uptodate_cache();
mlog_exit_void();
@@ -963,6 +957,7 @@ static void ocfs2_inode_init_once(void *data,
ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
ocfs2_lock_res_init_once(&oi->ip_data_lockres);
+ ocfs2_lock_res_init_once(&oi->ip_open_lockres);
ocfs2_metadata_cache_init(&oi->vfs_inode);
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
index f30e63b9910c..4f82a2f0efef 100644
--- a/fs/ocfs2/vote.c
+++ b/fs/ocfs2/vote.c
@@ -63,17 +63,10 @@ struct ocfs2_msg_hdr
__be32 h_node_num; /* node sending this particular message. */
};
-/* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this
- * for the network. */
-#define OCFS2_VOTE_FILENAME_LEN 256
struct ocfs2_vote_msg
{
struct ocfs2_msg_hdr v_hdr;
- union {
- __be32 v_generic1;
- __be32 v_orphaned_slot; /* Used during delete votes */
- __be32 v_nlink; /* Used during unlink votes */
- } md1; /* Message type dependant 1 */
+ __be32 v_reserved1;
};
/* Responses are given these values to maintain backwards
@@ -86,7 +79,6 @@ struct ocfs2_response_msg
{
struct ocfs2_msg_hdr r_hdr;
__be32 r_response;
- __be32 r_orphaned_slot;
};
struct ocfs2_vote_work {
@@ -96,7 +88,6 @@ struct ocfs2_vote_work {
enum ocfs2_vote_request {
OCFS2_VOTE_REQ_INVALID = 0,
- OCFS2_VOTE_REQ_DELETE,
OCFS2_VOTE_REQ_MOUNT,
OCFS2_VOTE_REQ_UMOUNT,
OCFS2_VOTE_REQ_LAST
@@ -151,135 +142,23 @@ static void ocfs2_process_umount_request(struct ocfs2_super *osb,
ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
}
-void ocfs2_mark_inode_remotely_deleted(struct inode *inode)
-{
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
-
- assert_spin_locked(&oi->ip_lock);
- /* We set the SKIP_DELETE flag on the inode so we don't try to
- * delete it in delete_inode ourselves, thus avoiding
- * unecessary lock pinging. If the other node failed to wipe
- * the inode as a result of a crash, then recovery will pick
- * up the slack. */
- oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE;
-}
-
-static int ocfs2_process_delete_request(struct inode *inode,
- int *orphaned_slot)
-{
- int response = OCFS2_RESPONSE_BUSY;
-
- mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n",
- inode->i_ino, inode->i_nlink, *orphaned_slot);
-
- spin_lock(&OCFS2_I(inode)->ip_lock);
-
- /* Whatever our vote response is, we want to make sure that
- * the orphaned slot is recorded properly on this node *and*
- * on the requesting node. Technically, if the requesting node
- * did not know which slot the inode is orphaned in but we
- * respond with BUSY he doesn't actually need the orphaned
- * slot, but it doesn't hurt to do it here anyway. */
- if ((*orphaned_slot) != OCFS2_INVALID_SLOT) {
- mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot !=
- OCFS2_INVALID_SLOT &&
- OCFS2_I(inode)->ip_orphaned_slot !=
- (*orphaned_slot),
- "Inode %llu: This node thinks it's "
- "orphaned in slot %d, messaged it's in %d\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- OCFS2_I(inode)->ip_orphaned_slot,
- *orphaned_slot);
-
- mlog(0, "Setting orphaned slot for inode %llu to %d\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- *orphaned_slot);
-
- OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot;
- } else {
- mlog(0, "Sending back orphaned slot %d for inode %llu\n",
- OCFS2_I(inode)->ip_orphaned_slot,
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
-
- *orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
- }
-
- /* vote no if the file is still open. */
- if (OCFS2_I(inode)->ip_open_count) {
- mlog(0, "open count = %u\n",
- OCFS2_I(inode)->ip_open_count);
- spin_unlock(&OCFS2_I(inode)->ip_lock);
- goto done;
- }
- spin_unlock(&OCFS2_I(inode)->ip_lock);
-
- /* directories are a bit ugly... What if someone is sitting in
- * it? We want to make sure the inode is removed completely as
- * a result of the iput in process_vote. */
- if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) {
- mlog(0, "i_count = %u\n", atomic_read(&inode->i_count));
- goto done;
- }
-
- if (filemap_fdatawrite(inode->i_mapping)) {
- mlog(ML_ERROR, "Could not sync inode %llu for delete!\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- goto done;
- }
- sync_mapping_buffers(inode->i_mapping);
- truncate_inode_pages(inode->i_mapping, 0);
- ocfs2_extent_map_trunc(inode, 0);
-
- spin_lock(&OCFS2_I(inode)->ip_lock);
- /* double check open count - someone might have raced this
- * thread into ocfs2_file_open while we were writing out
- * data. If we're to allow a wipe of this inode now, we *must*
- * hold the spinlock until we've marked it. */
- if (OCFS2_I(inode)->ip_open_count) {
- mlog(0, "Raced to wipe! open count = %u\n",
- OCFS2_I(inode)->ip_open_count);
- spin_unlock(&OCFS2_I(inode)->ip_lock);
- goto done;
- }
-
- /* Mark the inode as being wiped from disk. */
- ocfs2_mark_inode_remotely_deleted(inode);
- spin_unlock(&OCFS2_I(inode)->ip_lock);
-
- /* Not sure this is necessary anymore. */
- d_prune_aliases(inode);
-
- /* If we get here, then we're voting 'yes', so commit the
- * delete on our side. */
- response = OCFS2_RESPONSE_OK;
-done:
- return response;
-}
-
static void ocfs2_process_vote(struct ocfs2_super *osb,
struct ocfs2_vote_msg *msg)
{
int net_status, vote_response;
- int orphaned_slot = 0;
- unsigned int node_num, generation;
+ unsigned int node_num;
u64 blkno;
enum ocfs2_vote_request request;
- struct inode *inode = NULL;
struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
struct ocfs2_response_msg response;
/* decode the network mumbo jumbo into local variables. */
request = be32_to_cpu(hdr->h_request);
blkno = be64_to_cpu(hdr->h_blkno);
- generation = be32_to_cpu(hdr->h_generation);
node_num = be32_to_cpu(hdr->h_node_num);
- if (request == OCFS2_VOTE_REQ_DELETE)
- orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot);
- mlog(0, "processing vote: request = %u, blkno = %llu, "
- "generation = %u, node_num = %u, priv1 = %u\n", request,
- (unsigned long long)blkno, generation, node_num,
- be32_to_cpu(msg->md1.v_generic1));
+ mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n",
+ request, (unsigned long long)blkno, node_num);
if (!ocfs2_is_valid_vote_request(request)) {
mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
@@ -302,52 +181,6 @@ static void ocfs2_process_vote(struct ocfs2_super *osb,
break;
}
- /* We cannot process the remaining message types before we're
- * fully mounted. It's perfectly safe however to send a 'yes'
- * response as we can't possibly have any of the state they're
- * asking us to modify yet. */
- if (atomic_read(&osb->vol_state) == VOLUME_INIT)
- goto respond;
-
- /* If we get here, then the request is against an inode. */
- inode = ocfs2_ilookup_for_vote(osb, blkno,
- request == OCFS2_VOTE_REQ_DELETE);
-
- /* Not finding the inode is perfectly valid - it means we're
- * not interested in what the other node is about to do to it
- * so in those cases we automatically respond with an
- * affirmative. Cluster locking ensures that we won't race
- * interest in the inode with this vote request. */
- if (!inode)
- goto respond;
-
- /* Check generation values. It's possible for us to get a
- * request against a stale inode. If so then we proceed as if
- * we had not found an inode in the first place. */
- if (inode->i_generation != generation) {
- mlog(0, "generation passed %u != inode generation = %u, "
- "ip_flags = %x, ip_blkno = %llu, msg %llu, i_count = %u, "
- "message type = %u\n", generation, inode->i_generation,
- OCFS2_I(inode)->ip_flags,
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)blkno, atomic_read(&inode->i_count),
- request);
- iput(inode);
- inode = NULL;
- goto respond;
- }
-
- switch (request) {
- case OCFS2_VOTE_REQ_DELETE:
- vote_response = ocfs2_process_delete_request(inode,
- &orphaned_slot);
- break;
- default:
- mlog(ML_ERROR, "node %u, invalid request: %u\n",
- node_num, request);
- vote_response = OCFS2_RESPONSE_BAD_MSG;
- }
-
respond:
/* Response struture is small so we just put it on the stack
* and stuff it inline. */
@@ -357,7 +190,6 @@ respond:
response.r_hdr.h_generation = hdr->h_generation;
response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
response.r_response = cpu_to_be32(vote_response);
- response.r_orphaned_slot = cpu_to_be32(orphaned_slot);
net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
osb->net_key,
@@ -373,9 +205,6 @@ respond:
&& net_status != -ENOTCONN)
mlog(ML_ERROR, "message to node %u fails with error %d!\n",
node_num, net_status);
-
- if (inode)
- iput(inode);
}
static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
@@ -634,8 +463,7 @@ bail:
static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
u64 blkno,
unsigned int generation,
- enum ocfs2_vote_request type,
- u32 priv)
+ enum ocfs2_vote_request type)
{
struct ocfs2_vote_msg *request;
struct ocfs2_msg_hdr *hdr;
@@ -651,8 +479,6 @@ static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
hdr->h_request = cpu_to_be32(type);
hdr->h_blkno = cpu_to_be64(blkno);
hdr->h_generation = cpu_to_be32(generation);
-
- request->md1.v_generic1 = cpu_to_be32(priv);
}
return request;
@@ -664,7 +490,7 @@ static int ocfs2_do_request_vote(struct ocfs2_super *osb,
struct ocfs2_vote_msg *request,
struct ocfs2_net_response_cb *callback)
{
- int status, response;
+ int status, response = -EBUSY;
unsigned int response_id;
struct ocfs2_msg_hdr *hdr;
@@ -686,109 +512,12 @@ bail:
return status;
}
-static int ocfs2_request_vote(struct inode *inode,
- struct ocfs2_vote_msg *request,
- struct ocfs2_net_response_cb *callback)
-{
- int status;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-
- if (ocfs2_inode_is_new(inode))
- return 0;
-
- status = -EAGAIN;
- while (status == -EAGAIN) {
- if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
- signal_pending(current))
- return -ERESTARTSYS;
-
- status = ocfs2_super_lock(osb, 0);
- if (status < 0) {
- mlog_errno(status);
- break;
- }
-
- status = 0;
- if (!ocfs2_node_map_is_only(osb, &osb->mounted_map,
- osb->node_num))
- status = ocfs2_do_request_vote(osb, request, callback);
-
- ocfs2_super_unlock(osb, 0);
- }
- return status;
-}
-
-static void ocfs2_delete_response_cb(void *priv,
- struct ocfs2_response_msg *resp)
-{
- int orphaned_slot, node;
- struct inode *inode = priv;
-
- orphaned_slot = be32_to_cpu(resp->r_orphaned_slot);
- node = be32_to_cpu(resp->r_hdr.h_node_num);
- mlog(0, "node %d tells us that inode %llu is orphaned in slot %d\n",
- node, (unsigned long long)OCFS2_I(inode)->ip_blkno,
- orphaned_slot);
-
- /* The other node may not actually know which slot the inode
- * is orphaned in. */
- if (orphaned_slot == OCFS2_INVALID_SLOT)
- return;
-
- /* Ok, the responding node knows which slot this inode is
- * orphaned in. We verify that the information is correct and
- * then record this in the inode. ocfs2_delete_inode will use
- * this information to determine which lock to take. */
- spin_lock(&OCFS2_I(inode)->ip_lock);
- mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot &&
- OCFS2_I(inode)->ip_orphaned_slot
- != OCFS2_INVALID_SLOT, "Inode %llu: Node %d says it's "
- "orphaned in slot %d, we think it's in %d\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- be32_to_cpu(resp->r_hdr.h_node_num),
- orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot);
-
- OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot;
- spin_unlock(&OCFS2_I(inode)->ip_lock);
-}
-
-int ocfs2_request_delete_vote(struct inode *inode)
-{
- int orphaned_slot, status;
- struct ocfs2_net_response_cb delete_cb;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct ocfs2_vote_msg *request;
-
- spin_lock(&OCFS2_I(inode)->ip_lock);
- orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
- spin_unlock(&OCFS2_I(inode)->ip_lock);
-
- delete_cb.rc_cb = ocfs2_delete_response_cb;
- delete_cb.rc_priv = inode;
-
- mlog(0, "Inode %llu, we start thinking orphaned slot is %d\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, orphaned_slot);
-
- status = -ENOMEM;
- request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
- inode->i_generation,
- OCFS2_VOTE_REQ_DELETE, orphaned_slot);
- if (request) {
- status = ocfs2_request_vote(inode, request, &delete_cb);
-
- kfree(request);
- }
-
- return status;
-}
-
int ocfs2_request_mount_vote(struct ocfs2_super *osb)
{
int status;
struct ocfs2_vote_msg *request = NULL;
- request = ocfs2_new_vote_request(osb, 0ULL, 0,
- OCFS2_VOTE_REQ_MOUNT, 0);
+ request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT);
if (!request) {
status = -ENOMEM;
goto bail;
@@ -821,8 +550,7 @@ int ocfs2_request_umount_vote(struct ocfs2_super *osb)
int status;
struct ocfs2_vote_msg *request = NULL;
- request = ocfs2_new_vote_request(osb, 0ULL, 0,
- OCFS2_VOTE_REQ_UMOUNT, 0);
+ request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT);
if (!request) {
status = -ENOMEM;
goto bail;
@@ -969,7 +697,6 @@ static int ocfs2_handle_vote_message(struct o2net_msg *msg,
be32_to_cpu(work->w_msg.v_hdr.h_generation));
mlog(0, "h_node_num = %u\n",
be32_to_cpu(work->w_msg.v_hdr.h_node_num));
- mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1));
spin_lock(&osb->vote_task_lock);
list_add_tail(&work->w_list, &osb->vote_list);
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h
index 53ebc1c69e56..9ea46f62de31 100644
--- a/fs/ocfs2/vote.h
+++ b/fs/ocfs2/vote.h
@@ -38,14 +38,11 @@ static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
wake_up(&osb->vote_event);
}
-int ocfs2_request_delete_vote(struct inode *inode);
int ocfs2_request_mount_vote(struct ocfs2_super *osb);
int ocfs2_request_umount_vote(struct ocfs2_super *osb);
int ocfs2_register_net_handlers(struct ocfs2_super *osb);
void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
-void ocfs2_mark_inode_remotely_deleted(struct inode *inode);
-
void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
int node_num);
#endif
diff --git a/fs/sync.c b/fs/sync.c
index d0feff61e6aa..5cb9e7e43383 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -239,13 +239,11 @@ out:
/*
* `endbyte' is inclusive
*/
-int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
- unsigned int flags)
+int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
+ loff_t endbyte, unsigned int flags)
{
int ret;
- struct address_space *mapping;
- mapping = file->f_mapping;
if (!mapping) {
ret = -EINVAL;
goto out;
@@ -275,4 +273,4 @@ int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
out:
return ret;
}
-EXPORT_SYMBOL_GPL(do_sync_file_range);
+EXPORT_SYMBOL_GPL(do_sync_mapping_range);