diff options
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r-- | Documentation/filesystems/Locking | 8 | ||||
-rw-r--r-- | Documentation/filesystems/ceph.txt | 18 | ||||
-rw-r--r-- | Documentation/filesystems/configfs/configfs.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/debugfs.txt | 61 | ||||
-rw-r--r-- | Documentation/filesystems/ext4.txt | 19 | ||||
-rw-r--r-- | Documentation/filesystems/files.txt | 4 | ||||
-rw-r--r-- | Documentation/filesystems/gfs2-uevents.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/nfs/00-INDEX | 2 | ||||
-rw-r--r-- | Documentation/filesystems/nfs/fault_injection.txt | 69 | ||||
-rw-r--r-- | Documentation/filesystems/nfs/idmapper.txt | 20 | ||||
-rw-r--r-- | Documentation/filesystems/nfs/pnfs.txt | 54 | ||||
-rw-r--r-- | Documentation/filesystems/pohmelfs/network_protocol.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/porting | 6 | ||||
-rw-r--r-- | Documentation/filesystems/proc.txt | 74 | ||||
-rw-r--r-- | Documentation/filesystems/qnx6.txt | 174 | ||||
-rw-r--r-- | Documentation/filesystems/ramfs-rootfs-initramfs.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/squashfs.txt | 6 | ||||
-rw-r--r-- | Documentation/filesystems/sysfs.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/vfs.txt | 10 |
19 files changed, 479 insertions, 56 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index d819ba16a0c7..4fca82e5276e 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -37,15 +37,15 @@ d_manage: no no yes (ref-walk) maybe --------------------------- inode_operations --------------------------- prototypes: - int (*create) (struct inode *,struct dentry *,int, struct nameidata *); + int (*create) (struct inode *,struct dentry *,umode_t, struct nameidata *); struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameid ata *); int (*link) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); int (*symlink) (struct inode *,struct dentry *,const char *); - int (*mkdir) (struct inode *,struct dentry *,int); + int (*mkdir) (struct inode *,struct dentry *,umode_t); int (*rmdir) (struct inode *,struct dentry *); - int (*mknod) (struct inode *,struct dentry *,int,dev_t); + int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t); int (*rename) (struct inode *, struct dentry *, struct inode *, struct dentry *); int (*readlink) (struct dentry *, char __user *,int); @@ -117,7 +117,7 @@ prototypes: int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); void (*umount_begin) (struct super_block *); - int (*show_options)(struct seq_file *, struct vfsmount *); + int (*show_options)(struct seq_file *, struct dentry *); ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt index 763d8ebbbebd..d6030aa33376 100644 --- a/Documentation/filesystems/ceph.txt +++ b/Documentation/filesystems/ceph.txt @@ -119,12 +119,20 @@ Mount Options must rely on TCP's error correction to detect data corruption in the data payload. - noasyncreaddir - Disable client's use its local cache to satisfy readdir - requests. (This does not change correctness; the client uses - cached metadata only when a lease or capability ensures it is - valid.) + dcache + Use the dcache contents to perform negative lookups and + readdir when the client has the entire directory contents in + its cache. (This does not change correctness; the client uses + cached metadata only when a lease or capability ensures it is + valid.) + + nodcache + Do not use the dcache as above. This avoids a significant amount of + complex code, sacrificing performance without affecting correctness, + and is useful for tracking down bugs. + noasyncreaddir + Do not use the dcache as above for readdir. More Information ================ diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt index dd57bb6bb390..b40fec9d3f53 100644 --- a/Documentation/filesystems/configfs/configfs.txt +++ b/Documentation/filesystems/configfs/configfs.txt @@ -192,7 +192,7 @@ attribute value uses the store_attribute() method. struct configfs_attribute { char *ca_name; struct module *ca_owner; - mode_t ca_mode; + umode_t ca_mode; }; When a config_item wants an attribute to appear as a file in the item's diff --git a/Documentation/filesystems/debugfs.txt b/Documentation/filesystems/debugfs.txt index 742cc06e138f..7a34f827989c 100644 --- a/Documentation/filesystems/debugfs.txt +++ b/Documentation/filesystems/debugfs.txt @@ -14,7 +14,10 @@ Debugfs is typically mounted with a command like: mount -t debugfs none /sys/kernel/debug -(Or an equivalent /etc/fstab line). +(Or an equivalent /etc/fstab line). +The debugfs root directory is accessible by anyone by default. To +restrict access to the tree the "uid", "gid" and "mode" mount +options can be used. Note that the debugfs API is exported GPL-only to modules. @@ -35,7 +38,7 @@ described below will work. The most general way to create a file within a debugfs directory is with: - struct dentry *debugfs_create_file(const char *name, mode_t mode, + struct dentry *debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops); @@ -53,13 +56,13 @@ actually necessary; the debugfs code provides a number of helper functions for simple situations. Files containing a single integer value can be created with any of: - struct dentry *debugfs_create_u8(const char *name, mode_t mode, + struct dentry *debugfs_create_u8(const char *name, umode_t mode, struct dentry *parent, u8 *value); - struct dentry *debugfs_create_u16(const char *name, mode_t mode, + struct dentry *debugfs_create_u16(const char *name, umode_t mode, struct dentry *parent, u16 *value); - struct dentry *debugfs_create_u32(const char *name, mode_t mode, + struct dentry *debugfs_create_u32(const char *name, umode_t mode, struct dentry *parent, u32 *value); - struct dentry *debugfs_create_u64(const char *name, mode_t mode, + struct dentry *debugfs_create_u64(const char *name, umode_t mode, struct dentry *parent, u64 *value); These files support both reading and writing the given value; if a specific @@ -67,13 +70,13 @@ file should not be written to, simply set the mode bits accordingly. The values in these files are in decimal; if hexadecimal is more appropriate, the following functions can be used instead: - struct dentry *debugfs_create_x8(const char *name, mode_t mode, + struct dentry *debugfs_create_x8(const char *name, umode_t mode, struct dentry *parent, u8 *value); - struct dentry *debugfs_create_x16(const char *name, mode_t mode, + struct dentry *debugfs_create_x16(const char *name, umode_t mode, struct dentry *parent, u16 *value); - struct dentry *debugfs_create_x32(const char *name, mode_t mode, + struct dentry *debugfs_create_x32(const char *name, umode_t mode, struct dentry *parent, u32 *value); - struct dentry *debugfs_create_x64(const char *name, mode_t mode, + struct dentry *debugfs_create_x64(const char *name, umode_t mode, struct dentry *parent, u64 *value); These functions are useful as long as the developer knows the size of the @@ -81,7 +84,7 @@ value to be exported. Some types can have different widths on different architectures, though, complicating the situation somewhat. There is a function meant to help out in one special case: - struct dentry *debugfs_create_size_t(const char *name, mode_t mode, + struct dentry *debugfs_create_size_t(const char *name, umode_t mode, struct dentry *parent, size_t *value); @@ -90,21 +93,22 @@ a variable of type size_t. Boolean values can be placed in debugfs with: - struct dentry *debugfs_create_bool(const char *name, mode_t mode, + struct dentry *debugfs_create_bool(const char *name, umode_t mode, struct dentry *parent, u32 *value); A read on the resulting file will yield either Y (for non-zero values) or N, followed by a newline. If written to, it will accept either upper- or lower-case values, or 1 or 0. Any other input will be silently ignored. -Finally, a block of arbitrary binary data can be exported with: +Another option is exporting a block of arbitrary binary data, with +this structure and function: struct debugfs_blob_wrapper { void *data; unsigned long size; }; - struct dentry *debugfs_create_blob(const char *name, mode_t mode, + struct dentry *debugfs_create_blob(const char *name, umode_t mode, struct dentry *parent, struct debugfs_blob_wrapper *blob); @@ -115,6 +119,35 @@ can be used to export binary information, but there does not appear to be any code which does so in the mainline. Note that all files created with debugfs_create_blob() are read-only. +If you want to dump a block of registers (something that happens quite +often during development, even if little such code reaches mainline. +Debugfs offers two functions: one to make a registers-only file, and +another to insert a register block in the middle of another sequential +file. + + struct debugfs_reg32 { + char *name; + unsigned long offset; + }; + + struct debugfs_regset32 { + struct debugfs_reg32 *regs; + int nregs; + void __iomem *base; + }; + + struct dentry *debugfs_create_regset32(const char *name, umode_t mode, + struct dentry *parent, + struct debugfs_regset32 *regset); + + int debugfs_print_regs32(struct seq_file *s, struct debugfs_reg32 *regs, + int nregs, void __iomem *base, char *prefix); + +The "base" argument may be 0, but you may want to build the reg32 array +using __stringify, and a number of register names (macros) are actually +byte offsets over a base for the register block. + + There are a couple of other directory-oriented helper functions: struct dentry *debugfs_rename(struct dentry *old_dir, diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 4917cf24a5e0..1b7f9acbcbbe 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -144,9 +144,6 @@ journal_async_commit Commit block can be written to disk without waiting mount the device. This will enable 'journal_checksum' internally. -journal=update Update the ext4 file system's journal to the current - format. - journal_dev=devnum When the external journal device's major/minor numbers have changed, this option allows the user to specify the new journal location. The journal device is @@ -308,7 +305,7 @@ min_batch_time=usec This parameter sets the commit time (as fast disks, at the cost of increasing latency. journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the - highest priorty) which should be used for I/O + highest priority) which should be used for I/O operations submitted by kjournald2 during a commit operation. This defaults to 3, which is a slightly higher priority than the default I/O @@ -343,7 +340,7 @@ noinit_itable Do not initialize any uninitialized inode table init_itable=n The lazy itable init code will wait n times the number of milliseconds it took to zero out the previous block group's inode table. This - minimizes the impact on the systme performance + minimizes the impact on the system performance while file system's inode table is being initialized. discard Controls whether ext4 should issue discard/TRIM @@ -356,11 +353,6 @@ nouid32 Disables 32-bit UIDs and GIDs. This is for interoperability with older kernels which only store and expect 16-bit values. -resize Allows to resize filesystem to the end of the last - existing block group, further resize has to be done - with resize2fs either online, or offline. It can be - used only with conjunction with remount. - block_validity This options allows to enables/disables the in-kernel noblock_validity facility for tracking filesystem metadata blocks within internal data structures. This allows multi- @@ -581,6 +573,13 @@ Table of Ext4 specific ioctls behaviour may change in the future as it is not necessary and has been done this way only for sake of simplicity. + + EXT4_IOC_RESIZE_FS Resize the filesystem to a new size. The number + of blocks of resized filesystem is passed in via + 64 bit integer argument. The kernel allocates + bitmaps and inode table, the userspace tool thus + just passes the new number of blocks. + .............................................................................. References diff --git a/Documentation/filesystems/files.txt b/Documentation/filesystems/files.txt index ac2facc50d2a..46dfc6b038c3 100644 --- a/Documentation/filesystems/files.txt +++ b/Documentation/filesystems/files.txt @@ -113,8 +113,8 @@ the fdtable structure - if (fd >= 0) { /* locate_fd() may have expanded fdtable, load the ptr */ fdt = files_fdtable(files); - FD_SET(fd, fdt->open_fds); - FD_CLR(fd, fdt->close_on_exec); + __set_open_fd(fd, fdt); + __clear_close_on_exec(fd, fdt); spin_unlock(&files->file_lock); ..... diff --git a/Documentation/filesystems/gfs2-uevents.txt b/Documentation/filesystems/gfs2-uevents.txt index d81889669293..19a19ebebc34 100644 --- a/Documentation/filesystems/gfs2-uevents.txt +++ b/Documentation/filesystems/gfs2-uevents.txt @@ -62,7 +62,7 @@ be fixed. The REMOVE uevent is generated at the end of an unsuccessful mount or at the end of a umount of the filesystem. All REMOVE uevents will -have been preceded by at least an ADD uevent for the same fileystem, +have been preceded by at least an ADD uevent for the same filesystem, and unlike the other uevents is generated automatically by the kernel's kobject subsystem. diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX index a57e12411d2a..1716874a651e 100644 --- a/Documentation/filesystems/nfs/00-INDEX +++ b/Documentation/filesystems/nfs/00-INDEX @@ -2,6 +2,8 @@ - this file (nfs-related documentation). Exporting - explanation of how to make filesystems exportable. +fault_injection.txt + - information for using fault injection on the server knfsd-stats.txt - statistics which the NFS server makes available to user space. nfs.txt diff --git a/Documentation/filesystems/nfs/fault_injection.txt b/Documentation/filesystems/nfs/fault_injection.txt new file mode 100644 index 000000000000..426d166089a3 --- /dev/null +++ b/Documentation/filesystems/nfs/fault_injection.txt @@ -0,0 +1,69 @@ + +Fault Injection +=============== +Fault injection is a method for forcing errors that may not normally occur, or +may be difficult to reproduce. Forcing these errors in a controlled environment +can help the developer find and fix bugs before their code is shipped in a +production system. Injecting an error on the Linux NFS server will allow us to +observe how the client reacts and if it manages to recover its state correctly. + +NFSD_FAULT_INJECTION must be selected when configuring the kernel to use this +feature. + + +Using Fault Injection +===================== +On the client, mount the fault injection server through NFS v4.0+ and do some +work over NFS (open files, take locks, ...). + +On the server, mount the debugfs filesystem to <debug_dir> and ls +<debug_dir>/nfsd. This will show a list of files that will be used for +injecting faults on the NFS server. As root, write a number n to the file +corresponding to the action you want the server to take. The server will then +process the first n items it finds. So if you want to forget 5 locks, echo '5' +to <debug_dir>/nfsd/forget_locks. A value of 0 will tell the server to forget +all corresponding items. A log message will be created containing the number +of items forgotten (check dmesg). + +Go back to work on the client and check if the client recovered from the error +correctly. + + +Available Faults +================ +forget_clients: + The NFS server keeps a list of clients that have placed a mount call. If + this list is cleared, the server will have no knowledge of who the client + is, forcing the client to reauthenticate with the server. + +forget_openowners: + The NFS server keeps a list of what files are currently opened and who + they were opened by. Clearing this list will force the client to reopen + its files. + +forget_locks: + The NFS server keeps a list of what files are currently locked in the VFS. + Clearing this list will force the client to reclaim its locks (files are + unlocked through the VFS as they are cleared from this list). + +forget_delegations: + A delegation is used to assure the client that a file, or part of a file, + has not changed since the delegation was awarded. Clearing this list will + force the client to reaquire its delegation before accessing the file + again. + +recall_delegations: + Delegations can be recalled by the server when another client attempts to + access a file. This test will notify the client that its delegation has + been revoked, forcing the client to reaquire the delegation before using + the file again. + + +tools/nfs/inject_faults.sh script +================================= +This script has been created to ease the fault injection process. This script +will detect the mounted debugfs directory and write to the files located there +based on the arguments passed by the user. For example, running +`inject_faults.sh forget_locks 1` as root will instruct the server to forget +one lock. Running `inject_faults forget_locks` will instruct the server to +forgetall locks. diff --git a/Documentation/filesystems/nfs/idmapper.txt b/Documentation/filesystems/nfs/idmapper.txt index 120fd3cf7fd9..fe03d10bb79a 100644 --- a/Documentation/filesystems/nfs/idmapper.txt +++ b/Documentation/filesystems/nfs/idmapper.txt @@ -4,13 +4,21 @@ ID Mapper ========= Id mapper is used by NFS to translate user and group ids into names, and to translate user and group names into ids. Part of this translation involves -performing an upcall to userspace to request the information. Id mapper will -user request-key to perform this upcall and cache the result. The program -/usr/sbin/nfs.idmap should be called by request-key, and will perform the -translation and initialize a key with the resulting information. +performing an upcall to userspace to request the information. There are two +ways NFS could obtain this information: placing a call to /sbin/request-key +or by placing a call to the rpc.idmap daemon. + +NFS will attempt to call /sbin/request-key first. If this succeeds, the +result will be cached using the generic request-key cache. This call should +only fail if /etc/request-key.conf is not configured for the id_resolver key +type, see the "Configuring" section below if you wish to use the request-key +method. + +If the call to /sbin/request-key fails (if /etc/request-key.conf is not +configured with the id_resolver key type), then the idmapper will ask the +legacy rpc.idmap daemon for the id mapping. This result will be stored +in a custom NFS idmap cache. - NFS_USE_NEW_IDMAPPER must be selected when configuring the kernel to use this - feature. =========== Configuring diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt index 983e14abe7e9..c7919c6e3bea 100644 --- a/Documentation/filesystems/nfs/pnfs.txt +++ b/Documentation/filesystems/nfs/pnfs.txt @@ -53,3 +53,57 @@ lseg maintains an extra reference corresponding to the NFS_LSEG_VALID bit which holds it in the pnfs_layout_hdr's list. When the final lseg is removed from the pnfs_layout_hdr's list, the NFS_LAYOUT_DESTROYED bit is set, preventing any new lsegs from being added. + +layout drivers +-------------- + +PNFS utilizes what is called layout drivers. The STD defines 3 basic +layout types: "files" "objects" and "blocks". For each of these types +there is a layout-driver with a common function-vectors table which +are called by the nfs-client pnfs-core to implement the different layout +types. + +Files-layout-driver code is in: fs/nfs/nfs4filelayout.c && nfs4filelayoutdev.c +Objects-layout-deriver code is in: fs/nfs/objlayout/.. directory +Blocks-layout-deriver code is in: fs/nfs/blocklayout/.. directory + +objects-layout setup +-------------------- + +As part of the full STD implementation the objlayoutdriver.ko needs, at times, +to automatically login to yet undiscovered iscsi/osd devices. For this the +driver makes up-calles to a user-mode script called *osd_login* + +The path_name of the script to use is by default: + /sbin/osd_login. +This name can be overridden by the Kernel module parameter: + objlayoutdriver.osd_login_prog + +If Kernel does not find the osd_login_prog path it will zero it out +and will not attempt farther logins. An admin can then write new value +to the objlayoutdriver.osd_login_prog Kernel parameter to re-enable it. + +The /sbin/osd_login is part of the nfs-utils package, and should usually +be installed on distributions that support this Kernel version. + +The API to the login script is as follows: + Usage: $0 -u <URI> -o <OSDNAME> -s <SYSTEMID> + Options: + -u target uri e.g. iscsi://<ip>:<port> + (allways exists) + (More protocols can be defined in the future. + The client does not interpret this string it is + passed unchanged as recieved from the Server) + -o osdname of the requested target OSD + (Might be empty) + (A string which denotes the OSD name, there is a + limit of 64 chars on this string) + -s systemid of the requested target OSD + (Might be empty) + (This string, if not empty is always an hex + representation of the 20 bytes osd_system_id) + +blocks-layout setup +------------------- + +TODO: Document the setup needs of the blocks layout driver diff --git a/Documentation/filesystems/pohmelfs/network_protocol.txt b/Documentation/filesystems/pohmelfs/network_protocol.txt index 65e03dd44823..c680b4b5353d 100644 --- a/Documentation/filesystems/pohmelfs/network_protocol.txt +++ b/Documentation/filesystems/pohmelfs/network_protocol.txt @@ -20,7 +20,7 @@ Commands can be embedded into transaction command (which in turn has own command so one can extend protocol as needed without breaking backward compatibility as long as old commands are supported. All string lengths include tail 0 byte. -All commands are transferred over the network in big-endian. CPU endianess is used at the end peers. +All commands are transferred over the network in big-endian. CPU endianness is used at the end peers. @cmd - command number, which specifies command to be processed. Following commands are used currently: diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index b4a3d765ff9a..74acd9618819 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -429,3 +429,9 @@ filemap_write_and_wait_range() so that all dirty pages are synced out properly. You must also keep in mind that ->fsync() is not called with i_mutex held anymore, so if you require i_mutex locking you must make sure to take it and release it yourself. + +-- +[mandatory] + d_alloc_root() is gone, along with a lot of bugs caused by code +misusing it. Replacement: d_make_root(inode). The difference is, +d_make_root() drops the reference to inode if dentry allocation fails. diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 0ec91f03422e..b7413cb46dcb 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -41,6 +41,8 @@ Table of Contents 3.5 /proc/<pid>/mountinfo - Information about mounts 3.6 /proc/<pid>/comm & /proc/<pid>/task/<tid>/comm + 4 Configuring procfs + 4.1 Mount options ------------------------------------------------------------------------------ Preface @@ -288,7 +290,7 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7) rsslim current limit in bytes on the rss start_code address above which program text can run end_code address below which program text can run - start_stack address of the start of the stack + start_stack address of the start of the main process stack esp current value of ESP eip current value of EIP pending bitmap of pending signals @@ -305,6 +307,9 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7) blkio_ticks time spent waiting for block IO gtime guest time of the task in jiffies cgtime guest time of the task children in jiffies + start_data address above which program data+bss is placed + end_data address below which program data+bss is placed + start_brk address above which program heap can be expanded with brk() .............................................................................. The /proc/PID/maps file containing the currently mapped memory regions and @@ -320,7 +325,7 @@ address perms offset dev inode pathname a7cb1000-a7cb2000 ---p 00000000 00:00 0 a7cb2000-a7eb2000 rw-p 00000000 00:00 0 a7eb2000-a7eb3000 ---p 00000000 00:00 0 -a7eb3000-a7ed5000 rw-p 00000000 00:00 0 +a7eb3000-a7ed5000 rw-p 00000000 00:00 0 [stack:1001] a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6 a8008000-a800a000 r--p 00133000 03:00 4222 /lib/libc.so.6 a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/libc.so.6 @@ -352,11 +357,39 @@ is not associated with a file: [heap] = the heap of the program [stack] = the stack of the main process + [stack:1001] = the stack of the thread with tid 1001 [vdso] = the "virtual dynamic shared object", the kernel system call handler or if empty, the mapping is anonymous. +The /proc/PID/task/TID/maps is a view of the virtual memory from the viewpoint +of the individual tasks of a process. In this file you will see a mapping marked +as [stack] if that task sees it as a stack. This is a key difference from the +content of /proc/PID/maps, where you will see all mappings that are being used +as stack by all of those tasks. Hence, for the example above, the task-level +map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this: + +08048000-08049000 r-xp 00000000 03:00 8312 /opt/test +08049000-0804a000 rw-p 00001000 03:00 8312 /opt/test +0804a000-0806b000 rw-p 00000000 00:00 0 [heap] +a7cb1000-a7cb2000 ---p 00000000 00:00 0 +a7cb2000-a7eb2000 rw-p 00000000 00:00 0 +a7eb2000-a7eb3000 ---p 00000000 00:00 0 +a7eb3000-a7ed5000 rw-p 00000000 00:00 0 [stack] +a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6 +a8008000-a800a000 r--p 00133000 03:00 4222 /lib/libc.so.6 +a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/libc.so.6 +a800b000-a800e000 rw-p 00000000 00:00 0 +a800e000-a8022000 r-xp 00000000 03:00 14462 /lib/libpthread.so.0 +a8022000-a8023000 r--p 00013000 03:00 14462 /lib/libpthread.so.0 +a8023000-a8024000 rw-p 00014000 03:00 14462 /lib/libpthread.so.0 +a8024000-a8027000 rw-p 00000000 00:00 0 +a8027000-a8043000 r-xp 00000000 03:00 8317 /lib/ld-linux.so.2 +a8043000-a8044000 r--p 0001b000 03:00 8317 /lib/ld-linux.so.2 +a8044000-a8045000 rw-p 0001c000 03:00 8317 /lib/ld-linux.so.2 +aff35000-aff4a000 rw-p 00000000 00:00 0 +ffffe000-fffff000 r-xp 00000000 00:00 0 [vdso] The /proc/PID/smaps is an extension based on maps, showing the memory consumption for each of the process's mappings. For each of mappings there @@ -1542,3 +1575,40 @@ a task to set its own or one of its thread siblings comm value. The comm value is limited in size compared to the cmdline value, so writing anything longer then the kernel's TASK_COMM_LEN (currently 16 chars) will result in a truncated comm value. + + +------------------------------------------------------------------------------ +Configuring procfs +------------------------------------------------------------------------------ + +4.1 Mount options +--------------------- + +The following mount options are supported: + + hidepid= Set /proc/<pid>/ access mode. + gid= Set the group authorized to learn processes information. + +hidepid=0 means classic mode - everybody may access all /proc/<pid>/ directories +(default). + +hidepid=1 means users may not access any /proc/<pid>/ directories but their +own. Sensitive files like cmdline, sched*, status are now protected against +other users. This makes it impossible to learn whether any user runs +specific program (given the program doesn't reveal itself by its behaviour). +As an additional bonus, as /proc/<pid>/cmdline is unaccessible for other users, +poorly written programs passing sensitive information via program arguments are +now protected against local eavesdroppers. + +hidepid=2 means hidepid=1 plus all /proc/<pid>/ will be fully invisible to other +users. It doesn't mean that it hides a fact whether a process with a specific +pid value exists (it can be learned by other means, e.g. by "kill -0 $PID"), +but it hides process' uid and gid, which may be learned by stat()'ing +/proc/<pid>/ otherwise. It greatly complicates an intruder's task of gathering +information about running processes, whether some daemon runs with elevated +privileges, whether other user runs some sensitive program, whether other users +run any program at all, etc. + +gid= defines a group authorized to learn processes information otherwise +prohibited by hidepid=. If you use some daemon like identd which needs to learn +information about processes information, just add identd to this group. diff --git a/Documentation/filesystems/qnx6.txt b/Documentation/filesystems/qnx6.txt new file mode 100644 index 000000000000..050223ea03c7 --- /dev/null +++ b/Documentation/filesystems/qnx6.txt @@ -0,0 +1,174 @@ +The QNX6 Filesystem +=================== + +The qnx6fs is used by newer QNX operating system versions. (e.g. Neutrino) +It got introduced in QNX 6.4.0 and is used default since 6.4.1. + +Option +====== + +mmi_fs Mount filesystem as used for example by Audi MMI 3G system + +Specification +============= + +qnx6fs shares many properties with traditional Unix filesystems. It has the +concepts of blocks, inodes and directories. +On QNX it is possible to create little endian and big endian qnx6 filesystems. +This feature makes it possible to create and use a different endianness fs +for the target (QNX is used on quite a range of embedded systems) plattform +running on a different endianess. +The Linux driver handles endianness transparently. (LE and BE) + +Blocks +------ + +The space in the device or file is split up into blocks. These are a fixed +size of 512, 1024, 2048 or 4096, which is decided when the filesystem is +created. +Blockpointers are 32bit, so the maximum space that can be adressed is +2^32 * 4096 bytes or 16TB + +The superblocks +--------------- + +The superblock contains all global information about the filesystem. +Each qnx6fs got two superblocks, each one having a 64bit serial number. +That serial number is used to identify the "active" superblock. +In write mode with reach new snapshot (after each synchronous write), the +serial of the new master superblock is increased (old superblock serial + 1) + +So basically the snapshot functionality is realized by an atomic final +update of the serial number. Before updating that serial, all modifications +are done by copying all modified blocks during that specific write request +(or period) and building up a new (stable) filesystem structure under the +inactive superblock. + +Each superblock holds a set of root inodes for the different filesystem +parts. (Inode, Bitmap and Longfilenames) +Each of these root nodes holds information like total size of the stored +data and the adressing levels in that specific tree. +If the level value is 0, up to 16 direct blocks can be adressed by each +node. +Level 1 adds an additional indirect adressing level where each indirect +adressing block holds up to blocksize / 4 bytes pointers to data blocks. +Level 2 adds an additional indirect adressig block level (so, already up +to 16 * 256 * 256 = 1048576 blocks that can be adressed by such a tree)a + +Unused block pointers are always set to ~0 - regardless of root node, +indirect adressing blocks or inodes. +Data leaves are always on the lowest level. So no data is stored on upper +tree levels. + +The first Superblock is located at 0x2000. (0x2000 is the bootblock size) +The Audi MMI 3G first superblock directly starts at byte 0. +Second superblock position can either be calculated from the superblock +information (total number of filesystem blocks) or by taking the highest +device address, zeroing the last 3 bytes and then substracting 0x1000 from +that address. + +0x1000 is the size reserved for each superblock - regardless of the +blocksize of the filesystem. + +Inodes +------ + +Each object in the filesystem is represented by an inode. (index node) +The inode structure contains pointers to the filesystem blocks which contain +the data held in the object and all of the metadata about an object except +its longname. (filenames longer than 27 characters) +The metadata about an object includes the permissions, owner, group, flags, +size, number of blocks used, access time, change time and modification time. + +Object mode field is POSIX format. (which makes things easier) + +There are also pointers to the first 16 blocks, if the object data can be +adressed with 16 direct blocks. +For more than 16 blocks an indirect adressing in form of another tree is +used. (scheme is the same as the one used for the superblock root nodes) + +The filesize is stored 64bit. Inode counting starts with 1. (whilst long +filename inodes start with 0) + +Directories +----------- + +A directory is a filesystem object and has an inode just like a file. +It is a specially formatted file containing records which associate each +name with an inode number. +'.' inode number points to the directory inode +'..' inode number points to the parent directory inode +Eeach filename record additionally got a filename length field. + +One special case are long filenames or subdirectory names. +These got set a filename length field of 0xff in the corresponding directory +record plus the longfile inode number also stored in that record. +With that longfilename inode number, the longfilename tree can be walked +starting with the superblock longfilename root node pointers. + +Special files +------------- + +Symbolic links are also filesystem objects with inodes. They got a specific +bit in the inode mode field identifying them as symbolic link. +The directory entry file inode pointer points to the target file inode. + +Hard links got an inode, a directory entry, but a specific mode bit set, +no block pointers and the directory file record pointing to the target file +inode. + +Character and block special devices do not exist in QNX as those files +are handled by the QNX kernel/drivers and created in /dev independant of the +underlaying filesystem. + +Long filenames +-------------- + +Long filenames are stored in a seperate adressing tree. The staring point +is the longfilename root node in the active superblock. +Each data block (tree leaves) holds one long filename. That filename is +limited to 510 bytes. The first two starting bytes are used as length field +for the actual filename. +If that structure shall fit for all allowed blocksizes, it is clear why there +is a limit of 510 bytes for the actual filename stored. + +Bitmap +------ + +The qnx6fs filesystem allocation bitmap is stored in a tree under bitmap +root node in the superblock and each bit in the bitmap represents one +filesystem block. +The first block is block 0, which starts 0x1000 after superblock start. +So for a normal qnx6fs 0x3000 (bootblock + superblock) is the physical +address at which block 0 is located. + +Bits at the end of the last bitmap block are set to 1, if the device is +smaller than addressing space in the bitmap. + +Bitmap system area +------------------ + +The bitmap itself is devided into three parts. +First the system area, that is split into two halfs. +Then userspace. + +The requirement for a static, fixed preallocated system area comes from how +qnx6fs deals with writes. +Each superblock got it's own half of the system area. So superblock #1 +always uses blocks from the lower half whilst superblock #2 just writes to +blocks represented by the upper half bitmap system area bits. + +Bitmap blocks, Inode blocks and indirect addressing blocks for those two +tree structures are treated as system blocks. + +The rational behind that is that a write request can work on a new snapshot +(system area of the inactive - resp. lower serial numbered superblock) while +at the same time there is still a complete stable filesystem structer in the +other half of the system area. + +When finished with writing (a sync write is completed, the maximum sync leap +time or a filesystem sync is requested), serial of the previously inactive +superblock atomically is increased and the fs switches over to that - then +stable declared - superblock. + +For all data outside the system area, blocks are just copied while writing. diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.txt b/Documentation/filesystems/ramfs-rootfs-initramfs.txt index a8273d5fad20..59b4a0962e0f 100644 --- a/Documentation/filesystems/ramfs-rootfs-initramfs.txt +++ b/Documentation/filesystems/ramfs-rootfs-initramfs.txt @@ -297,7 +297,7 @@ the above threads) is: either way about the archive format, and there are alternative tools, such as: - http://freshmeat.net/projects/afio/ + http://freecode.com/projects/afio 2) The cpio archive format chosen by the kernel is simpler and cleaner (and thus easier to create and parse) than any of the (literally dozens of) diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt index 7db3ebda5a4c..403c090aca39 100644 --- a/Documentation/filesystems/squashfs.txt +++ b/Documentation/filesystems/squashfs.txt @@ -93,8 +93,8 @@ byte alignment: Compressed data blocks are written to the filesystem as files are read from the source directory, and checked for duplicates. Once all file data has been -written the completed inode, directory, fragment, export and uid/gid lookup -tables are written. +written the completed inode, directory, fragment, export, uid/gid lookup and +xattr tables are written. 3.1 Compression options ----------------------- @@ -151,7 +151,7 @@ in each metadata block. Directories are sorted in alphabetical order, and at lookup the index is scanned linearly looking for the first filename alphabetically larger than the filename being looked up. At this point the location of the metadata block the filename is in has been found. -The general idea of the index is ensure only one metadata block needs to be +The general idea of the index is to ensure only one metadata block needs to be decompressed to do a lookup irrespective of the length of the directory. This scheme has the advantage that it doesn't require extra memory overhead and doesn't require much extra storage on disk. diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt index 07235caec22c..a6619b7064b9 100644 --- a/Documentation/filesystems/sysfs.txt +++ b/Documentation/filesystems/sysfs.txt @@ -70,7 +70,7 @@ An attribute definition is simply: struct attribute { char * name; struct module *owner; - mode_t mode; + umode_t mode; }; diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 43cbd0821721..e916e3d36488 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -225,7 +225,7 @@ struct super_operations { void (*clear_inode) (struct inode *); void (*umount_begin) (struct super_block *); - int (*show_options)(struct seq_file *, struct vfsmount *); + int (*show_options)(struct seq_file *, struct dentry *); ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); @@ -341,14 +341,14 @@ This describes how the VFS can manipulate an inode in your filesystem. As of kernel 2.6.22, the following members are defined: struct inode_operations { - int (*create) (struct inode *,struct dentry *,int, struct nameidata *); + int (*create) (struct inode *,struct dentry *, umode_t, struct nameidata *); struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); int (*link) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); int (*symlink) (struct inode *,struct dentry *,const char *); - int (*mkdir) (struct inode *,struct dentry *,int); + int (*mkdir) (struct inode *,struct dentry *,umode_t); int (*rmdir) (struct inode *,struct dentry *); - int (*mknod) (struct inode *,struct dentry *,int,dev_t); + int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t); int (*rename) (struct inode *, struct dentry *, struct inode *, struct dentry *); int (*readlink) (struct dentry *, char __user *,int); @@ -993,7 +993,7 @@ struct dentry_operations { If the 'rcu_walk' parameter is true, then the caller is doing a pathwalk in RCU-walk mode. Sleeping is not permitted in this mode, - and the caller can be asked to leave it and call again by returing + and the caller can be asked to leave it and call again by returning -ECHILD. This function is only used if DCACHE_MANAGE_TRANSIT is set on the |