diff options
Diffstat (limited to 'Documentation')
40 files changed, 1819 insertions, 423 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss b/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss index f5bb0a3bb8c0..53d99edd1d75 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss +++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss @@ -71,3 +71,10 @@ Description: Value of 1 indicates the controller can honor the reset_devices a dump device, as kdump requires resetting the device in order to work reliably. +Where: /sys/bus/pci/devices/<dev>/ccissX/transport_mode +Date: July 2011 +Kernel Version: 3.0 +Contact: iss_storagedev@hp.com +Description: Value of "simple" indicates that the controller has been placed + in "simple mode". Value of "performant" indicates that the + controller has been placed in "performant mode". diff --git a/Documentation/CodingStyle b/Documentation/CodingStyle index c940239d9678..2b90d328b3ba 100644 --- a/Documentation/CodingStyle +++ b/Documentation/CodingStyle @@ -166,8 +166,8 @@ if (condition) else do_that(); -This does not apply if one branch of a conditional statement is a single -statement. Use braces in both branches. +This does not apply if only one branch of a conditional statement is a single +statement; in the latter case use braces in both branches: if (condition) { do_this(); diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt index fe2326906610..66bd97a95f10 100644 --- a/Documentation/DMA-API.txt +++ b/Documentation/DMA-API.txt @@ -50,6 +50,13 @@ specify the GFP_ flags (see kmalloc) for the allocation (the implementation may choose to ignore flags that affect the location of the returned memory, like GFP_DMA). +void * +dma_zalloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) + +Wraps dma_alloc_coherent() and also zeroes the returned memory if the +allocation attempt succeeded. + void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_handle) diff --git a/Documentation/DocBook/media/v4l/compat.xml b/Documentation/DocBook/media/v4l/compat.xml index 91410b6e7e08..b68698f96e7f 100644 --- a/Documentation/DocBook/media/v4l/compat.xml +++ b/Documentation/DocBook/media/v4l/compat.xml @@ -2486,6 +2486,9 @@ ioctls.</para> <listitem> <para>Flash API. <xref linkend="flash-controls" /></para> </listitem> + <listitem> + <para>&VIDIOC-CREATE-BUFS; and &VIDIOC-PREPARE-BUF; ioctls.</para> + </listitem> </itemizedlist> </section> diff --git a/Documentation/DocBook/media/v4l/controls.xml b/Documentation/DocBook/media/v4l/controls.xml index 23fdf79f8cf3..3bc5ee8b2c74 100644 --- a/Documentation/DocBook/media/v4l/controls.xml +++ b/Documentation/DocBook/media/v4l/controls.xml @@ -232,8 +232,9 @@ control is deprecated. New drivers and applications should use the <entry>Enables a power line frequency filter to avoid flicker. Possible values for <constant>enum v4l2_power_line_frequency</constant> are: <constant>V4L2_CID_POWER_LINE_FREQUENCY_DISABLED</constant> (0), -<constant>V4L2_CID_POWER_LINE_FREQUENCY_50HZ</constant> (1) and -<constant>V4L2_CID_POWER_LINE_FREQUENCY_60HZ</constant> (2).</entry> +<constant>V4L2_CID_POWER_LINE_FREQUENCY_50HZ</constant> (1), +<constant>V4L2_CID_POWER_LINE_FREQUENCY_60HZ</constant> (2) and +<constant>V4L2_CID_POWER_LINE_FREQUENCY_AUTO</constant> (3).</entry> </row> <row> <entry><constant>V4L2_CID_HUE_AUTO</constant></entry> diff --git a/Documentation/DocBook/media/v4l/io.xml b/Documentation/DocBook/media/v4l/io.xml index c57d1ec6291c..3f47df1aa54a 100644 --- a/Documentation/DocBook/media/v4l/io.xml +++ b/Documentation/DocBook/media/v4l/io.xml @@ -927,6 +927,33 @@ ioctl is called.</entry> Applications set or clear this flag before calling the <constant>VIDIOC_QBUF</constant> ioctl.</entry> </row> + <row> + <entry><constant>V4L2_BUF_FLAG_PREPARED</constant></entry> + <entry>0x0400</entry> + <entry>The buffer has been prepared for I/O and can be queued by the +application. Drivers set or clear this flag when the +<link linkend="vidioc-querybuf">VIDIOC_QUERYBUF</link>, <link + linkend="vidioc-qbuf">VIDIOC_PREPARE_BUF</link>, <link + linkend="vidioc-qbuf">VIDIOC_QBUF</link> or <link + linkend="vidioc-qbuf">VIDIOC_DQBUF</link> ioctl is called.</entry> + </row> + <row> + <entry><constant>V4L2_BUF_FLAG_NO_CACHE_INVALIDATE</constant></entry> + <entry>0x0400</entry> + <entry>Caches do not have to be invalidated for this buffer. +Typically applications shall use this flag if the data captured in the buffer +is not going to be touched by the CPU, instead the buffer will, probably, be +passed on to a DMA-capable hardware unit for further processing or output. +</entry> + </row> + <row> + <entry><constant>V4L2_BUF_FLAG_NO_CACHE_CLEAN</constant></entry> + <entry>0x0800</entry> + <entry>Caches do not have to be cleaned for this buffer. +Typically applications shall use this flag for output buffers if the data +in this buffer has not been created by the CPU but by some DMA-capable unit, +in which case caches have not been used.</entry> + </row> </tbody> </tgroup> </table> diff --git a/Documentation/DocBook/media/v4l/v4l2.xml b/Documentation/DocBook/media/v4l/v4l2.xml index 40132c277647..2ab365c10fb9 100644 --- a/Documentation/DocBook/media/v4l/v4l2.xml +++ b/Documentation/DocBook/media/v4l/v4l2.xml @@ -469,6 +469,7 @@ and discussions on the V4L mailing list.</revremark> &sub-close; &sub-ioctl; <!-- All ioctls go here. --> + &sub-create-bufs; &sub-cropcap; &sub-dbg-g-chip-ident; &sub-dbg-g-register; @@ -511,6 +512,7 @@ and discussions on the V4L mailing list.</revremark> &sub-queryctrl; &sub-query-dv-preset; &sub-querystd; + &sub-prepare-buf; &sub-reqbufs; &sub-s-hw-freq-seek; &sub-streamon; diff --git a/Documentation/DocBook/media/v4l/vidioc-create-bufs.xml b/Documentation/DocBook/media/v4l/vidioc-create-bufs.xml new file mode 100644 index 000000000000..73ae8a6cd004 --- /dev/null +++ b/Documentation/DocBook/media/v4l/vidioc-create-bufs.xml @@ -0,0 +1,139 @@ +<refentry id="vidioc-create-bufs"> + <refmeta> + <refentrytitle>ioctl VIDIOC_CREATE_BUFS</refentrytitle> + &manvol; + </refmeta> + + <refnamediv> + <refname>VIDIOC_CREATE_BUFS</refname> + <refpurpose>Create buffers for Memory Mapped or User Pointer I/O</refpurpose> + </refnamediv> + + <refsynopsisdiv> + <funcsynopsis> + <funcprototype> + <funcdef>int <function>ioctl</function></funcdef> + <paramdef>int <parameter>fd</parameter></paramdef> + <paramdef>int <parameter>request</parameter></paramdef> + <paramdef>struct v4l2_create_buffers *<parameter>argp</parameter></paramdef> + </funcprototype> + </funcsynopsis> + </refsynopsisdiv> + + <refsect1> + <title>Arguments</title> + + <variablelist> + <varlistentry> + <term><parameter>fd</parameter></term> + <listitem> + <para>&fd;</para> + </listitem> + </varlistentry> + <varlistentry> + <term><parameter>request</parameter></term> + <listitem> + <para>VIDIOC_CREATE_BUFS</para> + </listitem> + </varlistentry> + <varlistentry> + <term><parameter>argp</parameter></term> + <listitem> + <para></para> + </listitem> + </varlistentry> + </variablelist> + </refsect1> + + <refsect1> + <title>Description</title> + + <para>This ioctl is used to create buffers for <link linkend="mmap">memory +mapped</link> or <link linkend="userp">user pointer</link> +I/O. It can be used as an alternative or in addition to the +<constant>VIDIOC_REQBUFS</constant> ioctl, when a tighter control over buffers +is required. This ioctl can be called multiple times to create buffers of +different sizes.</para> + + <para>To allocate device buffers applications initialize relevant fields of +the <structname>v4l2_create_buffers</structname> structure. They set the +<structfield>type</structfield> field in the +<structname>v4l2_format</structname> structure, embedded in this +structure, to the respective stream or buffer type. +<structfield>count</structfield> must be set to the number of required buffers. +<structfield>memory</structfield> specifies the required I/O method. The +<structfield>format</structfield> field shall typically be filled in using +either the <constant>VIDIOC_TRY_FMT</constant> or +<constant>VIDIOC_G_FMT</constant> ioctl(). Additionally, applications can adjust +<structfield>sizeimage</structfield> fields to fit their specific needs. The +<structfield>reserved</structfield> array must be zeroed.</para> + + <para>When the ioctl is called with a pointer to this structure the driver +will attempt to allocate up to the requested number of buffers and store the +actual number allocated and the starting index in the +<structfield>count</structfield> and the <structfield>index</structfield> fields +respectively. On return <structfield>count</structfield> can be smaller than +the number requested. The driver may also increase buffer sizes if required, +however, it will not update <structfield>sizeimage</structfield> field values. +The user has to use <constant>VIDIOC_QUERYBUF</constant> to retrieve that +information.</para> + + <table pgwide="1" frame="none" id="v4l2-create-buffers"> + <title>struct <structname>v4l2_create_buffers</structname></title> + <tgroup cols="3"> + &cs-str; + <tbody valign="top"> + <row> + <entry>__u32</entry> + <entry><structfield>index</structfield></entry> + <entry>The starting buffer index, returned by the driver.</entry> + </row> + <row> + <entry>__u32</entry> + <entry><structfield>count</structfield></entry> + <entry>The number of buffers requested or granted.</entry> + </row> + <row> + <entry>&v4l2-memory;</entry> + <entry><structfield>memory</structfield></entry> + <entry>Applications set this field to +<constant>V4L2_MEMORY_MMAP</constant> or +<constant>V4L2_MEMORY_USERPTR</constant>.</entry> + </row> + <row> + <entry>&v4l2-format;</entry> + <entry><structfield>format</structfield></entry> + <entry>Filled in by the application, preserved by the driver.</entry> + </row> + <row> + <entry>__u32</entry> + <entry><structfield>reserved</structfield>[8]</entry> + <entry>A place holder for future extensions.</entry> + </row> + </tbody> + </tgroup> + </table> + </refsect1> + + <refsect1> + &return-value; + + <variablelist> + <varlistentry> + <term><errorcode>ENOMEM</errorcode></term> + <listitem> + <para>No memory to allocate buffers for <link linkend="mmap">memory +mapped</link> I/O.</para> + </listitem> + </varlistentry> + <varlistentry> + <term><errorcode>EINVAL</errorcode></term> + <listitem> + <para>The buffer type (<structfield>type</structfield> field) or the +requested I/O method (<structfield>memory</structfield>) is not +supported.</para> + </listitem> + </varlistentry> + </variablelist> + </refsect1> +</refentry> diff --git a/Documentation/DocBook/media/v4l/vidioc-prepare-buf.xml b/Documentation/DocBook/media/v4l/vidioc-prepare-buf.xml new file mode 100644 index 000000000000..7bde698760e4 --- /dev/null +++ b/Documentation/DocBook/media/v4l/vidioc-prepare-buf.xml @@ -0,0 +1,88 @@ +<refentry id="vidioc-prepare-buf"> + <refmeta> + <refentrytitle>ioctl VIDIOC_PREPARE_BUF</refentrytitle> + &manvol; + </refmeta> + + <refnamediv> + <refname>VIDIOC_PREPARE_BUF</refname> + <refpurpose>Prepare a buffer for I/O</refpurpose> + </refnamediv> + + <refsynopsisdiv> + <funcsynopsis> + <funcprototype> + <funcdef>int <function>ioctl</function></funcdef> + <paramdef>int <parameter>fd</parameter></paramdef> + <paramdef>int <parameter>request</parameter></paramdef> + <paramdef>struct v4l2_buffer *<parameter>argp</parameter></paramdef> + </funcprototype> + </funcsynopsis> + </refsynopsisdiv> + + <refsect1> + <title>Arguments</title> + + <variablelist> + <varlistentry> + <term><parameter>fd</parameter></term> + <listitem> + <para>&fd;</para> + </listitem> + </varlistentry> + <varlistentry> + <term><parameter>request</parameter></term> + <listitem> + <para>VIDIOC_PREPARE_BUF</para> + </listitem> + </varlistentry> + <varlistentry> + <term><parameter>argp</parameter></term> + <listitem> + <para></para> + </listitem> + </varlistentry> + </variablelist> + </refsect1> + + <refsect1> + <title>Description</title> + + <para>Applications can optionally call the +<constant>VIDIOC_PREPARE_BUF</constant> ioctl to pass ownership of the buffer +to the driver before actually enqueuing it, using the +<constant>VIDIOC_QBUF</constant> ioctl, and to prepare it for future I/O. +Such preparations may include cache invalidation or cleaning. Performing them +in advance saves time during the actual I/O. In case such cache operations are +not required, the application can use one of +<constant>V4L2_BUF_FLAG_NO_CACHE_INVALIDATE</constant> and +<constant>V4L2_BUF_FLAG_NO_CACHE_CLEAN</constant> flags to skip the respective +step.</para> + + <para>The <structname>v4l2_buffer</structname> structure is +specified in <xref linkend="buffer" />.</para> + </refsect1> + + <refsect1> + &return-value; + + <variablelist> + <varlistentry> + <term><errorcode>EBUSY</errorcode></term> + <listitem> + <para>File I/O is in progress.</para> + </listitem> + </varlistentry> + <varlistentry> + <term><errorcode>EINVAL</errorcode></term> + <listitem> + <para>The buffer <structfield>type</structfield> is not +supported, or the <structfield>index</structfield> is out of bounds, +or no buffers have been allocated yet, or the +<structfield>userptr</structfield> or +<structfield>length</structfield> are invalid.</para> + </listitem> + </varlistentry> + </variablelist> + </refsect1> +</refentry> diff --git a/Documentation/block/switching-sched.txt b/Documentation/block/switching-sched.txt index 71cfbdc0f74d..3b2612e342f1 100644 --- a/Documentation/block/switching-sched.txt +++ b/Documentation/block/switching-sched.txt @@ -1,6 +1,6 @@ To choose IO schedulers at boot time, use the argument 'elevator=deadline'. -'noop', 'as' and 'cfq' (the default) are also available. IO schedulers are -assigned globally at boot time only presently. +'noop' and 'cfq' (the default) are also available. IO schedulers are assigned +globally at boot time only presently. Each io queue has a set of io scheduler tunables associated with it. These tunables control how the io scheduler works. You can find these entries diff --git a/Documentation/blockdev/cciss.txt b/Documentation/blockdev/cciss.txt index c00c6a5ab21f..71464e09ec18 100644 --- a/Documentation/blockdev/cciss.txt +++ b/Documentation/blockdev/cciss.txt @@ -78,6 +78,16 @@ The device naming scheme is: /dev/cciss/c1d1p2 Controller 1, disk 1, partition 2 /dev/cciss/c1d1p3 Controller 1, disk 1, partition 3 +CCISS simple mode support +------------------------- + +The "cciss_simple_mode=1" boot parameter may be used to prevent the driver +from putting the controller into "performant" mode. The difference is that +with simple mode, each command completion requires an interrupt, while with +"performant mode" (the default, and ordinarily better performing) it is +possible to have multiple command completions indicated by a single +interrupt. + SCSI tape drive and medium changer support ------------------------------------------ diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index cd67e90003c0..9c452ef2328c 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -454,8 +454,8 @@ mounted hierarchy, to remove a task from its current cgroup you must move it into a new cgroup (possibly the root cgroup) by writing to the new cgroup's tasks file. -Note: If the ns cgroup is active, moving a process to another cgroup can -fail. +Note: Due to some restrictions enforced by some cgroup subsystems, moving +a process to another cgroup can fail. 2.3 Mounting hierarchies by name -------------------------------- diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 06eb6d957c83..cc0ebc5241b3 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -418,7 +418,6 @@ total_unevictable - sum of all children's "unevictable" # The following additional stats are dependent on CONFIG_DEBUG_VM. -inactive_ratio - VM internal parameter. (see mm/page_alloc.c) recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) recent_rotated_file - VM internal parameter. (see mm/vmscan.c) recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) diff --git a/Documentation/device-mapper/dm-log.txt b/Documentation/device-mapper/dm-log.txt index 994dd75475a6..c155ac569c44 100644 --- a/Documentation/device-mapper/dm-log.txt +++ b/Documentation/device-mapper/dm-log.txt @@ -48,7 +48,7 @@ kernel and userspace, 'connector' is used as the interface for communication. There are currently two userspace log implementations that leverage this -framework - "clustered_disk" and "clustered_core". These implementations +framework - "clustered-disk" and "clustered-core". These implementations provide a cluster-coherent log for shared-storage. Device-mapper mirroring can be used in a shared-storage environment when the cluster log implementations are employed. diff --git a/Documentation/device-mapper/persistent-data.txt b/Documentation/device-mapper/persistent-data.txt new file mode 100644 index 000000000000..0e5df9b04ad2 --- /dev/null +++ b/Documentation/device-mapper/persistent-data.txt @@ -0,0 +1,84 @@ +Introduction +============ + +The more-sophisticated device-mapper targets require complex metadata +that is managed in kernel. In late 2010 we were seeing that various +different targets were rolling their own data strutures, for example: + +- Mikulas Patocka's multisnap implementation +- Heinz Mauelshagen's thin provisioning target +- Another btree-based caching target posted to dm-devel +- Another multi-snapshot target based on a design of Daniel Phillips + +Maintaining these data structures takes a lot of work, so if possible +we'd like to reduce the number. + +The persistent-data library is an attempt to provide a re-usable +framework for people who want to store metadata in device-mapper +targets. It's currently used by the thin-provisioning target and an +upcoming hierarchical storage target. + +Overview +======== + +The main documentation is in the header files which can all be found +under drivers/md/persistent-data. + +The block manager +----------------- + +dm-block-manager.[hc] + +This provides access to the data on disk in fixed sized-blocks. There +is a read/write locking interface to prevent concurrent accesses, and +keep data that is being used in the cache. + +Clients of persistent-data are unlikely to use this directly. + +The transaction manager +----------------------- + +dm-transaction-manager.[hc] + +This restricts access to blocks and enforces copy-on-write semantics. +The only way you can get hold of a writable block through the +transaction manager is by shadowing an existing block (ie. doing +copy-on-write) or allocating a fresh one. Shadowing is elided within +the same transaction so performance is reasonable. The commit method +ensures that all data is flushed before it writes the superblock. +On power failure your metadata will be as it was when last committed. + +The Space Maps +-------------- + +dm-space-map.h +dm-space-map-metadata.[hc] +dm-space-map-disk.[hc] + +On-disk data structures that keep track of reference counts of blocks. +Also acts as the allocator of new blocks. Currently two +implementations: a simpler one for managing blocks on a different +device (eg. thinly-provisioned data blocks); and one for managing +the metadata space. The latter is complicated by the need to store +its own data within the space it's managing. + +The data structures +------------------- + +dm-btree.[hc] +dm-btree-remove.c +dm-btree-spine.c +dm-btree-internal.h + +Currently there is only one data structure, a hierarchical btree. +There are plans to add more. For example, something with an +array-like interface would see a lot of use. + +The btree is 'hierarchical' in that you can define it to be composed +of nested btrees, and take multiple keys. For example, the +thin-provisioning target uses a btree with two levels of nesting. +The first maps a device id to a mapping tree, and that in turn maps a +virtual block to a physical block. + +Values stored in the btrees can have arbitrary size. Keys are always +64bits, although nesting allows you to use multiple keys. diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt new file mode 100644 index 000000000000..801d9d1cf82b --- /dev/null +++ b/Documentation/device-mapper/thin-provisioning.txt @@ -0,0 +1,285 @@ +Introduction +============ + +This document descibes a collection of device-mapper targets that +between them implement thin-provisioning and snapshots. + +The main highlight of this implementation, compared to the previous +implementation of snapshots, is that it allows many virtual devices to +be stored on the same data volume. This simplifies administration and +allows the sharing of data between volumes, thus reducing disk usage. + +Another significant feature is support for an arbitrary depth of +recursive snapshots (snapshots of snapshots of snapshots ...). The +previous implementation of snapshots did this by chaining together +lookup tables, and so performance was O(depth). This new +implementation uses a single data structure to avoid this degradation +with depth. Fragmentation may still be an issue, however, in some +scenarios. + +Metadata is stored on a separate device from data, giving the +administrator some freedom, for example to: + +- Improve metadata resilience by storing metadata on a mirrored volume + but data on a non-mirrored one. + +- Improve performance by storing the metadata on SSD. + +Status +====== + +These targets are very much still in the EXPERIMENTAL state. Please +do not yet rely on them in production. But do experiment and offer us +feedback. Different use cases will have different performance +characteristics, for example due to fragmentation of the data volume. + +If you find this software is not performing as expected please mail +dm-devel@redhat.com with details and we'll try our best to improve +things for you. + +Userspace tools for checking and repairing the metadata are under +development. + +Cookbook +======== + +This section describes some quick recipes for using thin provisioning. +They use the dmsetup program to control the device-mapper driver +directly. End users will be advised to use a higher-level volume +manager such as LVM2 once support has been added. + +Pool device +----------- + +The pool device ties together the metadata volume and the data volume. +It maps I/O linearly to the data volume and updates the metadata via +two mechanisms: + +- Function calls from the thin targets + +- Device-mapper 'messages' from userspace which control the creation of new + virtual devices amongst other things. + +Setting up a fresh pool device +------------------------------ + +Setting up a pool device requires a valid metadata device, and a +data device. If you do not have an existing metadata device you can +make one by zeroing the first 4k to indicate empty metadata. + + dd if=/dev/zero of=$metadata_dev bs=4096 count=1 + +The amount of metadata you need will vary according to how many blocks +are shared between thin devices (i.e. through snapshots). If you have +less sharing than average you'll need a larger-than-average metadata device. + +As a guide, we suggest you calculate the number of bytes to use in the +metadata device as 48 * $data_dev_size / $data_block_size but round it up +to 2MB if the answer is smaller. The largest size supported is 16GB. + +If you're creating large numbers of snapshots which are recording large +amounts of change, you may need find you need to increase this. + +Reloading a pool table +---------------------- + +You may reload a pool's table, indeed this is how the pool is resized +if it runs out of space. (N.B. While specifying a different metadata +device when reloading is not forbidden at the moment, things will go +wrong if it does not route I/O to exactly the same on-disk location as +previously.) + +Using an existing pool device +----------------------------- + + dmsetup create pool \ + --table "0 20971520 thin-pool $metadata_dev $data_dev \ + $data_block_size $low_water_mark" + +$data_block_size gives the smallest unit of disk space that can be +allocated at a time expressed in units of 512-byte sectors. People +primarily interested in thin provisioning may want to use a value such +as 1024 (512KB). People doing lots of snapshotting may want a smaller value +such as 128 (64KB). If you are not zeroing newly-allocated data, +a larger $data_block_size in the region of 256000 (128MB) is suggested. +$data_block_size must be the same for the lifetime of the +metadata device. + +$low_water_mark is expressed in blocks of size $data_block_size. If +free space on the data device drops below this level then a dm event +will be triggered which a userspace daemon should catch allowing it to +extend the pool device. Only one such event will be sent. +Resuming a device with a new table itself triggers an event so the +userspace daemon can use this to detect a situation where a new table +already exceeds the threshold. + +Thin provisioning +----------------- + +i) Creating a new thinly-provisioned volume. + + To create a new thinly- provisioned volume you must send a message to an + active pool device, /dev/mapper/pool in this example. + + dmsetup message /dev/mapper/pool 0 "create_thin 0" + + Here '0' is an identifier for the volume, a 24-bit number. It's up + to the caller to allocate and manage these identifiers. If the + identifier is already in use, the message will fail with -EEXIST. + +ii) Using a thinly-provisioned volume. + + Thinly-provisioned volumes are activated using the 'thin' target: + + dmsetup create thin --table "0 2097152 thin /dev/mapper/pool 0" + + The last parameter is the identifier for the thinp device. + +Internal snapshots +------------------ + +i) Creating an internal snapshot. + + Snapshots are created with another message to the pool. + + N.B. If the origin device that you wish to snapshot is active, you + must suspend it before creating the snapshot to avoid corruption. + This is NOT enforced at the moment, so please be careful! + + dmsetup suspend /dev/mapper/thin + dmsetup message /dev/mapper/pool 0 "create_snap 1 0" + dmsetup resume /dev/mapper/thin + + Here '1' is the identifier for the volume, a 24-bit number. '0' is the + identifier for the origin device. + +ii) Using an internal snapshot. + + Once created, the user doesn't have to worry about any connection + between the origin and the snapshot. Indeed the snapshot is no + different from any other thinly-provisioned device and can be + snapshotted itself via the same method. It's perfectly legal to + have only one of them active, and there's no ordering requirement on + activating or removing them both. (This differs from conventional + device-mapper snapshots.) + + Activate it exactly the same way as any other thinly-provisioned volume: + + dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 1" + +Deactivation +------------ + +All devices using a pool must be deactivated before the pool itself +can be. + + dmsetup remove thin + dmsetup remove snap + dmsetup remove pool + +Reference +========= + +'thin-pool' target +------------------ + +i) Constructor + + thin-pool <metadata dev> <data dev> <data block size (sectors)> \ + <low water mark (blocks)> [<number of feature args> [<arg>]*] + + Optional feature arguments: + - 'skip_block_zeroing': skips the zeroing of newly-provisioned blocks. + + Data block size must be between 64KB (128 sectors) and 1GB + (2097152 sectors) inclusive. + + +ii) Status + + <transaction id> <used metadata blocks>/<total metadata blocks> + <used data blocks>/<total data blocks> <held metadata root> + + + transaction id: + A 64-bit number used by userspace to help synchronise with metadata + from volume managers. + + used data blocks / total data blocks + If the number of free blocks drops below the pool's low water mark a + dm event will be sent to userspace. This event is edge-triggered and + it will occur only once after each resume so volume manager writers + should register for the event and then check the target's status. + + held metadata root: + The location, in sectors, of the metadata root that has been + 'held' for userspace read access. '-' indicates there is no + held root. This feature is not yet implemented so '-' is + always returned. + +iii) Messages + + create_thin <dev id> + + Create a new thinly-provisioned device. + <dev id> is an arbitrary unique 24-bit identifier chosen by + the caller. + + create_snap <dev id> <origin id> + + Create a new snapshot of another thinly-provisioned device. + <dev id> is an arbitrary unique 24-bit identifier chosen by + the caller. + <origin id> is the identifier of the thinly-provisioned device + of which the new device will be a snapshot. + + delete <dev id> + + Deletes a thin device. Irreversible. + + trim <dev id> <new size in sectors> + + Delete mappings from the end of a thin device. Irreversible. + You might want to use this if you're reducing the size of + your thinly-provisioned device. In many cases, due to the + sharing of blocks between devices, it is not possible to + determine in advance how much space 'trim' will release. (In + future a userspace tool might be able to perform this + calculation.) + + set_transaction_id <current id> <new id> + + Userland volume managers, such as LVM, need a way to + synchronise their external metadata with the internal metadata of the + pool target. The thin-pool target offers to store an + arbitrary 64-bit transaction id and return it on the target's + status line. To avoid races you must provide what you think + the current transaction id is when you change it with this + compare-and-swap message. + +'thin' target +------------- + +i) Constructor + + thin <pool dev> <dev id> + + pool dev: + the thin-pool device, e.g. /dev/mapper/my_pool or 253:0 + + dev id: + the internal device identifier of the device to be + activated. + +The pool doesn't store any size against the thin devices. If you +load a thin target that is smaller than you've been using previously, +then you'll have no access to blocks mapped beyond the end. If you +load a target that is bigger than before, then extra blocks will be +provisioned as and when needed. + +If you wish to reduce the size of your thin device and potentially +regain some space then send the 'trim' message to the pool. + +ii) Status + + <nr mapped sectors> <highest mapped sector> diff --git a/Documentation/devicetree/bindings/ata/calxeda-sata.txt b/Documentation/devicetree/bindings/ata/calxeda-sata.txt new file mode 100644 index 000000000000..79caa5651f53 --- /dev/null +++ b/Documentation/devicetree/bindings/ata/calxeda-sata.txt @@ -0,0 +1,17 @@ +* Calxeda SATA Controller + +SATA nodes are defined to describe on-chip Serial ATA controllers. +Each SATA controller should have its own node. + +Required properties: +- compatible : compatible list, contains "calxeda,hb-ahci" +- interrupts : <interrupt mapping for SATA IRQ> +- reg : <registers mapping> + +Example: + sata@ffe08000 { + compatible = "calxeda,hb-ahci"; + reg = <0xffe08000 0x1000>; + interrupts = <115>; + }; + diff --git a/Documentation/devicetree/bindings/powerpc/fsl/board.txt b/Documentation/devicetree/bindings/powerpc/fsl/board.txt index 39e941515a36..380914e965e0 100644 --- a/Documentation/devicetree/bindings/powerpc/fsl/board.txt +++ b/Documentation/devicetree/bindings/powerpc/fsl/board.txt @@ -1,3 +1,8 @@ +Freescale Reference Board Bindings + +This document describes device tree bindings for various devices that +exist on some Freescale reference boards. + * Board Control and Status (BCSR) Required properties: @@ -12,25 +17,26 @@ Example: reg = <f8000000 8000>; }; -* Freescale on board FPGA +* Freescale on-board FPGA This is the memory-mapped registers for on board FPGA. Required properities: -- compatible : should be "fsl,fpga-pixis". -- reg : should contain the address and the length of the FPPGA register - set. +- compatible: should be a board-specific string followed by a string + indicating the type of FPGA. Example: + "fsl,<board>-fpga", "fsl,fpga-pixis" +- reg: should contain the address and the length of the FPGA register set. - interrupt-parent: should specify phandle for the interrupt controller. -- interrupts : should specify event (wakeup) IRQ. +- interrupts: should specify event (wakeup) IRQ. -Example (MPC8610HPCD): +Example (P1022DS): - board-control@e8000000 { - compatible = "fsl,fpga-pixis"; - reg = <0xe8000000 32>; - interrupt-parent = <&mpic>; - interrupts = <8 8>; - }; + board-control@3,0 { + compatible = "fsl,p1022ds-fpga", "fsl,fpga-ngpixis"; + reg = <3 0 0x30>; + interrupt-parent = <&mpic>; + interrupts = <8 8 0 0>; + }; * Freescale BCSR GPIO banks diff --git a/Documentation/devicetree/bindings/powerpc/fsl/dcsr.txt b/Documentation/devicetree/bindings/powerpc/fsl/dcsr.txt new file mode 100644 index 000000000000..9d54eb5a295f --- /dev/null +++ b/Documentation/devicetree/bindings/powerpc/fsl/dcsr.txt @@ -0,0 +1,395 @@ +=================================================================== +Debug Control and Status Register (DCSR) Binding +Copyright 2011 Freescale Semiconductor Inc. + +NOTE: The bindings described in this document are preliminary and subject +to change. Some of the compatible strings that contain only generic names +may turn out to be inappropriate, or need additional properties to describe +the integration of the block with the rest of the chip. + +===================================================================== +Debug Control and Status Register Memory Map + +Description + +This node defines the base address and range for the +defined DCSR Memory Map. Child nodes will describe the individual +debug blocks defined within this memory space. + +PROPERTIES + + - compatible + Usage: required + Value type: <string> + Definition: Must include "fsl,dcsr" and "simple-bus". + The DCSR space exists in the memory-mapped bus. + + - #address-cells + Usage: required + Value type: <u32> + Definition: A standard property. Defines the number of cells + or representing physical addresses in child nodes. + + - #size-cells + Usage: required + Value type: <u32> + Definition: A standard property. Defines the number of cells + or representing the size of physical addresses in + child nodes. + + - ranges + Usage: required + Value type: <prop-encoded-array> + Definition: A standard property. Specifies the physical address + range of the DCSR space. + +EXAMPLE + dcsr: dcsr@f00000000 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "fsl,dcsr", "simple-bus"; + ranges = <0x00000000 0xf 0x00000000 0x01008000>; + }; + +===================================================================== +Event Processing Unit + +This node represents the region of DCSR space allocated to the EPU + +PROPERTIES + + - compatible + Usage: required + Value type: <string> + Definition: Must include "fsl,dcsr-epu" + + - interrupts + Usage: required + Value type: <prop_encoded-array> + Definition: Specifies the interrupts generated by the EPU. + The value of the interrupts property consists of three + interrupt specifiers. The format of the specifier is defined + by the binding document describing the node's interrupt parent. + + The EPU counters can be configured to assert the performance + monitor interrupt signal based on either counter overflow or value + match. Which counter asserted the interrupt is captured in an EPU + Counter Interrupt Status Register (EPCPUISR). + + The EPU unit can also be configured to assert either or both of + two interrupt signals based on debug event sources within the SoC. + The interrupt signals are epu_xt_int0 and epu_xt_int1. + Which event source asserted the interrupt is captured in an EPU + Interrupt Status Register (EPISR0,EPISR1). + + Interrupt numbers are lised in order (perfmon, event0, event1). + + - interrupt-parent + Usage: required + Value type: <phandle> + Definition: A single <phandle> value that points + to the interrupt parent to which the child domain + is being mapped. Value must be "&mpic" + + - reg + Usage: required + Value type: <prop-encoded-array> + Definition: A standard property. Specifies the physical address + offset and length of the DCSR space registers of the device + configuration block. + +EXAMPLE + dcsr-epu@0 { + compatible = "fsl,dcsr-epu"; + interrupts = <52 2 0 0 + 84 2 0 0 + 85 2 0 0>; + interrupt-parent = <&mpic>; + reg = <0x0 0x1000>; + }; + +======================================================================= +Nexus Port Controller + +This node represents the region of DCSR space allocated to the NPC + +PROPERTIES + + - compatible + Usage: required + Value type: <string> + Definition: Must include "fsl,dcsr-npc" + + - reg + Usage: required + Value type: <prop-encoded-array> + Definition: A standard property. Specifies the physical address + offset and length of the DCSR space registers of the device + configuration block. + The Nexus Port controller occupies two regions in the DCSR space + with distinct functionality. + + The first register range describes the Nexus Port Controller + control and status registers. + + The second register range describes the Nexus Port Controller + internal trace buffer. The NPC trace buffer is a small memory buffer + which stages the nexus trace data for transmission via the Aurora port + or to a DDR based trace buffer. In some configurations the NPC trace + buffer can be the only trace buffer used. + + +EXAMPLE + dcsr-npc { + compatible = "fsl,dcsr-npc"; + reg = <0x1000 0x1000 0x1000000 0x8000>; + }; + +======================================================================= +Nexus Concentrator + +This node represents the region of DCSR space allocated to the NXC + +PROPERTIES + + - compatible + Usage: required + Value type: <string> + Definition: Must include "fsl,dcsr-nxc" + + - reg + Usage: required + Value type: <prop-encoded-array> + Definition: A standard property. Specifies the physical address + offset and length of the DCSR space registers of the device + configuration block. + +EXAMPLE + dcsr-nxc@2000 { + compatible = "fsl,dcsr-nxc"; + reg = <0x2000 0x1000>; + }; +======================================================================= +CoreNet Debug Controller + +This node represents the region of DCSR space allocated to +the CoreNet Debug controller. + +PROPERTIES + + - compatible + Usage: required + Value type: <string> + Definition: Must include "fsl,dcsr-corenet" + + - reg + Usage: required + Value type: <prop-encoded-array> + Definition: A standard property. Specifies the physical address + offset and length of the DCSR space registers of the device + configuration block. + The CoreNet Debug controller occupies two regions in the DCSR space + with distinct functionality. + + The first register range describes the CoreNet Debug Controller + functionalty to perform transaction and transaction attribute matches. + + The second register range describes the CoreNet Debug Controller + functionalty to trigger event notifications and debug traces. + +EXAMPLE + dcsr-corenet { + compatible = "fsl,dcsr-corenet"; + reg = <0x8000 0x1000 0xB0000 0x1000>; + }; + +======================================================================= +Data Path Debug controller + +This node represents the region of DCSR space allocated to +the DPAA Debug Controller. This controller controls debug configuration +for the QMAN and FMAN blocks. + +PROPERTIES + + - compatible + Usage: required + Value type: <string> + Definition: Must include both an identifier specific to the SoC + or Debug IP of the form "fsl,<soc>-dcsr-dpaa" in addition to the + generic compatible string "fsl,dcsr-dpaa". + + - reg + Usage: required + Value type: <prop-encoded-array> + Definition: A standard property. Specifies the physical address + offset and length of the DCSR space registers of the device + configuration block. + +EXAMPLE + dcsr-dpaa@9000 { + compatible = "fsl,p4080-dcsr-dpaa", "fsl,dcsr-dpaa"; + reg = <0x9000 0x1000>; + }; + +======================================================================= +OCeaN Debug controller + +This node represents the region of DCSR space allocated to +the OCN Debug Controller. + +PROPERTIES + + - compatible + Usage: required + Value type: <string> + Definition: Must include both an identifier specific to the SoC + or Debug IP of the form "fsl,<soc>-dcsr-ocn" in addition to the + generic compatible string "fsl,dcsr-ocn". + + - reg + Usage: required + Value type: <prop-encoded-array> + Definition: A standard property. Specifies the physical address + offset and length of the DCSR space registers of the device + configuration block. + +EXAMPLE + dcsr-ocn@11000 { + compatible = "fsl,p4080-dcsr-ocn", "fsl,dcsr-ocn"; + reg = <0x11000 0x1000>; + }; + +======================================================================= +DDR Controller Debug controller + +This node represents the region of DCSR space allocated to +the OCN Debug Controller. + +PROPERTIES + + - compatible + Usage: required + Value type: <string> + Definition: Must include "fsl,dcsr-ddr" + + - dev-handle + Usage: required + Definition: A phandle to associate this debug node with its + component controller. + + - reg + Usage: required + Value type: <prop-encoded-array> + Definition: A standard property. Specifies the physical address + offset and length of the DCSR space registers of the device + configuration block. + +EXAMPLE + dcsr-ddr@12000 { + compatible = "fsl,dcsr-ddr"; + dev-handle = <&ddr1>; + reg = <0x12000 0x1000>; + }; + +======================================================================= +Nexus Aurora Link Controller + +This node represents the region of DCSR space allocated to +the NAL Controller. + +PROPERTIES + + - compatible + Usage: required + Value type: <string> + Definition: Must include both an identifier specific to the SoC + or Debug IP of the form "fsl,<soc>-dcsr-nal" in addition to the + generic compatible string "fsl,dcsr-nal". + + - reg + Usage: required + Value type: <prop-encoded-array> + Definition: A standard property. Specifies the physical address + offset and length of the DCSR space registers of the device + configuration block. + +EXAMPLE + dcsr-nal@18000 { + compatible = "fsl,p4080-dcsr-nal", "fsl,dcsr-nal"; + reg = <0x18000 0x1000>; + }; + + +======================================================================= +Run Control and Power Management + +This node represents the region of DCSR space allocated to +the RCPM Debug Controller. This functionlity is limited to the +control the debug operations of the SoC and cores. + +PROPERTIES + + - compatible + Usage: required + Value type: <string> + Definition: Must include both an identifier specific to the SoC + or Debug IP of the form "fsl,<soc>-dcsr-rcpm" in addition to the + generic compatible string "fsl,dcsr-rcpm". + + - reg + Usage: required + Value type: <prop-encoded-array> + Definition: A standard property. Specifies the physical address + offset and length of the DCSR space registers of the device + configuration block. + +EXAMPLE + dcsr-rcpm@22000 { + compatible = "fsl,p4080-dcsr-rcpm", "fsl,dcsr-rcpm"; + reg = <0x22000 0x1000>; + }; + +======================================================================= +Core Service Bridge Proxy + +This node represents the region of DCSR space allocated to +the Core Service Bridge Proxies. +There is one Core Service Bridge Proxy device for each CPU in the system. +This functionlity provides access to the debug operations of the CPU. + +PROPERTIES + + - compatible + Usage: required + Value type: <string> + Definition: Must include both an identifier specific to the cpu + of the form "fsl,dcsr-<cpu>-sb-proxy" in addition to the + generic compatible string "fsl,dcsr-cpu-sb-proxy". + + - cpu-handle + Usage: required + Definition: A phandle to associate this debug node with its cpu. + + - reg + Usage: required + Value type: <prop-encoded-array> + Definition: A standard property. Specifies the physical address + offset and length of the DCSR space registers of the device + configuration block. + +EXAMPLE + dcsr-cpu-sb-proxy@40000 { + compatible = "fsl,dcsr-e500mc-sb-proxy", + "fsl,dcsr-cpu-sb-proxy"; + cpu-handle = <&cpu0>; + reg = <0x40000 0x1000>; + }; + dcsr-cpu-sb-proxy@41000 { + compatible = "fsl,dcsr-e500mc-sb-proxy", + "fsl,dcsr-cpu-sb-proxy"; + cpu-handle = <&cpu1>; + reg = <0x41000 0x1000>; + }; + +======================================================================= diff --git a/Documentation/devicetree/bindings/powerpc/fsl/msi-pic.txt b/Documentation/devicetree/bindings/powerpc/fsl/msi-pic.txt index 70558c3f3682..5d586e1ccaf5 100644 --- a/Documentation/devicetree/bindings/powerpc/fsl/msi-pic.txt +++ b/Documentation/devicetree/bindings/powerpc/fsl/msi-pic.txt @@ -25,6 +25,16 @@ Required properties: are routed to IPIC, and for 85xx/86xx cpu the interrupts are routed to MPIC. +Optional properties: +- msi-address-64: 64-bit PCI address of the MSIIR register. The MSIIR register + is used for MSI messaging. The address of MSIIR in PCI address space is + the MSI message address. + + This property may be used in virtualized environments where the hypervisor + has created an alternate mapping for the MSIR block. See below for an + explanation. + + Example: msi@41600 { compatible = "fsl,mpc8610-msi", "fsl,mpic-msi"; @@ -41,3 +51,35 @@ Example: 0xe7 0>; interrupt-parent = <&mpic>; }; + +The Freescale hypervisor and msi-address-64 +------------------------------------------- +Normally, PCI devices have access to all of CCSR via an ATMU mapping. The +Freescale MSI driver calculates the address of MSIIR (in the MSI register +block) and sets that address as the MSI message address. + +In a virtualized environment, the hypervisor may need to create an IOMMU +mapping for MSIIR. The Freescale ePAPR hypervisor has this requirement +because of hardware limitations of the Peripheral Access Management Unit +(PAMU), which is currently the only IOMMU that the hypervisor supports. +The ATMU is programmed with the guest physical address, and the PAMU +intercepts transactions and reroutes them to the true physical address. + +In the PAMU, each PCI controller is given only one primary window. The +PAMU restricts DMA operations so that they can only occur within a window. +Because PCI devices must be able to DMA to memory, the primary window must +be used to cover all of the guest's memory space. + +PAMU primary windows can be divided into 256 subwindows, and each +subwindow can have its own address mapping ("guest physical" to "true +physical"). However, each subwindow has to have the same alignment, which +means they cannot be located at just any address. Because of these +restrictions, it is usually impossible to create a 4KB subwindow that +covers MSIIR where it's normally located. + +Therefore, the hypervisor has to create a subwindow inside the same +primary window used for memory, but mapped to the MSIR block (where MSIIR +lives). The first subwindow after the end of guest memory is used for +this. The address specified in the msi-address-64 property is the PCI +address of MSIIR. The hypervisor configures the PAMU to map that address to +the true physical address of MSIIR. diff --git a/Documentation/devicetree/bindings/virtio/mmio.txt b/Documentation/devicetree/bindings/virtio/mmio.txt new file mode 100644 index 000000000000..5069c1b8e193 --- /dev/null +++ b/Documentation/devicetree/bindings/virtio/mmio.txt @@ -0,0 +1,17 @@ +* virtio memory mapped device + +See http://ozlabs.org/~rusty/virtio-spec/ for more details. + +Required properties: + +- compatible: "virtio,mmio" compatibility string +- reg: control registers base address and size including configuration space +- interrupts: interrupt generated by the device + +Example: + + virtio_block@3000 { + compatible = "virtio,mmio"; + reg = <0x3000 0x100>; + interrupts = <41>; + } diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 7c799fc5b88e..3d849122b5b1 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -133,41 +133,6 @@ Who: Pavel Machek <pavel@ucw.cz> --------------------------- -What: sys_sysctl -When: September 2010 -Option: CONFIG_SYSCTL_SYSCALL -Why: The same information is available in a more convenient from - /proc/sys, and none of the sysctl variables appear to be - important performance wise. - - Binary sysctls are a long standing source of subtle kernel - bugs and security issues. - - When I looked several months ago all I could find after - searching several distributions were 5 user space programs and - glibc (which falls back to /proc/sys) using this syscall. - - The man page for sysctl(2) documents it as unusable for user - space programs. - - sysctl(2) is not generally ABI compatible to a 32bit user - space application on a 64bit and a 32bit kernel. - - For the last several months the policy has been no new binary - sysctls and no one has put forward an argument to use them. - - Binary sysctls issues seem to keep happening appearing so - properly deprecating them (with a warning to user space) and a - 2 year grace warning period will mean eventually we can kill - them and end the pain. - - In the mean time individual binary sysctls can be dealt with - in a piecewise fashion. - -Who: Eric Biederman <ebiederm@xmission.com> - ---------------------------- - What: /proc/<pid>/oom_adj When: August 2012 Why: /proc/<pid>/oom_adj allows userspace to influence the oom killer's diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 653380793a6c..d819ba16a0c7 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -29,6 +29,7 @@ d_hash no no no maybe d_compare: yes no no maybe d_delete: no yes no no d_release: no no yes no +d_prune: no yes no no d_iput: no no yes no d_dname: no no no no d_automount: no no yes no diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt index 22f3a0eda1d2..b100adc38adb 100644 --- a/Documentation/filesystems/ext3.txt +++ b/Documentation/filesystems/ext3.txt @@ -73,14 +73,6 @@ nobarrier (*) This also requires an IO stack which can support also be used to enable or disable barriers, for consistency with other ext3 mount options. -orlov (*) This enables the new Orlov block allocator. It is - enabled by default. - -oldalloc This disables the Orlov block allocator and enables - the old block allocator. Orlov should have better - performance - we'd like to get some feedback if it's - the contrary for you. - user_xattr Enables Extended User Attributes. Additionally, you need to have extended attribute support enabled in the kernel configuration (CONFIG_EXT3_FS_XATTR). See the diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 232a575a0c48..4917cf24a5e0 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -160,7 +160,9 @@ noload if the filesystem was not unmounted cleanly, lead to any number of problems. data=journal All data are committed into the journal prior to being - written into the main file system. + written into the main file system. Enabling + this mode will disable delayed allocation and + O_DIRECT support. data=ordered (*) All data are forced directly out to the main file system prior to its metadata being committed to the @@ -201,30 +203,19 @@ inode_readahead_blks=n This tuning parameter controls the maximum table readahead algorithm will pre-read into the buffer cache. The default value is 32 blocks. -orlov (*) This enables the new Orlov block allocator. It is - enabled by default. - -oldalloc This disables the Orlov block allocator and enables - the old block allocator. Orlov should have better - performance - we'd like to get some feedback if it's - the contrary for you. - -user_xattr Enables Extended User Attributes. Additionally, you - need to have extended attribute support enabled in the - kernel configuration (CONFIG_EXT4_FS_XATTR). See the - attr(5) manual page and http://acl.bestbits.at/ to - learn more about extended attributes. - -nouser_xattr Disables Extended User Attributes. - -acl Enables POSIX Access Control Lists support. - Additionally, you need to have ACL support enabled in - the kernel configuration (CONFIG_EXT4_FS_POSIX_ACL). - See the acl(5) manual page and http://acl.bestbits.at/ - for more information. +nouser_xattr Disables Extended User Attributes. If you have extended + attribute support enabled in the kernel configuration + (CONFIG_EXT4_FS_XATTR), extended attribute support + is enabled by default on mount. See the attr(5) manual + page and http://acl.bestbits.at/ for more information + about extended attributes. noacl This option disables POSIX Access Control List - support. + support. If ACL support is enabled in the kernel + configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL is + enabled by default on mount. See the acl(5) manual + page and http://acl.bestbits.at/ for more information + about acl. bsddf (*) Make 'df' act like BSD. minixdf Make 'df' act like Minix. @@ -419,8 +410,8 @@ written to the journal first, and then to its final location. In the event of a crash, the journal can be replayed, bringing both data and metadata into a consistent state. This mode is the slowest except when data needs to be read from and written to disk at the same time where it -outperforms all others modes. Currently ext4 does not have delayed -allocation support if this data journalling mode is selected. +outperforms all others modes. Enabling this mode will disable delayed +allocation and O_DIRECT support. /proc entries ============= diff --git a/Documentation/filesystems/hfs.txt b/Documentation/filesystems/hfs.txt index bd0fa7704035..d096df6db07a 100644 --- a/Documentation/filesystems/hfs.txt +++ b/Documentation/filesystems/hfs.txt @@ -1,3 +1,4 @@ +Note: This filesystem doesn't have a maintainer. Macintosh HFS Filesystem for Linux ================================== @@ -76,8 +77,6 @@ hformat that can be used to create HFS filesystem. See Credits ======= -The HFS drivers was written by Paul H. Hargrovea (hargrove@sccm.Stanford.EDU) -and is now maintained by Roman Zippel (roman@ardistech.com) at Ardis -Technologies. -Roman rewrote large parts of the code and brought in btree routines derived -from Brad Boyer's hfsplus driver (also maintained by Roman now). +The HFS drivers was written by Paul H. Hargrovea (hargrove@sccm.Stanford.EDU). +Roman Zippel (roman@ardistech.com) rewrote large parts of the code and brought +in btree routines derived from Brad Boyer's hfsplus driver. diff --git a/Documentation/filesystems/inotify.txt b/Documentation/filesystems/inotify.txt index 59a919f16144..cfd02712b83e 100644 --- a/Documentation/filesystems/inotify.txt +++ b/Documentation/filesystems/inotify.txt @@ -194,7 +194,8 @@ associated with the inotify_handle, and on which events are queued. Each watch is associated with an inotify_watch structure. Watches are chained off of each associated inotify_handle and each associated inode. -See fs/inotify.c and fs/inotify_user.c for the locking and lifetime rules. +See fs/notify/inotify/inotify_fsnotify.c and fs/notify/inotify/inotify_user.c +for the locking and lifetime rules. (vi) Rationale diff --git a/Documentation/hwmon/w83627ehf b/Documentation/hwmon/w83627ehf index 76ffef94ed75..3f44dbdfda70 100644 --- a/Documentation/hwmon/w83627ehf +++ b/Documentation/hwmon/w83627ehf @@ -14,6 +14,10 @@ Supported chips: Prefix: 'w83627dhg' Addresses scanned: ISA address retrieved from Super I/O registers Datasheet: not available + * Winbond W83627UHG + Prefix: 'w83627uhg' + Addresses scanned: ISA address retrieved from Super I/O registers + Datasheet: available from www.nuvoton.com * Winbond W83667HG Prefix: 'w83667hg' Addresses scanned: ISA address retrieved from Super I/O registers @@ -42,14 +46,13 @@ Description ----------- This driver implements support for the Winbond W83627EHF, W83627EHG, -W83627DHG, W83627DHG-P, W83667HG, W83667HG-B, W83667HG-I (NCT6775F), -and NCT6776F super I/O chips. We will refer to them collectively as -Winbond chips. - -The chips implement three temperature sensors (up to four for 667HG-B, and nine -for NCT6775F and NCT6776F), five fan rotation speed sensors, ten analog voltage -sensors (only nine for the 627DHG), one VID (6 pins for the 627EHF/EHG, 8 pins -for the 627DHG and 667HG), alarms with beep warnings (control unimplemented), +W83627DHG, W83627DHG-P, W83627UHG, W83667HG, W83667HG-B, W83667HG-I +(NCT6775F), and NCT6776F super I/O chips. We will refer to them collectively +as Winbond chips. + +The chips implement 2 to 4 temperature sensors (9 for NCT6775F and NCT6776F), +2 to 5 fan rotation speed sensors, 8 to 10 analog voltage sensors, one VID +(except for 627UHG), alarms with beep warnings (control unimplemented), and some automatic fan regulation strategies (plus manual fan control mode). The temperature sensor sources on W82677HG-B, NCT6775F, and NCT6776F are @@ -86,17 +89,16 @@ follows: temp1 -> pwm1 temp2 -> pwm2 -temp3 -> pwm3 +temp3 -> pwm3 (not on 627UHG) prog -> pwm4 (not on 667HG and 667HG-B; the programmable setting is not supported by the driver) /sys files ---------- -name - this is a standard hwmon device entry. For the W83627EHF and W83627EHG, - it is set to "w83627ehf", for the W83627DHG it is set to "w83627dhg", - for the W83667HG and W83667HG-B it is set to "w83667hg", for NCT6775F it - is set to "nct6775", and for NCT6776F it is set to "nct6776". +name - this is a standard hwmon device entry, it contains the name of + the device (see the prefix in the list of supported devices at + the top of this file) pwm[1-4] - this file stores PWM duty cycle or DC value (fan speed) in range: 0 (stop) to 255 (full) diff --git a/Documentation/hwspinlock.txt b/Documentation/hwspinlock.txt index 7dcd1a4e726c..a903ee5e9776 100644 --- a/Documentation/hwspinlock.txt +++ b/Documentation/hwspinlock.txt @@ -39,23 +39,20 @@ independent, drivers. in case an unused hwspinlock isn't available. Users of this API will usually want to communicate the lock's id to the remote core before it can be used to achieve synchronization. - Can be called from an atomic context (this function will not sleep) but - not from within interrupt context. + Should be called from a process context (might sleep). struct hwspinlock *hwspin_lock_request_specific(unsigned int id); - assign a specific hwspinlock id and return its address, or NULL if that hwspinlock is already in use. Usually board code will be calling this function in order to reserve specific hwspinlock ids for predefined purposes. - Can be called from an atomic context (this function will not sleep) but - not from within interrupt context. + Should be called from a process context (might sleep). int hwspin_lock_free(struct hwspinlock *hwlock); - free a previously-assigned hwspinlock; returns 0 on success, or an appropriate error code on failure (e.g. -EINVAL if the hwspinlock is already free). - Can be called from an atomic context (this function will not sleep) but - not from within interrupt context. + Should be called from a process context (might sleep). int hwspin_lock_timeout(struct hwspinlock *hwlock, unsigned int timeout); - lock a previously-assigned hwspinlock with a timeout limit (specified in @@ -230,45 +227,62 @@ int hwspinlock_example2(void) 4. API for implementors - int hwspin_lock_register(struct hwspinlock *hwlock); + int hwspin_lock_register(struct hwspinlock_device *bank, struct device *dev, + const struct hwspinlock_ops *ops, int base_id, int num_locks); - to be called from the underlying platform-specific implementation, in - order to register a new hwspinlock instance. Can be called from an atomic - context (this function will not sleep) but not from within interrupt - context. Returns 0 on success, or appropriate error code on failure. + order to register a new hwspinlock device (which is usually a bank of + numerous locks). Should be called from a process context (this function + might sleep). + Returns 0 on success, or appropriate error code on failure. - struct hwspinlock *hwspin_lock_unregister(unsigned int id); + int hwspin_lock_unregister(struct hwspinlock_device *bank); - to be called from the underlying vendor-specific implementation, in order - to unregister an existing (and unused) hwspinlock instance. - Can be called from an atomic context (will not sleep) but not from - within interrupt context. + to unregister an hwspinlock device (which is usually a bank of numerous + locks). + Should be called from a process context (this function might sleep). Returns the address of hwspinlock on success, or NULL on error (e.g. if the hwspinlock is sill in use). -5. struct hwspinlock +5. Important structs -This struct represents an hwspinlock instance. It is registered by the -underlying hwspinlock implementation using the hwspin_lock_register() API. +struct hwspinlock_device is a device which usually contains a bank +of hardware locks. It is registered by the underlying hwspinlock +implementation using the hwspin_lock_register() API. /** - * struct hwspinlock - vendor-specific hwspinlock implementation - * - * @dev: underlying device, will be used with runtime PM api - * @ops: vendor-specific hwspinlock handlers - * @id: a global, unique, system-wide, index of the lock. - * @lock: initialized and used by hwspinlock core - * @owner: underlying implementation module, used to maintain module ref count + * struct hwspinlock_device - a device which usually spans numerous hwspinlocks + * @dev: underlying device, will be used to invoke runtime PM api + * @ops: platform-specific hwspinlock handlers + * @base_id: id index of the first lock in this device + * @num_locks: number of locks in this device + * @lock: dynamically allocated array of 'struct hwspinlock' */ -struct hwspinlock { +struct hwspinlock_device { struct device *dev; const struct hwspinlock_ops *ops; - int id; + int base_id; + int num_locks; + struct hwspinlock lock[0]; +}; + +struct hwspinlock_device contains an array of hwspinlock structs, each +of which represents a single hardware lock: + +/** + * struct hwspinlock - this struct represents a single hwspinlock instance + * @bank: the hwspinlock_device structure which owns this lock + * @lock: initialized and used by hwspinlock core + * @priv: private data, owned by the underlying platform-specific hwspinlock drv + */ +struct hwspinlock { + struct hwspinlock_device *bank; spinlock_t lock; - struct module *owner; + void *priv; }; -The underlying implementation is responsible to assign the dev, ops, id and -owner members. The lock member, OTOH, is initialized and used by the hwspinlock -core. +When registering a bank of locks, the hwspinlock driver only needs to +set the priv members of the locks. The rest of the members are set and +initialized by the hwspinlock core itself. 6. Implementation callbacks diff --git a/Documentation/laptops/thinkpad-acpi.txt b/Documentation/laptops/thinkpad-acpi.txt index 3ff0dad62d36..9d666828915a 100644 --- a/Documentation/laptops/thinkpad-acpi.txt +++ b/Documentation/laptops/thinkpad-acpi.txt @@ -411,9 +411,9 @@ event code Key Notes 0x1004 0x03 FN+F4 Sleep button (ACPI sleep button semantics, i.e. sleep-to-RAM). - It is always generate some kind + It always generates some kind of event, either the hot key - event or a ACPI sleep button + event or an ACPI sleep button event. The firmware may refuse to generate further FN+F4 key presses until a S3 or S4 ACPI diff --git a/Documentation/leds/leds-class.txt b/Documentation/leds/leds-class.txt index 4996586e27e8..79699c200766 100644 --- a/Documentation/leds/leds-class.txt +++ b/Documentation/leds/leds-class.txt @@ -61,8 +61,8 @@ Hardware accelerated blink of LEDs Some LEDs can be programmed to blink without any CPU interaction. To support this feature, a LED driver can optionally implement the blink_set() function (see <linux/leds.h>). To set an LED to blinking, -however, it is better to use use the API function led_blink_set(), -as it will check and implement software fallback if necessary. +however, it is better to use the API function led_blink_set(), as it +will check and implement software fallback if necessary. To turn off blinking again, use the API function led_brightness_set() as that will not just set the LED brightness but also stop any software diff --git a/Documentation/networking/ipvs-sysctl.txt b/Documentation/networking/ipvs-sysctl.txt index 4ccdbca03811..f2a2488f1bf3 100644 --- a/Documentation/networking/ipvs-sysctl.txt +++ b/Documentation/networking/ipvs-sysctl.txt @@ -15,6 +15,23 @@ amemthresh - INTEGER enabled and the variable is automatically set to 2, otherwise the strategy is disabled and the variable is set to 1. +conntrack - BOOLEAN + 0 - disabled (default) + not 0 - enabled + + If set, maintain connection tracking entries for + connections handled by IPVS. + + This should be enabled if connections handled by IPVS are to be + also handled by stateful firewall rules. That is, iptables rules + that make use of connection tracking. It is a performance + optimisation to disable this setting otherwise. + + Connections handled by the IPVS FTP application module + will have connection tracking entries regardless of this setting. + + Only available when IPVS is compiled with CONFIG_IP_VS_NFCT enabled. + cache_bypass - BOOLEAN 0 - disabled (default) not 0 - enabled @@ -39,7 +56,7 @@ debug_level - INTEGER 11 - IPVS packet handling (ip_vs_in/ip_vs_out) 12 or more - packet traversal - Only available when IPVS is compiled with the CONFIG_IPVS_DEBUG + Only available when IPVS is compiled with CONFIG_IP_VS_DEBUG enabled. Higher debugging levels include the messages for lower debugging levels, so setting debug level 2, includes level 0, 1 and 2 @@ -123,13 +140,11 @@ nat_icmp_send - BOOLEAN secure_tcp - INTEGER 0 - disabled (default) - The secure_tcp defense is to use a more complicated state - transition table and some possible short timeouts of each - state. In the VS/NAT, it delays the entering the ESTABLISHED - until the real server starts to send data and ACK packet - (after 3-way handshake). + The secure_tcp defense is to use a more complicated TCP state + transition table. For VS/NAT, it also delays entering the + TCP ESTABLISHED state until the three way handshake is completed. - The value definition is the same as that of drop_entry or + The value definition is the same as that of drop_entry and drop_packet. sync_threshold - INTEGER @@ -141,3 +156,36 @@ sync_threshold - INTEGER synchronized, every time the number of its incoming packets modulus 50 equals the threshold. The range of the threshold is from 0 to 49. + +snat_reroute - BOOLEAN + 0 - disabled + not 0 - enabled (default) + + If enabled, recalculate the route of SNATed packets from + realservers so that they are routed as if they originate from the + director. Otherwise they are routed as if they are forwarded by the + director. + + If policy routing is in effect then it is possible that the route + of a packet originating from a director is routed differently to a + packet being forwarded by the director. + + If policy routing is not in effect then the recalculated route will + always be the same as the original route so it is an optimisation + to disable snat_reroute and avoid the recalculation. + +sync_version - INTEGER + default 1 + + The version of the synchronisation protocol used when sending + synchronisation messages. + + 0 selects the original synchronisation protocol (version 0). This + should be used when sending synchronisation messages to a legacy + system that only understands the original synchronisation protocol. + + 1 selects the current synchronisation protocol (version 1). This + should be used where possible. + + Kernels with this sync_version entry are able to receive messages + of both version 1 and version 2 of the synchronisation protocol. diff --git a/Documentation/oops-tracing.txt b/Documentation/oops-tracing.txt index 6fe9001b9263..13032c0140d4 100644 --- a/Documentation/oops-tracing.txt +++ b/Documentation/oops-tracing.txt @@ -263,6 +263,8 @@ characters, each representing a particular tainted value. 12: 'I' if the kernel is working around a severe bug in the platform firmware (BIOS or similar). + 13: 'O' if an externally-built ("out-of-tree") module has been loaded. + The primary reason for the 'Tainted: ' string is to tell kernel debuggers if this is a clean kernel or if anything unusual has occurred. Tainting is permanent: even if an offending module is diff --git a/Documentation/power/freezing-of-tasks.txt b/Documentation/power/freezing-of-tasks.txt index 38b57248fd61..316c2ba187f4 100644 --- a/Documentation/power/freezing-of-tasks.txt +++ b/Documentation/power/freezing-of-tasks.txt @@ -22,12 +22,12 @@ try_to_freeze_tasks() that sets TIF_FREEZE for all of the freezable tasks and either wakes them up, if they are kernel threads, or sends fake signals to them, if they are user space processes. A task that has TIF_FREEZE set, should react to it by calling the function called refrigerator() (defined in -kernel/power/process.c), which sets the task's PF_FROZEN flag, changes its state +kernel/freezer.c), which sets the task's PF_FROZEN flag, changes its state to TASK_UNINTERRUPTIBLE and makes it loop until PF_FROZEN is cleared for it. Then, we say that the task is 'frozen' and therefore the set of functions handling this mechanism is referred to as 'the freezer' (these functions are -defined in kernel/power/process.c and include/linux/freezer.h). User space -processes are generally frozen before kernel threads. +defined in kernel/power/process.c, kernel/freezer.c & include/linux/freezer.h). +User space processes are generally frozen before kernel threads. It is not recommended to call refrigerator() directly. Instead, it is recommended to use the try_to_freeze() function (defined in @@ -95,7 +95,7 @@ after the memory for the image has been freed, we don't want tasks to allocate additional memory and we prevent them from doing that by freezing them earlier. [Of course, this also means that device drivers should not allocate substantial amounts of memory from their .suspend() callbacks before hibernation, but this -is e separate issue.] +is a separate issue.] 3. The third reason is to prevent user space processes and some kernel threads from interfering with the suspending and resuming of devices. A user space diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt index 0e856088db7c..5336149f831b 100644 --- a/Documentation/power/runtime_pm.txt +++ b/Documentation/power/runtime_pm.txt @@ -789,6 +789,16 @@ will behave normally, not taking the autosuspend delay into account. Similarly, if the power.use_autosuspend field isn't set then the autosuspend helper functions will behave just like the non-autosuspend counterparts. +Under some circumstances a driver or subsystem may want to prevent a device +from autosuspending immediately, even though the usage counter is zero and the +autosuspend delay time has expired. If the ->runtime_suspend() callback +returns -EAGAIN or -EBUSY, and if the next autosuspend delay expiration time is +in the future (as it normally would be if the callback invoked +pm_runtime_mark_last_busy()), the PM core will automatically reschedule the +autosuspend. The ->runtime_suspend() callback can't do this rescheduling +itself because no suspend requests of any kind are accepted while the device is +suspending (i.e., while the callback is running). + The implementation is well suited for asynchronous use in interrupt contexts. However such use inevitably involves races, because the PM core can't synchronize ->runtime_suspend() callbacks with the arrival of I/O requests. diff --git a/Documentation/rapidio/rapidio.txt b/Documentation/rapidio/rapidio.txt index be70ee15f8ca..c75694b35d08 100644 --- a/Documentation/rapidio/rapidio.txt +++ b/Documentation/rapidio/rapidio.txt @@ -144,7 +144,7 @@ and the default device ID in order to access the device on the active port. After the host has completed enumeration of the entire network it releases devices by clearing device ID locks (calls rio_clear_locks()). For each endpoint -in the system, it sets the Master Enable bit in the Port General Control CSR +in the system, it sets the Discovered bit in the Port General Control CSR to indicate that enumeration is completed and agents are allowed to execute passive discovery of the network. diff --git a/Documentation/rapidio/tsi721.txt b/Documentation/rapidio/tsi721.txt new file mode 100644 index 000000000000..335f3c6087dc --- /dev/null +++ b/Documentation/rapidio/tsi721.txt @@ -0,0 +1,49 @@ +RapidIO subsystem mport driver for IDT Tsi721 PCI Express-to-SRIO bridge. +========================================================================= + +I. Overview + +This driver implements all currently defined RapidIO mport callback functions. +It supports maintenance read and write operations, inbound and outbound RapidIO +doorbells, inbound maintenance port-writes and RapidIO messaging. + +To generate SRIO maintenance transactions this driver uses one of Tsi721 DMA +channels. This mechanism provides access to larger range of hop counts and +destination IDs without need for changes in outbound window translation. + +RapidIO messaging support uses dedicated messaging channels for each mailbox. +For inbound messages this driver uses destination ID matching to forward messages +into the corresponding message queue. Messaging callbacks are implemented to be +fully compatible with RIONET driver (Ethernet over RapidIO messaging services). + +II. Known problems + + None. + +III. To do + + Add DMA data transfers (non-messaging). + Add inbound region (SRIO-to-PCIe) mapping. + +IV. Version History + + 1.0.0 - Initial driver release. + +V. License +----------------------------------------------- + + Copyright(c) 2011 Integrated Device Technology, Inc. All rights reserved. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., + 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. diff --git a/Documentation/serial/computone.txt b/Documentation/serial/computone.txt index 60a6f657c37d..39ddcdbeeb85 100644 --- a/Documentation/serial/computone.txt +++ b/Documentation/serial/computone.txt @@ -20,8 +20,6 @@ Version: 1.2.14 Date: 11/01/2001 Historical Author: Andrew Manison <amanison@america.net> Primary Author: Doug McNash -Support: support@computone.com -Fixes and Updates: Mike Warfield <mhw@wittsend.com> This file assumes that you are using the Computone drivers which are integrated into the kernel sources. For updating the drivers or installing diff --git a/Documentation/virtual/uml/UserModeLinux-HOWTO.txt b/Documentation/virtual/uml/UserModeLinux-HOWTO.txt index 5d0fc8bfcdb9..77dfecf4e2d6 100644 --- a/Documentation/virtual/uml/UserModeLinux-HOWTO.txt +++ b/Documentation/virtual/uml/UserModeLinux-HOWTO.txt @@ -134,13 +134,13 @@ ______________________________________________________________________ - 11.. IInnttrroodduuccttiioonn + 1. Introduction Welcome to User Mode Linux. It's going to be fun. - 11..11.. HHooww iiss UUsseerr MMooddee LLiinnuuxx DDiiffffeerreenntt?? + 1.1. How is User Mode Linux Different? Normally, the Linux Kernel talks straight to your hardware (video card, keyboard, hard drives, etc), and any programs which run ask the @@ -181,7 +181,7 @@ - 11..22.. WWhhyy WWoouulldd II WWaanntt UUsseerr MMooddee LLiinnuuxx?? + 1.2. Why Would I Want User Mode Linux? 1. If User Mode Linux crashes, your host kernel is still fine. @@ -206,12 +206,12 @@ - 22.. CCoommppiilliinngg tthhee kkeerrnneell aanndd mmoodduulleess + 2. Compiling the kernel and modules - 22..11.. CCoommppiilliinngg tthhee kkeerrnneell + 2.1. Compiling the kernel Compiling the user mode kernel is just like compiling any other @@ -322,7 +322,7 @@ bug fixes and enhancements that have gone into subsequent releases. - 22..22.. CCoommppiilliinngg aanndd iinnssttaalllliinngg kkeerrnneell mmoodduulleess + 2.2. Compiling and installing kernel modules UML modules are built in the same way as the native kernel (with the exception of the 'ARCH=um' that you always need for UML): @@ -386,19 +386,19 @@ - 22..33.. CCoommppiilliinngg aanndd iinnssttaalllliinngg uummll__uuttiilliittiieess + 2.3. Compiling and installing uml_utilities Many features of the UML kernel require a user-space helper program, so a uml_utilities package is distributed separately from the kernel patch which provides these helpers. Included within this is: - +o port-helper - Used by consoles which connect to xterms or ports + o port-helper - Used by consoles which connect to xterms or ports - +o tunctl - Configuration tool to create and delete tap devices + o tunctl - Configuration tool to create and delete tap devices - +o uml_net - Setuid binary for automatic tap device configuration + o uml_net - Setuid binary for automatic tap device configuration - +o uml_switch - User-space virtual switch required for daemon + o uml_switch - User-space virtual switch required for daemon transport The uml_utilities tree is compiled with: @@ -423,11 +423,11 @@ - 33.. RRuunnnniinngg UUMMLL aanndd llooggggiinngg iinn + 3. Running UML and logging in - 33..11.. RRuunnnniinngg UUMMLL + 3.1. Running UML It runs on 2.2.15 or later, and all 2.4 kernels. @@ -454,7 +454,7 @@ - 33..22.. LLooggggiinngg iinn + 3.2. Logging in @@ -468,7 +468,7 @@ There are a couple of other ways to log in: - +o On a virtual console + o On a virtual console @@ -480,7 +480,7 @@ - +o Over the serial line + o Over the serial line In the boot output, find a line that looks like: @@ -503,7 +503,7 @@ - +o Over the net + o Over the net If the network is running, then you can telnet to the virtual @@ -514,13 +514,13 @@ down and the process will exit. - 33..33.. EExxaammpplleess + 3.3. Examples Here are some examples of UML in action: - +o A login session <http://user-mode-linux.sourceforge.net/login.html> + o A login session <http://user-mode-linux.sourceforge.net/login.html> - +o A virtual network <http://user-mode-linux.sourceforge.net/net.html> + o A virtual network <http://user-mode-linux.sourceforge.net/net.html> @@ -528,12 +528,12 @@ - 44.. UUMMLL oonn 22GG//22GG hhoossttss + 4. UML on 2G/2G hosts - 44..11.. IInnttrroodduuccttiioonn + 4.1. Introduction Most Linux machines are configured so that the kernel occupies the @@ -546,7 +546,7 @@ - 44..22.. TThhee pprroobblleemm + 4.2. The problem The prebuilt UML binaries on this site will not run on 2G/2G hosts @@ -558,7 +558,7 @@ - 44..33.. TThhee ssoolluuttiioonn + 4.3. The solution The fix for this is to rebuild UML from source after enabling @@ -576,7 +576,7 @@ - 55.. SSeettttiinngg uupp sseerriiaall lliinneess aanndd ccoonnssoolleess + 5. Setting up serial lines and consoles It is possible to attach UML serial lines and consoles to many types @@ -586,12 +586,12 @@ You can attach them to host ptys, ttys, file descriptors, and ports. This allows you to do things like - +o have a UML console appear on an unused host console, + o have a UML console appear on an unused host console, - +o hook two virtual machines together by having one attach to a pty + o hook two virtual machines together by having one attach to a pty and having the other attach to the corresponding tty - +o make a virtual machine accessible from the net by attaching a + o make a virtual machine accessible from the net by attaching a console to a port on the host. @@ -599,7 +599,7 @@ - 55..11.. SSppeecciiffyyiinngg tthhee ddeevviiccee + 5.1. Specifying the device Devices are specified with "con" or "ssl" (console or serial line, respectively), optionally with a device number if you are talking @@ -626,13 +626,13 @@ - 55..22.. SSppeecciiffyyiinngg tthhee cchhaannnneell + 5.2. Specifying the channel There are a number of different types of channels to attach a UML device to, each with a different way of specifying exactly what to attach to. - +o pseudo-terminals - device=pty pts terminals - device=pts + o pseudo-terminals - device=pty pts terminals - device=pts This will cause UML to allocate a free host pseudo-terminal for the @@ -640,20 +640,20 @@ log. You access it by attaching a terminal program to the corresponding tty: - +o screen /dev/pts/n + o screen /dev/pts/n - +o screen /dev/ttyxx + o screen /dev/ttyxx - +o minicom -o -p /dev/ttyxx - minicom seems not able to handle pts + o minicom -o -p /dev/ttyxx - minicom seems not able to handle pts devices - +o kermit - start it up, 'open' the device, then 'connect' + o kermit - start it up, 'open' the device, then 'connect' - +o terminals - device=tty:tty device file + o terminals - device=tty:tty device file This will make UML attach the device to the specified tty (i.e @@ -672,7 +672,7 @@ - +o xterms - device=xterm + o xterms - device=xterm UML will run an xterm and the device will be attached to it. @@ -681,7 +681,7 @@ - +o Port - device=port:port number + o Port - device=port:port number This will attach the UML devices to the specified host port. @@ -725,7 +725,7 @@ - +o already-existing file descriptors - device=file descriptor + o already-existing file descriptors - device=file descriptor If you set up a file descriptor on the UML command line, you can @@ -743,7 +743,7 @@ - +o Nothing - device=null + o Nothing - device=null This allows the device to be opened, in contrast to 'none', but @@ -754,7 +754,7 @@ - +o None - device=none + o None - device=none This causes the device to disappear. @@ -770,7 +770,7 @@ - will cause serial line 3 to accept input on the host's /dev/tty3 and + will cause serial line 3 to accept input on the host's /dev/tty2 and display output on an xterm. That's a silly example - the most common use of this syntax is to reattach the main console to stdin and stdout as shown above. @@ -785,7 +785,7 @@ - 55..33.. EExxaammpplleess + 5.3. Examples There are a number of interesting things you can do with this capability. @@ -838,7 +838,7 @@ prompt of the other virtual machine. - 66.. SSeettttiinngg uupp tthhee nneettwwoorrkk + 6. Setting up the network @@ -858,19 +858,19 @@ There are currently five transport types available for a UML virtual machine to exchange packets with other hosts: - +o ethertap + o ethertap - +o TUN/TAP + o TUN/TAP - +o Multicast + o Multicast - +o a switch daemon + o a switch daemon - +o slip + o slip - +o slirp + o slirp - +o pcap + o pcap The TUN/TAP, ethertap, slip, and slirp transports allow a UML instance to exchange packets with the host. They may be directed @@ -893,28 +893,28 @@ With so many host transports, which one should you use? Here's when you should use each one: - +o ethertap - if you want access to the host networking and it is + o ethertap - if you want access to the host networking and it is running 2.2 - +o TUN/TAP - if you want access to the host networking and it is + o TUN/TAP - if you want access to the host networking and it is running 2.4. Also, the TUN/TAP transport is able to use a preconfigured device, allowing it to avoid using the setuid uml_net helper, which is a security advantage. - +o Multicast - if you want a purely virtual network and you don't want + o Multicast - if you want a purely virtual network and you don't want to set up anything but the UML - +o a switch daemon - if you want a purely virtual network and you + o a switch daemon - if you want a purely virtual network and you don't mind running the daemon in order to get somewhat better performance - +o slip - there is no particular reason to run the slip backend unless + o slip - there is no particular reason to run the slip backend unless ethertap and TUN/TAP are just not available for some reason - +o slirp - if you don't have root access on the host to setup + o slirp - if you don't have root access on the host to setup networking, or if you don't want to allocate an IP to your UML - +o pcap - not much use for actual network connectivity, but great for + o pcap - not much use for actual network connectivity, but great for monitoring traffic on the host Ethertap is available on 2.4 and works fine. TUN/TAP is preferred @@ -926,7 +926,7 @@ exploit the helper's root privileges. - 66..11.. GGeenneerraall sseettuupp + 6.1. General setup First, you must have the virtual network enabled in your UML. If are running a prebuilt kernel from this site, everything is already @@ -995,7 +995,7 @@ - 66..22.. UUsseerrssppaaccee ddaaeemmoonnss + 6.2. Userspace daemons You will likely need the setuid helper, or the switch daemon, or both. They are both installed with the RPM and deb, so if you've installed @@ -1011,7 +1011,7 @@ - 66..33.. SSppeecciiffyyiinngg eetthheerrnneett aaddddrreesssseess + 6.3. Specifying ethernet addresses Below, you will see that the TUN/TAP, ethertap, and daemon interfaces allow you to specify hardware addresses for the virtual ethernet @@ -1023,11 +1023,11 @@ sufficient to guarantee a unique hardware address for the device. A couple of exceptions are: - +o Another set of virtual ethernet devices are on the same network and + o Another set of virtual ethernet devices are on the same network and they are assigned hardware addresses using a different scheme which may conflict with the UML IP address-based scheme - +o You aren't going to use the device for IP networking, so you don't + o You aren't going to use the device for IP networking, so you don't assign the device an IP address If you let the driver provide the hardware address, you should make @@ -1049,7 +1049,7 @@ - 66..44.. UUMMLL iinntteerrffaaccee sseettuupp + 6.4. UML interface setup Once the network devices have been described on the command line, you should boot UML and log in. @@ -1131,7 +1131,7 @@ - 66..55.. MMuullttiiccaasstt + 6.5. Multicast The simplest way to set up a virtual network between multiple UMLs is to use the mcast transport. This was written by Harald Welte and is @@ -1194,7 +1194,7 @@ - 66..66.. TTUUNN//TTAAPP wwiitthh tthhee uummll__nneett hheellppeerr + 6.6. TUN/TAP with the uml_net helper TUN/TAP is the preferred mechanism on 2.4 to exchange packets with the host. The TUN/TAP backend has been in UML since 2.4.9-3um. @@ -1247,10 +1247,10 @@ There are a couple potential problems with running the TUN/TAP transport on a 2.4 host kernel - +o TUN/TAP seems not to work on 2.4.3 and earlier. Upgrade the host + o TUN/TAP seems not to work on 2.4.3 and earlier. Upgrade the host kernel or use the ethertap transport. - +o With an upgraded kernel, TUN/TAP may fail with + o With an upgraded kernel, TUN/TAP may fail with File descriptor in bad state @@ -1269,7 +1269,7 @@ - 66..77.. TTUUNN//TTAAPP wwiitthh aa pprreeccoonnffiigguurreedd ttaapp ddeevviiccee + 6.7. TUN/TAP with a preconfigured tap device If you prefer not to have UML use uml_net (which is somewhat insecure), with UML 2.4.17-11, you can set up a TUN/TAP device @@ -1277,7 +1277,7 @@ there is no need for root assistance. Setting up the device is done as follows: - +o Create the device with tunctl (available from the UML utilities + o Create the device with tunctl (available from the UML utilities tarball) @@ -1291,7 +1291,7 @@ where uid is the user id or username that UML will be run as. This will tell you what device was created. - +o Configure the device IP (change IP addresses and device name to + o Configure the device IP (change IP addresses and device name to suit) @@ -1303,7 +1303,7 @@ - +o Set up routing and arping if desired - this is my recipe, there are + o Set up routing and arping if desired - this is my recipe, there are other ways of doing the same thing @@ -1338,7 +1338,7 @@ utility which reads the information from a config file and sets up devices at boot time. - +o Rather than using up two IPs and ARPing for one of them, you can + o Rather than using up two IPs and ARPing for one of them, you can also provide direct access to your LAN by the UML by using a bridge. @@ -1417,7 +1417,7 @@ Note that 'br0' should be setup using ifconfig with the existing IP address of eth0, as eth0 no longer has its own IP. - +o + o Also, the /dev/net/tun device must be writable by the user running @@ -1438,11 +1438,11 @@ devices and chgrp /dev/net/tun to that group with mode 664 or 660. - +o Once the device is set up, run UML with 'eth0=tuntap,device name' + o Once the device is set up, run UML with 'eth0=tuntap,device name' (i.e. 'eth0=tuntap,tap0') on the command line (or do it with the mconsole config command). - +o Bring the eth device up in UML and you're in business. + o Bring the eth device up in UML and you're in business. If you don't want that tap device any more, you can make it non- persistent with @@ -1465,7 +1465,7 @@ - 66..88.. EEtthheerrttaapp + 6.8. Ethertap Ethertap is the general mechanism on 2.2 for userspace processes to exchange packets with the kernel. @@ -1561,9 +1561,9 @@ - 66..99.. TThhee sswwiittcchh ddaaeemmoonn + 6.9. The switch daemon - NNoottee: This is the daemon formerly known as uml_router, but which was + Note: This is the daemon formerly known as uml_router, but which was renamed so the network weenies of the world would stop growling at me. @@ -1649,7 +1649,7 @@ - 66..1100.. SSlliipp + 6.10. Slip Slip is another, less general, mechanism for a process to communicate with the host networking. In contrast to the ethertap interface, @@ -1681,7 +1681,7 @@ - 66..1111.. SSlliirrpp + 6.11. Slirp slirp uses an external program, usually /usr/bin/slirp, to provide IP only networking connectivity through the host. This is similar to IP @@ -1737,7 +1737,7 @@ - 66..1122.. ppccaapp + 6.12. pcap The pcap transport is attached to a UML ethernet device on the command line or with uml_mconsole with the following syntax: @@ -1777,7 +1777,7 @@ - 66..1133.. SSeettttiinngg uupp tthhee hhoosstt yyoouurrsseellff + 6.13. Setting up the host yourself If you don't specify an address for the host side of the ethertap or slip device, UML won't do any setup on the host. So this is what is @@ -1785,7 +1785,7 @@ 192.168.0.251 and a UML-side IP of 192.168.0.250 - adjust to suit your own network): - +o The device needs to be configured with its IP address. Tap devices + o The device needs to be configured with its IP address. Tap devices are also configured with an mtu of 1484. Slip devices are configured with a point-to-point address pointing at the UML ip address. @@ -1805,7 +1805,7 @@ - +o If a tap device is being set up, a route is set to the UML IP. + o If a tap device is being set up, a route is set to the UML IP. UML# route add -host 192.168.0.250 gw 192.168.0.251 @@ -1814,7 +1814,7 @@ - +o To allow other hosts on your network to see the virtual machine, + o To allow other hosts on your network to see the virtual machine, proxy arp is set up for it. @@ -1824,7 +1824,7 @@ - +o Finally, the host is set up to route packets. + o Finally, the host is set up to route packets. host# echo 1 > /proc/sys/net/ipv4/ip_forward @@ -1838,12 +1838,12 @@ - 77.. SShhaarriinngg FFiilleessyysstteemmss bbeettwweeeenn VViirrttuuaall MMaacchhiinneess + 7. Sharing Filesystems between Virtual Machines - 77..11.. AA wwaarrnniinngg + 7.1. A warning Don't attempt to share filesystems simply by booting two UMLs from the same file. That's the same thing as booting two physical machines @@ -1851,7 +1851,7 @@ - 77..22.. UUssiinngg llaayyeerreedd bblloocckk ddeevviicceess + 7.2. Using layered block devices The way to share a filesystem between two virtual machines is to use the copy-on-write (COW) layering capability of the ubd block driver. @@ -1896,7 +1896,7 @@ - 77..33.. NNoottee!! + 7.3. Note! When checking the size of the COW file in order to see the gobs of space that you're saving, make sure you use 'ls -ls' to see the actual @@ -1926,7 +1926,7 @@ - 77..44.. AAnnootthheerr wwaarrnniinngg + 7.4. Another warning Once a filesystem is being used as a readonly backing file for a COW file, do not boot directly from it or modify it in any way. Doing so @@ -1952,7 +1952,7 @@ - 77..55.. uummll__mmoooo :: MMeerrggiinngg aa CCOOWW ffiillee wwiitthh iittss bbaacckkiinngg ffiillee + 7.5. uml_moo : Merging a COW file with its backing file Depending on how you use UML and COW devices, it may be advisable to merge the changes in the COW file into the backing file every once in @@ -2001,7 +2001,7 @@ - 88.. CCrreeaattiinngg ffiilleessyysstteemmss + 8. Creating filesystems You may want to create and mount new UML filesystems, either because @@ -2015,7 +2015,7 @@ should be easy to translate to the filesystem of your choice. - 88..11.. CCrreeaattee tthhee ffiilleessyysstteemm ffiillee + 8.1. Create the filesystem file dd is your friend. All you need to do is tell dd to create an empty file of the appropriate size. I usually make it sparse to save time @@ -2032,7 +2032,7 @@ - 88..22.. AAssssiiggnn tthhee ffiillee ttoo aa UUMMLL ddeevviiccee + 8.2. Assign the file to a UML device Add an argument like the following to the UML command line: @@ -2045,7 +2045,7 @@ - 88..33.. CCrreeaattiinngg aanndd mmoouunnttiinngg tthhee ffiilleessyysstteemm + 8.3. Creating and mounting the filesystem Make sure that the filesystem is available, either by being built into the kernel, or available as a module, then boot up UML and log in. If @@ -2096,7 +2096,7 @@ - 99.. HHoosstt ffiillee aacccceessss + 9. Host file access If you want to access files on the host machine from inside UML, you @@ -2112,7 +2112,7 @@ files contained in it just as you would on the host. - 99..11.. UUssiinngg hhoossttffss + 9.1. Using hostfs To begin with, make sure that hostfs is available inside the virtual machine with @@ -2151,7 +2151,7 @@ - 99..22.. hhoossttffss aass tthhee rroooott ffiilleessyysstteemm + 9.2. hostfs as the root filesystem It's possible to boot from a directory hierarchy on the host using hostfs rather than using the standard filesystem in a file. @@ -2194,20 +2194,20 @@ UML should then boot as it does normally. - 99..33.. BBuuiillddiinngg hhoossttffss + 9.3. Building hostfs If you need to build hostfs because it's not in your kernel, you have two choices: - +o Compiling hostfs into the kernel: + o Compiling hostfs into the kernel: Reconfigure the kernel and set the 'Host filesystem' option under - +o Compiling hostfs as a module: + o Compiling hostfs as a module: Reconfigure the kernel and set the 'Host filesystem' option under @@ -2228,7 +2228,7 @@ - 1100.. TThhee MMaannaaggeemmeenntt CCoonnssoollee + 10. The Management Console @@ -2240,15 +2240,15 @@ There are a number of things you can do with the mconsole interface: - +o get the kernel version + o get the kernel version - +o add and remove devices + o add and remove devices - +o halt or reboot the machine + o halt or reboot the machine - +o Send SysRq commands + o Send SysRq commands - +o Pause and resume the UML + o Pause and resume the UML You need the mconsole client (uml_mconsole) which is present in CVS @@ -2300,28 +2300,28 @@ You'll get a prompt, at which you can run one of these commands: - +o version + o version - +o halt + o halt - +o reboot + o reboot - +o config + o config - +o remove + o remove - +o sysrq + o sysrq - +o help + o help - +o cad + o cad - +o stop + o stop - +o go + o go - 1100..11.. vveerrssiioonn + 10.1. version This takes no arguments. It prints the UML version. @@ -2342,7 +2342,7 @@ - 1100..22.. hhaalltt aanndd rreebboooott + 10.2. halt and reboot These take no arguments. They shut the machine down immediately, with no syncing of disks and no clean shutdown of userspace. So, they are @@ -2357,7 +2357,7 @@ - 1100..33.. ccoonnffiigg + 10.3. config "config" adds a new device to the virtual machine. Currently the ubd and network drivers support this. It takes one argument, which is the @@ -2378,7 +2378,7 @@ - 1100..44.. rreemmoovvee + 10.4. remove "remove" deletes a device from the system. Its argument is just the name of the device to be removed. The device must be idle in whatever @@ -2397,7 +2397,7 @@ - 1100..55.. ssyyssrrqq + 10.5. sysrq This takes one argument, which is a single letter. It calls the generic kernel's SysRq driver, which does whatever is called for by @@ -2407,14 +2407,14 @@ - 1100..66.. hheellpp + 10.6. help "help" returns a string listing the valid commands and what each one does. - 1100..77.. ccaadd + 10.7. cad This invokes the Ctl-Alt-Del action on init. What exactly this ends up doing is up to /etc/inittab. Normally, it reboots the machine. @@ -2432,7 +2432,7 @@ - 1100..88.. ssttoopp + 10.8. stop This puts the UML in a loop reading mconsole requests until a 'go' mconsole command is received. This is very useful for making backups @@ -2448,7 +2448,7 @@ - 1100..99.. ggoo + 10.9. go This resumes a UML after being paused by a 'stop' command. Note that when the UML has resumed, TCP connections may have timed out and if @@ -2462,10 +2462,10 @@ - 1111.. KKeerrnneell ddeebbuuggggiinngg + 11. Kernel debugging - NNoottee:: The interface that makes debugging, as described here, possible + Note: The interface that makes debugging, as described here, possible is present in 2.4.0-test6 kernels and later. @@ -2485,7 +2485,7 @@ - 1111..11.. SSttaarrttiinngg tthhee kkeerrnneell uunnddeerr ggddbb + 11.1. Starting the kernel under gdb You can have the kernel running under the control of gdb from the beginning by putting 'debug' on the command line. You will get an @@ -2498,7 +2498,7 @@ There is a transcript of a debugging session here <debug- session.html> , with breakpoints being set in the scheduler and in an interrupt handler. - 1111..22.. EExxaammiinniinngg sslleeeeppiinngg pprroocceesssseess + 11.2. Examining sleeping processes Not every bug is evident in the currently running process. Sometimes, processes hang in the kernel when they shouldn't because they've @@ -2516,7 +2516,7 @@ Now what you do is this: - +o detach from the current thread + o detach from the current thread (UML gdb) det @@ -2525,7 +2525,7 @@ - +o attach to the thread you are interested in + o attach to the thread you are interested in (UML gdb) att <host pid> @@ -2534,7 +2534,7 @@ - +o look at its stack and anything else of interest + o look at its stack and anything else of interest (UML gdb) bt @@ -2545,7 +2545,7 @@ Note that you can't do anything at this point that requires that a process execute, e.g. calling a function - +o when you're done looking at that process, reattach to the current + o when you're done looking at that process, reattach to the current thread and continue it @@ -2569,12 +2569,12 @@ - 1111..33.. RRuunnnniinngg dddddd oonn UUMMLL + 11.3. Running ddd on UML ddd works on UML, but requires a special kludge. The process goes like this: - +o Start ddd + o Start ddd host% ddd linux @@ -2583,14 +2583,14 @@ - +o With ps, get the pid of the gdb that ddd started. You can ask the + o With ps, get the pid of the gdb that ddd started. You can ask the gdb to tell you, but for some reason that confuses things and causes a hang. - +o run UML with 'debug=parent gdb-pid=<pid>' added to the command line + o run UML with 'debug=parent gdb-pid=<pid>' added to the command line - it will just sit there after you hit return - +o type 'att 1' to the ddd gdb and you will see something like + o type 'att 1' to the ddd gdb and you will see something like 0xa013dc51 in __kill () @@ -2602,12 +2602,12 @@ - +o At this point, type 'c', UML will boot up, and you can use ddd just + o At this point, type 'c', UML will boot up, and you can use ddd just as you do on any other process. - 1111..44.. DDeebbuuggggiinngg mmoodduulleess + 11.4. Debugging modules gdb has support for debugging code which is dynamically loaded into the process. This support is what is needed to debug kernel modules @@ -2823,7 +2823,7 @@ - 1111..55.. AAttttaacchhiinngg ggddbb ttoo tthhee kkeerrnneell + 11.5. Attaching gdb to the kernel If you don't have the kernel running under gdb, you can attach gdb to it later by sending the tracing thread a SIGUSR1. The first line of @@ -2857,7 +2857,7 @@ - 1111..66.. UUssiinngg aalltteerrnnaattee ddeebbuuggggeerrss + 11.6. Using alternate debuggers UML has support for attaching to an already running debugger rather than starting gdb itself. This is present in CVS as of 17 Apr 2001. @@ -2886,7 +2886,7 @@ An example of an alternate debugger is strace. You can strace the actual kernel as follows: - +o Run the following in a shell + o Run the following in a shell host% @@ -2894,10 +2894,10 @@ - +o Run UML with 'debug' and 'gdb-pid=<pid>' with the pid printed out + o Run UML with 'debug' and 'gdb-pid=<pid>' with the pid printed out by the previous command - +o Hit return in the shell, and UML will start running, and strace + o Hit return in the shell, and UML will start running, and strace output will start accumulating in the output file. Note that this is different from running @@ -2917,9 +2917,9 @@ - 1122.. KKeerrnneell ddeebbuuggggiinngg eexxaammpplleess + 12. Kernel debugging examples - 1122..11.. TThhee ccaassee ooff tthhee hhuunngg ffsscckk + 12.1. The case of the hung fsck When booting up the kernel, fsck failed, and dropped me into a shell to fix things up. I ran fsck -y, which hung: @@ -3154,9 +3154,9 @@ The interesting things here are : - +o There are two segfaults on this stack (frames 9 and 14) + o There are two segfaults on this stack (frames 9 and 14) - +o The first faulting address (frame 11) is 0x50000800 + o The first faulting address (frame 11) is 0x50000800 (gdb) p (void *)1342179328 $16 = (void *) 0x50000800 @@ -3399,7 +3399,7 @@ on will be somewhat clearer. - 1122..22.. EEppiissooddee 22:: TThhee ccaassee ooff tthhee hhuunngg ffsscckk + 12.2. Episode 2: The case of the hung fsck After setting a trap in the SEGV handler for accesses to the signal thread's stack, I reran the kernel. @@ -3788,12 +3788,12 @@ - 1133.. WWhhaatt ttoo ddoo wwhheenn UUMMLL ddooeessnn''tt wwoorrkk + 13. What to do when UML doesn't work - 1133..11.. SSttrraannggee ccoommppiillaattiioonn eerrrroorrss wwhheenn yyoouu bbuuiilldd ffrroomm ssoouurrccee + 13.1. Strange compilation errors when you build from source As of test11, it is necessary to have "ARCH=um" in the environment or on the make command line for all steps in building UML, including @@ -3824,8 +3824,8 @@ - 1133..33.. AA vvaarriieettyy ooff ppaanniiccss aanndd hhaannggss wwiitthh //ttmmpp oonn aa rreeiisseerrffss ffiilleessyyss-- - tteemm + 13.3. A variety of panics and hangs with /tmp on a reiserfs filesys- + tem I saw this on reiserfs 3.5.21 and it seems to be fixed in 3.5.27. Panics preceded by @@ -3842,8 +3842,8 @@ - 1133..44.. TThhee ccoommppiillee ffaaiillss wwiitthh eerrrroorrss aabboouutt ccoonnfflliiccttiinngg ttyyppeess ffoorr - ''ooppeenn'',, ''dduupp'',, aanndd ''wwaaiittppiidd'' + 13.4. The compile fails with errors about conflicting types for + 'open', 'dup', and 'waitpid' This happens when you build in /usr/src/linux. The UML build makes the include/asm link point to include/asm-um. /usr/include/asm points @@ -3854,14 +3854,14 @@ - 1133..55.. UUMMLL ddooeessnn''tt wwoorrkk wwhheenn //ttmmpp iiss aann NNFFSS ffiilleessyysstteemm + 13.5. UML doesn't work when /tmp is an NFS filesystem This seems to be a similar situation with the ReiserFS problem above. Some versions of NFS seems not to handle mmap correctly, which UML depends on. The workaround is have /tmp be a non-NFS directory. - 1133..66.. UUMMLL hhaannggss oonn bboooott wwhheenn ccoommppiilleedd wwiitthh ggpprrooff ssuuppppoorrtt + 13.6. UML hangs on boot when compiled with gprof support If you build UML with gprof support and, early in the boot, it does this @@ -3878,7 +3878,7 @@ - 1133..77.. ssyyssllooggdd ddiieess wwiitthh aa SSIIGGTTEERRMM oonn ssttaarrttuupp + 13.7. syslogd dies with a SIGTERM on startup The exact boot error depends on the distribution that you're booting, but Debian produces this: @@ -3897,17 +3897,17 @@ - 1133..88.. TTUUNN//TTAAPP nneettwwoorrkkiinngg ddooeessnn''tt wwoorrkk oonn aa 22..44 hhoosstt + 13.8. TUN/TAP networking doesn't work on a 2.4 host There are a couple of problems which were <http://www.geocrawler.com/lists/3/SourceForge/597/0/> name="pointed out"> by Tim Robinson <timro at trkr dot net> - +o It doesn't work on hosts running 2.4.7 (or thereabouts) or earlier. + o It doesn't work on hosts running 2.4.7 (or thereabouts) or earlier. The fix is to upgrade to something more recent and then read the next item. - +o If you see + o If you see File descriptor in bad state @@ -3921,8 +3921,8 @@ - 1133..99.. YYoouu ccaann nneettwwoorrkk ttoo tthhee hhoosstt bbuutt nnoott ttoo ootthheerr mmaacchhiinneess oonn tthhee - nneett + 13.9. You can network to the host but not to other machines on the + net If you can connect to the host, and the host can connect to UML, but you cannot connect to any other machines, then you may need to enable @@ -3972,7 +3972,7 @@ - 1133..1100.. II hhaavvee nnoo rroooott aanndd II wwaanntt ttoo ssccrreeaamm + 13.10. I have no root and I want to scream Thanks to Birgit Wahlich for telling me about this strange one. It turns out that there's a limit of six environment variables on the @@ -3987,7 +3987,7 @@ - 1133..1111.. UUMMLL bbuuiilldd ccoonnfflliicctt bbeettwweeeenn ppttrraaccee..hh aanndd uuccoonntteexxtt..hh + 13.11. UML build conflict between ptrace.h and ucontext.h On some older systems, /usr/include/asm/ptrace.h and /usr/include/sys/ucontext.h define the same names. So, when they're @@ -4007,7 +4007,7 @@ - 1133..1122.. TThhee UUMMLL BBooggooMMiippss iiss eexxaaccttllyy hhaallff tthhee hhoosstt''ss BBooggooMMiippss + 13.12. The UML BogoMips is exactly half the host's BogoMips On i386 kernels, there are two ways of running the loop that is used to calculate the BogoMips rating, using the TSC if it's there or using @@ -4019,7 +4019,7 @@ - 1133..1133.. WWhheenn yyoouu rruunn UUMMLL,, iitt iimmmmeeddiiaatteellyy sseeggffaauullttss + 13.13. When you run UML, it immediately segfaults If the host is configured with the 2G/2G address space split, that's why. See ``UML on 2G/2G hosts'' for the details on getting UML to @@ -4027,7 +4027,7 @@ - 1133..1144.. xxtteerrmmss aappppeeaarr,, tthheenn iimmmmeeddiiaatteellyy ddiissaappppeeaarr + 13.14. xterms appear, then immediately disappear If you're running an up to date kernel with an old release of uml_utilities, the port-helper program will not work properly, so @@ -4039,7 +4039,7 @@ - 1133..1155.. AAnnyy ootthheerr ppaanniicc,, hhaanngg,, oorr ssttrraannggee bbeehhaavviioorr + 13.15. Any other panic, hang, or strange behavior If you're seeing truly strange behavior, such as hangs or panics that happen in random places, or you try running the debugger to see what's @@ -4059,7 +4059,7 @@ If you want to be super-helpful, read ``Diagnosing Problems'' and follow the instructions contained therein. - 1144.. DDiiaaggnnoossiinngg PPrroobblleemmss + 14. Diagnosing Problems If you get UML to crash, hang, or otherwise misbehave, you should @@ -4078,7 +4078,7 @@ ``Kernel debugging'' UML first. - 1144..11.. CCaassee 11 :: NNoorrmmaall kkeerrnneell ppaanniiccss + 14.1. Case 1 : Normal kernel panics The most common case is for a normal thread to panic. To debug this, you will need to run it under the debugger (add 'debug' to the command @@ -4128,7 +4128,7 @@ to get that information from the faulting ip. - 1144..22.. CCaassee 22 :: TTrraacciinngg tthhrreeaadd ppaanniiccss + 14.2. Case 2 : Tracing thread panics The less common and more painful case is when the tracing thread panics. In this case, the kernel debugger will be useless because it @@ -4161,7 +4161,7 @@ backtrace in and wait for our crack debugging team to fix the problem. - 1144..33.. CCaassee 33 :: TTrraacciinngg tthhrreeaadd ppaanniiccss ccaauusseedd bbyy ootthheerr tthhrreeaaddss + 14.3. Case 3 : Tracing thread panics caused by other threads However, there are cases where the misbehavior of another thread caused the problem. The most common panic of this type is: @@ -4227,7 +4227,7 @@ - 1144..44.. CCaassee 44 :: HHaannggss + 14.4. Case 4 : Hangs Hangs seem to be fairly rare, but they sometimes happen. When a hang happens, we need a backtrace from the offending process. Run the @@ -4257,7 +4257,7 @@ - 1155.. TThhaannkkss + 15. Thanks A number of people have helped this project in various ways, and this @@ -4274,20 +4274,20 @@ bookkeeping lapses and I forget about contributions. - 1155..11.. CCooddee aanndd DDooccuummeennttaattiioonn + 15.1. Code and Documentation Rusty Russell <rusty at linuxcare.com.au> - - +o wrote the HOWTO <http://user-mode- + o wrote the HOWTO <http://user-mode- linux.sourceforge.net/UserModeLinux-HOWTO.html> - +o prodded me into making this project official and putting it on + o prodded me into making this project official and putting it on SourceForge - +o came up with the way cool UML logo <http://user-mode- + o came up with the way cool UML logo <http://user-mode- linux.sourceforge.net/uml-small.png> - +o redid the config process + o redid the config process Peter Moulder <reiter at netspace.net.au> - Fixed my config and build @@ -4296,18 +4296,18 @@ Bill Stearns <wstearns at pobox.com> - - +o HOWTO updates + o HOWTO updates - +o lots of bug reports + o lots of bug reports - +o lots of testing + o lots of testing - +o dedicated a box (uml.ists.dartmouth.edu) to support UML development + o dedicated a box (uml.ists.dartmouth.edu) to support UML development - +o wrote the mkrootfs script, which allows bootable filesystems of + o wrote the mkrootfs script, which allows bootable filesystems of RPM-based distributions to be cranked out - +o cranked out a large number of filesystems with said script + o cranked out a large number of filesystems with said script Jim Leu <jleu at mindspring.com> - Wrote the virtual ethernet driver @@ -4375,176 +4375,176 @@ David Coulson <http://davidcoulson.net> - - +o Set up the usermodelinux.org <http://usermodelinux.org> site, + o Set up the usermodelinux.org <http://usermodelinux.org> site, which is a great way of keeping the UML user community on top of UML goings-on. - +o Site documentation and updates + o Site documentation and updates - +o Nifty little UML management daemon UMLd + o Nifty little UML management daemon UMLd <http://uml.openconsultancy.com/umld/> - +o Lots of testing and bug reports + o Lots of testing and bug reports - 1155..22.. FFlluusshhiinngg oouutt bbuuggss + 15.2. Flushing out bugs - +o Yuri Pudgorodsky + o Yuri Pudgorodsky - +o Gerald Britton + o Gerald Britton - +o Ian Wehrman + o Ian Wehrman - +o Gord Lamb + o Gord Lamb - +o Eugene Koontz + o Eugene Koontz - +o John H. Hartman + o John H. Hartman - +o Anders Karlsson + o Anders Karlsson - +o Daniel Phillips + o Daniel Phillips - +o John Fremlin + o John Fremlin - +o Rainer Burgstaller + o Rainer Burgstaller - +o James Stevenson + o James Stevenson - +o Matt Clay + o Matt Clay - +o Cliff Jefferies + o Cliff Jefferies - +o Geoff Hoff + o Geoff Hoff - +o Lennert Buytenhek + o Lennert Buytenhek - +o Al Viro + o Al Viro - +o Frank Klingenhoefer + o Frank Klingenhoefer - +o Livio Baldini Soares + o Livio Baldini Soares - +o Jon Burgess + o Jon Burgess - +o Petru Paler + o Petru Paler - +o Paul + o Paul - +o Chris Reahard + o Chris Reahard - +o Sverker Nilsson + o Sverker Nilsson - +o Gong Su + o Gong Su - +o johan verrept + o johan verrept - +o Bjorn Eriksson + o Bjorn Eriksson - +o Lorenzo Allegrucci + o Lorenzo Allegrucci - +o Muli Ben-Yehuda + o Muli Ben-Yehuda - +o David Mansfield + o David Mansfield - +o Howard Goff + o Howard Goff - +o Mike Anderson + o Mike Anderson - +o John Byrne + o John Byrne - +o Sapan J. Batia + o Sapan J. Batia - +o Iris Huang + o Iris Huang - +o Jan Hudec + o Jan Hudec - +o Voluspa + o Voluspa - 1155..33.. BBuugglleettss aanndd cclleeaann--uuppss + 15.3. Buglets and clean-ups - +o Dave Zarzycki + o Dave Zarzycki - +o Adam Lazur + o Adam Lazur - +o Boria Feigin + o Boria Feigin - +o Brian J. Murrell + o Brian J. Murrell - +o JS + o JS - +o Roman Zippel + o Roman Zippel - +o Wil Cooley + o Wil Cooley - +o Ayelet Shemesh + o Ayelet Shemesh - +o Will Dyson + o Will Dyson - +o Sverker Nilsson + o Sverker Nilsson - +o dvorak + o dvorak - +o v.naga srinivas + o v.naga srinivas - +o Shlomi Fish + o Shlomi Fish - +o Roger Binns + o Roger Binns - +o johan verrept + o johan verrept - +o MrChuoi + o MrChuoi - +o Peter Cleve + o Peter Cleve - +o Vincent Guffens + o Vincent Guffens - +o Nathan Scott + o Nathan Scott - +o Patrick Caulfield + o Patrick Caulfield - +o jbearce + o jbearce - +o Catalin Marinas + o Catalin Marinas - +o Shane Spencer + o Shane Spencer - +o Zou Min + o Zou Min - +o Ryan Boder + o Ryan Boder - +o Lorenzo Colitti + o Lorenzo Colitti - +o Gwendal Grignou + o Gwendal Grignou - +o Andre' Breiler + o Andre' Breiler - +o Tsutomu Yasuda + o Tsutomu Yasuda - 1155..44.. CCaassee SSttuuddiieess + 15.4. Case Studies - +o Jon Wright + o Jon Wright - +o William McEwan + o William McEwan - +o Michael Richardson + o Michael Richardson - 1155..55.. OOtthheerr ccoonnttrriibbuuttiioonnss + 15.5. Other contributions Bill Carr <Bill.Carr at compaq.com> made the Red Hat mkrootfs script diff --git a/Documentation/watchdog/convert_drivers_to_kernel_api.txt b/Documentation/watchdog/convert_drivers_to_kernel_api.txt new file mode 100644 index 000000000000..ae1e90036d06 --- /dev/null +++ b/Documentation/watchdog/convert_drivers_to_kernel_api.txt @@ -0,0 +1,195 @@ +Converting old watchdog drivers to the watchdog framework +by Wolfram Sang <w.sang@pengutronix.de> +========================================================= + +Before the watchdog framework came into the kernel, every driver had to +implement the API on its own. Now, as the framework factored out the common +components, those drivers can be lightened making it a user of the framework. +This document shall guide you for this task. The necessary steps are described +as well as things to look out for. + + +Remove the file_operations struct +--------------------------------- + +Old drivers define their own file_operations for actions like open(), write(), +etc... These are now handled by the framework and just call the driver when +needed. So, in general, the 'file_operations' struct and assorted functions can +go. Only very few driver-specific details have to be moved to other functions. +Here is a overview of the functions and probably needed actions: + +- open: Everything dealing with resource management (file-open checks, magic + close preparations) can simply go. Device specific stuff needs to go to the + driver specific start-function. Note that for some drivers, the start-function + also serves as the ping-function. If that is the case and you need start/stop + to be balanced (clocks!), you are better off refactoring a separate start-function. + +- close: Same hints as for open apply. + +- write: Can simply go, all defined behaviour is taken care of by the framework, + i.e. ping on write and magic char ('V') handling. + +- ioctl: While the driver is allowed to have extensions to the IOCTL interface, + the most common ones are handled by the framework, supported by some assistance + from the driver: + + WDIOC_GETSUPPORT: + Returns the mandatory watchdog_info struct from the driver + + WDIOC_GETSTATUS: + Needs the status-callback defined, otherwise returns 0 + + WDIOC_GETBOOTSTATUS: + Needs the bootstatus member properly set. Make sure it is 0 if you + don't have further support! + + WDIOC_SETOPTIONS: + No preparations needed + + WDIOC_KEEPALIVE: + If wanted, options in watchdog_info need to have WDIOF_KEEPALIVEPING + set + + WDIOC_SETTIMEOUT: + Options in watchdog_info need to have WDIOF_SETTIMEOUT set + and a set_timeout-callback has to be defined. The core will also + do limit-checking, if min_timeout and max_timeout in the watchdog + device are set. All is optional. + + WDIOC_GETTIMEOUT: + No preparations needed + + Other IOCTLs can be served using the ioctl-callback. Note that this is mainly + intended for porting old drivers; new drivers should not invent private IOCTLs. + Private IOCTLs are processed first. When the callback returns with + -ENOIOCTLCMD, the IOCTLs of the framework will be tried, too. Any other error + is directly given to the user. + +Example conversion: + +-static const struct file_operations s3c2410wdt_fops = { +- .owner = THIS_MODULE, +- .llseek = no_llseek, +- .write = s3c2410wdt_write, +- .unlocked_ioctl = s3c2410wdt_ioctl, +- .open = s3c2410wdt_open, +- .release = s3c2410wdt_release, +-}; + +Check the functions for device-specific stuff and keep it for later +refactoring. The rest can go. + + +Remove the miscdevice +--------------------- + +Since the file_operations are gone now, you can also remove the 'struct +miscdevice'. The framework will create it on watchdog_dev_register() called by +watchdog_register_device(). + +-static struct miscdevice s3c2410wdt_miscdev = { +- .minor = WATCHDOG_MINOR, +- .name = "watchdog", +- .fops = &s3c2410wdt_fops, +-}; + + +Remove obsolete includes and defines +------------------------------------ + +Because of the simplifications, a few defines are probably unused now. Remove +them. Includes can be removed, too. For example: + +- #include <linux/fs.h> +- #include <linux/miscdevice.h> (if MODULE_ALIAS_MISCDEV is not used) +- #include <linux/uaccess.h> (if no custom IOCTLs are used) + + +Add the watchdog operations +--------------------------- + +All possible callbacks are defined in 'struct watchdog_ops'. You can find it +explained in 'watchdog-kernel-api.txt' in this directory. start(), stop() and +owner must be set, the rest are optional. You will easily find corresponding +functions in the old driver. Note that you will now get a pointer to the +watchdog_device as a parameter to these functions, so you probably have to +change the function header. Other changes are most likely not needed, because +here simply happens the direct hardware access. If you have device-specific +code left from the above steps, it should be refactored into these callbacks. + +Here is a simple example: + ++static struct watchdog_ops s3c2410wdt_ops = { ++ .owner = THIS_MODULE, ++ .start = s3c2410wdt_start, ++ .stop = s3c2410wdt_stop, ++ .ping = s3c2410wdt_keepalive, ++ .set_timeout = s3c2410wdt_set_heartbeat, ++}; + +A typical function-header change looks like: + +-static void s3c2410wdt_keepalive(void) ++static int s3c2410wdt_keepalive(struct watchdog_device *wdd) + { +... ++ ++ return 0; + } + +... + +- s3c2410wdt_keepalive(); ++ s3c2410wdt_keepalive(&s3c2410_wdd); + + +Add the watchdog device +----------------------- + +Now we need to create a 'struct watchdog_device' and populate it with the +necessary information for the framework. The struct is also explained in detail +in 'watchdog-kernel-api.txt' in this directory. We pass it the mandatory +watchdog_info struct and the newly created watchdog_ops. Often, old drivers +have their own record-keeping for things like bootstatus and timeout using +static variables. Those have to be converted to use the members in +watchdog_device. Note that the timeout values are unsigned int. Some drivers +use signed int, so this has to be converted, too. + +Here is a simple example for a watchdog device: + ++static struct watchdog_device s3c2410_wdd = { ++ .info = &s3c2410_wdt_ident, ++ .ops = &s3c2410wdt_ops, ++}; + + +Register the watchdog device +---------------------------- + +Replace misc_register(&miscdev) with watchdog_register_device(&watchdog_dev). +Make sure the return value gets checked and the error message, if present, +still fits. Also convert the unregister case. + +- ret = misc_register(&s3c2410wdt_miscdev); ++ ret = watchdog_register_device(&s3c2410_wdd); + +... + +- misc_deregister(&s3c2410wdt_miscdev); ++ watchdog_unregister_device(&s3c2410_wdd); + + +Update the Kconfig-entry +------------------------ + +The entry for the driver now needs to select WATCHDOG_CORE: + ++ select WATCHDOG_CORE + + +Create a patch and send it to upstream +-------------------------------------- + +Make sure you understood Documentation/SubmittingPatches and send your patch to +linux-watchdog@vger.kernel.org. We are looking forward to it :) + |