summaryrefslogtreecommitdiffstats
path: root/Documentation/DocBook
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/DocBook')
-rw-r--r--Documentation/DocBook/.gitignore17
-rw-r--r--Documentation/DocBook/Makefile282
-rw-r--r--Documentation/DocBook/filesystems.tmpl381
-rw-r--r--Documentation/DocBook/kernel-hacking.tmpl1312
-rw-r--r--Documentation/DocBook/kernel-locking.tmpl2151
-rw-r--r--Documentation/DocBook/kgdb.tmpl918
-rw-r--r--Documentation/DocBook/libata.tmpl1625
-rw-r--r--Documentation/DocBook/librs.tmpl289
-rw-r--r--Documentation/DocBook/lsm.tmpl265
-rw-r--r--Documentation/DocBook/mtdnand.tmpl1291
-rw-r--r--Documentation/DocBook/networking.tmpl111
-rw-r--r--Documentation/DocBook/rapidio.tmpl155
-rw-r--r--Documentation/DocBook/s390-drivers.tmpl161
-rw-r--r--Documentation/DocBook/scsi.tmpl409
-rw-r--r--Documentation/DocBook/sh.tmpl105
-rw-r--r--Documentation/DocBook/stylesheet.xsl11
-rw-r--r--Documentation/DocBook/w1.tmpl101
-rw-r--r--Documentation/DocBook/z8530book.tmpl371
18 files changed, 0 insertions, 9955 deletions
diff --git a/Documentation/DocBook/.gitignore b/Documentation/DocBook/.gitignore
deleted file mode 100644
index e05da3f7aa21..000000000000
--- a/Documentation/DocBook/.gitignore
+++ /dev/null
@@ -1,17 +0,0 @@
-*.xml
-*.ps
-*.pdf
-*.html
-*.9.gz
-*.9
-*.aux
-*.dvi
-*.log
-*.out
-*.png
-*.gif
-*.svg
-*.proc
-*.db
-media-indices.tmpl
-media-entities.tmpl
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile
deleted file mode 100644
index 85916f13d330..000000000000
--- a/Documentation/DocBook/Makefile
+++ /dev/null
@@ -1,282 +0,0 @@
-###
-# This makefile is used to generate the kernel documentation,
-# primarily based on in-line comments in various source files.
-# See Documentation/kernel-doc-nano-HOWTO.txt for instruction in how
-# to document the SRC - and how to read it.
-# To add a new book the only step required is to add the book to the
-# list of DOCBOOKS.
-
-DOCBOOKS := z8530book.xml \
- kernel-hacking.xml kernel-locking.xml \
- networking.xml \
- filesystems.xml lsm.xml kgdb.xml \
- libata.xml mtdnand.xml librs.xml rapidio.xml \
- s390-drivers.xml scsi.xml \
- sh.xml w1.xml
-
-ifeq ($(DOCBOOKS),)
-
-# Skip DocBook build if the user explicitly requested no DOCBOOKS.
-.DEFAULT:
- @echo " SKIP DocBook $@ target (DOCBOOKS=\"\" specified)."
-else
-ifneq ($(SPHINXDIRS),)
-
-# Skip DocBook build if the user explicitly requested a sphinx dir
-.DEFAULT:
- @echo " SKIP DocBook $@ target (SPHINXDIRS specified)."
-else
-
-
-###
-# The build process is as follows (targets):
-# (xmldocs) [by docproc]
-# file.tmpl --> file.xml +--> file.ps (psdocs) [by db2ps or xmlto]
-# +--> file.pdf (pdfdocs) [by db2pdf or xmlto]
-# +--> DIR=file (htmldocs) [by xmlto]
-# +--> man/ (mandocs) [by xmlto]
-
-
-# for PDF and PS output you can choose between xmlto and docbook-utils tools
-PDF_METHOD = $(prefer-db2x)
-PS_METHOD = $(prefer-db2x)
-
-
-targets += $(DOCBOOKS)
-BOOKS := $(addprefix $(obj)/,$(DOCBOOKS))
-xmldocs: $(BOOKS)
-sgmldocs: xmldocs
-
-PS := $(patsubst %.xml, %.ps, $(BOOKS))
-psdocs: $(PS)
-
-PDF := $(patsubst %.xml, %.pdf, $(BOOKS))
-pdfdocs: $(PDF)
-
-HTML := $(sort $(patsubst %.xml, %.html, $(BOOKS)))
-htmldocs: $(HTML)
- $(call cmd,build_main_index)
-
-MAN := $(patsubst %.xml, %.9, $(BOOKS))
-mandocs: $(MAN)
- find $(obj)/man -name '*.9' | xargs gzip -nf
-
-# Default location for installed man pages
-export INSTALL_MAN_PATH = $(objtree)/usr
-
-installmandocs: mandocs
- mkdir -p $(INSTALL_MAN_PATH)/man/man9/
- find $(obj)/man -name '*.9.gz' -printf '%h %f\n' | \
- sort -k 2 -k 1 | uniq -f 1 | sed -e 's: :/:' | \
- xargs install -m 644 -t $(INSTALL_MAN_PATH)/man/man9/
-
-# no-op for the DocBook toolchain
-epubdocs:
-latexdocs:
-linkcheckdocs:
-
-###
-#External programs used
-KERNELDOCXMLREF = $(srctree)/scripts/kernel-doc-xml-ref
-KERNELDOC = $(srctree)/scripts/kernel-doc
-DOCPROC = $(objtree)/scripts/docproc
-CHECK_LC_CTYPE = $(objtree)/scripts/check-lc_ctype
-
-# Use a fixed encoding - UTF-8 if the C library has support built-in
-# or ASCII if not
-LC_CTYPE := $(call try-run, LC_CTYPE=C.UTF-8 $(CHECK_LC_CTYPE),C.UTF-8,C)
-export LC_CTYPE
-
-XMLTOFLAGS = -m $(srctree)/$(src)/stylesheet.xsl
-XMLTOFLAGS += --skip-validation
-
-###
-# DOCPROC is used for two purposes:
-# 1) To generate a dependency list for a .tmpl file
-# 2) To preprocess a .tmpl file and call kernel-doc with
-# appropriate parameters.
-# The following rules are used to generate the .xml documentation
-# required to generate the final targets. (ps, pdf, html).
-quiet_cmd_docproc = DOCPROC $@
- cmd_docproc = SRCTREE=$(srctree)/ $(DOCPROC) doc $< >$@
-define rule_docproc
- set -e; \
- $(if $($(quiet)cmd_$(1)),echo ' $($(quiet)cmd_$(1))';) \
- $(cmd_$(1)); \
- ( \
- echo 'cmd_$@ := $(cmd_$(1))'; \
- echo $@: `SRCTREE=$(srctree) $(DOCPROC) depend $<`; \
- ) > $(dir $@).$(notdir $@).cmd
-endef
-
-%.xml: %.tmpl $(KERNELDOC) $(DOCPROC) $(KERNELDOCXMLREF) FORCE
- $(call if_changed_rule,docproc)
-
-# Tell kbuild to always build the programs
-always := $(hostprogs-y)
-
-notfoundtemplate = echo "*** You have to install docbook-utils or xmlto ***"; \
- exit 1
-db2xtemplate = db2TYPE -o $(dir $@) $<
-xmltotemplate = xmlto TYPE $(XMLTOFLAGS) -o $(dir $@) $<
-
-# determine which methods are available
-ifeq ($(shell which db2ps >/dev/null 2>&1 && echo found),found)
- use-db2x = db2x
- prefer-db2x = db2x
-else
- use-db2x = notfound
- prefer-db2x = $(use-xmlto)
-endif
-ifeq ($(shell which xmlto >/dev/null 2>&1 && echo found),found)
- use-xmlto = xmlto
- prefer-xmlto = xmlto
-else
- use-xmlto = notfound
- prefer-xmlto = $(use-db2x)
-endif
-
-# the commands, generated from the chosen template
-quiet_cmd_db2ps = PS $@
- cmd_db2ps = $(subst TYPE,ps, $($(PS_METHOD)template))
-%.ps : %.xml
- $(call cmd,db2ps)
-
-quiet_cmd_db2pdf = PDF $@
- cmd_db2pdf = $(subst TYPE,pdf, $($(PDF_METHOD)template))
-%.pdf : %.xml
- $(call cmd,db2pdf)
-
-
-index = index.html
-main_idx = $(obj)/$(index)
-quiet_cmd_build_main_index = HTML $(main_idx)
- cmd_build_main_index = rm -rf $(main_idx); \
- echo '<h1>Linux Kernel HTML Documentation</h1>' >> $(main_idx) && \
- echo '<h2>Kernel Version: $(KERNELVERSION)</h2>' >> $(main_idx) && \
- cat $(HTML) >> $(main_idx)
-
-quiet_cmd_db2html = HTML $@
- cmd_db2html = xmlto html $(XMLTOFLAGS) -o $(patsubst %.html,%,$@) $< && \
- echo '<a HREF="$(patsubst %.html,%,$(notdir $@))/index.html"> \
- $(patsubst %.html,%,$(notdir $@))</a><p>' > $@
-
-###
-# Rules to create an aux XML and .db, and use them to re-process the DocBook XML
-# to fill internal hyperlinks
- gen_aux_xml = :
- quiet_gen_aux_xml = echo ' XMLREF $@'
-silent_gen_aux_xml = :
-%.aux.xml: %.xml
- @$($(quiet)gen_aux_xml)
- @rm -rf $@
- @(cat $< | egrep "^<refentry id" | egrep -o "\".*\"" | cut -f 2 -d \" > $<.db)
- @$(KERNELDOCXMLREF) -db $<.db $< > $@
-.PRECIOUS: %.aux.xml
-
-%.html: %.aux.xml
- @(which xmlto > /dev/null 2>&1) || \
- (echo "*** You need to install xmlto ***"; \
- exit 1)
- @rm -rf $@ $(patsubst %.html,%,$@)
- $(call cmd,db2html)
- @if [ ! -z "$(PNG-$(basename $(notdir $@)))" ]; then \
- cp $(PNG-$(basename $(notdir $@))) $(patsubst %.html,%,$@); fi
-
-quiet_cmd_db2man = MAN $@
- cmd_db2man = if grep -q refentry $<; then xmlto man $(XMLTOFLAGS) -o $(obj)/man/$(*F) $< ; fi
-%.9 : %.xml
- @(which xmlto > /dev/null 2>&1) || \
- (echo "*** You need to install xmlto ***"; \
- exit 1)
- $(Q)mkdir -p $(obj)/man/$(*F)
- $(call cmd,db2man)
- @touch $@
-
-###
-# Rules to generate postscripts and PNG images from .fig format files
-quiet_cmd_fig2eps = FIG2EPS $@
- cmd_fig2eps = fig2dev -Leps $< $@
-
-%.eps: %.fig
- @(which fig2dev > /dev/null 2>&1) || \
- (echo "*** You need to install transfig ***"; \
- exit 1)
- $(call cmd,fig2eps)
-
-quiet_cmd_fig2png = FIG2PNG $@
- cmd_fig2png = fig2dev -Lpng $< $@
-
-%.png: %.fig
- @(which fig2dev > /dev/null 2>&1) || \
- (echo "*** You need to install transfig ***"; \
- exit 1)
- $(call cmd,fig2png)
-
-###
-# Rule to convert a .c file to inline XML documentation
- gen_xml = :
- quiet_gen_xml = echo ' GEN $@'
-silent_gen_xml = :
-%.xml: %.c
- @$($(quiet)gen_xml)
- @( \
- echo "<programlisting>"; \
- expand --tabs=8 < $< | \
- sed -e "s/&/\\&amp;/g" \
- -e "s/</\\&lt;/g" \
- -e "s/>/\\&gt;/g"; \
- echo "</programlisting>") > $@
-
-endif # DOCBOOKS=""
-endif # SPHINDIR=...
-
-###
-# Help targets as used by the top-level makefile
-dochelp:
- @echo ' Linux kernel internal documentation in different formats (DocBook):'
- @echo ' htmldocs - HTML'
- @echo ' pdfdocs - PDF'
- @echo ' psdocs - Postscript'
- @echo ' xmldocs - XML DocBook'
- @echo ' mandocs - man pages'
- @echo ' installmandocs - install man pages generated by mandocs to INSTALL_MAN_PATH'; \
- echo ' (default: $(INSTALL_MAN_PATH))'; \
- echo ''
- @echo ' cleandocs - clean all generated DocBook files'
- @echo
- @echo ' make DOCBOOKS="s1.xml s2.xml" [target] Generate only docs s1.xml s2.xml'
- @echo ' valid values for DOCBOOKS are: $(DOCBOOKS)'
- @echo
- @echo " make DOCBOOKS=\"\" [target] Don't generate docs from Docbook"
- @echo ' This is useful to generate only the ReST docs (Sphinx)'
-
-
-###
-# Temporary files left by various tools
-clean-files := $(DOCBOOKS) \
- $(patsubst %.xml, %.dvi, $(DOCBOOKS)) \
- $(patsubst %.xml, %.aux, $(DOCBOOKS)) \
- $(patsubst %.xml, %.tex, $(DOCBOOKS)) \
- $(patsubst %.xml, %.log, $(DOCBOOKS)) \
- $(patsubst %.xml, %.out, $(DOCBOOKS)) \
- $(patsubst %.xml, %.ps, $(DOCBOOKS)) \
- $(patsubst %.xml, %.pdf, $(DOCBOOKS)) \
- $(patsubst %.xml, %.html, $(DOCBOOKS)) \
- $(patsubst %.xml, %.9, $(DOCBOOKS)) \
- $(patsubst %.xml, %.aux.xml, $(DOCBOOKS)) \
- $(patsubst %.xml, %.xml.db, $(DOCBOOKS)) \
- $(patsubst %.xml, %.xml, $(DOCBOOKS)) \
- $(patsubst %.xml, .%.xml.cmd, $(DOCBOOKS)) \
- $(index)
-
-clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS)) man
-
-cleandocs:
- $(Q)rm -f $(call objectify, $(clean-files))
- $(Q)rm -rf $(call objectify, $(clean-dirs))
-
-# Declare the contents of the .PHONY variable as phony. We keep that
-# information in a variable so we can use it in if_changed and friends.
-
-.PHONY: $(PHONY)
diff --git a/Documentation/DocBook/filesystems.tmpl b/Documentation/DocBook/filesystems.tmpl
deleted file mode 100644
index 6006b6358c86..000000000000
--- a/Documentation/DocBook/filesystems.tmpl
+++ /dev/null
@@ -1,381 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
- "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
-
-<book id="Linux-filesystems-API">
- <bookinfo>
- <title>Linux Filesystems API</title>
-
- <legalnotice>
- <para>
- This documentation is free software; you can redistribute
- it and/or modify it under the terms of the GNU General Public
- License as published by the Free Software Foundation; either
- version 2 of the License, or (at your option) any later
- version.
- </para>
-
- <para>
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- See the GNU General Public License for more details.
- </para>
-
- <para>
- You should have received a copy of the GNU General Public
- License along with this program; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- MA 02111-1307 USA
- </para>
-
- <para>
- For more details see the file COPYING in the source
- distribution of Linux.
- </para>
- </legalnotice>
- </bookinfo>
-
-<toc></toc>
-
- <chapter id="vfs">
- <title>The Linux VFS</title>
- <sect1 id="the_filesystem_types"><title>The Filesystem types</title>
-!Iinclude/linux/fs.h
- </sect1>
- <sect1 id="the_directory_cache"><title>The Directory Cache</title>
-!Efs/dcache.c
-!Iinclude/linux/dcache.h
- </sect1>
- <sect1 id="inode_handling"><title>Inode Handling</title>
-!Efs/inode.c
-!Efs/bad_inode.c
- </sect1>
- <sect1 id="registration_and_superblocks"><title>Registration and Superblocks</title>
-!Efs/super.c
- </sect1>
- <sect1 id="file_locks"><title>File Locks</title>
-!Efs/locks.c
-!Ifs/locks.c
- </sect1>
- <sect1 id="other_functions"><title>Other Functions</title>
-!Efs/mpage.c
-!Efs/namei.c
-!Efs/buffer.c
-!Eblock/bio.c
-!Efs/seq_file.c
-!Efs/filesystems.c
-!Efs/fs-writeback.c
-!Efs/block_dev.c
- </sect1>
- </chapter>
-
- <chapter id="proc">
- <title>The proc filesystem</title>
-
- <sect1 id="sysctl_interface"><title>sysctl interface</title>
-!Ekernel/sysctl.c
- </sect1>
-
- <sect1 id="proc_filesystem_interface"><title>proc filesystem interface</title>
-!Ifs/proc/base.c
- </sect1>
- </chapter>
-
- <chapter id="fs_events">
- <title>Events based on file descriptors</title>
-!Efs/eventfd.c
- </chapter>
-
- <chapter id="sysfs">
- <title>The Filesystem for Exporting Kernel Objects</title>
-!Efs/sysfs/file.c
-!Efs/sysfs/symlink.c
- </chapter>
-
- <chapter id="debugfs">
- <title>The debugfs filesystem</title>
-
- <sect1 id="debugfs_interface"><title>debugfs interface</title>
-!Efs/debugfs/inode.c
-!Efs/debugfs/file.c
- </sect1>
- </chapter>
-
- <chapter id="LinuxJDBAPI">
- <chapterinfo>
- <title>The Linux Journalling API</title>
-
- <authorgroup>
- <author>
- <firstname>Roger</firstname>
- <surname>Gammans</surname>
- <affiliation>
- <address>
- <email>rgammans@computer-surgery.co.uk</email>
- </address>
- </affiliation>
- </author>
- </authorgroup>
-
- <authorgroup>
- <author>
- <firstname>Stephen</firstname>
- <surname>Tweedie</surname>
- <affiliation>
- <address>
- <email>sct@redhat.com</email>
- </address>
- </affiliation>
- </author>
- </authorgroup>
-
- <copyright>
- <year>2002</year>
- <holder>Roger Gammans</holder>
- </copyright>
- </chapterinfo>
-
- <title>The Linux Journalling API</title>
-
- <sect1 id="journaling_overview">
- <title>Overview</title>
- <sect2 id="journaling_details">
- <title>Details</title>
-<para>
-The journalling layer is easy to use. You need to
-first of all create a journal_t data structure. There are
-two calls to do this dependent on how you decide to allocate the physical
-media on which the journal resides. The jbd2_journal_init_inode() call
-is for journals stored in filesystem inodes, or the jbd2_journal_init_dev()
-call can be used for journal stored on a raw device (in a continuous range
-of blocks). A journal_t is a typedef for a struct pointer, so when
-you are finally finished make sure you call jbd2_journal_destroy() on it
-to free up any used kernel memory.
-</para>
-
-<para>
-Once you have got your journal_t object you need to 'mount' or load the journal
-file. The journalling layer expects the space for the journal was already
-allocated and initialized properly by the userspace tools. When loading the
-journal you must call jbd2_journal_load() to process journal contents. If the
-client file system detects the journal contents does not need to be processed
-(or even need not have valid contents), it may call jbd2_journal_wipe() to
-clear the journal contents before calling jbd2_journal_load().
-</para>
-
-<para>
-Note that jbd2_journal_wipe(..,0) calls jbd2_journal_skip_recovery() for you if
-it detects any outstanding transactions in the journal and similarly
-jbd2_journal_load() will call jbd2_journal_recover() if necessary. I would
-advise reading ext4_load_journal() in fs/ext4/super.c for examples on this
-stage.
-</para>
-
-<para>
-Now you can go ahead and start modifying the underlying
-filesystem. Almost.
-</para>
-
-<para>
-
-You still need to actually journal your filesystem changes, this
-is done by wrapping them into transactions. Additionally you
-also need to wrap the modification of each of the buffers
-with calls to the journal layer, so it knows what the modifications
-you are actually making are. To do this use jbd2_journal_start() which
-returns a transaction handle.
-</para>
-
-<para>
-jbd2_journal_start()
-and its counterpart jbd2_journal_stop(), which indicates the end of a
-transaction are nestable calls, so you can reenter a transaction if necessary,
-but remember you must call jbd2_journal_stop() the same number of times as
-jbd2_journal_start() before the transaction is completed (or more accurately
-leaves the update phase). Ext4/VFS makes use of this feature to simplify
-handling of inode dirtying, quota support, etc.
-</para>
-
-<para>
-Inside each transaction you need to wrap the modifications to the
-individual buffers (blocks). Before you start to modify a buffer you
-need to call jbd2_journal_get_{create,write,undo}_access() as appropriate,
-this allows the journalling layer to copy the unmodified data if it
-needs to. After all the buffer may be part of a previously uncommitted
-transaction.
-At this point you are at last ready to modify a buffer, and once
-you are have done so you need to call jbd2_journal_dirty_{meta,}data().
-Or if you've asked for access to a buffer you now know is now longer
-required to be pushed back on the device you can call jbd2_journal_forget()
-in much the same way as you might have used bforget() in the past.
-</para>
-
-<para>
-A jbd2_journal_flush() may be called at any time to commit and checkpoint
-all your transactions.
-</para>
-
-<para>
-Then at umount time , in your put_super() you can then call jbd2_journal_destroy()
-to clean up your in-core journal object.
-</para>
-
-<para>
-Unfortunately there a couple of ways the journal layer can cause a deadlock.
-The first thing to note is that each task can only have
-a single outstanding transaction at any one time, remember nothing
-commits until the outermost jbd2_journal_stop(). This means
-you must complete the transaction at the end of each file/inode/address
-etc. operation you perform, so that the journalling system isn't re-entered
-on another journal. Since transactions can't be nested/batched
-across differing journals, and another filesystem other than
-yours (say ext4) may be modified in a later syscall.
-</para>
-
-<para>
-The second case to bear in mind is that jbd2_journal_start() can
-block if there isn't enough space in the journal for your transaction
-(based on the passed nblocks param) - when it blocks it merely(!) needs to
-wait for transactions to complete and be committed from other tasks,
-so essentially we are waiting for jbd2_journal_stop(). So to avoid
-deadlocks you must treat jbd2_journal_start/stop() as if they
-were semaphores and include them in your semaphore ordering rules to prevent
-deadlocks. Note that jbd2_journal_extend() has similar blocking behaviour to
-jbd2_journal_start() so you can deadlock here just as easily as on
-jbd2_journal_start().
-</para>
-
-<para>
-Try to reserve the right number of blocks the first time. ;-). This will
-be the maximum number of blocks you are going to touch in this transaction.
-I advise having a look at at least ext4_jbd.h to see the basis on which
-ext4 uses to make these decisions.
-</para>
-
-<para>
-Another wriggle to watch out for is your on-disk block allocation strategy.
-Why? Because, if you do a delete, you need to ensure you haven't reused any
-of the freed blocks until the transaction freeing these blocks commits. If you
-reused these blocks and crash happens, there is no way to restore the contents
-of the reallocated blocks at the end of the last fully committed transaction.
-
-One simple way of doing this is to mark blocks as free in internal in-memory
-block allocation structures only after the transaction freeing them commits.
-Ext4 uses journal commit callback for this purpose.
-</para>
-
-<para>
-With journal commit callbacks you can ask the journalling layer to call a
-callback function when the transaction is finally committed to disk, so that
-you can do some of your own management. You ask the journalling layer for
-calling the callback by simply setting journal->j_commit_callback function
-pointer and that function is called after each transaction commit. You can also
-use transaction->t_private_list for attaching entries to a transaction that
-need processing when the transaction commits.
-</para>
-
-<para>
-JBD2 also provides a way to block all transaction updates via
-jbd2_journal_{un,}lock_updates(). Ext4 uses this when it wants a window with a
-clean and stable fs for a moment. E.g.
-</para>
-
-<programlisting>
-
- jbd2_journal_lock_updates() //stop new stuff happening..
- jbd2_journal_flush() // checkpoint everything.
- ..do stuff on stable fs
- jbd2_journal_unlock_updates() // carry on with filesystem use.
-</programlisting>
-
-<para>
-The opportunities for abuse and DOS attacks with this should be obvious,
-if you allow unprivileged userspace to trigger codepaths containing these
-calls.
-</para>
-
- </sect2>
-
- <sect2 id="jbd_summary">
- <title>Summary</title>
-<para>
-Using the journal is a matter of wrapping the different context changes,
-being each mount, each modification (transaction) and each changed buffer
-to tell the journalling layer about them.
-</para>
-
- </sect2>
-
- </sect1>
-
- <sect1 id="data_types">
- <title>Data Types</title>
- <para>
- The journalling layer uses typedefs to 'hide' the concrete definitions
- of the structures used. As a client of the JBD2 layer you can
- just rely on the using the pointer as a magic cookie of some sort.
-
- Obviously the hiding is not enforced as this is 'C'.
- </para>
- <sect2 id="structures"><title>Structures</title>
-!Iinclude/linux/jbd2.h
- </sect2>
- </sect1>
-
- <sect1 id="functions">
- <title>Functions</title>
- <para>
- The functions here are split into two groups those that
- affect a journal as a whole, and those which are used to
- manage transactions
- </para>
- <sect2 id="journal_level"><title>Journal Level</title>
-!Efs/jbd2/journal.c
-!Ifs/jbd2/recovery.c
- </sect2>
- <sect2 id="transaction_level"><title>Transasction Level</title>
-!Efs/jbd2/transaction.c
- </sect2>
- </sect1>
- <sect1 id="see_also">
- <title>See also</title>
- <para>
- <citation>
- <ulink url="http://kernel.org/pub/linux/kernel/people/sct/ext3/journal-design.ps.gz">
- Journaling the Linux ext2fs Filesystem, LinuxExpo 98, Stephen Tweedie
- </ulink>
- </citation>
- </para>
- <para>
- <citation>
- <ulink url="http://olstrans.sourceforge.net/release/OLS2000-ext3/OLS2000-ext3.html">
- Ext3 Journalling FileSystem, OLS 2000, Dr. Stephen Tweedie
- </ulink>
- </citation>
- </para>
- </sect1>
-
- </chapter>
-
- <chapter id="splice">
- <title>splice API</title>
- <para>
- splice is a method for moving blocks of data around inside the
- kernel, without continually transferring them between the kernel
- and user space.
- </para>
-!Ffs/splice.c
- </chapter>
-
- <chapter id="pipes">
- <title>pipes API</title>
- <para>
- Pipe interfaces are all for in-kernel (builtin image) use.
- They are not exported for use by modules.
- </para>
-!Iinclude/linux/pipe_fs_i.h
-!Ffs/pipe.c
- </chapter>
-
-</book>
diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl
deleted file mode 100644
index da5c087462b1..000000000000
--- a/Documentation/DocBook/kernel-hacking.tmpl
+++ /dev/null
@@ -1,1312 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
- "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
-
-<book id="lk-hacking-guide">
- <bookinfo>
- <title>Unreliable Guide To Hacking The Linux Kernel</title>
-
- <authorgroup>
- <author>
- <firstname>Rusty</firstname>
- <surname>Russell</surname>
- <affiliation>
- <address>
- <email>rusty@rustcorp.com.au</email>
- </address>
- </affiliation>
- </author>
- </authorgroup>
-
- <copyright>
- <year>2005</year>
- <holder>Rusty Russell</holder>
- </copyright>
-
- <legalnotice>
- <para>
- This documentation is free software; you can redistribute
- it and/or modify it under the terms of the GNU General Public
- License as published by the Free Software Foundation; either
- version 2 of the License, or (at your option) any later
- version.
- </para>
-
- <para>
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- See the GNU General Public License for more details.
- </para>
-
- <para>
- You should have received a copy of the GNU General Public
- License along with this program; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- MA 02111-1307 USA
- </para>
-
- <para>
- For more details see the file COPYING in the source
- distribution of Linux.
- </para>
- </legalnotice>
-
- <releaseinfo>
- This is the first release of this document as part of the kernel tarball.
- </releaseinfo>
-
- </bookinfo>
-
- <toc></toc>
-
- <chapter id="introduction">
- <title>Introduction</title>
- <para>
- Welcome, gentle reader, to Rusty's Remarkably Unreliable Guide to Linux
- Kernel Hacking. This document describes the common routines and
- general requirements for kernel code: its goal is to serve as a
- primer for Linux kernel development for experienced C
- programmers. I avoid implementation details: that's what the
- code is for, and I ignore whole tracts of useful routines.
- </para>
- <para>
- Before you read this, please understand that I never wanted to
- write this document, being grossly under-qualified, but I always
- wanted to read it, and this was the only way. I hope it will
- grow into a compendium of best practice, common starting points
- and random information.
- </para>
- </chapter>
-
- <chapter id="basic-players">
- <title>The Players</title>
-
- <para>
- At any time each of the CPUs in a system can be:
- </para>
-
- <itemizedlist>
- <listitem>
- <para>
- not associated with any process, serving a hardware interrupt;
- </para>
- </listitem>
-
- <listitem>
- <para>
- not associated with any process, serving a softirq or tasklet;
- </para>
- </listitem>
-
- <listitem>
- <para>
- running in kernel space, associated with a process (user context);
- </para>
- </listitem>
-
- <listitem>
- <para>
- running a process in user space.
- </para>
- </listitem>
- </itemizedlist>
-
- <para>
- There is an ordering between these. The bottom two can preempt
- each other, but above that is a strict hierarchy: each can only be
- preempted by the ones above it. For example, while a softirq is
- running on a CPU, no other softirq will preempt it, but a hardware
- interrupt can. However, any other CPUs in the system execute
- independently.
- </para>
-
- <para>
- We'll see a number of ways that the user context can block
- interrupts, to become truly non-preemptable.
- </para>
-
- <sect1 id="basics-usercontext">
- <title>User Context</title>
-
- <para>
- User context is when you are coming in from a system call or other
- trap: like userspace, you can be preempted by more important tasks
- and by interrupts. You can sleep, by calling
- <function>schedule()</function>.
- </para>
-
- <note>
- <para>
- You are always in user context on module load and unload,
- and on operations on the block device layer.
- </para>
- </note>
-
- <para>
- In user context, the <varname>current</varname> pointer (indicating
- the task we are currently executing) is valid, and
- <function>in_interrupt()</function>
- (<filename>include/linux/interrupt.h</filename>) is <returnvalue>false
- </returnvalue>.
- </para>
-
- <caution>
- <para>
- Beware that if you have preemption or softirqs disabled
- (see below), <function>in_interrupt()</function> will return a
- false positive.
- </para>
- </caution>
- </sect1>
-
- <sect1 id="basics-hardirqs">
- <title>Hardware Interrupts (Hard IRQs)</title>
-
- <para>
- Timer ticks, <hardware>network cards</hardware> and
- <hardware>keyboard</hardware> are examples of real
- hardware which produce interrupts at any time. The kernel runs
- interrupt handlers, which services the hardware. The kernel
- guarantees that this handler is never re-entered: if the same
- interrupt arrives, it is queued (or dropped). Because it
- disables interrupts, this handler has to be fast: frequently it
- simply acknowledges the interrupt, marks a 'software interrupt'
- for execution and exits.
- </para>
-
- <para>
- You can tell you are in a hardware interrupt, because
- <function>in_irq()</function> returns <returnvalue>true</returnvalue>.
- </para>
- <caution>
- <para>
- Beware that this will return a false positive if interrupts are disabled
- (see below).
- </para>
- </caution>
- </sect1>
-
- <sect1 id="basics-softirqs">
- <title>Software Interrupt Context: Softirqs and Tasklets</title>
-
- <para>
- Whenever a system call is about to return to userspace, or a
- hardware interrupt handler exits, any 'software interrupts'
- which are marked pending (usually by hardware interrupts) are
- run (<filename>kernel/softirq.c</filename>).
- </para>
-
- <para>
- Much of the real interrupt handling work is done here. Early in
- the transition to <acronym>SMP</acronym>, there were only 'bottom
- halves' (BHs), which didn't take advantage of multiple CPUs. Shortly
- after we switched from wind-up computers made of match-sticks and snot,
- we abandoned this limitation and switched to 'softirqs'.
- </para>
-
- <para>
- <filename class="headerfile">include/linux/interrupt.h</filename> lists the
- different softirqs. A very important softirq is the
- timer softirq (<filename
- class="headerfile">include/linux/timer.h</filename>): you can
- register to have it call functions for you in a given length of
- time.
- </para>
-
- <para>
- Softirqs are often a pain to deal with, since the same softirq
- will run simultaneously on more than one CPU. For this reason,
- tasklets (<filename
- class="headerfile">include/linux/interrupt.h</filename>) are more
- often used: they are dynamically-registrable (meaning you can have
- as many as you want), and they also guarantee that any tasklet
- will only run on one CPU at any time, although different tasklets
- can run simultaneously.
- </para>
- <caution>
- <para>
- The name 'tasklet' is misleading: they have nothing to do with 'tasks',
- and probably more to do with some bad vodka Alexey Kuznetsov had at the
- time.
- </para>
- </caution>
-
- <para>
- You can tell you are in a softirq (or tasklet)
- using the <function>in_softirq()</function> macro
- (<filename class="headerfile">include/linux/interrupt.h</filename>).
- </para>
- <caution>
- <para>
- Beware that this will return a false positive if a bh lock (see below)
- is held.
- </para>
- </caution>
- </sect1>
- </chapter>
-
- <chapter id="basic-rules">
- <title>Some Basic Rules</title>
-
- <variablelist>
- <varlistentry>
- <term>No memory protection</term>
- <listitem>
- <para>
- If you corrupt memory, whether in user context or
- interrupt context, the whole machine will crash. Are you
- sure you can't do what you want in userspace?
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry>
- <term>No floating point or <acronym>MMX</acronym></term>
- <listitem>
- <para>
- The <acronym>FPU</acronym> context is not saved; even in user
- context the <acronym>FPU</acronym> state probably won't
- correspond with the current process: you would mess with some
- user process' <acronym>FPU</acronym> state. If you really want
- to do this, you would have to explicitly save/restore the full
- <acronym>FPU</acronym> state (and avoid context switches). It
- is generally a bad idea; use fixed point arithmetic first.
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry>
- <term>A rigid stack limit</term>
- <listitem>
- <para>
- Depending on configuration options the kernel stack is about 3K to 6K for most 32-bit architectures: it's
- about 14K on most 64-bit archs, and often shared with interrupts
- so you can't use it all. Avoid deep recursion and huge local
- arrays on the stack (allocate them dynamically instead).
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry>
- <term>The Linux kernel is portable</term>
- <listitem>
- <para>
- Let's keep it that way. Your code should be 64-bit clean,
- and endian-independent. You should also minimize CPU
- specific stuff, e.g. inline assembly should be cleanly
- encapsulated and minimized to ease porting. Generally it
- should be restricted to the architecture-dependent part of
- the kernel tree.
- </para>
- </listitem>
- </varlistentry>
- </variablelist>
- </chapter>
-
- <chapter id="ioctls">
- <title>ioctls: Not writing a new system call</title>
-
- <para>
- A system call generally looks like this
- </para>
-
- <programlisting>
-asmlinkage long sys_mycall(int arg)
-{
- return 0;
-}
- </programlisting>
-
- <para>
- First, in most cases you don't want to create a new system call.
- You create a character device and implement an appropriate ioctl
- for it. This is much more flexible than system calls, doesn't have
- to be entered in every architecture's
- <filename class="headerfile">include/asm/unistd.h</filename> and
- <filename>arch/kernel/entry.S</filename> file, and is much more
- likely to be accepted by Linus.
- </para>
-
- <para>
- If all your routine does is read or write some parameter, consider
- implementing a <function>sysfs</function> interface instead.
- </para>
-
- <para>
- Inside the ioctl you're in user context to a process. When a
- error occurs you return a negated errno (see
- <filename class="headerfile">include/linux/errno.h</filename>),
- otherwise you return <returnvalue>0</returnvalue>.
- </para>
-
- <para>
- After you slept you should check if a signal occurred: the
- Unix/Linux way of handling signals is to temporarily exit the
- system call with the <constant>-ERESTARTSYS</constant> error. The
- system call entry code will switch back to user context, process
- the signal handler and then your system call will be restarted
- (unless the user disabled that). So you should be prepared to
- process the restart, e.g. if you're in the middle of manipulating
- some data structure.
- </para>
-
- <programlisting>
-if (signal_pending(current))
- return -ERESTARTSYS;
- </programlisting>
-
- <para>
- If you're doing longer computations: first think userspace. If you
- <emphasis>really</emphasis> want to do it in kernel you should
- regularly check if you need to give up the CPU (remember there is
- cooperative multitasking per CPU). Idiom:
- </para>
-
- <programlisting>
-cond_resched(); /* Will sleep */
- </programlisting>
-
- <para>
- A short note on interface design: the UNIX system call motto is
- "Provide mechanism not policy".
- </para>
- </chapter>
-
- <chapter id="deadlock-recipes">
- <title>Recipes for Deadlock</title>
-
- <para>
- You cannot call any routines which may sleep, unless:
- </para>
- <itemizedlist>
- <listitem>
- <para>
- You are in user context.
- </para>
- </listitem>
-
- <listitem>
- <para>
- You do not own any spinlocks.
- </para>
- </listitem>
-
- <listitem>
- <para>
- You have interrupts enabled (actually, Andi Kleen says
- that the scheduling code will enable them for you, but
- that's probably not what you wanted).
- </para>
- </listitem>
- </itemizedlist>
-
- <para>
- Note that some functions may sleep implicitly: common ones are
- the user space access functions (*_user) and memory allocation
- functions without <symbol>GFP_ATOMIC</symbol>.
- </para>
-
- <para>
- You should always compile your kernel
- <symbol>CONFIG_DEBUG_ATOMIC_SLEEP</symbol> on, and it will warn
- you if you break these rules. If you <emphasis>do</emphasis> break
- the rules, you will eventually lock up your box.
- </para>
-
- <para>
- Really.
- </para>
- </chapter>
-
- <chapter id="common-routines">
- <title>Common Routines</title>
-
- <sect1 id="routines-printk">
- <title>
- <function>printk()</function>
- <filename class="headerfile">include/linux/kernel.h</filename>
- </title>
-
- <para>
- <function>printk()</function> feeds kernel messages to the
- console, dmesg, and the syslog daemon. It is useful for debugging
- and reporting errors, and can be used inside interrupt context,
- but use with caution: a machine which has its console flooded with
- printk messages is unusable. It uses a format string mostly
- compatible with ANSI C printf, and C string concatenation to give
- it a first "priority" argument:
- </para>
-
- <programlisting>
-printk(KERN_INFO "i = %u\n", i);
- </programlisting>
-
- <para>
- See <filename class="headerfile">include/linux/kernel.h</filename>;
- for other KERN_ values; these are interpreted by syslog as the
- level. Special case: for printing an IP address use
- </para>
-
- <programlisting>
-__be32 ipaddress;
-printk(KERN_INFO "my ip: %pI4\n", &amp;ipaddress);
- </programlisting>
-
- <para>
- <function>printk()</function> internally uses a 1K buffer and does
- not catch overruns. Make sure that will be enough.
- </para>
-
- <note>
- <para>
- You will know when you are a real kernel hacker
- when you start typoing printf as printk in your user programs :)
- </para>
- </note>
-
- <!--- From the Lions book reader department -->
-
- <note>
- <para>
- Another sidenote: the original Unix Version 6 sources had a
- comment on top of its printf function: "Printf should not be
- used for chit-chat". You should follow that advice.
- </para>
- </note>
- </sect1>
-
- <sect1 id="routines-copy">
- <title>
- <function>copy_[to/from]_user()</function>
- /
- <function>get_user()</function>
- /
- <function>put_user()</function>
- <filename class="headerfile">include/linux/uaccess.h</filename>
- </title>
-
- <para>
- <emphasis>[SLEEPS]</emphasis>
- </para>
-
- <para>
- <function>put_user()</function> and <function>get_user()</function>
- are used to get and put single values (such as an int, char, or
- long) from and to userspace. A pointer into userspace should
- never be simply dereferenced: data should be copied using these
- routines. Both return <constant>-EFAULT</constant> or 0.
- </para>
- <para>
- <function>copy_to_user()</function> and
- <function>copy_from_user()</function> are more general: they copy
- an arbitrary amount of data to and from userspace.
- <caution>
- <para>
- Unlike <function>put_user()</function> and
- <function>get_user()</function>, they return the amount of
- uncopied data (ie. <returnvalue>0</returnvalue> still means
- success).
- </para>
- </caution>
- [Yes, this moronic interface makes me cringe. The flamewar comes up every year or so. --RR.]
- </para>
- <para>
- The functions may sleep implicitly. This should never be called
- outside user context (it makes no sense), with interrupts
- disabled, or a spinlock held.
- </para>
- </sect1>
-
- <sect1 id="routines-kmalloc">
- <title><function>kmalloc()</function>/<function>kfree()</function>
- <filename class="headerfile">include/linux/slab.h</filename></title>
-
- <para>
- <emphasis>[MAY SLEEP: SEE BELOW]</emphasis>
- </para>
-
- <para>
- These routines are used to dynamically request pointer-aligned
- chunks of memory, like malloc and free do in userspace, but
- <function>kmalloc()</function> takes an extra flag word.
- Important values:
- </para>
-
- <variablelist>
- <varlistentry>
- <term>
- <constant>
- GFP_KERNEL
- </constant>
- </term>
- <listitem>
- <para>
- May sleep and swap to free memory. Only allowed in user
- context, but is the most reliable way to allocate memory.
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry>
- <term>
- <constant>
- GFP_ATOMIC
- </constant>
- </term>
- <listitem>
- <para>
- Don't sleep. Less reliable than <constant>GFP_KERNEL</constant>,
- but may be called from interrupt context. You should
- <emphasis>really</emphasis> have a good out-of-memory
- error-handling strategy.
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry>
- <term>
- <constant>
- GFP_DMA
- </constant>
- </term>
- <listitem>
- <para>
- Allocate ISA DMA lower than 16MB. If you don't know what that
- is you don't need it. Very unreliable.
- </para>
- </listitem>
- </varlistentry>
- </variablelist>
-
- <para>
- If you see a <errorname>sleeping function called from invalid
- context</errorname> warning message, then maybe you called a
- sleeping allocation function from interrupt context without
- <constant>GFP_ATOMIC</constant>. You should really fix that.
- Run, don't walk.
- </para>
-
- <para>
- If you are allocating at least <constant>PAGE_SIZE</constant>
- (<filename class="headerfile">include/asm/page.h</filename>) bytes,
- consider using <function>__get_free_pages()</function>
-
- (<filename class="headerfile">include/linux/mm.h</filename>). It
- takes an order argument (0 for page sized, 1 for double page, 2
- for four pages etc.) and the same memory priority flag word as
- above.
- </para>
-
- <para>
- If you are allocating more than a page worth of bytes you can use
- <function>vmalloc()</function>. It'll allocate virtual memory in
- the kernel map. This block is not contiguous in physical memory,
- but the <acronym>MMU</acronym> makes it look like it is for you
- (so it'll only look contiguous to the CPUs, not to external device
- drivers). If you really need large physically contiguous memory
- for some weird device, you have a problem: it is poorly supported
- in Linux because after some time memory fragmentation in a running
- kernel makes it hard. The best way is to allocate the block early
- in the boot process via the <function>alloc_bootmem()</function>
- routine.
- </para>
-
- <para>
- Before inventing your own cache of often-used objects consider
- using a slab cache in
- <filename class="headerfile">include/linux/slab.h</filename>
- </para>
- </sect1>
-
- <sect1 id="routines-current">
- <title><function>current</function>
- <filename class="headerfile">include/asm/current.h</filename></title>
-
- <para>
- This global variable (really a macro) contains a pointer to
- the current task structure, so is only valid in user context.
- For example, when a process makes a system call, this will
- point to the task structure of the calling process. It is
- <emphasis>not NULL</emphasis> in interrupt context.
- </para>
- </sect1>
-
- <sect1 id="routines-udelay">
- <title><function>mdelay()</function>/<function>udelay()</function>
- <filename class="headerfile">include/asm/delay.h</filename>
- <filename class="headerfile">include/linux/delay.h</filename>
- </title>
-
- <para>
- The <function>udelay()</function> and <function>ndelay()</function> functions can be used for small pauses.
- Do not use large values with them as you risk
- overflow - the helper function <function>mdelay()</function> is useful
- here, or consider <function>msleep()</function>.
- </para>
- </sect1>
-
- <sect1 id="routines-endian">
- <title><function>cpu_to_be32()</function>/<function>be32_to_cpu()</function>/<function>cpu_to_le32()</function>/<function>le32_to_cpu()</function>
- <filename class="headerfile">include/asm/byteorder.h</filename>
- </title>
-
- <para>
- The <function>cpu_to_be32()</function> family (where the "32" can
- be replaced by 64 or 16, and the "be" can be replaced by "le") are
- the general way to do endian conversions in the kernel: they
- return the converted value. All variations supply the reverse as
- well: <function>be32_to_cpu()</function>, etc.
- </para>
-
- <para>
- There are two major variations of these functions: the pointer
- variation, such as <function>cpu_to_be32p()</function>, which take
- a pointer to the given type, and return the converted value. The
- other variation is the "in-situ" family, such as
- <function>cpu_to_be32s()</function>, which convert value referred
- to by the pointer, and return void.
- </para>
- </sect1>
-
- <sect1 id="routines-local-irqs">
- <title><function>local_irq_save()</function>/<function>local_irq_restore()</function>
- <filename class="headerfile">include/linux/irqflags.h</filename>
- </title>
-
- <para>
- These routines disable hard interrupts on the local CPU, and
- restore them. They are reentrant; saving the previous state in
- their one <varname>unsigned long flags</varname> argument. If you
- know that interrupts are enabled, you can simply use
- <function>local_irq_disable()</function> and
- <function>local_irq_enable()</function>.
- </para>
- </sect1>
-
- <sect1 id="routines-softirqs">
- <title><function>local_bh_disable()</function>/<function>local_bh_enable()</function>
- <filename class="headerfile">include/linux/interrupt.h</filename></title>
-
- <para>
- These routines disable soft interrupts on the local CPU, and
- restore them. They are reentrant; if soft interrupts were
- disabled before, they will still be disabled after this pair
- of functions has been called. They prevent softirqs and tasklets
- from running on the current CPU.
- </para>
- </sect1>
-
- <sect1 id="routines-processorids">
- <title><function>smp_processor_id</function>()
- <filename class="headerfile">include/asm/smp.h</filename></title>
-
- <para>
- <function>get_cpu()</function> disables preemption (so you won't
- suddenly get moved to another CPU) and returns the current
- processor number, between 0 and <symbol>NR_CPUS</symbol>. Note
- that the CPU numbers are not necessarily continuous. You return
- it again with <function>put_cpu()</function> when you are done.
- </para>
- <para>
- If you know you cannot be preempted by another task (ie. you are
- in interrupt context, or have preemption disabled) you can use
- smp_processor_id().
- </para>
- </sect1>
-
- <sect1 id="routines-init">
- <title><type>__init</type>/<type>__exit</type>/<type>__initdata</type>
- <filename class="headerfile">include/linux/init.h</filename></title>
-
- <para>
- After boot, the kernel frees up a special section; functions
- marked with <type>__init</type> and data structures marked with
- <type>__initdata</type> are dropped after boot is complete: similarly
- modules discard this memory after initialization. <type>__exit</type>
- is used to declare a function which is only required on exit: the
- function will be dropped if this file is not compiled as a module.
- See the header file for use. Note that it makes no sense for a function
- marked with <type>__init</type> to be exported to modules with
- <function>EXPORT_SYMBOL()</function> - this will break.
- </para>
-
- </sect1>
-
- <sect1 id="routines-init-again">
- <title><function>__initcall()</function>/<function>module_init()</function>
- <filename class="headerfile">include/linux/init.h</filename></title>
- <para>
- Many parts of the kernel are well served as a module
- (dynamically-loadable parts of the kernel). Using the
- <function>module_init()</function> and
- <function>module_exit()</function> macros it is easy to write code
- without #ifdefs which can operate both as a module or built into
- the kernel.
- </para>
-
- <para>
- The <function>module_init()</function> macro defines which
- function is to be called at module insertion time (if the file is
- compiled as a module), or at boot time: if the file is not
- compiled as a module the <function>module_init()</function> macro
- becomes equivalent to <function>__initcall()</function>, which
- through linker magic ensures that the function is called on boot.
- </para>
-
- <para>
- The function can return a negative error number to cause
- module loading to fail (unfortunately, this has no effect if
- the module is compiled into the kernel). This function is
- called in user context with interrupts enabled, so it can sleep.
- </para>
- </sect1>
-
- <sect1 id="routines-moduleexit">
- <title> <function>module_exit()</function>
- <filename class="headerfile">include/linux/init.h</filename> </title>
-
- <para>
- This macro defines the function to be called at module removal
- time (or never, in the case of the file compiled into the
- kernel). It will only be called if the module usage count has
- reached zero. This function can also sleep, but cannot fail:
- everything must be cleaned up by the time it returns.
- </para>
-
- <para>
- Note that this macro is optional: if it is not present, your
- module will not be removable (except for 'rmmod -f').
- </para>
- </sect1>
-
- <sect1 id="routines-module-use-counters">
- <title> <function>try_module_get()</function>/<function>module_put()</function>
- <filename class="headerfile">include/linux/module.h</filename></title>
-
- <para>
- These manipulate the module usage count, to protect against
- removal (a module also can't be removed if another module uses one
- of its exported symbols: see below). Before calling into module
- code, you should call <function>try_module_get()</function> on
- that module: if it fails, then the module is being removed and you
- should act as if it wasn't there. Otherwise, you can safely enter
- the module, and call <function>module_put()</function> when you're
- finished.
- </para>
-
- <para>
- Most registerable structures have an
- <structfield>owner</structfield> field, such as in the
- <structname>file_operations</structname> structure. Set this field
- to the macro <symbol>THIS_MODULE</symbol>.
- </para>
- </sect1>
-
- <!-- add info on new-style module refcounting here -->
- </chapter>
-
- <chapter id="queues">
- <title>Wait Queues
- <filename class="headerfile">include/linux/wait.h</filename>
- </title>
- <para>
- <emphasis>[SLEEPS]</emphasis>
- </para>
-
- <para>
- A wait queue is used to wait for someone to wake you up when a
- certain condition is true. They must be used carefully to ensure
- there is no race condition. You declare a
- <type>wait_queue_head_t</type>, and then processes which want to
- wait for that condition declare a <type>wait_queue_t</type>
- referring to themselves, and place that in the queue.
- </para>
-
- <sect1 id="queue-declaring">
- <title>Declaring</title>
-
- <para>
- You declare a <type>wait_queue_head_t</type> using the
- <function>DECLARE_WAIT_QUEUE_HEAD()</function> macro, or using the
- <function>init_waitqueue_head()</function> routine in your
- initialization code.
- </para>
- </sect1>
-
- <sect1 id="queue-waitqueue">
- <title>Queuing</title>
-
- <para>
- Placing yourself in the waitqueue is fairly complex, because you
- must put yourself in the queue before checking the condition.
- There is a macro to do this:
- <function>wait_event_interruptible()</function>
-
- <filename class="headerfile">include/linux/wait.h</filename> The
- first argument is the wait queue head, and the second is an
- expression which is evaluated; the macro returns
- <returnvalue>0</returnvalue> when this expression is true, or
- <returnvalue>-ERESTARTSYS</returnvalue> if a signal is received.
- The <function>wait_event()</function> version ignores signals.
- </para>
-
- </sect1>
-
- <sect1 id="queue-waking">
- <title>Waking Up Queued Tasks</title>
-
- <para>
- Call <function>wake_up()</function>
-
- <filename class="headerfile">include/linux/wait.h</filename>;,
- which will wake up every process in the queue. The exception is
- if one has <constant>TASK_EXCLUSIVE</constant> set, in which case
- the remainder of the queue will not be woken. There are other variants
- of this basic function available in the same header.
- </para>
- </sect1>
- </chapter>
-
- <chapter id="atomic-ops">
- <title>Atomic Operations</title>
-
- <para>
- Certain operations are guaranteed atomic on all platforms. The
- first class of operations work on <type>atomic_t</type>
-
- <filename class="headerfile">include/asm/atomic.h</filename>; this
- contains a signed integer (at least 32 bits long), and you must use
- these functions to manipulate or read atomic_t variables.
- <function>atomic_read()</function> and
- <function>atomic_set()</function> get and set the counter,
- <function>atomic_add()</function>,
- <function>atomic_sub()</function>,
- <function>atomic_inc()</function>,
- <function>atomic_dec()</function>, and
- <function>atomic_dec_and_test()</function> (returns
- <returnvalue>true</returnvalue> if it was decremented to zero).
- </para>
-
- <para>
- Yes. It returns <returnvalue>true</returnvalue> (i.e. != 0) if the
- atomic variable is zero.
- </para>
-
- <para>
- Note that these functions are slower than normal arithmetic, and
- so should not be used unnecessarily.
- </para>
-
- <para>
- The second class of atomic operations is atomic bit operations on an
- <type>unsigned long</type>, defined in
-
- <filename class="headerfile">include/linux/bitops.h</filename>. These
- operations generally take a pointer to the bit pattern, and a bit
- number: 0 is the least significant bit.
- <function>set_bit()</function>, <function>clear_bit()</function>
- and <function>change_bit()</function> set, clear, and flip the
- given bit. <function>test_and_set_bit()</function>,
- <function>test_and_clear_bit()</function> and
- <function>test_and_change_bit()</function> do the same thing,
- except return true if the bit was previously set; these are
- particularly useful for atomically setting flags.
- </para>
-
- <para>
- It is possible to call these operations with bit indices greater
- than BITS_PER_LONG. The resulting behavior is strange on big-endian
- platforms though so it is a good idea not to do this.
- </para>
- </chapter>
-
- <chapter id="symbols">
- <title>Symbols</title>
-
- <para>
- Within the kernel proper, the normal linking rules apply
- (ie. unless a symbol is declared to be file scope with the
- <type>static</type> keyword, it can be used anywhere in the
- kernel). However, for modules, a special exported symbol table is
- kept which limits the entry points to the kernel proper. Modules
- can also export symbols.
- </para>
-
- <sect1 id="sym-exportsymbols">
- <title><function>EXPORT_SYMBOL()</function>
- <filename class="headerfile">include/linux/export.h</filename></title>
-
- <para>
- This is the classic method of exporting a symbol: dynamically
- loaded modules will be able to use the symbol as normal.
- </para>
- </sect1>
-
- <sect1 id="sym-exportsymbols-gpl">
- <title><function>EXPORT_SYMBOL_GPL()</function>
- <filename class="headerfile">include/linux/export.h</filename></title>
-
- <para>
- Similar to <function>EXPORT_SYMBOL()</function> except that the
- symbols exported by <function>EXPORT_SYMBOL_GPL()</function> can
- only be seen by modules with a
- <function>MODULE_LICENSE()</function> that specifies a GPL
- compatible license. It implies that the function is considered
- an internal implementation issue, and not really an interface.
- Some maintainers and developers may however
- require EXPORT_SYMBOL_GPL() when adding any new APIs or functionality.
- </para>
- </sect1>
- </chapter>
-
- <chapter id="conventions">
- <title>Routines and Conventions</title>
-
- <sect1 id="conventions-doublelinkedlist">
- <title>Double-linked lists
- <filename class="headerfile">include/linux/list.h</filename></title>
-
- <para>
- There used to be three sets of linked-list routines in the kernel
- headers, but this one is the winner. If you don't have some
- particular pressing need for a single list, it's a good choice.
- </para>
-
- <para>
- In particular, <function>list_for_each_entry</function> is useful.
- </para>
- </sect1>
-
- <sect1 id="convention-returns">
- <title>Return Conventions</title>
-
- <para>
- For code called in user context, it's very common to defy C
- convention, and return <returnvalue>0</returnvalue> for success,
- and a negative error number
- (eg. <returnvalue>-EFAULT</returnvalue>) for failure. This can be
- unintuitive at first, but it's fairly widespread in the kernel.
- </para>
-
- <para>
- Using <function>ERR_PTR()</function>
-
- <filename class="headerfile">include/linux/err.h</filename>; to
- encode a negative error number into a pointer, and
- <function>IS_ERR()</function> and <function>PTR_ERR()</function>
- to get it back out again: avoids a separate pointer parameter for
- the error number. Icky, but in a good way.
- </para>
- </sect1>
-
- <sect1 id="conventions-borkedcompile">
- <title>Breaking Compilation</title>
-
- <para>
- Linus and the other developers sometimes change function or
- structure names in development kernels; this is not done just to
- keep everyone on their toes: it reflects a fundamental change
- (eg. can no longer be called with interrupts on, or does extra
- checks, or doesn't do checks which were caught before). Usually
- this is accompanied by a fairly complete note to the linux-kernel
- mailing list; search the archive. Simply doing a global replace
- on the file usually makes things <emphasis>worse</emphasis>.
- </para>
- </sect1>
-
- <sect1 id="conventions-initialising">
- <title>Initializing structure members</title>
-
- <para>
- The preferred method of initializing structures is to use
- designated initialisers, as defined by ISO C99, eg:
- </para>
- <programlisting>
-static struct block_device_operations opt_fops = {
- .open = opt_open,
- .release = opt_release,
- .ioctl = opt_ioctl,
- .check_media_change = opt_media_change,
-};
- </programlisting>
- <para>
- This makes it easy to grep for, and makes it clear which
- structure fields are set. You should do this because it looks
- cool.
- </para>
- </sect1>
-
- <sect1 id="conventions-gnu-extns">
- <title>GNU Extensions</title>
-
- <para>
- GNU Extensions are explicitly allowed in the Linux kernel.
- Note that some of the more complex ones are not very well
- supported, due to lack of general use, but the following are
- considered standard (see the GCC info page section "C
- Extensions" for more details - Yes, really the info page, the
- man page is only a short summary of the stuff in info).
- </para>
- <itemizedlist>
- <listitem>
- <para>
- Inline functions
- </para>
- </listitem>
- <listitem>
- <para>
- Statement expressions (ie. the ({ and }) constructs).
- </para>
- </listitem>
- <listitem>
- <para>
- Declaring attributes of a function / variable / type
- (__attribute__)
- </para>
- </listitem>
- <listitem>
- <para>
- typeof
- </para>
- </listitem>
- <listitem>
- <para>
- Zero length arrays
- </para>
- </listitem>
- <listitem>
- <para>
- Macro varargs
- </para>
- </listitem>
- <listitem>
- <para>
- Arithmetic on void pointers
- </para>
- </listitem>
- <listitem>
- <para>
- Non-Constant initializers
- </para>
- </listitem>
- <listitem>
- <para>
- Assembler Instructions (not outside arch/ and include/asm/)
- </para>
- </listitem>
- <listitem>
- <para>
- Function names as strings (__func__).
- </para>
- </listitem>
- <listitem>
- <para>
- __builtin_constant_p()
- </para>
- </listitem>
- </itemizedlist>
-
- <para>
- Be wary when using long long in the kernel, the code gcc generates for
- it is horrible and worse: division and multiplication does not work
- on i386 because the GCC runtime functions for it are missing from
- the kernel environment.
- </para>
-
- <!-- FIXME: add a note about ANSI aliasing cleanness -->
- </sect1>
-
- <sect1 id="conventions-cplusplus">
- <title>C++</title>
-
- <para>
- Using C++ in the kernel is usually a bad idea, because the
- kernel does not provide the necessary runtime environment
- and the include files are not tested for it. It is still
- possible, but not recommended. If you really want to do
- this, forget about exceptions at least.
- </para>
- </sect1>
-
- <sect1 id="conventions-ifdef">
- <title>&num;if</title>
-
- <para>
- It is generally considered cleaner to use macros in header files
- (or at the top of .c files) to abstract away functions rather than
- using `#if' pre-processor statements throughout the source code.
- </para>
- </sect1>
- </chapter>
-
- <chapter id="submitting">
- <title>Putting Your Stuff in the Kernel</title>
-
- <para>
- In order to get your stuff into shape for official inclusion, or
- even to make a neat patch, there's administrative work to be
- done:
- </para>
- <itemizedlist>
- <listitem>
- <para>
- Figure out whose pond you've been pissing in. Look at the top of
- the source files, inside the <filename>MAINTAINERS</filename>
- file, and last of all in the <filename>CREDITS</filename> file.
- You should coordinate with this person to make sure you're not
- duplicating effort, or trying something that's already been
- rejected.
- </para>
-
- <para>
- Make sure you put your name and EMail address at the top of
- any files you create or mangle significantly. This is the
- first place people will look when they find a bug, or when
- <emphasis>they</emphasis> want to make a change.
- </para>
- </listitem>
-
- <listitem>
- <para>
- Usually you want a configuration option for your kernel hack.
- Edit <filename>Kconfig</filename> in the appropriate directory.
- The Config language is simple to use by cut and paste, and there's
- complete documentation in
- <filename>Documentation/kbuild/kconfig-language.txt</filename>.
- </para>
-
- <para>
- In your description of the option, make sure you address both the
- expert user and the user who knows nothing about your feature. Mention
- incompatibilities and issues here. <emphasis> Definitely
- </emphasis> end your description with <quote> if in doubt, say N
- </quote> (or, occasionally, `Y'); this is for people who have no
- idea what you are talking about.
- </para>
- </listitem>
-
- <listitem>
- <para>
- Edit the <filename>Makefile</filename>: the CONFIG variables are
- exported here so you can usually just add a "obj-$(CONFIG_xxx) +=
- xxx.o" line. The syntax is documented in
- <filename>Documentation/kbuild/makefiles.txt</filename>.
- </para>
- </listitem>
-
- <listitem>
- <para>
- Put yourself in <filename>CREDITS</filename> if you've done
- something noteworthy, usually beyond a single file (your name
- should be at the top of the source files anyway).
- <filename>MAINTAINERS</filename> means you want to be consulted
- when changes are made to a subsystem, and hear about bugs; it
- implies a more-than-passing commitment to some part of the code.
- </para>
- </listitem>
-
- <listitem>
- <para>
- Finally, don't forget to read <filename>Documentation/process/submitting-patches.rst</filename>
- and possibly <filename>Documentation/process/submitting-drivers.rst</filename>.
- </para>
- </listitem>
- </itemizedlist>
- </chapter>
-
- <chapter id="cantrips">
- <title>Kernel Cantrips</title>
-
- <para>
- Some favorites from browsing the source. Feel free to add to this
- list.
- </para>
-
- <para>
- <filename>arch/x86/include/asm/delay.h:</filename>
- </para>
- <programlisting>
-#define ndelay(n) (__builtin_constant_p(n) ? \
- ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
- __ndelay(n))
- </programlisting>
-
- <para>
- <filename>include/linux/fs.h</filename>:
- </para>
- <programlisting>
-/*
- * Kernel pointers have redundant information, so we can use a
- * scheme where we can return either an error code or a dentry
- * pointer with the same return value.
- *
- * This should be a per-architecture thing, to allow different
- * error and pointer decisions.
- */
- #define ERR_PTR(err) ((void *)((long)(err)))
- #define PTR_ERR(ptr) ((long)(ptr))
- #define IS_ERR(ptr) ((unsigned long)(ptr) > (unsigned long)(-1000))
-</programlisting>
-
- <para>
- <filename>arch/x86/include/asm/uaccess_32.h:</filename>
- </para>
-
- <programlisting>
-#define copy_to_user(to,from,n) \
- (__builtin_constant_p(n) ? \
- __constant_copy_to_user((to),(from),(n)) : \
- __generic_copy_to_user((to),(from),(n)))
- </programlisting>
-
- <para>
- <filename>arch/sparc/kernel/head.S:</filename>
- </para>
-
- <programlisting>
-/*
- * Sun people can't spell worth damn. "compatability" indeed.
- * At least we *know* we can't spell, and use a spell-checker.
- */
-
-/* Uh, actually Linus it is I who cannot spell. Too much murky
- * Sparc assembly will do this to ya.
- */
-C_LABEL(cputypvar):
- .asciz "compatibility"
-
-/* Tested on SS-5, SS-10. Probably someone at Sun applied a spell-checker. */
- .align 4
-C_LABEL(cputypvar_sun4m):
- .asciz "compatible"
- </programlisting>
-
- <para>
- <filename>arch/sparc/lib/checksum.S:</filename>
- </para>
-
- <programlisting>
- /* Sun, you just can't beat me, you just can't. Stop trying,
- * give up. I'm serious, I am going to kick the living shit
- * out of you, game over, lights out.
- */
- </programlisting>
- </chapter>
-
- <chapter id="credits">
- <title>Thanks</title>
-
- <para>
- Thanks to Andi Kleen for the idea, answering my questions, fixing
- my mistakes, filling content, etc. Philipp Rumpf for more spelling
- and clarity fixes, and some excellent non-obvious points. Werner
- Almesberger for giving me a great summary of
- <function>disable_irq()</function>, and Jes Sorensen and Andrea
- Arcangeli added caveats. Michael Elizabeth Chastain for checking
- and adding to the Configure section. <!-- Rusty insisted on this
- bit; I didn't do it! --> Telsa Gwynne for teaching me DocBook.
- </para>
- </chapter>
-</book>
-
diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl
deleted file mode 100644
index 7c9cc4846cb6..000000000000
--- a/Documentation/DocBook/kernel-locking.tmpl
+++ /dev/null
@@ -1,2151 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
- "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
-
-<book id="LKLockingGuide">
- <bookinfo>
- <title>Unreliable Guide To Locking</title>
-
- <authorgroup>
- <author>
- <firstname>Rusty</firstname>
- <surname>Russell</surname>
- <affiliation>
- <address>
- <email>rusty@rustcorp.com.au</email>
- </address>
- </affiliation>
- </author>
- </authorgroup>
-
- <copyright>
- <year>2003</year>
- <holder>Rusty Russell</holder>
- </copyright>
-
- <legalnotice>
- <para>
- This documentation is free software; you can redistribute
- it and/or modify it under the terms of the GNU General Public
- License as published by the Free Software Foundation; either
- version 2 of the License, or (at your option) any later
- version.
- </para>
-
- <para>
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- See the GNU General Public License for more details.
- </para>
-
- <para>
- You should have received a copy of the GNU General Public
- License along with this program; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- MA 02111-1307 USA
- </para>
-
- <para>
- For more details see the file COPYING in the source
- distribution of Linux.
- </para>
- </legalnotice>
- </bookinfo>
-
- <toc></toc>
- <chapter id="intro">
- <title>Introduction</title>
- <para>
- Welcome, to Rusty's Remarkably Unreliable Guide to Kernel
- Locking issues. This document describes the locking systems in
- the Linux Kernel in 2.6.
- </para>
- <para>
- With the wide availability of HyperThreading, and <firstterm
- linkend="gloss-preemption">preemption </firstterm> in the Linux
- Kernel, everyone hacking on the kernel needs to know the
- fundamentals of concurrency and locking for
- <firstterm linkend="gloss-smp"><acronym>SMP</acronym></firstterm>.
- </para>
- </chapter>
-
- <chapter id="races">
- <title>The Problem With Concurrency</title>
- <para>
- (Skip this if you know what a Race Condition is).
- </para>
- <para>
- In a normal program, you can increment a counter like so:
- </para>
- <programlisting>
- very_important_count++;
- </programlisting>
-
- <para>
- This is what they would expect to happen:
- </para>
-
- <table>
- <title>Expected Results</title>
-
- <tgroup cols="2" align="left">
-
- <thead>
- <row>
- <entry>Instance 1</entry>
- <entry>Instance 2</entry>
- </row>
- </thead>
-
- <tbody>
- <row>
- <entry>read very_important_count (5)</entry>
- <entry></entry>
- </row>
- <row>
- <entry>add 1 (6)</entry>
- <entry></entry>
- </row>
- <row>
- <entry>write very_important_count (6)</entry>
- <entry></entry>
- </row>
- <row>
- <entry></entry>
- <entry>read very_important_count (6)</entry>
- </row>
- <row>
- <entry></entry>
- <entry>add 1 (7)</entry>
- </row>
- <row>
- <entry></entry>
- <entry>write very_important_count (7)</entry>
- </row>
- </tbody>
-
- </tgroup>
- </table>
-
- <para>
- This is what might happen:
- </para>
-
- <table>
- <title>Possible Results</title>
-
- <tgroup cols="2" align="left">
- <thead>
- <row>
- <entry>Instance 1</entry>
- <entry>Instance 2</entry>
- </row>
- </thead>
-
- <tbody>
- <row>
- <entry>read very_important_count (5)</entry>
- <entry></entry>
- </row>
- <row>
- <entry></entry>
- <entry>read very_important_count (5)</entry>
- </row>
- <row>
- <entry>add 1 (6)</entry>
- <entry></entry>
- </row>
- <row>
- <entry></entry>
- <entry>add 1 (6)</entry>
- </row>
- <row>
- <entry>write very_important_count (6)</entry>
- <entry></entry>
- </row>
- <row>
- <entry></entry>
- <entry>write very_important_count (6)</entry>
- </row>
- </tbody>
- </tgroup>
- </table>
-
- <sect1 id="race-condition">
- <title>Race Conditions and Critical Regions</title>
- <para>
- This overlap, where the result depends on the
- relative timing of multiple tasks, is called a <firstterm>race condition</firstterm>.
- The piece of code containing the concurrency issue is called a
- <firstterm>critical region</firstterm>. And especially since Linux starting running
- on SMP machines, they became one of the major issues in kernel
- design and implementation.
- </para>
- <para>
- Preemption can have the same effect, even if there is only one
- CPU: by preempting one task during the critical region, we have
- exactly the same race condition. In this case the thread which
- preempts might run the critical region itself.
- </para>
- <para>
- The solution is to recognize when these simultaneous accesses
- occur, and use locks to make sure that only one instance can
- enter the critical region at any time. There are many
- friendly primitives in the Linux kernel to help you do this.
- And then there are the unfriendly primitives, but I'll pretend
- they don't exist.
- </para>
- </sect1>
- </chapter>
-
- <chapter id="locks">
- <title>Locking in the Linux Kernel</title>
-
- <para>
- If I could give you one piece of advice: never sleep with anyone
- crazier than yourself. But if I had to give you advice on
- locking: <emphasis>keep it simple</emphasis>.
- </para>
-
- <para>
- Be reluctant to introduce new locks.
- </para>
-
- <para>
- Strangely enough, this last one is the exact reverse of my advice when
- you <emphasis>have</emphasis> slept with someone crazier than yourself.
- And you should think about getting a big dog.
- </para>
-
- <sect1 id="lock-intro">
- <title>Two Main Types of Kernel Locks: Spinlocks and Mutexes</title>
-
- <para>
- There are two main types of kernel locks. The fundamental type
- is the spinlock
- (<filename class="headerfile">include/asm/spinlock.h</filename>),
- which is a very simple single-holder lock: if you can't get the
- spinlock, you keep trying (spinning) until you can. Spinlocks are
- very small and fast, and can be used anywhere.
- </para>
- <para>
- The second type is a mutex
- (<filename class="headerfile">include/linux/mutex.h</filename>): it
- is like a spinlock, but you may block holding a mutex.
- If you can't lock a mutex, your task will suspend itself, and be woken
- up when the mutex is released. This means the CPU can do something
- else while you are waiting. There are many cases when you simply
- can't sleep (see <xref linkend="sleeping-things"/>), and so have to
- use a spinlock instead.
- </para>
- <para>
- Neither type of lock is recursive: see
- <xref linkend="deadlock"/>.
- </para>
- </sect1>
-
- <sect1 id="uniprocessor">
- <title>Locks and Uniprocessor Kernels</title>
-
- <para>
- For kernels compiled without <symbol>CONFIG_SMP</symbol>, and
- without <symbol>CONFIG_PREEMPT</symbol> spinlocks do not exist at
- all. This is an excellent design decision: when no-one else can
- run at the same time, there is no reason to have a lock.
- </para>
-
- <para>
- If the kernel is compiled without <symbol>CONFIG_SMP</symbol>,
- but <symbol>CONFIG_PREEMPT</symbol> is set, then spinlocks
- simply disable preemption, which is sufficient to prevent any
- races. For most purposes, we can think of preemption as
- equivalent to SMP, and not worry about it separately.
- </para>
-
- <para>
- You should always test your locking code with <symbol>CONFIG_SMP</symbol>
- and <symbol>CONFIG_PREEMPT</symbol> enabled, even if you don't have an SMP test box, because it
- will still catch some kinds of locking bugs.
- </para>
-
- <para>
- Mutexes still exist, because they are required for
- synchronization between <firstterm linkend="gloss-usercontext">user
- contexts</firstterm>, as we will see below.
- </para>
- </sect1>
-
- <sect1 id="usercontextlocking">
- <title>Locking Only In User Context</title>
-
- <para>
- If you have a data structure which is only ever accessed from
- user context, then you can use a simple mutex
- (<filename>include/linux/mutex.h</filename>) to protect it. This
- is the most trivial case: you initialize the mutex. Then you can
- call <function>mutex_lock_interruptible()</function> to grab the mutex,
- and <function>mutex_unlock()</function> to release it. There is also a
- <function>mutex_lock()</function>, which should be avoided, because it
- will not return if a signal is received.
- </para>
-
- <para>
- Example: <filename>net/netfilter/nf_sockopt.c</filename> allows
- registration of new <function>setsockopt()</function> and
- <function>getsockopt()</function> calls, with
- <function>nf_register_sockopt()</function>. Registration and
- de-registration are only done on module load and unload (and boot
- time, where there is no concurrency), and the list of registrations
- is only consulted for an unknown <function>setsockopt()</function>
- or <function>getsockopt()</function> system call. The
- <varname>nf_sockopt_mutex</varname> is perfect to protect this,
- especially since the setsockopt and getsockopt calls may well
- sleep.
- </para>
- </sect1>
-
- <sect1 id="lock-user-bh">
- <title>Locking Between User Context and Softirqs</title>
-
- <para>
- If a <firstterm linkend="gloss-softirq">softirq</firstterm> shares
- data with user context, you have two problems. Firstly, the current
- user context can be interrupted by a softirq, and secondly, the
- critical region could be entered from another CPU. This is where
- <function>spin_lock_bh()</function>
- (<filename class="headerfile">include/linux/spinlock.h</filename>) is
- used. It disables softirqs on that CPU, then grabs the lock.
- <function>spin_unlock_bh()</function> does the reverse. (The
- '_bh' suffix is a historical reference to "Bottom Halves", the
- old name for software interrupts. It should really be
- called spin_lock_softirq()' in a perfect world).
- </para>
-
- <para>
- Note that you can also use <function>spin_lock_irq()</function>
- or <function>spin_lock_irqsave()</function> here, which stop
- hardware interrupts as well: see <xref linkend="hardirq-context"/>.
- </para>
-
- <para>
- This works perfectly for <firstterm linkend="gloss-up"><acronym>UP
- </acronym></firstterm> as well: the spin lock vanishes, and this macro
- simply becomes <function>local_bh_disable()</function>
- (<filename class="headerfile">include/linux/interrupt.h</filename>), which
- protects you from the softirq being run.
- </para>
- </sect1>
-
- <sect1 id="lock-user-tasklet">
- <title>Locking Between User Context and Tasklets</title>
-
- <para>
- This is exactly the same as above, because <firstterm
- linkend="gloss-tasklet">tasklets</firstterm> are actually run
- from a softirq.
- </para>
- </sect1>
-
- <sect1 id="lock-user-timers">
- <title>Locking Between User Context and Timers</title>
-
- <para>
- This, too, is exactly the same as above, because <firstterm
- linkend="gloss-timers">timers</firstterm> are actually run from
- a softirq. From a locking point of view, tasklets and timers
- are identical.
- </para>
- </sect1>
-
- <sect1 id="lock-tasklets">
- <title>Locking Between Tasklets/Timers</title>
-
- <para>
- Sometimes a tasklet or timer might want to share data with
- another tasklet or timer.
- </para>
-
- <sect2 id="lock-tasklets-same">
- <title>The Same Tasklet/Timer</title>
- <para>
- Since a tasklet is never run on two CPUs at once, you don't
- need to worry about your tasklet being reentrant (running
- twice at once), even on SMP.
- </para>
- </sect2>
-
- <sect2 id="lock-tasklets-different">
- <title>Different Tasklets/Timers</title>
- <para>
- If another tasklet/timer wants
- to share data with your tasklet or timer , you will both need to use
- <function>spin_lock()</function> and
- <function>spin_unlock()</function> calls.
- <function>spin_lock_bh()</function> is
- unnecessary here, as you are already in a tasklet, and
- none will be run on the same CPU.
- </para>
- </sect2>
- </sect1>
-
- <sect1 id="lock-softirqs">
- <title>Locking Between Softirqs</title>
-
- <para>
- Often a softirq might
- want to share data with itself or a tasklet/timer.
- </para>
-
- <sect2 id="lock-softirqs-same">
- <title>The Same Softirq</title>
-
- <para>
- The same softirq can run on the other CPUs: you can use a
- per-CPU array (see <xref linkend="per-cpu"/>) for better
- performance. If you're going so far as to use a softirq,
- you probably care about scalable performance enough
- to justify the extra complexity.
- </para>
-
- <para>
- You'll need to use <function>spin_lock()</function> and
- <function>spin_unlock()</function> for shared data.
- </para>
- </sect2>
-
- <sect2 id="lock-softirqs-different">
- <title>Different Softirqs</title>
-
- <para>
- You'll need to use <function>spin_lock()</function> and
- <function>spin_unlock()</function> for shared data, whether it
- be a timer, tasklet, different softirq or the same or another
- softirq: any of them could be running on a different CPU.
- </para>
- </sect2>
- </sect1>
- </chapter>
-
- <chapter id="hardirq-context">
- <title>Hard IRQ Context</title>
-
- <para>
- Hardware interrupts usually communicate with a
- tasklet or softirq. Frequently this involves putting work in a
- queue, which the softirq will take out.
- </para>
-
- <sect1 id="hardirq-softirq">
- <title>Locking Between Hard IRQ and Softirqs/Tasklets</title>
-
- <para>
- If a hardware irq handler shares data with a softirq, you have
- two concerns. Firstly, the softirq processing can be
- interrupted by a hardware interrupt, and secondly, the
- critical region could be entered by a hardware interrupt on
- another CPU. This is where <function>spin_lock_irq()</function> is
- used. It is defined to disable interrupts on that cpu, then grab
- the lock. <function>spin_unlock_irq()</function> does the reverse.
- </para>
-
- <para>
- The irq handler does not to use
- <function>spin_lock_irq()</function>, because the softirq cannot
- run while the irq handler is running: it can use
- <function>spin_lock()</function>, which is slightly faster. The
- only exception would be if a different hardware irq handler uses
- the same lock: <function>spin_lock_irq()</function> will stop
- that from interrupting us.
- </para>
-
- <para>
- This works perfectly for UP as well: the spin lock vanishes,
- and this macro simply becomes <function>local_irq_disable()</function>
- (<filename class="headerfile">include/asm/smp.h</filename>), which
- protects you from the softirq/tasklet/BH being run.
- </para>
-
- <para>
- <function>spin_lock_irqsave()</function>
- (<filename>include/linux/spinlock.h</filename>) is a variant
- which saves whether interrupts were on or off in a flags word,
- which is passed to <function>spin_unlock_irqrestore()</function>. This
- means that the same code can be used inside an hard irq handler (where
- interrupts are already off) and in softirqs (where the irq
- disabling is required).
- </para>
-
- <para>
- Note that softirqs (and hence tasklets and timers) are run on
- return from hardware interrupts, so
- <function>spin_lock_irq()</function> also stops these. In that
- sense, <function>spin_lock_irqsave()</function> is the most
- general and powerful locking function.
- </para>
-
- </sect1>
- <sect1 id="hardirq-hardirq">
- <title>Locking Between Two Hard IRQ Handlers</title>
- <para>
- It is rare to have to share data between two IRQ handlers, but
- if you do, <function>spin_lock_irqsave()</function> should be
- used: it is architecture-specific whether all interrupts are
- disabled inside irq handlers themselves.
- </para>
- </sect1>
-
- </chapter>
-
- <chapter id="cheatsheet">
- <title>Cheat Sheet For Locking</title>
- <para>
- Pete Zaitcev gives the following summary:
- </para>
- <itemizedlist>
- <listitem>
- <para>
- If you are in a process context (any syscall) and want to
- lock other process out, use a mutex. You can take a mutex
- and sleep (<function>copy_from_user*(</function> or
- <function>kmalloc(x,GFP_KERNEL)</function>).
- </para>
- </listitem>
- <listitem>
- <para>
- Otherwise (== data can be touched in an interrupt), use
- <function>spin_lock_irqsave()</function> and
- <function>spin_unlock_irqrestore()</function>.
- </para>
- </listitem>
- <listitem>
- <para>
- Avoid holding spinlock for more than 5 lines of code and
- across any function call (except accessors like
- <function>readb</function>).
- </para>
- </listitem>
- </itemizedlist>
-
- <sect1 id="minimum-lock-reqirements">
- <title>Table of Minimum Requirements</title>
-
- <para> The following table lists the <emphasis>minimum</emphasis>
- locking requirements between various contexts. In some cases,
- the same context can only be running on one CPU at a time, so
- no locking is required for that context (eg. a particular
- thread can only run on one CPU at a time, but if it needs
- shares data with another thread, locking is required).
- </para>
- <para>
- Remember the advice above: you can always use
- <function>spin_lock_irqsave()</function>, which is a superset
- of all other spinlock primitives.
- </para>
-
- <table>
-<title>Table of Locking Requirements</title>
-<tgroup cols="11">
-<tbody>
-
-<row>
-<entry></entry>
-<entry>IRQ Handler A</entry>
-<entry>IRQ Handler B</entry>
-<entry>Softirq A</entry>
-<entry>Softirq B</entry>
-<entry>Tasklet A</entry>
-<entry>Tasklet B</entry>
-<entry>Timer A</entry>
-<entry>Timer B</entry>
-<entry>User Context A</entry>
-<entry>User Context B</entry>
-</row>
-
-<row>
-<entry>IRQ Handler A</entry>
-<entry>None</entry>
-</row>
-
-<row>
-<entry>IRQ Handler B</entry>
-<entry>SLIS</entry>
-<entry>None</entry>
-</row>
-
-<row>
-<entry>Softirq A</entry>
-<entry>SLI</entry>
-<entry>SLI</entry>
-<entry>SL</entry>
-</row>
-
-<row>
-<entry>Softirq B</entry>
-<entry>SLI</entry>
-<entry>SLI</entry>
-<entry>SL</entry>
-<entry>SL</entry>
-</row>
-
-<row>
-<entry>Tasklet A</entry>
-<entry>SLI</entry>
-<entry>SLI</entry>
-<entry>SL</entry>
-<entry>SL</entry>
-<entry>None</entry>
-</row>
-
-<row>
-<entry>Tasklet B</entry>
-<entry>SLI</entry>
-<entry>SLI</entry>
-<entry>SL</entry>
-<entry>SL</entry>
-<entry>SL</entry>
-<entry>None</entry>
-</row>
-
-<row>
-<entry>Timer A</entry>
-<entry>SLI</entry>
-<entry>SLI</entry>
-<entry>SL</entry>
-<entry>SL</entry>
-<entry>SL</entry>
-<entry>SL</entry>
-<entry>None</entry>
-</row>
-
-<row>
-<entry>Timer B</entry>
-<entry>SLI</entry>
-<entry>SLI</entry>
-<entry>SL</entry>
-<entry>SL</entry>
-<entry>SL</entry>
-<entry>SL</entry>
-<entry>SL</entry>
-<entry>None</entry>
-</row>
-
-<row>
-<entry>User Context A</entry>
-<entry>SLI</entry>
-<entry>SLI</entry>
-<entry>SLBH</entry>
-<entry>SLBH</entry>
-<entry>SLBH</entry>
-<entry>SLBH</entry>
-<entry>SLBH</entry>
-<entry>SLBH</entry>
-<entry>None</entry>
-</row>
-
-<row>
-<entry>User Context B</entry>
-<entry>SLI</entry>
-<entry>SLI</entry>
-<entry>SLBH</entry>
-<entry>SLBH</entry>
-<entry>SLBH</entry>
-<entry>SLBH</entry>
-<entry>SLBH</entry>
-<entry>SLBH</entry>
-<entry>MLI</entry>
-<entry>None</entry>
-</row>
-
-</tbody>
-</tgroup>
-</table>
-
- <table>
-<title>Legend for Locking Requirements Table</title>
-<tgroup cols="2">
-<tbody>
-
-<row>
-<entry>SLIS</entry>
-<entry>spin_lock_irqsave</entry>
-</row>
-<row>
-<entry>SLI</entry>
-<entry>spin_lock_irq</entry>
-</row>
-<row>
-<entry>SL</entry>
-<entry>spin_lock</entry>
-</row>
-<row>
-<entry>SLBH</entry>
-<entry>spin_lock_bh</entry>
-</row>
-<row>
-<entry>MLI</entry>
-<entry>mutex_lock_interruptible</entry>
-</row>
-
-</tbody>
-</tgroup>
-</table>
-
-</sect1>
-</chapter>
-
-<chapter id="trylock-functions">
- <title>The trylock Functions</title>
- <para>
- There are functions that try to acquire a lock only once and immediately
- return a value telling about success or failure to acquire the lock.
- They can be used if you need no access to the data protected with the lock
- when some other thread is holding the lock. You should acquire the lock
- later if you then need access to the data protected with the lock.
- </para>
-
- <para>
- <function>spin_trylock()</function> does not spin but returns non-zero if
- it acquires the spinlock on the first try or 0 if not. This function can
- be used in all contexts like <function>spin_lock</function>: you must have
- disabled the contexts that might interrupt you and acquire the spin lock.
- </para>
-
- <para>
- <function>mutex_trylock()</function> does not suspend your task
- but returns non-zero if it could lock the mutex on the first try
- or 0 if not. This function cannot be safely used in hardware or software
- interrupt contexts despite not sleeping.
- </para>
-</chapter>
-
- <chapter id="Examples">
- <title>Common Examples</title>
- <para>
-Let's step through a simple example: a cache of number to name
-mappings. The cache keeps a count of how often each of the objects is
-used, and when it gets full, throws out the least used one.
-
- </para>
-
- <sect1 id="examples-usercontext">
- <title>All In User Context</title>
- <para>
-For our first example, we assume that all operations are in user
-context (ie. from system calls), so we can sleep. This means we can
-use a mutex to protect the cache and all the objects within
-it. Here's the code:
- </para>
-
- <programlisting>
-#include &lt;linux/list.h&gt;
-#include &lt;linux/slab.h&gt;
-#include &lt;linux/string.h&gt;
-#include &lt;linux/mutex.h&gt;
-#include &lt;asm/errno.h&gt;
-
-struct object
-{
- struct list_head list;
- int id;
- char name[32];
- int popularity;
-};
-
-/* Protects the cache, cache_num, and the objects within it */
-static DEFINE_MUTEX(cache_lock);
-static LIST_HEAD(cache);
-static unsigned int cache_num = 0;
-#define MAX_CACHE_SIZE 10
-
-/* Must be holding cache_lock */
-static struct object *__cache_find(int id)
-{
- struct object *i;
-
- list_for_each_entry(i, &amp;cache, list)
- if (i-&gt;id == id) {
- i-&gt;popularity++;
- return i;
- }
- return NULL;
-}
-
-/* Must be holding cache_lock */
-static void __cache_delete(struct object *obj)
-{
- BUG_ON(!obj);
- list_del(&amp;obj-&gt;list);
- kfree(obj);
- cache_num--;
-}
-
-/* Must be holding cache_lock */
-static void __cache_add(struct object *obj)
-{
- list_add(&amp;obj-&gt;list, &amp;cache);
- if (++cache_num > MAX_CACHE_SIZE) {
- struct object *i, *outcast = NULL;
- list_for_each_entry(i, &amp;cache, list) {
- if (!outcast || i-&gt;popularity &lt; outcast-&gt;popularity)
- outcast = i;
- }
- __cache_delete(outcast);
- }
-}
-
-int cache_add(int id, const char *name)
-{
- struct object *obj;
-
- if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL)
- return -ENOMEM;
-
- strlcpy(obj-&gt;name, name, sizeof(obj-&gt;name));
- obj-&gt;id = id;
- obj-&gt;popularity = 0;
-
- mutex_lock(&amp;cache_lock);
- __cache_add(obj);
- mutex_unlock(&amp;cache_lock);
- return 0;
-}
-
-void cache_delete(int id)
-{
- mutex_lock(&amp;cache_lock);
- __cache_delete(__cache_find(id));
- mutex_unlock(&amp;cache_lock);
-}
-
-int cache_find(int id, char *name)
-{
- struct object *obj;
- int ret = -ENOENT;
-
- mutex_lock(&amp;cache_lock);
- obj = __cache_find(id);
- if (obj) {
- ret = 0;
- strcpy(name, obj-&gt;name);
- }
- mutex_unlock(&amp;cache_lock);
- return ret;
-}
-</programlisting>
-
- <para>
-Note that we always make sure we have the cache_lock when we add,
-delete, or look up the cache: both the cache infrastructure itself and
-the contents of the objects are protected by the lock. In this case
-it's easy, since we copy the data for the user, and never let them
-access the objects directly.
- </para>
- <para>
-There is a slight (and common) optimization here: in
-<function>cache_add</function> we set up the fields of the object
-before grabbing the lock. This is safe, as no-one else can access it
-until we put it in cache.
- </para>
- </sect1>
-
- <sect1 id="examples-interrupt">
- <title>Accessing From Interrupt Context</title>
- <para>
-Now consider the case where <function>cache_find</function> can be
-called from interrupt context: either a hardware interrupt or a
-softirq. An example would be a timer which deletes object from the
-cache.
- </para>
- <para>
-The change is shown below, in standard patch format: the
-<symbol>-</symbol> are lines which are taken away, and the
-<symbol>+</symbol> are lines which are added.
- </para>
-<programlisting>
---- cache.c.usercontext 2003-12-09 13:58:54.000000000 +1100
-+++ cache.c.interrupt 2003-12-09 14:07:49.000000000 +1100
-@@ -12,7 +12,7 @@
- int popularity;
- };
-
--static DEFINE_MUTEX(cache_lock);
-+static DEFINE_SPINLOCK(cache_lock);
- static LIST_HEAD(cache);
- static unsigned int cache_num = 0;
- #define MAX_CACHE_SIZE 10
-@@ -55,6 +55,7 @@
- int cache_add(int id, const char *name)
- {
- struct object *obj;
-+ unsigned long flags;
-
- if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL)
- return -ENOMEM;
-@@ -63,30 +64,33 @@
- obj-&gt;id = id;
- obj-&gt;popularity = 0;
-
-- mutex_lock(&amp;cache_lock);
-+ spin_lock_irqsave(&amp;cache_lock, flags);
- __cache_add(obj);
-- mutex_unlock(&amp;cache_lock);
-+ spin_unlock_irqrestore(&amp;cache_lock, flags);
- return 0;
- }
-
- void cache_delete(int id)
- {
-- mutex_lock(&amp;cache_lock);
-+ unsigned long flags;
-+
-+ spin_lock_irqsave(&amp;cache_lock, flags);
- __cache_delete(__cache_find(id));
-- mutex_unlock(&amp;cache_lock);
-+ spin_unlock_irqrestore(&amp;cache_lock, flags);
- }
-
- int cache_find(int id, char *name)
- {
- struct object *obj;
- int ret = -ENOENT;
-+ unsigned long flags;
-
-- mutex_lock(&amp;cache_lock);
-+ spin_lock_irqsave(&amp;cache_lock, flags);
- obj = __cache_find(id);
- if (obj) {
- ret = 0;
- strcpy(name, obj-&gt;name);
- }
-- mutex_unlock(&amp;cache_lock);
-+ spin_unlock_irqrestore(&amp;cache_lock, flags);
- return ret;
- }
-</programlisting>
-
- <para>
-Note that the <function>spin_lock_irqsave</function> will turn off
-interrupts if they are on, otherwise does nothing (if we are already
-in an interrupt handler), hence these functions are safe to call from
-any context.
- </para>
- <para>
-Unfortunately, <function>cache_add</function> calls
-<function>kmalloc</function> with the <symbol>GFP_KERNEL</symbol>
-flag, which is only legal in user context. I have assumed that
-<function>cache_add</function> is still only called in user context,
-otherwise this should become a parameter to
-<function>cache_add</function>.
- </para>
- </sect1>
- <sect1 id="examples-refcnt">
- <title>Exposing Objects Outside This File</title>
- <para>
-If our objects contained more information, it might not be sufficient
-to copy the information in and out: other parts of the code might want
-to keep pointers to these objects, for example, rather than looking up
-the id every time. This produces two problems.
- </para>
- <para>
-The first problem is that we use the <symbol>cache_lock</symbol> to
-protect objects: we'd need to make this non-static so the rest of the
-code can use it. This makes locking trickier, as it is no longer all
-in one place.
- </para>
- <para>
-The second problem is the lifetime problem: if another structure keeps
-a pointer to an object, it presumably expects that pointer to remain
-valid. Unfortunately, this is only guaranteed while you hold the
-lock, otherwise someone might call <function>cache_delete</function>
-and even worse, add another object, re-using the same address.
- </para>
- <para>
-As there is only one lock, you can't hold it forever: no-one else would
-get any work done.
- </para>
- <para>
-The solution to this problem is to use a reference count: everyone who
-has a pointer to the object increases it when they first get the
-object, and drops the reference count when they're finished with it.
-Whoever drops it to zero knows it is unused, and can actually delete it.
- </para>
- <para>
-Here is the code:
- </para>
-
-<programlisting>
---- cache.c.interrupt 2003-12-09 14:25:43.000000000 +1100
-+++ cache.c.refcnt 2003-12-09 14:33:05.000000000 +1100
-@@ -7,6 +7,7 @@
- struct object
- {
- struct list_head list;
-+ unsigned int refcnt;
- int id;
- char name[32];
- int popularity;
-@@ -17,6 +18,35 @@
- static unsigned int cache_num = 0;
- #define MAX_CACHE_SIZE 10
-
-+static void __object_put(struct object *obj)
-+{
-+ if (--obj-&gt;refcnt == 0)
-+ kfree(obj);
-+}
-+
-+static void __object_get(struct object *obj)
-+{
-+ obj-&gt;refcnt++;
-+}
-+
-+void object_put(struct object *obj)
-+{
-+ unsigned long flags;
-+
-+ spin_lock_irqsave(&amp;cache_lock, flags);
-+ __object_put(obj);
-+ spin_unlock_irqrestore(&amp;cache_lock, flags);
-+}
-+
-+void object_get(struct object *obj)
-+{
-+ unsigned long flags;
-+
-+ spin_lock_irqsave(&amp;cache_lock, flags);
-+ __object_get(obj);
-+ spin_unlock_irqrestore(&amp;cache_lock, flags);
-+}
-+
- /* Must be holding cache_lock */
- static struct object *__cache_find(int id)
- {
-@@ -35,6 +65,7 @@
- {
- BUG_ON(!obj);
- list_del(&amp;obj-&gt;list);
-+ __object_put(obj);
- cache_num--;
- }
-
-@@ -63,6 +94,7 @@
- strlcpy(obj-&gt;name, name, sizeof(obj-&gt;name));
- obj-&gt;id = id;
- obj-&gt;popularity = 0;
-+ obj-&gt;refcnt = 1; /* The cache holds a reference */
-
- spin_lock_irqsave(&amp;cache_lock, flags);
- __cache_add(obj);
-@@ -79,18 +111,15 @@
- spin_unlock_irqrestore(&amp;cache_lock, flags);
- }
-
--int cache_find(int id, char *name)
-+struct object *cache_find(int id)
- {
- struct object *obj;
-- int ret = -ENOENT;
- unsigned long flags;
-
- spin_lock_irqsave(&amp;cache_lock, flags);
- obj = __cache_find(id);
-- if (obj) {
-- ret = 0;
-- strcpy(name, obj-&gt;name);
-- }
-+ if (obj)
-+ __object_get(obj);
- spin_unlock_irqrestore(&amp;cache_lock, flags);
-- return ret;
-+ return obj;
- }
-</programlisting>
-
-<para>
-We encapsulate the reference counting in the standard 'get' and 'put'
-functions. Now we can return the object itself from
-<function>cache_find</function> which has the advantage that the user
-can now sleep holding the object (eg. to
-<function>copy_to_user</function> to name to userspace).
-</para>
-<para>
-The other point to note is that I said a reference should be held for
-every pointer to the object: thus the reference count is 1 when first
-inserted into the cache. In some versions the framework does not hold
-a reference count, but they are more complicated.
-</para>
-
- <sect2 id="examples-refcnt-atomic">
- <title>Using Atomic Operations For The Reference Count</title>
-<para>
-In practice, <type>atomic_t</type> would usually be used for
-<structfield>refcnt</structfield>. There are a number of atomic
-operations defined in
-
-<filename class="headerfile">include/asm/atomic.h</filename>: these are
-guaranteed to be seen atomically from all CPUs in the system, so no
-lock is required. In this case, it is simpler than using spinlocks,
-although for anything non-trivial using spinlocks is clearer. The
-<function>atomic_inc</function> and
-<function>atomic_dec_and_test</function> are used instead of the
-standard increment and decrement operators, and the lock is no longer
-used to protect the reference count itself.
-</para>
-
-<programlisting>
---- cache.c.refcnt 2003-12-09 15:00:35.000000000 +1100
-+++ cache.c.refcnt-atomic 2003-12-11 15:49:42.000000000 +1100
-@@ -7,7 +7,7 @@
- struct object
- {
- struct list_head list;
-- unsigned int refcnt;
-+ atomic_t refcnt;
- int id;
- char name[32];
- int popularity;
-@@ -18,33 +18,15 @@
- static unsigned int cache_num = 0;
- #define MAX_CACHE_SIZE 10
-
--static void __object_put(struct object *obj)
--{
-- if (--obj-&gt;refcnt == 0)
-- kfree(obj);
--}
--
--static void __object_get(struct object *obj)
--{
-- obj-&gt;refcnt++;
--}
--
- void object_put(struct object *obj)
- {
-- unsigned long flags;
--
-- spin_lock_irqsave(&amp;cache_lock, flags);
-- __object_put(obj);
-- spin_unlock_irqrestore(&amp;cache_lock, flags);
-+ if (atomic_dec_and_test(&amp;obj-&gt;refcnt))
-+ kfree(obj);
- }
-
- void object_get(struct object *obj)
- {
-- unsigned long flags;
--
-- spin_lock_irqsave(&amp;cache_lock, flags);
-- __object_get(obj);
-- spin_unlock_irqrestore(&amp;cache_lock, flags);
-+ atomic_inc(&amp;obj-&gt;refcnt);
- }
-
- /* Must be holding cache_lock */
-@@ -65,7 +47,7 @@
- {
- BUG_ON(!obj);
- list_del(&amp;obj-&gt;list);
-- __object_put(obj);
-+ object_put(obj);
- cache_num--;
- }
-
-@@ -94,7 +76,7 @@
- strlcpy(obj-&gt;name, name, sizeof(obj-&gt;name));
- obj-&gt;id = id;
- obj-&gt;popularity = 0;
-- obj-&gt;refcnt = 1; /* The cache holds a reference */
-+ atomic_set(&amp;obj-&gt;refcnt, 1); /* The cache holds a reference */
-
- spin_lock_irqsave(&amp;cache_lock, flags);
- __cache_add(obj);
-@@ -119,7 +101,7 @@
- spin_lock_irqsave(&amp;cache_lock, flags);
- obj = __cache_find(id);
- if (obj)
-- __object_get(obj);
-+ object_get(obj);
- spin_unlock_irqrestore(&amp;cache_lock, flags);
- return obj;
- }
-</programlisting>
-</sect2>
-</sect1>
-
- <sect1 id="examples-lock-per-obj">
- <title>Protecting The Objects Themselves</title>
- <para>
-In these examples, we assumed that the objects (except the reference
-counts) never changed once they are created. If we wanted to allow
-the name to change, there are three possibilities:
- </para>
- <itemizedlist>
- <listitem>
- <para>
-You can make <symbol>cache_lock</symbol> non-static, and tell people
-to grab that lock before changing the name in any object.
- </para>
- </listitem>
- <listitem>
- <para>
-You can provide a <function>cache_obj_rename</function> which grabs
-this lock and changes the name for the caller, and tell everyone to
-use that function.
- </para>
- </listitem>
- <listitem>
- <para>
-You can make the <symbol>cache_lock</symbol> protect only the cache
-itself, and use another lock to protect the name.
- </para>
- </listitem>
- </itemizedlist>
-
- <para>
-Theoretically, you can make the locks as fine-grained as one lock for
-every field, for every object. In practice, the most common variants
-are:
-</para>
- <itemizedlist>
- <listitem>
- <para>
-One lock which protects the infrastructure (the <symbol>cache</symbol>
-list in this example) and all the objects. This is what we have done
-so far.
- </para>
- </listitem>
- <listitem>
- <para>
-One lock which protects the infrastructure (including the list
-pointers inside the objects), and one lock inside the object which
-protects the rest of that object.
- </para>
- </listitem>
- <listitem>
- <para>
-Multiple locks to protect the infrastructure (eg. one lock per hash
-chain), possibly with a separate per-object lock.
- </para>
- </listitem>
- </itemizedlist>
-
-<para>
-Here is the "lock-per-object" implementation:
-</para>
-<programlisting>
---- cache.c.refcnt-atomic 2003-12-11 15:50:54.000000000 +1100
-+++ cache.c.perobjectlock 2003-12-11 17:15:03.000000000 +1100
-@@ -6,11 +6,17 @@
-
- struct object
- {
-+ /* These two protected by cache_lock. */
- struct list_head list;
-+ int popularity;
-+
- atomic_t refcnt;
-+
-+ /* Doesn't change once created. */
- int id;
-+
-+ spinlock_t lock; /* Protects the name */
- char name[32];
-- int popularity;
- };
-
- static DEFINE_SPINLOCK(cache_lock);
-@@ -77,6 +84,7 @@
- obj-&gt;id = id;
- obj-&gt;popularity = 0;
- atomic_set(&amp;obj-&gt;refcnt, 1); /* The cache holds a reference */
-+ spin_lock_init(&amp;obj-&gt;lock);
-
- spin_lock_irqsave(&amp;cache_lock, flags);
- __cache_add(obj);
-</programlisting>
-
-<para>
-Note that I decide that the <structfield>popularity</structfield>
-count should be protected by the <symbol>cache_lock</symbol> rather
-than the per-object lock: this is because it (like the
-<structname>struct list_head</structname> inside the object) is
-logically part of the infrastructure. This way, I don't need to grab
-the lock of every object in <function>__cache_add</function> when
-seeking the least popular.
-</para>
-
-<para>
-I also decided that the <structfield>id</structfield> member is
-unchangeable, so I don't need to grab each object lock in
-<function>__cache_find()</function> to examine the
-<structfield>id</structfield>: the object lock is only used by a
-caller who wants to read or write the <structfield>name</structfield>
-field.
-</para>
-
-<para>
-Note also that I added a comment describing what data was protected by
-which locks. This is extremely important, as it describes the runtime
-behavior of the code, and can be hard to gain from just reading. And
-as Alan Cox says, <quote>Lock data, not code</quote>.
-</para>
-</sect1>
-</chapter>
-
- <chapter id="common-problems">
- <title>Common Problems</title>
- <sect1 id="deadlock">
- <title>Deadlock: Simple and Advanced</title>
-
- <para>
- There is a coding bug where a piece of code tries to grab a
- spinlock twice: it will spin forever, waiting for the lock to
- be released (spinlocks, rwlocks and mutexes are not
- recursive in Linux). This is trivial to diagnose: not a
- stay-up-five-nights-talk-to-fluffy-code-bunnies kind of
- problem.
- </para>
-
- <para>
- For a slightly more complex case, imagine you have a region
- shared by a softirq and user context. If you use a
- <function>spin_lock()</function> call to protect it, it is
- possible that the user context will be interrupted by the softirq
- while it holds the lock, and the softirq will then spin
- forever trying to get the same lock.
- </para>
-
- <para>
- Both of these are called deadlock, and as shown above, it can
- occur even with a single CPU (although not on UP compiles,
- since spinlocks vanish on kernel compiles with
- <symbol>CONFIG_SMP</symbol>=n. You'll still get data corruption
- in the second example).
- </para>
-
- <para>
- This complete lockup is easy to diagnose: on SMP boxes the
- watchdog timer or compiling with <symbol>DEBUG_SPINLOCK</symbol> set
- (<filename>include/linux/spinlock.h</filename>) will show this up
- immediately when it happens.
- </para>
-
- <para>
- A more complex problem is the so-called 'deadly embrace',
- involving two or more locks. Say you have a hash table: each
- entry in the table is a spinlock, and a chain of hashed
- objects. Inside a softirq handler, you sometimes want to
- alter an object from one place in the hash to another: you
- grab the spinlock of the old hash chain and the spinlock of
- the new hash chain, and delete the object from the old one,
- and insert it in the new one.
- </para>
-
- <para>
- There are two problems here. First, if your code ever
- tries to move the object to the same chain, it will deadlock
- with itself as it tries to lock it twice. Secondly, if the
- same softirq on another CPU is trying to move another object
- in the reverse direction, the following could happen:
- </para>
-
- <table>
- <title>Consequences</title>
-
- <tgroup cols="2" align="left">
-
- <thead>
- <row>
- <entry>CPU 1</entry>
- <entry>CPU 2</entry>
- </row>
- </thead>
-
- <tbody>
- <row>
- <entry>Grab lock A -&gt; OK</entry>
- <entry>Grab lock B -&gt; OK</entry>
- </row>
- <row>
- <entry>Grab lock B -&gt; spin</entry>
- <entry>Grab lock A -&gt; spin</entry>
- </row>
- </tbody>
- </tgroup>
- </table>
-
- <para>
- The two CPUs will spin forever, waiting for the other to give up
- their lock. It will look, smell, and feel like a crash.
- </para>
- </sect1>
-
- <sect1 id="techs-deadlock-prevent">
- <title>Preventing Deadlock</title>
-
- <para>
- Textbooks will tell you that if you always lock in the same
- order, you will never get this kind of deadlock. Practice
- will tell you that this approach doesn't scale: when I
- create a new lock, I don't understand enough of the kernel
- to figure out where in the 5000 lock hierarchy it will fit.
- </para>
-
- <para>
- The best locks are encapsulated: they never get exposed in
- headers, and are never held around calls to non-trivial
- functions outside the same file. You can read through this
- code and see that it will never deadlock, because it never
- tries to grab another lock while it has that one. People
- using your code don't even need to know you are using a
- lock.
- </para>
-
- <para>
- A classic problem here is when you provide callbacks or
- hooks: if you call these with the lock held, you risk simple
- deadlock, or a deadly embrace (who knows what the callback
- will do?). Remember, the other programmers are out to get
- you, so don't do this.
- </para>
-
- <sect2 id="techs-deadlock-overprevent">
- <title>Overzealous Prevention Of Deadlocks</title>
-
- <para>
- Deadlocks are problematic, but not as bad as data
- corruption. Code which grabs a read lock, searches a list,
- fails to find what it wants, drops the read lock, grabs a
- write lock and inserts the object has a race condition.
- </para>
-
- <para>
- If you don't see why, please stay the fuck away from my code.
- </para>
- </sect2>
- </sect1>
-
- <sect1 id="racing-timers">
- <title>Racing Timers: A Kernel Pastime</title>
-
- <para>
- Timers can produce their own special problems with races.
- Consider a collection of objects (list, hash, etc) where each
- object has a timer which is due to destroy it.
- </para>
-
- <para>
- If you want to destroy the entire collection (say on module
- removal), you might do the following:
- </para>
-
- <programlisting>
- /* THIS CODE BAD BAD BAD BAD: IF IT WAS ANY WORSE IT WOULD USE
- HUNGARIAN NOTATION */
- spin_lock_bh(&amp;list_lock);
-
- while (list) {
- struct foo *next = list-&gt;next;
- del_timer(&amp;list-&gt;timer);
- kfree(list);
- list = next;
- }
-
- spin_unlock_bh(&amp;list_lock);
- </programlisting>
-
- <para>
- Sooner or later, this will crash on SMP, because a timer can
- have just gone off before the <function>spin_lock_bh()</function>,
- and it will only get the lock after we
- <function>spin_unlock_bh()</function>, and then try to free
- the element (which has already been freed!).
- </para>
-
- <para>
- This can be avoided by checking the result of
- <function>del_timer()</function>: if it returns
- <returnvalue>1</returnvalue>, the timer has been deleted.
- If <returnvalue>0</returnvalue>, it means (in this
- case) that it is currently running, so we can do:
- </para>
-
- <programlisting>
- retry:
- spin_lock_bh(&amp;list_lock);
-
- while (list) {
- struct foo *next = list-&gt;next;
- if (!del_timer(&amp;list-&gt;timer)) {
- /* Give timer a chance to delete this */
- spin_unlock_bh(&amp;list_lock);
- goto retry;
- }
- kfree(list);
- list = next;
- }
-
- spin_unlock_bh(&amp;list_lock);
- </programlisting>
-
- <para>
- Another common problem is deleting timers which restart
- themselves (by calling <function>add_timer()</function> at the end
- of their timer function). Because this is a fairly common case
- which is prone to races, you should use <function>del_timer_sync()</function>
- (<filename class="headerfile">include/linux/timer.h</filename>)
- to handle this case. It returns the number of times the timer
- had to be deleted before we finally stopped it from adding itself back
- in.
- </para>
- </sect1>
-
- </chapter>
-
- <chapter id="Efficiency">
- <title>Locking Speed</title>
-
- <para>
-There are three main things to worry about when considering speed of
-some code which does locking. First is concurrency: how many things
-are going to be waiting while someone else is holding a lock. Second
-is the time taken to actually acquire and release an uncontended lock.
-Third is using fewer, or smarter locks. I'm assuming that the lock is
-used fairly often: otherwise, you wouldn't be concerned about
-efficiency.
-</para>
- <para>
-Concurrency depends on how long the lock is usually held: you should
-hold the lock for as long as needed, but no longer. In the cache
-example, we always create the object without the lock held, and then
-grab the lock only when we are ready to insert it in the list.
-</para>
- <para>
-Acquisition times depend on how much damage the lock operations do to
-the pipeline (pipeline stalls) and how likely it is that this CPU was
-the last one to grab the lock (ie. is the lock cache-hot for this
-CPU): on a machine with more CPUs, this likelihood drops fast.
-Consider a 700MHz Intel Pentium III: an instruction takes about 0.7ns,
-an atomic increment takes about 58ns, a lock which is cache-hot on
-this CPU takes 160ns, and a cacheline transfer from another CPU takes
-an additional 170 to 360ns. (These figures from Paul McKenney's
-<ulink url="http://www.linuxjournal.com/article.php?sid=6993"> Linux
-Journal RCU article</ulink>).
-</para>
- <para>
-These two aims conflict: holding a lock for a short time might be done
-by splitting locks into parts (such as in our final per-object-lock
-example), but this increases the number of lock acquisitions, and the
-results are often slower than having a single lock. This is another
-reason to advocate locking simplicity.
-</para>
- <para>
-The third concern is addressed below: there are some methods to reduce
-the amount of locking which needs to be done.
-</para>
-
- <sect1 id="efficiency-rwlocks">
- <title>Read/Write Lock Variants</title>
-
- <para>
- Both spinlocks and mutexes have read/write variants:
- <type>rwlock_t</type> and <structname>struct rw_semaphore</structname>.
- These divide users into two classes: the readers and the writers. If
- you are only reading the data, you can get a read lock, but to write to
- the data you need the write lock. Many people can hold a read lock,
- but a writer must be sole holder.
- </para>
-
- <para>
- If your code divides neatly along reader/writer lines (as our
- cache code does), and the lock is held by readers for
- significant lengths of time, using these locks can help. They
- are slightly slower than the normal locks though, so in practice
- <type>rwlock_t</type> is not usually worthwhile.
- </para>
- </sect1>
-
- <sect1 id="efficiency-read-copy-update">
- <title>Avoiding Locks: Read Copy Update</title>
-
- <para>
- There is a special method of read/write locking called Read Copy
- Update. Using RCU, the readers can avoid taking a lock
- altogether: as we expect our cache to be read more often than
- updated (otherwise the cache is a waste of time), it is a
- candidate for this optimization.
- </para>
-
- <para>
- How do we get rid of read locks? Getting rid of read locks
- means that writers may be changing the list underneath the
- readers. That is actually quite simple: we can read a linked
- list while an element is being added if the writer adds the
- element very carefully. For example, adding
- <symbol>new</symbol> to a single linked list called
- <symbol>list</symbol>:
- </para>
-
- <programlisting>
- new-&gt;next = list-&gt;next;
- wmb();
- list-&gt;next = new;
- </programlisting>
-
- <para>
- The <function>wmb()</function> is a write memory barrier. It
- ensures that the first operation (setting the new element's
- <symbol>next</symbol> pointer) is complete and will be seen by
- all CPUs, before the second operation is (putting the new
- element into the list). This is important, since modern
- compilers and modern CPUs can both reorder instructions unless
- told otherwise: we want a reader to either not see the new
- element at all, or see the new element with the
- <symbol>next</symbol> pointer correctly pointing at the rest of
- the list.
- </para>
- <para>
- Fortunately, there is a function to do this for standard
- <structname>struct list_head</structname> lists:
- <function>list_add_rcu()</function>
- (<filename>include/linux/list.h</filename>).
- </para>
- <para>
- Removing an element from the list is even simpler: we replace
- the pointer to the old element with a pointer to its successor,
- and readers will either see it, or skip over it.
- </para>
- <programlisting>
- list-&gt;next = old-&gt;next;
- </programlisting>
- <para>
- There is <function>list_del_rcu()</function>
- (<filename>include/linux/list.h</filename>) which does this (the
- normal version poisons the old object, which we don't want).
- </para>
- <para>
- The reader must also be careful: some CPUs can look through the
- <symbol>next</symbol> pointer to start reading the contents of
- the next element early, but don't realize that the pre-fetched
- contents is wrong when the <symbol>next</symbol> pointer changes
- underneath them. Once again, there is a
- <function>list_for_each_entry_rcu()</function>
- (<filename>include/linux/list.h</filename>) to help you. Of
- course, writers can just use
- <function>list_for_each_entry()</function>, since there cannot
- be two simultaneous writers.
- </para>
- <para>
- Our final dilemma is this: when can we actually destroy the
- removed element? Remember, a reader might be stepping through
- this element in the list right now: if we free this element and
- the <symbol>next</symbol> pointer changes, the reader will jump
- off into garbage and crash. We need to wait until we know that
- all the readers who were traversing the list when we deleted the
- element are finished. We use <function>call_rcu()</function> to
- register a callback which will actually destroy the object once
- all pre-existing readers are finished. Alternatively,
- <function>synchronize_rcu()</function> may be used to block until
- all pre-existing are finished.
- </para>
- <para>
- But how does Read Copy Update know when the readers are
- finished? The method is this: firstly, the readers always
- traverse the list inside
- <function>rcu_read_lock()</function>/<function>rcu_read_unlock()</function>
- pairs: these simply disable preemption so the reader won't go to
- sleep while reading the list.
- </para>
- <para>
- RCU then waits until every other CPU has slept at least once:
- since readers cannot sleep, we know that any readers which were
- traversing the list during the deletion are finished, and the
- callback is triggered. The real Read Copy Update code is a
- little more optimized than this, but this is the fundamental
- idea.
- </para>
-
-<programlisting>
---- cache.c.perobjectlock 2003-12-11 17:15:03.000000000 +1100
-+++ cache.c.rcupdate 2003-12-11 17:55:14.000000000 +1100
-@@ -1,15 +1,18 @@
- #include &lt;linux/list.h&gt;
- #include &lt;linux/slab.h&gt;
- #include &lt;linux/string.h&gt;
-+#include &lt;linux/rcupdate.h&gt;
- #include &lt;linux/mutex.h&gt;
- #include &lt;asm/errno.h&gt;
-
- struct object
- {
-- /* These two protected by cache_lock. */
-+ /* This is protected by RCU */
- struct list_head list;
- int popularity;
-
-+ struct rcu_head rcu;
-+
- atomic_t refcnt;
-
- /* Doesn't change once created. */
-@@ -40,7 +43,7 @@
- {
- struct object *i;
-
-- list_for_each_entry(i, &amp;cache, list) {
-+ list_for_each_entry_rcu(i, &amp;cache, list) {
- if (i-&gt;id == id) {
- i-&gt;popularity++;
- return i;
-@@ -49,19 +52,25 @@
- return NULL;
- }
-
-+/* Final discard done once we know no readers are looking. */
-+static void cache_delete_rcu(void *arg)
-+{
-+ object_put(arg);
-+}
-+
- /* Must be holding cache_lock */
- static void __cache_delete(struct object *obj)
- {
- BUG_ON(!obj);
-- list_del(&amp;obj-&gt;list);
-- object_put(obj);
-+ list_del_rcu(&amp;obj-&gt;list);
- cache_num--;
-+ call_rcu(&amp;obj-&gt;rcu, cache_delete_rcu);
- }
-
- /* Must be holding cache_lock */
- static void __cache_add(struct object *obj)
- {
-- list_add(&amp;obj-&gt;list, &amp;cache);
-+ list_add_rcu(&amp;obj-&gt;list, &amp;cache);
- if (++cache_num > MAX_CACHE_SIZE) {
- struct object *i, *outcast = NULL;
- list_for_each_entry(i, &amp;cache, list) {
-@@ -104,12 +114,11 @@
- struct object *cache_find(int id)
- {
- struct object *obj;
-- unsigned long flags;
-
-- spin_lock_irqsave(&amp;cache_lock, flags);
-+ rcu_read_lock();
- obj = __cache_find(id);
- if (obj)
- object_get(obj);
-- spin_unlock_irqrestore(&amp;cache_lock, flags);
-+ rcu_read_unlock();
- return obj;
- }
-</programlisting>
-
-<para>
-Note that the reader will alter the
-<structfield>popularity</structfield> member in
-<function>__cache_find()</function>, and now it doesn't hold a lock.
-One solution would be to make it an <type>atomic_t</type>, but for
-this usage, we don't really care about races: an approximate result is
-good enough, so I didn't change it.
-</para>
-
-<para>
-The result is that <function>cache_find()</function> requires no
-synchronization with any other functions, so is almost as fast on SMP
-as it would be on UP.
-</para>
-
-<para>
-There is a further optimization possible here: remember our original
-cache code, where there were no reference counts and the caller simply
-held the lock whenever using the object? This is still possible: if
-you hold the lock, no one can delete the object, so you don't need to
-get and put the reference count.
-</para>
-
-<para>
-Now, because the 'read lock' in RCU is simply disabling preemption, a
-caller which always has preemption disabled between calling
-<function>cache_find()</function> and
-<function>object_put()</function> does not need to actually get and
-put the reference count: we could expose
-<function>__cache_find()</function> by making it non-static, and
-such callers could simply call that.
-</para>
-<para>
-The benefit here is that the reference count is not written to: the
-object is not altered in any way, which is much faster on SMP
-machines due to caching.
-</para>
- </sect1>
-
- <sect1 id="per-cpu">
- <title>Per-CPU Data</title>
-
- <para>
- Another technique for avoiding locking which is used fairly
- widely is to duplicate information for each CPU. For example,
- if you wanted to keep a count of a common condition, you could
- use a spin lock and a single counter. Nice and simple.
- </para>
-
- <para>
- If that was too slow (it's usually not, but if you've got a
- really big machine to test on and can show that it is), you
- could instead use a counter for each CPU, then none of them need
- an exclusive lock. See <function>DEFINE_PER_CPU()</function>,
- <function>get_cpu_var()</function> and
- <function>put_cpu_var()</function>
- (<filename class="headerfile">include/linux/percpu.h</filename>).
- </para>
-
- <para>
- Of particular use for simple per-cpu counters is the
- <type>local_t</type> type, and the
- <function>cpu_local_inc()</function> and related functions,
- which are more efficient than simple code on some architectures
- (<filename class="headerfile">include/asm/local.h</filename>).
- </para>
-
- <para>
- Note that there is no simple, reliable way of getting an exact
- value of such a counter, without introducing more locks. This
- is not a problem for some uses.
- </para>
- </sect1>
-
- <sect1 id="mostly-hardirq">
- <title>Data Which Mostly Used By An IRQ Handler</title>
-
- <para>
- If data is always accessed from within the same IRQ handler, you
- don't need a lock at all: the kernel already guarantees that the
- irq handler will not run simultaneously on multiple CPUs.
- </para>
- <para>
- Manfred Spraul points out that you can still do this, even if
- the data is very occasionally accessed in user context or
- softirqs/tasklets. The irq handler doesn't use a lock, and
- all other accesses are done as so:
- </para>
-
-<programlisting>
- spin_lock(&amp;lock);
- disable_irq(irq);
- ...
- enable_irq(irq);
- spin_unlock(&amp;lock);
-</programlisting>
- <para>
- The <function>disable_irq()</function> prevents the irq handler
- from running (and waits for it to finish if it's currently
- running on other CPUs). The spinlock prevents any other
- accesses happening at the same time. Naturally, this is slower
- than just a <function>spin_lock_irq()</function> call, so it
- only makes sense if this type of access happens extremely
- rarely.
- </para>
- </sect1>
- </chapter>
-
- <chapter id="sleeping-things">
- <title>What Functions Are Safe To Call From Interrupts?</title>
-
- <para>
- Many functions in the kernel sleep (ie. call schedule())
- directly or indirectly: you can never call them while holding a
- spinlock, or with preemption disabled. This also means you need
- to be in user context: calling them from an interrupt is illegal.
- </para>
-
- <sect1 id="sleeping">
- <title>Some Functions Which Sleep</title>
-
- <para>
- The most common ones are listed below, but you usually have to
- read the code to find out if other calls are safe. If everyone
- else who calls it can sleep, you probably need to be able to
- sleep, too. In particular, registration and deregistration
- functions usually expect to be called from user context, and can
- sleep.
- </para>
-
- <itemizedlist>
- <listitem>
- <para>
- Accesses to
- <firstterm linkend="gloss-userspace">userspace</firstterm>:
- </para>
- <itemizedlist>
- <listitem>
- <para>
- <function>copy_from_user()</function>
- </para>
- </listitem>
- <listitem>
- <para>
- <function>copy_to_user()</function>
- </para>
- </listitem>
- <listitem>
- <para>
- <function>get_user()</function>
- </para>
- </listitem>
- <listitem>
- <para>
- <function>put_user()</function>
- </para>
- </listitem>
- </itemizedlist>
- </listitem>
-
- <listitem>
- <para>
- <function>kmalloc(GFP_KERNEL)</function>
- </para>
- </listitem>
-
- <listitem>
- <para>
- <function>mutex_lock_interruptible()</function> and
- <function>mutex_lock()</function>
- </para>
- <para>
- There is a <function>mutex_trylock()</function> which does not
- sleep. Still, it must not be used inside interrupt context since
- its implementation is not safe for that.
- <function>mutex_unlock()</function> will also never sleep.
- It cannot be used in interrupt context either since a mutex
- must be released by the same task that acquired it.
- </para>
- </listitem>
- </itemizedlist>
- </sect1>
-
- <sect1 id="dont-sleep">
- <title>Some Functions Which Don't Sleep</title>
-
- <para>
- Some functions are safe to call from any context, or holding
- almost any lock.
- </para>
-
- <itemizedlist>
- <listitem>
- <para>
- <function>printk()</function>
- </para>
- </listitem>
- <listitem>
- <para>
- <function>kfree()</function>
- </para>
- </listitem>
- <listitem>
- <para>
- <function>add_timer()</function> and <function>del_timer()</function>
- </para>
- </listitem>
- </itemizedlist>
- </sect1>
- </chapter>
-
- <chapter id="apiref-mutex">
- <title>Mutex API reference</title>
-!Iinclude/linux/mutex.h
-!Ekernel/locking/mutex.c
- </chapter>
-
- <chapter id="apiref-futex">
- <title>Futex API reference</title>
-!Ikernel/futex.c
- </chapter>
-
- <chapter id="references">
- <title>Further reading</title>
-
- <itemizedlist>
- <listitem>
- <para>
- <filename>Documentation/locking/spinlocks.txt</filename>:
- Linus Torvalds' spinlocking tutorial in the kernel sources.
- </para>
- </listitem>
-
- <listitem>
- <para>
- Unix Systems for Modern Architectures: Symmetric
- Multiprocessing and Caching for Kernel Programmers:
- </para>
-
- <para>
- Curt Schimmel's very good introduction to kernel level
- locking (not written for Linux, but nearly everything
- applies). The book is expensive, but really worth every
- penny to understand SMP locking. [ISBN: 0201633388]
- </para>
- </listitem>
- </itemizedlist>
- </chapter>
-
- <chapter id="thanks">
- <title>Thanks</title>
-
- <para>
- Thanks to Telsa Gwynne for DocBooking, neatening and adding
- style.
- </para>
-
- <para>
- Thanks to Martin Pool, Philipp Rumpf, Stephen Rothwell, Paul
- Mackerras, Ruedi Aschwanden, Alan Cox, Manfred Spraul, Tim
- Waugh, Pete Zaitcev, James Morris, Robert Love, Paul McKenney,
- John Ashby for proofreading, correcting, flaming, commenting.
- </para>
-
- <para>
- Thanks to the cabal for having no influence on this document.
- </para>
- </chapter>
-
- <glossary id="glossary">
- <title>Glossary</title>
-
- <glossentry id="gloss-preemption">
- <glossterm>preemption</glossterm>
- <glossdef>
- <para>
- Prior to 2.5, or when <symbol>CONFIG_PREEMPT</symbol> is
- unset, processes in user context inside the kernel would not
- preempt each other (ie. you had that CPU until you gave it up,
- except for interrupts). With the addition of
- <symbol>CONFIG_PREEMPT</symbol> in 2.5.4, this changed: when
- in user context, higher priority tasks can "cut in": spinlocks
- were changed to disable preemption, even on UP.
- </para>
- </glossdef>
- </glossentry>
-
- <glossentry id="gloss-bh">
- <glossterm>bh</glossterm>
- <glossdef>
- <para>
- Bottom Half: for historical reasons, functions with
- '_bh' in them often now refer to any software interrupt, e.g.
- <function>spin_lock_bh()</function> blocks any software interrupt
- on the current CPU. Bottom halves are deprecated, and will
- eventually be replaced by tasklets. Only one bottom half will be
- running at any time.
- </para>
- </glossdef>
- </glossentry>
-
- <glossentry id="gloss-hwinterrupt">
- <glossterm>Hardware Interrupt / Hardware IRQ</glossterm>
- <glossdef>
- <para>
- Hardware interrupt request. <function>in_irq()</function> returns
- <returnvalue>true</returnvalue> in a hardware interrupt handler.
- </para>
- </glossdef>
- </glossentry>
-
- <glossentry id="gloss-interruptcontext">
- <glossterm>Interrupt Context</glossterm>
- <glossdef>
- <para>
- Not user context: processing a hardware irq or software irq.
- Indicated by the <function>in_interrupt()</function> macro
- returning <returnvalue>true</returnvalue>.
- </para>
- </glossdef>
- </glossentry>
-
- <glossentry id="gloss-smp">
- <glossterm><acronym>SMP</acronym></glossterm>
- <glossdef>
- <para>
- Symmetric Multi-Processor: kernels compiled for multiple-CPU
- machines. (CONFIG_SMP=y).
- </para>
- </glossdef>
- </glossentry>
-
- <glossentry id="gloss-softirq">
- <glossterm>Software Interrupt / softirq</glossterm>
- <glossdef>
- <para>
- Software interrupt handler. <function>in_irq()</function> returns
- <returnvalue>false</returnvalue>; <function>in_softirq()</function>
- returns <returnvalue>true</returnvalue>. Tasklets and softirqs
- both fall into the category of 'software interrupts'.
- </para>
- <para>
- Strictly speaking a softirq is one of up to 32 enumerated software
- interrupts which can run on multiple CPUs at once.
- Sometimes used to refer to tasklets as
- well (ie. all software interrupts).
- </para>
- </glossdef>
- </glossentry>
-
- <glossentry id="gloss-tasklet">
- <glossterm>tasklet</glossterm>
- <glossdef>
- <para>
- A dynamically-registrable software interrupt,
- which is guaranteed to only run on one CPU at a time.
- </para>
- </glossdef>
- </glossentry>
-
- <glossentry id="gloss-timers">
- <glossterm>timer</glossterm>
- <glossdef>
- <para>
- A dynamically-registrable software interrupt, which is run at
- (or close to) a given time. When running, it is just like a
- tasklet (in fact, they are called from the TIMER_SOFTIRQ).
- </para>
- </glossdef>
- </glossentry>
-
- <glossentry id="gloss-up">
- <glossterm><acronym>UP</acronym></glossterm>
- <glossdef>
- <para>
- Uni-Processor: Non-SMP. (CONFIG_SMP=n).
- </para>
- </glossdef>
- </glossentry>
-
- <glossentry id="gloss-usercontext">
- <glossterm>User Context</glossterm>
- <glossdef>
- <para>
- The kernel executing on behalf of a particular process (ie. a
- system call or trap) or kernel thread. You can tell which
- process with the <symbol>current</symbol> macro.) Not to
- be confused with userspace. Can be interrupted by software or
- hardware interrupts.
- </para>
- </glossdef>
- </glossentry>
-
- <glossentry id="gloss-userspace">
- <glossterm>Userspace</glossterm>
- <glossdef>
- <para>
- A process executing its own code outside the kernel.
- </para>
- </glossdef>
- </glossentry>
-
- </glossary>
-</book>
-
diff --git a/Documentation/DocBook/kgdb.tmpl b/Documentation/DocBook/kgdb.tmpl
deleted file mode 100644
index 856ac20bf367..000000000000
--- a/Documentation/DocBook/kgdb.tmpl
+++ /dev/null
@@ -1,918 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
- "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
-
-<book id="kgdbOnLinux">
- <bookinfo>
- <title>Using kgdb, kdb and the kernel debugger internals</title>
-
- <authorgroup>
- <author>
- <firstname>Jason</firstname>
- <surname>Wessel</surname>
- <affiliation>
- <address>
- <email>jason.wessel@windriver.com</email>
- </address>
- </affiliation>
- </author>
- </authorgroup>
- <copyright>
- <year>2008,2010</year>
- <holder>Wind River Systems, Inc.</holder>
- </copyright>
- <copyright>
- <year>2004-2005</year>
- <holder>MontaVista Software, Inc.</holder>
- </copyright>
- <copyright>
- <year>2004</year>
- <holder>Amit S. Kale</holder>
- </copyright>
-
- <legalnotice>
- <para>
- This file is licensed under the terms of the GNU General Public License
- version 2. This program is licensed "as is" without any warranty of any
- kind, whether express or implied.
- </para>
-
- </legalnotice>
- </bookinfo>
-
-<toc></toc>
- <chapter id="Introduction">
- <title>Introduction</title>
- <para>
- The kernel has two different debugger front ends (kdb and kgdb)
- which interface to the debug core. It is possible to use either
- of the debugger front ends and dynamically transition between them
- if you configure the kernel properly at compile and runtime.
- </para>
- <para>
- Kdb is simplistic shell-style interface which you can use on a
- system console with a keyboard or serial console. You can use it
- to inspect memory, registers, process lists, dmesg, and even set
- breakpoints to stop in a certain location. Kdb is not a source
- level debugger, although you can set breakpoints and execute some
- basic kernel run control. Kdb is mainly aimed at doing some
- analysis to aid in development or diagnosing kernel problems. You
- can access some symbols by name in kernel built-ins or in kernel
- modules if the code was built
- with <symbol>CONFIG_KALLSYMS</symbol>.
- </para>
- <para>
- Kgdb is intended to be used as a source level debugger for the
- Linux kernel. It is used along with gdb to debug a Linux kernel.
- The expectation is that gdb can be used to "break in" to the
- kernel to inspect memory, variables and look through call stack
- information similar to the way an application developer would use
- gdb to debug an application. It is possible to place breakpoints
- in kernel code and perform some limited execution stepping.
- </para>
- <para>
- Two machines are required for using kgdb. One of these machines is
- a development machine and the other is the target machine. The
- kernel to be debugged runs on the target machine. The development
- machine runs an instance of gdb against the vmlinux file which
- contains the symbols (not a boot image such as bzImage, zImage,
- uImage...). In gdb the developer specifies the connection
- parameters and connects to kgdb. The type of connection a
- developer makes with gdb depends on the availability of kgdb I/O
- modules compiled as built-ins or loadable kernel modules in the test
- machine's kernel.
- </para>
- </chapter>
- <chapter id="CompilingAKernel">
- <title>Compiling a kernel</title>
- <para>
- <itemizedlist>
- <listitem><para>In order to enable compilation of kdb, you must first enable kgdb.</para></listitem>
- <listitem><para>The kgdb test compile options are described in the kgdb test suite chapter.</para></listitem>
- </itemizedlist>
- </para>
- <sect1 id="CompileKGDB">
- <title>Kernel config options for kgdb</title>
- <para>
- To enable <symbol>CONFIG_KGDB</symbol> you should look under
- "Kernel hacking" / "Kernel debugging" and select "KGDB: kernel debugger".
- </para>
- <para>
- While it is not a hard requirement that you have symbols in your
- vmlinux file, gdb tends not to be very useful without the symbolic
- data, so you will want to turn
- on <symbol>CONFIG_DEBUG_INFO</symbol> which is called "Compile the
- kernel with debug info" in the config menu.
- </para>
- <para>
- It is advised, but not required, that you turn on the
- <symbol>CONFIG_FRAME_POINTER</symbol> kernel option which is called "Compile the
- kernel with frame pointers" in the config menu. This option
- inserts code to into the compiled executable which saves the frame
- information in registers or on the stack at different points which
- allows a debugger such as gdb to more accurately construct
- stack back traces while debugging the kernel.
- </para>
- <para>
- If the architecture that you are using supports the kernel option
- CONFIG_STRICT_KERNEL_RWX, you should consider turning it off. This
- option will prevent the use of software breakpoints because it
- marks certain regions of the kernel's memory space as read-only.
- If kgdb supports it for the architecture you are using, you can
- use hardware breakpoints if you desire to run with the
- CONFIG_STRICT_KERNEL_RWX option turned on, else you need to turn off
- this option.
- </para>
- <para>
- Next you should choose one of more I/O drivers to interconnect
- debugging host and debugged target. Early boot debugging requires
- a KGDB I/O driver that supports early debugging and the driver
- must be built into the kernel directly. Kgdb I/O driver
- configuration takes place via kernel or module parameters which
- you can learn more about in the in the section that describes the
- parameter "kgdboc".
- </para>
- <para>Here is an example set of .config symbols to enable or
- disable for kgdb:
- <itemizedlist>
- <listitem><para># CONFIG_STRICT_KERNEL_RWX is not set</para></listitem>
- <listitem><para>CONFIG_FRAME_POINTER=y</para></listitem>
- <listitem><para>CONFIG_KGDB=y</para></listitem>
- <listitem><para>CONFIG_KGDB_SERIAL_CONSOLE=y</para></listitem>
- </itemizedlist>
- </para>
- </sect1>
- <sect1 id="CompileKDB">
- <title>Kernel config options for kdb</title>
- <para>Kdb is quite a bit more complex than the simple gdbstub
- sitting on top of the kernel's debug core. Kdb must implement a
- shell, and also adds some helper functions in other parts of the
- kernel, responsible for printing out interesting data such as what
- you would see if you ran "lsmod", or "ps". In order to build kdb
- into the kernel you follow the same steps as you would for kgdb.
- </para>
- <para>The main config option for kdb
- is <symbol>CONFIG_KGDB_KDB</symbol> which is called "KGDB_KDB:
- include kdb frontend for kgdb" in the config menu. In theory you
- would have already also selected an I/O driver such as the
- CONFIG_KGDB_SERIAL_CONSOLE interface if you plan on using kdb on a
- serial port, when you were configuring kgdb.
- </para>
- <para>If you want to use a PS/2-style keyboard with kdb, you would
- select CONFIG_KDB_KEYBOARD which is called "KGDB_KDB: keyboard as
- input device" in the config menu. The CONFIG_KDB_KEYBOARD option
- is not used for anything in the gdb interface to kgdb. The
- CONFIG_KDB_KEYBOARD option only works with kdb.
- </para>
- <para>Here is an example set of .config symbols to enable/disable kdb:
- <itemizedlist>
- <listitem><para># CONFIG_STRICT_KERNEL_RWX is not set</para></listitem>
- <listitem><para>CONFIG_FRAME_POINTER=y</para></listitem>
- <listitem><para>CONFIG_KGDB=y</para></listitem>
- <listitem><para>CONFIG_KGDB_SERIAL_CONSOLE=y</para></listitem>
- <listitem><para>CONFIG_KGDB_KDB=y</para></listitem>
- <listitem><para>CONFIG_KDB_KEYBOARD=y</para></listitem>
- </itemizedlist>
- </para>
- </sect1>
- </chapter>
- <chapter id="kgdbKernelArgs">
- <title>Kernel Debugger Boot Arguments</title>
- <para>This section describes the various runtime kernel
- parameters that affect the configuration of the kernel debugger.
- The following chapter covers using kdb and kgdb as well as
- providing some examples of the configuration parameters.</para>
- <sect1 id="kgdboc">
- <title>Kernel parameter: kgdboc</title>
- <para>The kgdboc driver was originally an abbreviation meant to
- stand for "kgdb over console". Today it is the primary mechanism
- to configure how to communicate from gdb to kgdb as well as the
- devices you want to use to interact with the kdb shell.
- </para>
- <para>For kgdb/gdb, kgdboc is designed to work with a single serial
- port. It is intended to cover the circumstance where you want to
- use a serial console as your primary console as well as using it to
- perform kernel debugging. It is also possible to use kgdb on a
- serial port which is not designated as a system console. Kgdboc
- may be configured as a kernel built-in or a kernel loadable module.
- You can only make use of <constant>kgdbwait</constant> and early
- debugging if you build kgdboc into the kernel as a built-in.
- </para>
- <para>Optionally you can elect to activate kms (Kernel Mode
- Setting) integration. When you use kms with kgdboc and you have a
- video driver that has atomic mode setting hooks, it is possible to
- enter the debugger on the graphics console. When the kernel
- execution is resumed, the previous graphics mode will be restored.
- This integration can serve as a useful tool to aid in diagnosing
- crashes or doing analysis of memory with kdb while allowing the
- full graphics console applications to run.
- </para>
- <sect2 id="kgdbocArgs">
- <title>kgdboc arguments</title>
- <para>Usage: <constant>kgdboc=[kms][[,]kbd][[,]serial_device][,baud]</constant></para>
- <para>The order listed above must be observed if you use any of the
- optional configurations together.
- </para>
- <para>Abbreviations:
- <itemizedlist>
- <listitem><para>kms = Kernel Mode Setting</para></listitem>
- <listitem><para>kbd = Keyboard</para></listitem>
- </itemizedlist>
- </para>
- <para>You can configure kgdboc to use the keyboard, and/or a serial
- device depending on if you are using kdb and/or kgdb, in one of the
- following scenarios. The order listed above must be observed if
- you use any of the optional configurations together. Using kms +
- only gdb is generally not a useful combination.</para>
- <sect3 id="kgdbocArgs1">
- <title>Using loadable module or built-in</title>
- <para>
- <orderedlist>
- <listitem><para>As a kernel built-in:</para>
- <para>Use the kernel boot argument: <constant>kgdboc=&lt;tty-device&gt;,[baud]</constant></para></listitem>
- <listitem>
- <para>As a kernel loadable module:</para>
- <para>Use the command: <constant>modprobe kgdboc kgdboc=&lt;tty-device&gt;,[baud]</constant></para>
- <para>Here are two examples of how you might format the kgdboc
- string. The first is for an x86 target using the first serial port.
- The second example is for the ARM Versatile AB using the second
- serial port.
- <orderedlist>
- <listitem><para><constant>kgdboc=ttyS0,115200</constant></para></listitem>
- <listitem><para><constant>kgdboc=ttyAMA1,115200</constant></para></listitem>
- </orderedlist>
- </para>
- </listitem>
- </orderedlist></para>
- </sect3>
- <sect3 id="kgdbocArgs2">
- <title>Configure kgdboc at runtime with sysfs</title>
- <para>At run time you can enable or disable kgdboc by echoing a
- parameters into the sysfs. Here are two examples:</para>
- <orderedlist>
- <listitem><para>Enable kgdboc on ttyS0</para>
- <para><constant>echo ttyS0 &gt; /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
- <listitem><para>Disable kgdboc</para>
- <para><constant>echo "" &gt; /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
- </orderedlist>
- <para>NOTE: You do not need to specify the baud if you are
- configuring the console on tty which is already configured or
- open.</para>
- </sect3>
- <sect3 id="kgdbocArgs3">
- <title>More examples</title>
- <para>You can configure kgdboc to use the keyboard, and/or a serial device
- depending on if you are using kdb and/or kgdb, in one of the
- following scenarios.
- <orderedlist>
- <listitem><para>kdb and kgdb over only a serial port</para>
- <para><constant>kgdboc=&lt;serial_device&gt;[,baud]</constant></para>
- <para>Example: <constant>kgdboc=ttyS0,115200</constant></para>
- </listitem>
- <listitem><para>kdb and kgdb with keyboard and a serial port</para>
- <para><constant>kgdboc=kbd,&lt;serial_device&gt;[,baud]</constant></para>
- <para>Example: <constant>kgdboc=kbd,ttyS0,115200</constant></para>
- </listitem>
- <listitem><para>kdb with a keyboard</para>
- <para><constant>kgdboc=kbd</constant></para>
- </listitem>
- <listitem><para>kdb with kernel mode setting</para>
- <para><constant>kgdboc=kms,kbd</constant></para>
- </listitem>
- <listitem><para>kdb with kernel mode setting and kgdb over a serial port</para>
- <para><constant>kgdboc=kms,kbd,ttyS0,115200</constant></para>
- </listitem>
- </orderedlist>
- </para>
- <para>NOTE: Kgdboc does not support interrupting the target via the
- gdb remote protocol. You must manually send a sysrq-g unless you
- have a proxy that splits console output to a terminal program.
- A console proxy has a separate TCP port for the debugger and a separate
- TCP port for the "human" console. The proxy can take care of sending
- the sysrq-g for you.
- </para>
- <para>When using kgdboc with no debugger proxy, you can end up
- connecting the debugger at one of two entry points. If an
- exception occurs after you have loaded kgdboc, a message should
- print on the console stating it is waiting for the debugger. In
- this case you disconnect your terminal program and then connect the
- debugger in its place. If you want to interrupt the target system
- and forcibly enter a debug session you have to issue a Sysrq
- sequence and then type the letter <constant>g</constant>. Then
- you disconnect the terminal session and connect gdb. Your options
- if you don't like this are to hack gdb to send the sysrq-g for you
- as well as on the initial connect, or to use a debugger proxy that
- allows an unmodified gdb to do the debugging.
- </para>
- </sect3>
- </sect2>
- </sect1>
- <sect1 id="kgdbwait">
- <title>Kernel parameter: kgdbwait</title>
- <para>
- The Kernel command line option <constant>kgdbwait</constant> makes
- kgdb wait for a debugger connection during booting of a kernel. You
- can only use this option if you compiled a kgdb I/O driver into the
- kernel and you specified the I/O driver configuration as a kernel
- command line option. The kgdbwait parameter should always follow the
- configuration parameter for the kgdb I/O driver in the kernel
- command line else the I/O driver will not be configured prior to
- asking the kernel to use it to wait.
- </para>
- <para>
- The kernel will stop and wait as early as the I/O driver and
- architecture allows when you use this option. If you build the
- kgdb I/O driver as a loadable kernel module kgdbwait will not do
- anything.
- </para>
- </sect1>
- <sect1 id="kgdbcon">
- <title>Kernel parameter: kgdbcon</title>
- <para> The kgdbcon feature allows you to see printk() messages
- inside gdb while gdb is connected to the kernel. Kdb does not make
- use of the kgdbcon feature.
- </para>
- <para>Kgdb supports using the gdb serial protocol to send console
- messages to the debugger when the debugger is connected and running.
- There are two ways to activate this feature.
- <orderedlist>
- <listitem><para>Activate with the kernel command line option:</para>
- <para><constant>kgdbcon</constant></para>
- </listitem>
- <listitem><para>Use sysfs before configuring an I/O driver</para>
- <para>
- <constant>echo 1 &gt; /sys/module/kgdb/parameters/kgdb_use_con</constant>
- </para>
- <para>
- NOTE: If you do this after you configure the kgdb I/O driver, the
- setting will not take effect until the next point the I/O is
- reconfigured.
- </para>
- </listitem>
- </orderedlist>
- </para>
- <para>IMPORTANT NOTE: You cannot use kgdboc + kgdbcon on a tty that is an
- active system console. An example of incorrect usage is <constant>console=ttyS0,115200 kgdboc=ttyS0 kgdbcon</constant>
- </para>
- <para>It is possible to use this option with kgdboc on a tty that is not a system console.
- </para>
- </sect1>
- <sect1 id="kgdbreboot">
- <title>Run time parameter: kgdbreboot</title>
- <para> The kgdbreboot feature allows you to change how the debugger
- deals with the reboot notification. You have 3 choices for the
- behavior. The default behavior is always set to 0.</para>
- <orderedlist>
- <listitem><para>echo -1 > /sys/module/debug_core/parameters/kgdbreboot</para>
- <para>Ignore the reboot notification entirely.</para>
- </listitem>
- <listitem><para>echo 0 > /sys/module/debug_core/parameters/kgdbreboot</para>
- <para>Send the detach message to any attached debugger client.</para>
- </listitem>
- <listitem><para>echo 1 > /sys/module/debug_core/parameters/kgdbreboot</para>
- <para>Enter the debugger on reboot notify.</para>
- </listitem>
- </orderedlist>
- </sect1>
- </chapter>
- <chapter id="usingKDB">
- <title>Using kdb</title>
- <para>
- </para>
- <sect1 id="quickKDBserial">
- <title>Quick start for kdb on a serial port</title>
- <para>This is a quick example of how to use kdb.</para>
- <para><orderedlist>
- <listitem><para>Configure kgdboc at boot using kernel parameters:
- <itemizedlist>
- <listitem><para><constant>console=ttyS0,115200 kgdboc=ttyS0,115200</constant></para></listitem>
- </itemizedlist></para>
- <para>OR</para>
- <para>Configure kgdboc after the kernel has booted; assuming you are using a serial port console:
- <itemizedlist>
- <listitem><para><constant>echo ttyS0 &gt; /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
- </itemizedlist>
- </para>
- </listitem>
- <listitem><para>Enter the kernel debugger manually or by waiting for an oops or fault. There are several ways you can enter the kernel debugger manually; all involve using the sysrq-g, which means you must have enabled CONFIG_MAGIC_SYSRQ=y in your kernel config.</para>
- <itemizedlist>
- <listitem><para>When logged in as root or with a super user session you can run:</para>
- <para><constant>echo g &gt; /proc/sysrq-trigger</constant></para></listitem>
- <listitem><para>Example using minicom 2.2</para>
- <para>Press: <constant>Control-a</constant></para>
- <para>Press: <constant>f</constant></para>
- <para>Press: <constant>g</constant></para>
- </listitem>
- <listitem><para>When you have telneted to a terminal server that supports sending a remote break</para>
- <para>Press: <constant>Control-]</constant></para>
- <para>Type in:<constant>send break</constant></para>
- <para>Press: <constant>Enter</constant></para>
- <para>Press: <constant>g</constant></para>
- </listitem>
- </itemizedlist>
- </listitem>
- <listitem><para>From the kdb prompt you can run the "help" command to see a complete list of the commands that are available.</para>
- <para>Some useful commands in kdb include:
- <itemizedlist>
- <listitem><para>lsmod -- Shows where kernel modules are loaded</para></listitem>
- <listitem><para>ps -- Displays only the active processes</para></listitem>
- <listitem><para>ps A -- Shows all the processes</para></listitem>
- <listitem><para>summary -- Shows kernel version info and memory usage</para></listitem>
- <listitem><para>bt -- Get a backtrace of the current process using dump_stack()</para></listitem>
- <listitem><para>dmesg -- View the kernel syslog buffer</para></listitem>
- <listitem><para>go -- Continue the system</para></listitem>
- </itemizedlist>
- </para>
- </listitem>
- <listitem>
- <para>When you are done using kdb you need to consider rebooting the
- system or using the "go" command to resuming normal kernel
- execution. If you have paused the kernel for a lengthy period of
- time, applications that rely on timely networking or anything to do
- with real wall clock time could be adversely affected, so you
- should take this into consideration when using the kernel
- debugger.</para>
- </listitem>
- </orderedlist></para>
- </sect1>
- <sect1 id="quickKDBkeyboard">
- <title>Quick start for kdb using a keyboard connected console</title>
- <para>This is a quick example of how to use kdb with a keyboard.</para>
- <para><orderedlist>
- <listitem><para>Configure kgdboc at boot using kernel parameters:
- <itemizedlist>
- <listitem><para><constant>kgdboc=kbd</constant></para></listitem>
- </itemizedlist></para>
- <para>OR</para>
- <para>Configure kgdboc after the kernel has booted:
- <itemizedlist>
- <listitem><para><constant>echo kbd &gt; /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
- </itemizedlist>
- </para>
- </listitem>
- <listitem><para>Enter the kernel debugger manually or by waiting for an oops or fault. There are several ways you can enter the kernel debugger manually; all involve using the sysrq-g, which means you must have enabled CONFIG_MAGIC_SYSRQ=y in your kernel config.</para>
- <itemizedlist>
- <listitem><para>When logged in as root or with a super user session you can run:</para>
- <para><constant>echo g &gt; /proc/sysrq-trigger</constant></para></listitem>
- <listitem><para>Example using a laptop keyboard</para>
- <para>Press and hold down: <constant>Alt</constant></para>
- <para>Press and hold down: <constant>Fn</constant></para>
- <para>Press and release the key with the label: <constant>SysRq</constant></para>
- <para>Release: <constant>Fn</constant></para>
- <para>Press and release: <constant>g</constant></para>
- <para>Release: <constant>Alt</constant></para>
- </listitem>
- <listitem><para>Example using a PS/2 101-key keyboard</para>
- <para>Press and hold down: <constant>Alt</constant></para>
- <para>Press and release the key with the label: <constant>SysRq</constant></para>
- <para>Press and release: <constant>g</constant></para>
- <para>Release: <constant>Alt</constant></para>
- </listitem>
- </itemizedlist>
- </listitem>
- <listitem>
- <para>Now type in a kdb command such as "help", "dmesg", "bt" or "go" to continue kernel execution.</para>
- </listitem>
- </orderedlist></para>
- </sect1>
- </chapter>
- <chapter id="EnableKGDB">
- <title>Using kgdb / gdb</title>
- <para>In order to use kgdb you must activate it by passing
- configuration information to one of the kgdb I/O drivers. If you
- do not pass any configuration information kgdb will not do anything
- at all. Kgdb will only actively hook up to the kernel trap hooks
- if a kgdb I/O driver is loaded and configured. If you unconfigure
- a kgdb I/O driver, kgdb will unregister all the kernel hook points.
- </para>
- <para> All kgdb I/O drivers can be reconfigured at run time, if
- <symbol>CONFIG_SYSFS</symbol> and <symbol>CONFIG_MODULES</symbol>
- are enabled, by echo'ing a new config string to
- <constant>/sys/module/&lt;driver&gt;/parameter/&lt;option&gt;</constant>.
- The driver can be unconfigured by passing an empty string. You cannot
- change the configuration while the debugger is attached. Make sure
- to detach the debugger with the <constant>detach</constant> command
- prior to trying to unconfigure a kgdb I/O driver.
- </para>
- <sect1 id="ConnectingGDB">
- <title>Connecting with gdb to a serial port</title>
- <orderedlist>
- <listitem><para>Configure kgdboc</para>
- <para>Configure kgdboc at boot using kernel parameters:
- <itemizedlist>
- <listitem><para><constant>kgdboc=ttyS0,115200</constant></para></listitem>
- </itemizedlist></para>
- <para>OR</para>
- <para>Configure kgdboc after the kernel has booted:
- <itemizedlist>
- <listitem><para><constant>echo ttyS0 &gt; /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
- </itemizedlist></para>
- </listitem>
- <listitem>
- <para>Stop kernel execution (break into the debugger)</para>
- <para>In order to connect to gdb via kgdboc, the kernel must
- first be stopped. There are several ways to stop the kernel which
- include using kgdbwait as a boot argument, via a sysrq-g, or running
- the kernel until it takes an exception where it waits for the
- debugger to attach.
- <itemizedlist>
- <listitem><para>When logged in as root or with a super user session you can run:</para>
- <para><constant>echo g &gt; /proc/sysrq-trigger</constant></para></listitem>
- <listitem><para>Example using minicom 2.2</para>
- <para>Press: <constant>Control-a</constant></para>
- <para>Press: <constant>f</constant></para>
- <para>Press: <constant>g</constant></para>
- </listitem>
- <listitem><para>When you have telneted to a terminal server that supports sending a remote break</para>
- <para>Press: <constant>Control-]</constant></para>
- <para>Type in:<constant>send break</constant></para>
- <para>Press: <constant>Enter</constant></para>
- <para>Press: <constant>g</constant></para>
- </listitem>
- </itemizedlist>
- </para>
- </listitem>
- <listitem>
- <para>Connect from gdb</para>
- <para>
- Example (using a directly connected port):
- </para>
- <programlisting>
- % gdb ./vmlinux
- (gdb) set remotebaud 115200
- (gdb) target remote /dev/ttyS0
- </programlisting>
- <para>
- Example (kgdb to a terminal server on TCP port 2012):
- </para>
- <programlisting>
- % gdb ./vmlinux
- (gdb) target remote 192.168.2.2:2012
- </programlisting>
- <para>
- Once connected, you can debug a kernel the way you would debug an
- application program.
- </para>
- <para>
- If you are having problems connecting or something is going
- seriously wrong while debugging, it will most often be the case
- that you want to enable gdb to be verbose about its target
- communications. You do this prior to issuing the <constant>target
- remote</constant> command by typing in: <constant>set debug remote 1</constant>
- </para>
- </listitem>
- </orderedlist>
- <para>Remember if you continue in gdb, and need to "break in" again,
- you need to issue an other sysrq-g. It is easy to create a simple
- entry point by putting a breakpoint at <constant>sys_sync</constant>
- and then you can run "sync" from a shell or script to break into the
- debugger.</para>
- </sect1>
- </chapter>
- <chapter id="switchKdbKgdb">
- <title>kgdb and kdb interoperability</title>
- <para>It is possible to transition between kdb and kgdb dynamically.
- The debug core will remember which you used the last time and
- automatically start in the same mode.</para>
- <sect1>
- <title>Switching between kdb and kgdb</title>
- <sect2>
- <title>Switching from kgdb to kdb</title>
- <para>
- There are two ways to switch from kgdb to kdb: you can use gdb to
- issue a maintenance packet, or you can blindly type the command $3#33.
- Whenever the kernel debugger stops in kgdb mode it will print the
- message <constant>KGDB or $3#33 for KDB</constant>. It is important
- to note that you have to type the sequence correctly in one pass.
- You cannot type a backspace or delete because kgdb will interpret
- that as part of the debug stream.
- <orderedlist>
- <listitem><para>Change from kgdb to kdb by blindly typing:</para>
- <para><constant>$3#33</constant></para></listitem>
- <listitem><para>Change from kgdb to kdb with gdb</para>
- <para><constant>maintenance packet 3</constant></para>
- <para>NOTE: Now you must kill gdb. Typically you press control-z and
- issue the command: kill -9 %</para></listitem>
- </orderedlist>
- </para>
- </sect2>
- <sect2>
- <title>Change from kdb to kgdb</title>
- <para>There are two ways you can change from kdb to kgdb. You can
- manually enter kgdb mode by issuing the kgdb command from the kdb
- shell prompt, or you can connect gdb while the kdb shell prompt is
- active. The kdb shell looks for the typical first commands that gdb
- would issue with the gdb remote protocol and if it sees one of those
- commands it automatically changes into kgdb mode.</para>
- <orderedlist>
- <listitem><para>From kdb issue the command:</para>
- <para><constant>kgdb</constant></para>
- <para>Now disconnect your terminal program and connect gdb in its place</para></listitem>
- <listitem><para>At the kdb prompt, disconnect the terminal program and connect gdb in its place.</para></listitem>
- </orderedlist>
- </sect2>
- </sect1>
- <sect1>
- <title>Running kdb commands from gdb</title>
- <para>It is possible to run a limited set of kdb commands from gdb,
- using the gdb monitor command. You don't want to execute any of the
- run control or breakpoint operations, because it can disrupt the
- state of the kernel debugger. You should be using gdb for
- breakpoints and run control operations if you have gdb connected.
- The more useful commands to run are things like lsmod, dmesg, ps or
- possibly some of the memory information commands. To see all the kdb
- commands you can run <constant>monitor help</constant>.</para>
- <para>Example:
- <informalexample><programlisting>
-(gdb) monitor ps
-1 idle process (state I) and
-27 sleeping system daemon (state M) processes suppressed,
-use 'ps A' to see all.
-Task Addr Pid Parent [*] cpu State Thread Command
-
-0xc78291d0 1 0 0 0 S 0xc7829404 init
-0xc7954150 942 1 0 0 S 0xc7954384 dropbear
-0xc78789c0 944 1 0 0 S 0xc7878bf4 sh
-(gdb)
- </programlisting></informalexample>
- </para>
- </sect1>
- </chapter>
- <chapter id="KGDBTestSuite">
- <title>kgdb Test Suite</title>
- <para>
- When kgdb is enabled in the kernel config you can also elect to
- enable the config parameter KGDB_TESTS. Turning this on will
- enable a special kgdb I/O module which is designed to test the
- kgdb internal functions.
- </para>
- <para>
- The kgdb tests are mainly intended for developers to test the kgdb
- internals as well as a tool for developing a new kgdb architecture
- specific implementation. These tests are not really for end users
- of the Linux kernel. The primary source of documentation would be
- to look in the drivers/misc/kgdbts.c file.
- </para>
- <para>
- The kgdb test suite can also be configured at compile time to run
- the core set of tests by setting the kernel config parameter
- KGDB_TESTS_ON_BOOT. This particular option is aimed at automated
- regression testing and does not require modifying the kernel boot
- config arguments. If this is turned on, the kgdb test suite can
- be disabled by specifying "kgdbts=" as a kernel boot argument.
- </para>
- </chapter>
- <chapter id="CommonBackEndReq">
- <title>Kernel Debugger Internals</title>
- <sect1 id="kgdbArchitecture">
- <title>Architecture Specifics</title>
- <para>
- The kernel debugger is organized into a number of components:
- <orderedlist>
- <listitem><para>The debug core</para>
- <para>
- The debug core is found in kernel/debugger/debug_core.c. It contains:
- <itemizedlist>
- <listitem><para>A generic OS exception handler which includes
- sync'ing the processors into a stopped state on an multi-CPU
- system.</para></listitem>
- <listitem><para>The API to talk to the kgdb I/O drivers</para></listitem>
- <listitem><para>The API to make calls to the arch-specific kgdb implementation</para></listitem>
- <listitem><para>The logic to perform safe memory reads and writes to memory while using the debugger</para></listitem>
- <listitem><para>A full implementation for software breakpoints unless overridden by the arch</para></listitem>
- <listitem><para>The API to invoke either the kdb or kgdb frontend to the debug core.</para></listitem>
- <listitem><para>The structures and callback API for atomic kernel mode setting.</para>
- <para>NOTE: kgdboc is where the kms callbacks are invoked.</para></listitem>
- </itemizedlist>
- </para>
- </listitem>
- <listitem><para>kgdb arch-specific implementation</para>
- <para>
- This implementation is generally found in arch/*/kernel/kgdb.c.
- As an example, arch/x86/kernel/kgdb.c contains the specifics to
- implement HW breakpoint as well as the initialization to
- dynamically register and unregister for the trap handlers on
- this architecture. The arch-specific portion implements:
- <itemizedlist>
- <listitem><para>contains an arch-specific trap catcher which
- invokes kgdb_handle_exception() to start kgdb about doing its
- work</para></listitem>
- <listitem><para>translation to and from gdb specific packet format to pt_regs</para></listitem>
- <listitem><para>Registration and unregistration of architecture specific trap hooks</para></listitem>
- <listitem><para>Any special exception handling and cleanup</para></listitem>
- <listitem><para>NMI exception handling and cleanup</para></listitem>
- <listitem><para>(optional) HW breakpoints</para></listitem>
- </itemizedlist>
- </para>
- </listitem>
- <listitem><para>gdbstub frontend (aka kgdb)</para>
- <para>The gdbstub is located in kernel/debug/gdbstub.c. It contains:</para>
- <itemizedlist>
- <listitem><para>All the logic to implement the gdb serial protocol</para></listitem>
- </itemizedlist>
- </listitem>
- <listitem><para>kdb frontend</para>
- <para>The kdb debugger shell is broken down into a number of
- components. The kdb core is located in kernel/debug/kdb. There
- are a number of helper functions in some of the other kernel
- components to make it possible for kdb to examine and report
- information about the kernel without taking locks that could
- cause a kernel deadlock. The kdb core contains implements the following functionality.</para>
- <itemizedlist>
- <listitem><para>A simple shell</para></listitem>
- <listitem><para>The kdb core command set</para></listitem>
- <listitem><para>A registration API to register additional kdb shell commands.</para>
- <itemizedlist>
- <listitem><para>A good example of a self-contained kdb module
- is the "ftdump" command for dumping the ftrace buffer. See:
- kernel/trace/trace_kdb.c</para></listitem>
- <listitem><para>For an example of how to dynamically register
- a new kdb command you can build the kdb_hello.ko kernel module
- from samples/kdb/kdb_hello.c. To build this example you can
- set CONFIG_SAMPLES=y and CONFIG_SAMPLE_KDB=m in your kernel
- config. Later run "modprobe kdb_hello" and the next time you
- enter the kdb shell, you can run the "hello"
- command.</para></listitem>
- </itemizedlist></listitem>
- <listitem><para>The implementation for kdb_printf() which
- emits messages directly to I/O drivers, bypassing the kernel
- log.</para></listitem>
- <listitem><para>SW / HW breakpoint management for the kdb shell</para></listitem>
- </itemizedlist>
- </listitem>
- <listitem><para>kgdb I/O driver</para>
- <para>
- Each kgdb I/O driver has to provide an implementation for the following:
- <itemizedlist>
- <listitem><para>configuration via built-in or module</para></listitem>
- <listitem><para>dynamic configuration and kgdb hook registration calls</para></listitem>
- <listitem><para>read and write character interface</para></listitem>
- <listitem><para>A cleanup handler for unconfiguring from the kgdb core</para></listitem>
- <listitem><para>(optional) Early debug methodology</para></listitem>
- </itemizedlist>
- Any given kgdb I/O driver has to operate very closely with the
- hardware and must do it in such a way that does not enable
- interrupts or change other parts of the system context without
- completely restoring them. The kgdb core will repeatedly "poll"
- a kgdb I/O driver for characters when it needs input. The I/O
- driver is expected to return immediately if there is no data
- available. Doing so allows for the future possibility to touch
- watchdog hardware in such a way as to have a target system not
- reset when these are enabled.
- </para>
- </listitem>
- </orderedlist>
- </para>
- <para>
- If you are intent on adding kgdb architecture specific support
- for a new architecture, the architecture should define
- <constant>HAVE_ARCH_KGDB</constant> in the architecture specific
- Kconfig file. This will enable kgdb for the architecture, and
- at that point you must create an architecture specific kgdb
- implementation.
- </para>
- <para>
- There are a few flags which must be set on every architecture in
- their &lt;asm/kgdb.h&gt; file. These are:
- <itemizedlist>
- <listitem>
- <para>
- NUMREGBYTES: The size in bytes of all of the registers, so
- that we can ensure they will all fit into a packet.
- </para>
- </listitem>
- <listitem>
- <para>
- BUFMAX: The size in bytes of the buffer GDB will read into.
- This must be larger than NUMREGBYTES.
- </para>
- </listitem>
- <listitem>
- <para>
- CACHE_FLUSH_IS_SAFE: Set to 1 if it is always safe to call
- flush_cache_range or flush_icache_range. On some architectures,
- these functions may not be safe to call on SMP since we keep other
- CPUs in a holding pattern.
- </para>
- </listitem>
- </itemizedlist>
- </para>
- <para>
- There are also the following functions for the common backend,
- found in kernel/kgdb.c, that must be supplied by the
- architecture-specific backend unless marked as (optional), in
- which case a default function maybe used if the architecture
- does not need to provide a specific implementation.
- </para>
-!Iinclude/linux/kgdb.h
- </sect1>
- <sect1 id="kgdbocDesign">
- <title>kgdboc internals</title>
- <sect2>
- <title>kgdboc and uarts</title>
- <para>
- The kgdboc driver is actually a very thin driver that relies on the
- underlying low level to the hardware driver having "polling hooks"
- to which the tty driver is attached. In the initial
- implementation of kgdboc the serial_core was changed to expose a
- low level UART hook for doing polled mode reading and writing of a
- single character while in an atomic context. When kgdb makes an I/O
- request to the debugger, kgdboc invokes a callback in the serial
- core which in turn uses the callback in the UART driver.</para>
- <para>
- When using kgdboc with a UART, the UART driver must implement two callbacks in the <constant>struct uart_ops</constant>. Example from drivers/8250.c:<programlisting>
-#ifdef CONFIG_CONSOLE_POLL
- .poll_get_char = serial8250_get_poll_char,
- .poll_put_char = serial8250_put_poll_char,
-#endif
- </programlisting>
- Any implementation specifics around creating a polling driver use the
- <constant>#ifdef CONFIG_CONSOLE_POLL</constant>, as shown above.
- Keep in mind that polling hooks have to be implemented in such a way
- that they can be called from an atomic context and have to restore
- the state of the UART chip on return such that the system can return
- to normal when the debugger detaches. You need to be very careful
- with any kind of lock you consider, because failing here is most likely
- going to mean pressing the reset button.
- </para>
- </sect2>
- <sect2 id="kgdbocKbd">
- <title>kgdboc and keyboards</title>
- <para>The kgdboc driver contains logic to configure communications
- with an attached keyboard. The keyboard infrastructure is only
- compiled into the kernel when CONFIG_KDB_KEYBOARD=y is set in the
- kernel configuration.</para>
- <para>The core polled keyboard driver driver for PS/2 type keyboards
- is in drivers/char/kdb_keyboard.c. This driver is hooked into the
- debug core when kgdboc populates the callback in the array
- called <constant>kdb_poll_funcs[]</constant>. The
- kdb_get_kbd_char() is the top-level function which polls hardware
- for single character input.
- </para>
- </sect2>
- <sect2 id="kgdbocKms">
- <title>kgdboc and kms</title>
- <para>The kgdboc driver contains logic to request the graphics
- display to switch to a text context when you are using
- "kgdboc=kms,kbd", provided that you have a video driver which has a
- frame buffer console and atomic kernel mode setting support.</para>
- <para>
- Every time the kernel
- debugger is entered it calls kgdboc_pre_exp_handler() which in turn
- calls con_debug_enter() in the virtual console layer. On resuming kernel
- execution, the kernel debugger calls kgdboc_post_exp_handler() which
- in turn calls con_debug_leave().</para>
- <para>Any video driver that wants to be compatible with the kernel
- debugger and the atomic kms callbacks must implement the
- mode_set_base_atomic, fb_debug_enter and fb_debug_leave operations.
- For the fb_debug_enter and fb_debug_leave the option exists to use
- the generic drm fb helper functions or implement something custom for
- the hardware. The following example shows the initialization of the
- .mode_set_base_atomic operation in
- drivers/gpu/drm/i915/intel_display.c:
- <informalexample>
- <programlisting>
-static const struct drm_crtc_helper_funcs intel_helper_funcs = {
-[...]
- .mode_set_base_atomic = intel_pipe_set_base_atomic,
-[...]
-};
- </programlisting>
- </informalexample>
- </para>
- <para>Here is an example of how the i915 driver initializes the fb_debug_enter and fb_debug_leave functions to use the generic drm helpers in
- drivers/gpu/drm/i915/intel_fb.c:
- <informalexample>
- <programlisting>
-static struct fb_ops intelfb_ops = {
-[...]
- .fb_debug_enter = drm_fb_helper_debug_enter,
- .fb_debug_leave = drm_fb_helper_debug_leave,
-[...]
-};
- </programlisting>
- </informalexample>
- </para>
- </sect2>
- </sect1>
- </chapter>
- <chapter id="credits">
- <title>Credits</title>
- <para>
- The following people have contributed to this document:
- <orderedlist>
- <listitem><para>Amit Kale<email>amitkale@linsyssoft.com</email></para></listitem>
- <listitem><para>Tom Rini<email>trini@kernel.crashing.org</email></para></listitem>
- </orderedlist>
- In March 2008 this document was completely rewritten by:
- <itemizedlist>
- <listitem><para>Jason Wessel<email>jason.wessel@windriver.com</email></para></listitem>
- </itemizedlist>
- In Jan 2010 this document was updated to include kdb.
- <itemizedlist>
- <listitem><para>Jason Wessel<email>jason.wessel@windriver.com</email></para></listitem>
- </itemizedlist>
- </para>
- </chapter>
-</book>
-
diff --git a/Documentation/DocBook/libata.tmpl b/Documentation/DocBook/libata.tmpl
deleted file mode 100644
index 0320910b866d..000000000000
--- a/Documentation/DocBook/libata.tmpl
+++ /dev/null
@@ -1,1625 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
- "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
-
-<book id="libataDevGuide">
- <bookinfo>
- <title>libATA Developer's Guide</title>
-
- <authorgroup>
- <author>
- <firstname>Jeff</firstname>
- <surname>Garzik</surname>
- </author>
- </authorgroup>
-
- <copyright>
- <year>2003-2006</year>
- <holder>Jeff Garzik</holder>
- </copyright>
-
- <legalnotice>
- <para>
- The contents of this file are subject to the Open
- Software License version 1.1 that can be found at
- <ulink url="http://fedoraproject.org/wiki/Licensing:OSL1.1">http://fedoraproject.org/wiki/Licensing:OSL1.1</ulink>
- and is included herein by reference.
- </para>
-
- <para>
- Alternatively, the contents of this file may be used under the terms
- of the GNU General Public License version 2 (the "GPL") as distributed
- in the kernel source COPYING file, in which case the provisions of
- the GPL are applicable instead of the above. If you wish to allow
- the use of your version of this file only under the terms of the
- GPL and not to allow others to use your version of this file under
- the OSL, indicate your decision by deleting the provisions above and
- replace them with the notice and other provisions required by the GPL.
- If you do not delete the provisions above, a recipient may use your
- version of this file under either the OSL or the GPL.
- </para>
-
- </legalnotice>
- </bookinfo>
-
-<toc></toc>
-
- <chapter id="libataIntroduction">
- <title>Introduction</title>
- <para>
- libATA is a library used inside the Linux kernel to support ATA host
- controllers and devices. libATA provides an ATA driver API, class
- transports for ATA and ATAPI devices, and SCSI&lt;-&gt;ATA translation
- for ATA devices according to the T10 SAT specification.
- </para>
- <para>
- This Guide documents the libATA driver API, library functions, library
- internals, and a couple sample ATA low-level drivers.
- </para>
- </chapter>
-
- <chapter id="libataDriverApi">
- <title>libata Driver API</title>
- <para>
- struct ata_port_operations is defined for every low-level libata
- hardware driver, and it controls how the low-level driver
- interfaces with the ATA and SCSI layers.
- </para>
- <para>
- FIS-based drivers will hook into the system with ->qc_prep() and
- ->qc_issue() high-level hooks. Hardware which behaves in a manner
- similar to PCI IDE hardware may utilize several generic helpers,
- defining at a bare minimum the bus I/O addresses of the ATA shadow
- register blocks.
- </para>
- <sect1>
- <title>struct ata_port_operations</title>
-
- <sect2><title>Disable ATA port</title>
- <programlisting>
-void (*port_disable) (struct ata_port *);
- </programlisting>
-
- <para>
- Called from ata_bus_probe() error path, as well as when
- unregistering from the SCSI module (rmmod, hot unplug).
- This function should do whatever needs to be done to take the
- port out of use. In most cases, ata_port_disable() can be used
- as this hook.
- </para>
- <para>
- Called from ata_bus_probe() on a failed probe.
- Called from ata_scsi_release().
- </para>
-
- </sect2>
-
- <sect2><title>Post-IDENTIFY device configuration</title>
- <programlisting>
-void (*dev_config) (struct ata_port *, struct ata_device *);
- </programlisting>
-
- <para>
- Called after IDENTIFY [PACKET] DEVICE is issued to each device
- found. Typically used to apply device-specific fixups prior to
- issue of SET FEATURES - XFER MODE, and prior to operation.
- </para>
- <para>
- This entry may be specified as NULL in ata_port_operations.
- </para>
-
- </sect2>
-
- <sect2><title>Set PIO/DMA mode</title>
- <programlisting>
-void (*set_piomode) (struct ata_port *, struct ata_device *);
-void (*set_dmamode) (struct ata_port *, struct ata_device *);
-void (*post_set_mode) (struct ata_port *);
-unsigned int (*mode_filter) (struct ata_port *, struct ata_device *, unsigned int);
- </programlisting>
-
- <para>
- Hooks called prior to the issue of SET FEATURES - XFER MODE
- command. The optional ->mode_filter() hook is called when libata
- has built a mask of the possible modes. This is passed to the
- ->mode_filter() function which should return a mask of valid modes
- after filtering those unsuitable due to hardware limits. It is not
- valid to use this interface to add modes.
- </para>
- <para>
- dev->pio_mode and dev->dma_mode are guaranteed to be valid when
- ->set_piomode() and when ->set_dmamode() is called. The timings for
- any other drive sharing the cable will also be valid at this point.
- That is the library records the decisions for the modes of each
- drive on a channel before it attempts to set any of them.
- </para>
- <para>
- ->post_set_mode() is
- called unconditionally, after the SET FEATURES - XFER MODE
- command completes successfully.
- </para>
-
- <para>
- ->set_piomode() is always called (if present), but
- ->set_dma_mode() is only called if DMA is possible.
- </para>
-
- </sect2>
-
- <sect2><title>Taskfile read/write</title>
- <programlisting>
-void (*sff_tf_load) (struct ata_port *ap, struct ata_taskfile *tf);
-void (*sff_tf_read) (struct ata_port *ap, struct ata_taskfile *tf);
- </programlisting>
-
- <para>
- ->tf_load() is called to load the given taskfile into hardware
- registers / DMA buffers. ->tf_read() is called to read the
- hardware registers / DMA buffers, to obtain the current set of
- taskfile register values.
- Most drivers for taskfile-based hardware (PIO or MMIO) use
- ata_sff_tf_load() and ata_sff_tf_read() for these hooks.
- </para>
-
- </sect2>
-
- <sect2><title>PIO data read/write</title>
- <programlisting>
-void (*sff_data_xfer) (struct ata_device *, unsigned char *, unsigned int, int);
- </programlisting>
-
- <para>
-All bmdma-style drivers must implement this hook. This is the low-level
-operation that actually copies the data bytes during a PIO data
-transfer.
-Typically the driver will choose one of ata_sff_data_xfer_noirq(),
-ata_sff_data_xfer(), or ata_sff_data_xfer32().
- </para>
-
- </sect2>
-
- <sect2><title>ATA command execute</title>
- <programlisting>
-void (*sff_exec_command)(struct ata_port *ap, struct ata_taskfile *tf);
- </programlisting>
-
- <para>
- causes an ATA command, previously loaded with
- ->tf_load(), to be initiated in hardware.
- Most drivers for taskfile-based hardware use ata_sff_exec_command()
- for this hook.
- </para>
-
- </sect2>
-
- <sect2><title>Per-cmd ATAPI DMA capabilities filter</title>
- <programlisting>
-int (*check_atapi_dma) (struct ata_queued_cmd *qc);
- </programlisting>
-
- <para>
-Allow low-level driver to filter ATA PACKET commands, returning a status
-indicating whether or not it is OK to use DMA for the supplied PACKET
-command.
- </para>
- <para>
- This hook may be specified as NULL, in which case libata will
- assume that atapi dma can be supported.
- </para>
-
- </sect2>
-
- <sect2><title>Read specific ATA shadow registers</title>
- <programlisting>
-u8 (*sff_check_status)(struct ata_port *ap);
-u8 (*sff_check_altstatus)(struct ata_port *ap);
- </programlisting>
-
- <para>
- Reads the Status/AltStatus ATA shadow register from
- hardware. On some hardware, reading the Status register has
- the side effect of clearing the interrupt condition.
- Most drivers for taskfile-based hardware use
- ata_sff_check_status() for this hook.
- </para>
-
- </sect2>
-
- <sect2><title>Write specific ATA shadow register</title>
- <programlisting>
-void (*sff_set_devctl)(struct ata_port *ap, u8 ctl);
- </programlisting>
-
- <para>
- Write the device control ATA shadow register to the hardware.
- Most drivers don't need to define this.
- </para>
-
- </sect2>
-
- <sect2><title>Select ATA device on bus</title>
- <programlisting>
-void (*sff_dev_select)(struct ata_port *ap, unsigned int device);
- </programlisting>
-
- <para>
- Issues the low-level hardware command(s) that causes one of N
- hardware devices to be considered 'selected' (active and
- available for use) on the ATA bus. This generally has no
- meaning on FIS-based devices.
- </para>
- <para>
- Most drivers for taskfile-based hardware use
- ata_sff_dev_select() for this hook.
- </para>
-
- </sect2>
-
- <sect2><title>Private tuning method</title>
- <programlisting>
-void (*set_mode) (struct ata_port *ap);
- </programlisting>
-
- <para>
- By default libata performs drive and controller tuning in
- accordance with the ATA timing rules and also applies blacklists
- and cable limits. Some controllers need special handling and have
- custom tuning rules, typically raid controllers that use ATA
- commands but do not actually do drive timing.
- </para>
-
- <warning>
- <para>
- This hook should not be used to replace the standard controller
- tuning logic when a controller has quirks. Replacing the default
- tuning logic in that case would bypass handling for drive and
- bridge quirks that may be important to data reliability. If a
- controller needs to filter the mode selection it should use the
- mode_filter hook instead.
- </para>
- </warning>
-
- </sect2>
-
- <sect2><title>Control PCI IDE BMDMA engine</title>
- <programlisting>
-void (*bmdma_setup) (struct ata_queued_cmd *qc);
-void (*bmdma_start) (struct ata_queued_cmd *qc);
-void (*bmdma_stop) (struct ata_port *ap);
-u8 (*bmdma_status) (struct ata_port *ap);
- </programlisting>
-
- <para>
-When setting up an IDE BMDMA transaction, these hooks arm
-(->bmdma_setup), fire (->bmdma_start), and halt (->bmdma_stop)
-the hardware's DMA engine. ->bmdma_status is used to read the standard
-PCI IDE DMA Status register.
- </para>
-
- <para>
-These hooks are typically either no-ops, or simply not implemented, in
-FIS-based drivers.
- </para>
- <para>
-Most legacy IDE drivers use ata_bmdma_setup() for the bmdma_setup()
-hook. ata_bmdma_setup() will write the pointer to the PRD table to
-the IDE PRD Table Address register, enable DMA in the DMA Command
-register, and call exec_command() to begin the transfer.
- </para>
- <para>
-Most legacy IDE drivers use ata_bmdma_start() for the bmdma_start()
-hook. ata_bmdma_start() will write the ATA_DMA_START flag to the DMA
-Command register.
- </para>
- <para>
-Many legacy IDE drivers use ata_bmdma_stop() for the bmdma_stop()
-hook. ata_bmdma_stop() clears the ATA_DMA_START flag in the DMA
-command register.
- </para>
- <para>
-Many legacy IDE drivers use ata_bmdma_status() as the bmdma_status() hook.
- </para>
-
- </sect2>
-
- <sect2><title>High-level taskfile hooks</title>
- <programlisting>
-void (*qc_prep) (struct ata_queued_cmd *qc);
-int (*qc_issue) (struct ata_queued_cmd *qc);
- </programlisting>
-
- <para>
- Higher-level hooks, these two hooks can potentially supercede
- several of the above taskfile/DMA engine hooks. ->qc_prep is
- called after the buffers have been DMA-mapped, and is typically
- used to populate the hardware's DMA scatter-gather table.
- Most drivers use the standard ata_qc_prep() helper function, but
- more advanced drivers roll their own.
- </para>
- <para>
- ->qc_issue is used to make a command active, once the hardware
- and S/G tables have been prepared. IDE BMDMA drivers use the
- helper function ata_qc_issue_prot() for taskfile protocol-based
- dispatch. More advanced drivers implement their own ->qc_issue.
- </para>
- <para>
- ata_qc_issue_prot() calls ->tf_load(), ->bmdma_setup(), and
- ->bmdma_start() as necessary to initiate a transfer.
- </para>
-
- </sect2>
-
- <sect2><title>Exception and probe handling (EH)</title>
- <programlisting>
-void (*eng_timeout) (struct ata_port *ap);
-void (*phy_reset) (struct ata_port *ap);
- </programlisting>
-
- <para>
-Deprecated. Use ->error_handler() instead.
- </para>
-
- <programlisting>
-void (*freeze) (struct ata_port *ap);
-void (*thaw) (struct ata_port *ap);
- </programlisting>
-
- <para>
-ata_port_freeze() is called when HSM violations or some other
-condition disrupts normal operation of the port. A frozen port
-is not allowed to perform any operation until the port is
-thawed, which usually follows a successful reset.
- </para>
-
- <para>
-The optional ->freeze() callback can be used for freezing the port
-hardware-wise (e.g. mask interrupt and stop DMA engine). If a
-port cannot be frozen hardware-wise, the interrupt handler
-must ack and clear interrupts unconditionally while the port
-is frozen.
- </para>
- <para>
-The optional ->thaw() callback is called to perform the opposite of ->freeze():
-prepare the port for normal operation once again. Unmask interrupts,
-start DMA engine, etc.
- </para>
-
- <programlisting>
-void (*error_handler) (struct ata_port *ap);
- </programlisting>
-
- <para>
-->error_handler() is a driver's hook into probe, hotplug, and recovery
-and other exceptional conditions. The primary responsibility of an
-implementation is to call ata_do_eh() or ata_bmdma_drive_eh() with a set
-of EH hooks as arguments:
- </para>
-
- <para>
-'prereset' hook (may be NULL) is called during an EH reset, before any other actions
-are taken.
- </para>
-
- <para>
-'postreset' hook (may be NULL) is called after the EH reset is performed. Based on
-existing conditions, severity of the problem, and hardware capabilities,
- </para>
-
- <para>
-Either 'softreset' (may be NULL) or 'hardreset' (may be NULL) will be
-called to perform the low-level EH reset.
- </para>
-
- <programlisting>
-void (*post_internal_cmd) (struct ata_queued_cmd *qc);
- </programlisting>
-
- <para>
-Perform any hardware-specific actions necessary to finish processing
-after executing a probe-time or EH-time command via ata_exec_internal().
- </para>
-
- </sect2>
-
- <sect2><title>Hardware interrupt handling</title>
- <programlisting>
-irqreturn_t (*irq_handler)(int, void *, struct pt_regs *);
-void (*irq_clear) (struct ata_port *);
- </programlisting>
-
- <para>
- ->irq_handler is the interrupt handling routine registered with
- the system, by libata. ->irq_clear is called during probe just
- before the interrupt handler is registered, to be sure hardware
- is quiet.
- </para>
- <para>
- The second argument, dev_instance, should be cast to a pointer
- to struct ata_host_set.
- </para>
- <para>
- Most legacy IDE drivers use ata_sff_interrupt() for the
- irq_handler hook, which scans all ports in the host_set,
- determines which queued command was active (if any), and calls
- ata_sff_host_intr(ap,qc).
- </para>
- <para>
- Most legacy IDE drivers use ata_sff_irq_clear() for the
- irq_clear() hook, which simply clears the interrupt and error
- flags in the DMA status register.
- </para>
-
- </sect2>
-
- <sect2><title>SATA phy read/write</title>
- <programlisting>
-int (*scr_read) (struct ata_port *ap, unsigned int sc_reg,
- u32 *val);
-int (*scr_write) (struct ata_port *ap, unsigned int sc_reg,
- u32 val);
- </programlisting>
-
- <para>
- Read and write standard SATA phy registers. Currently only used
- if ->phy_reset hook called the sata_phy_reset() helper function.
- sc_reg is one of SCR_STATUS, SCR_CONTROL, SCR_ERROR, or SCR_ACTIVE.
- </para>
-
- </sect2>
-
- <sect2><title>Init and shutdown</title>
- <programlisting>
-int (*port_start) (struct ata_port *ap);
-void (*port_stop) (struct ata_port *ap);
-void (*host_stop) (struct ata_host_set *host_set);
- </programlisting>
-
- <para>
- ->port_start() is called just after the data structures for each
- port are initialized. Typically this is used to alloc per-port
- DMA buffers / tables / rings, enable DMA engines, and similar
- tasks. Some drivers also use this entry point as a chance to
- allocate driver-private memory for ap->private_data.
- </para>
- <para>
- Many drivers use ata_port_start() as this hook or call
- it from their own port_start() hooks. ata_port_start()
- allocates space for a legacy IDE PRD table and returns.
- </para>
- <para>
- ->port_stop() is called after ->host_stop(). Its sole function
- is to release DMA/memory resources, now that they are no longer
- actively being used. Many drivers also free driver-private
- data from port at this time.
- </para>
- <para>
- ->host_stop() is called after all ->port_stop() calls
-have completed. The hook must finalize hardware shutdown, release DMA
-and other resources, etc.
- This hook may be specified as NULL, in which case it is not called.
- </para>
-
- </sect2>
-
- </sect1>
- </chapter>
-
- <chapter id="libataEH">
- <title>Error handling</title>
-
- <para>
- This chapter describes how errors are handled under libata.
- Readers are advised to read SCSI EH
- (Documentation/scsi/scsi_eh.txt) and ATA exceptions doc first.
- </para>
-
- <sect1><title>Origins of commands</title>
- <para>
- In libata, a command is represented with struct ata_queued_cmd
- or qc. qc's are preallocated during port initialization and
- repetitively used for command executions. Currently only one
- qc is allocated per port but yet-to-be-merged NCQ branch
- allocates one for each tag and maps each qc to NCQ tag 1-to-1.
- </para>
- <para>
- libata commands can originate from two sources - libata itself
- and SCSI midlayer. libata internal commands are used for
- initialization and error handling. All normal blk requests
- and commands for SCSI emulation are passed as SCSI commands
- through queuecommand callback of SCSI host template.
- </para>
- </sect1>
-
- <sect1><title>How commands are issued</title>
-
- <variablelist>
-
- <varlistentry><term>Internal commands</term>
- <listitem>
- <para>
- First, qc is allocated and initialized using
- ata_qc_new_init(). Although ata_qc_new_init() doesn't
- implement any wait or retry mechanism when qc is not
- available, internal commands are currently issued only during
- initialization and error recovery, so no other command is
- active and allocation is guaranteed to succeed.
- </para>
- <para>
- Once allocated qc's taskfile is initialized for the command to
- be executed. qc currently has two mechanisms to notify
- completion. One is via qc->complete_fn() callback and the
- other is completion qc->waiting. qc->complete_fn() callback
- is the asynchronous path used by normal SCSI translated
- commands and qc->waiting is the synchronous (issuer sleeps in
- process context) path used by internal commands.
- </para>
- <para>
- Once initialization is complete, host_set lock is acquired
- and the qc is issued.
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry><term>SCSI commands</term>
- <listitem>
- <para>
- All libata drivers use ata_scsi_queuecmd() as
- hostt->queuecommand callback. scmds can either be simulated
- or translated. No qc is involved in processing a simulated
- scmd. The result is computed right away and the scmd is
- completed.
- </para>
- <para>
- For a translated scmd, ata_qc_new_init() is invoked to
- allocate a qc and the scmd is translated into the qc. SCSI
- midlayer's completion notification function pointer is stored
- into qc->scsidone.
- </para>
- <para>
- qc->complete_fn() callback is used for completion
- notification. ATA commands use ata_scsi_qc_complete() while
- ATAPI commands use atapi_qc_complete(). Both functions end up
- calling qc->scsidone to notify upper layer when the qc is
- finished. After translation is completed, the qc is issued
- with ata_qc_issue().
- </para>
- <para>
- Note that SCSI midlayer invokes hostt->queuecommand while
- holding host_set lock, so all above occur while holding
- host_set lock.
- </para>
- </listitem>
- </varlistentry>
-
- </variablelist>
- </sect1>
-
- <sect1><title>How commands are processed</title>
- <para>
- Depending on which protocol and which controller are used,
- commands are processed differently. For the purpose of
- discussion, a controller which uses taskfile interface and all
- standard callbacks is assumed.
- </para>
- <para>
- Currently 6 ATA command protocols are used. They can be
- sorted into the following four categories according to how
- they are processed.
- </para>
-
- <variablelist>
- <varlistentry><term>ATA NO DATA or DMA</term>
- <listitem>
- <para>
- ATA_PROT_NODATA and ATA_PROT_DMA fall into this category.
- These types of commands don't require any software
- intervention once issued. Device will raise interrupt on
- completion.
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry><term>ATA PIO</term>
- <listitem>
- <para>
- ATA_PROT_PIO is in this category. libata currently
- implements PIO with polling. ATA_NIEN bit is set to turn
- off interrupt and pio_task on ata_wq performs polling and
- IO.
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry><term>ATAPI NODATA or DMA</term>
- <listitem>
- <para>
- ATA_PROT_ATAPI_NODATA and ATA_PROT_ATAPI_DMA are in this
- category. packet_task is used to poll BSY bit after
- issuing PACKET command. Once BSY is turned off by the
- device, packet_task transfers CDB and hands off processing
- to interrupt handler.
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry><term>ATAPI PIO</term>
- <listitem>
- <para>
- ATA_PROT_ATAPI is in this category. ATA_NIEN bit is set
- and, as in ATAPI NODATA or DMA, packet_task submits cdb.
- However, after submitting cdb, further processing (data
- transfer) is handed off to pio_task.
- </para>
- </listitem>
- </varlistentry>
- </variablelist>
- </sect1>
-
- <sect1><title>How commands are completed</title>
- <para>
- Once issued, all qc's are either completed with
- ata_qc_complete() or time out. For commands which are handled
- by interrupts, ata_host_intr() invokes ata_qc_complete(), and,
- for PIO tasks, pio_task invokes ata_qc_complete(). In error
- cases, packet_task may also complete commands.
- </para>
- <para>
- ata_qc_complete() does the following.
- </para>
-
- <orderedlist>
-
- <listitem>
- <para>
- DMA memory is unmapped.
- </para>
- </listitem>
-
- <listitem>
- <para>
- ATA_QCFLAG_ACTIVE is cleared from qc->flags.
- </para>
- </listitem>
-
- <listitem>
- <para>
- qc->complete_fn() callback is invoked. If the return value of
- the callback is not zero. Completion is short circuited and
- ata_qc_complete() returns.
- </para>
- </listitem>
-
- <listitem>
- <para>
- __ata_qc_complete() is called, which does
- <orderedlist>
-
- <listitem>
- <para>
- qc->flags is cleared to zero.
- </para>
- </listitem>
-
- <listitem>
- <para>
- ap->active_tag and qc->tag are poisoned.
- </para>
- </listitem>
-
- <listitem>
- <para>
- qc->waiting is cleared &amp; completed (in that order).
- </para>
- </listitem>
-
- <listitem>
- <para>
- qc is deallocated by clearing appropriate bit in ap->qactive.
- </para>
- </listitem>
-
- </orderedlist>
- </para>
- </listitem>
-
- </orderedlist>
-
- <para>
- So, it basically notifies upper layer and deallocates qc. One
- exception is short-circuit path in #3 which is used by
- atapi_qc_complete().
- </para>
- <para>
- For all non-ATAPI commands, whether it fails or not, almost
- the same code path is taken and very little error handling
- takes place. A qc is completed with success status if it
- succeeded, with failed status otherwise.
- </para>
- <para>
- However, failed ATAPI commands require more handling as
- REQUEST SENSE is needed to acquire sense data. If an ATAPI
- command fails, ata_qc_complete() is invoked with error status,
- which in turn invokes atapi_qc_complete() via
- qc->complete_fn() callback.
- </para>
- <para>
- This makes atapi_qc_complete() set scmd->result to
- SAM_STAT_CHECK_CONDITION, complete the scmd and return 1. As
- the sense data is empty but scmd->result is CHECK CONDITION,
- SCSI midlayer will invoke EH for the scmd, and returning 1
- makes ata_qc_complete() to return without deallocating the qc.
- This leads us to ata_scsi_error() with partially completed qc.
- </para>
-
- </sect1>
-
- <sect1><title>ata_scsi_error()</title>
- <para>
- ata_scsi_error() is the current transportt->eh_strategy_handler()
- for libata. As discussed above, this will be entered in two
- cases - timeout and ATAPI error completion. This function
- calls low level libata driver's eng_timeout() callback, the
- standard callback for which is ata_eng_timeout(). It checks
- if a qc is active and calls ata_qc_timeout() on the qc if so.
- Actual error handling occurs in ata_qc_timeout().
- </para>
- <para>
- If EH is invoked for timeout, ata_qc_timeout() stops BMDMA and
- completes the qc. Note that as we're currently in EH, we
- cannot call scsi_done. As described in SCSI EH doc, a
- recovered scmd should be either retried with
- scsi_queue_insert() or finished with scsi_finish_command().
- Here, we override qc->scsidone with scsi_finish_command() and
- calls ata_qc_complete().
- </para>
- <para>
- If EH is invoked due to a failed ATAPI qc, the qc here is
- completed but not deallocated. The purpose of this
- half-completion is to use the qc as place holder to make EH
- code reach this place. This is a bit hackish, but it works.
- </para>
- <para>
- Once control reaches here, the qc is deallocated by invoking
- __ata_qc_complete() explicitly. Then, internal qc for REQUEST
- SENSE is issued. Once sense data is acquired, scmd is
- finished by directly invoking scsi_finish_command() on the
- scmd. Note that as we already have completed and deallocated
- the qc which was associated with the scmd, we don't need
- to/cannot call ata_qc_complete() again.
- </para>
-
- </sect1>
-
- <sect1><title>Problems with the current EH</title>
-
- <itemizedlist>
-
- <listitem>
- <para>
- Error representation is too crude. Currently any and all
- error conditions are represented with ATA STATUS and ERROR
- registers. Errors which aren't ATA device errors are treated
- as ATA device errors by setting ATA_ERR bit. Better error
- descriptor which can properly represent ATA and other
- errors/exceptions is needed.
- </para>
- </listitem>
-
- <listitem>
- <para>
- When handling timeouts, no action is taken to make device
- forget about the timed out command and ready for new commands.
- </para>
- </listitem>
-
- <listitem>
- <para>
- EH handling via ata_scsi_error() is not properly protected
- from usual command processing. On EH entrance, the device is
- not in quiescent state. Timed out commands may succeed or
- fail any time. pio_task and atapi_task may still be running.
- </para>
- </listitem>
-
- <listitem>
- <para>
- Too weak error recovery. Devices / controllers causing HSM
- mismatch errors and other errors quite often require reset to
- return to known state. Also, advanced error handling is
- necessary to support features like NCQ and hotplug.
- </para>
- </listitem>
-
- <listitem>
- <para>
- ATA errors are directly handled in the interrupt handler and
- PIO errors in pio_task. This is problematic for advanced
- error handling for the following reasons.
- </para>
- <para>
- First, advanced error handling often requires context and
- internal qc execution.
- </para>
- <para>
- Second, even a simple failure (say, CRC error) needs
- information gathering and could trigger complex error handling
- (say, resetting &amp; reconfiguring). Having multiple code
- paths to gather information, enter EH and trigger actions
- makes life painful.
- </para>
- <para>
- Third, scattered EH code makes implementing low level drivers
- difficult. Low level drivers override libata callbacks. If
- EH is scattered over several places, each affected callbacks
- should perform its part of error handling. This can be error
- prone and painful.
- </para>
- </listitem>
-
- </itemizedlist>
- </sect1>
- </chapter>
-
- <chapter id="libataExt">
- <title>libata Library</title>
-!Edrivers/ata/libata-core.c
- </chapter>
-
- <chapter id="libataInt">
- <title>libata Core Internals</title>
-!Idrivers/ata/libata-core.c
- </chapter>
-
- <chapter id="libataScsiInt">
- <title>libata SCSI translation/emulation</title>
-!Edrivers/ata/libata-scsi.c
-!Idrivers/ata/libata-scsi.c
- </chapter>
-
- <chapter id="ataExceptions">
- <title>ATA errors and exceptions</title>
-
- <para>
- This chapter tries to identify what error/exception conditions exist
- for ATA/ATAPI devices and describe how they should be handled in
- implementation-neutral way.
- </para>
-
- <para>
- The term 'error' is used to describe conditions where either an
- explicit error condition is reported from device or a command has
- timed out.
- </para>
-
- <para>
- The term 'exception' is either used to describe exceptional
- conditions which are not errors (say, power or hotplug events), or
- to describe both errors and non-error exceptional conditions. Where
- explicit distinction between error and exception is necessary, the
- term 'non-error exception' is used.
- </para>
-
- <sect1 id="excat">
- <title>Exception categories</title>
- <para>
- Exceptions are described primarily with respect to legacy
- taskfile + bus master IDE interface. If a controller provides
- other better mechanism for error reporting, mapping those into
- categories described below shouldn't be difficult.
- </para>
-
- <para>
- In the following sections, two recovery actions - reset and
- reconfiguring transport - are mentioned. These are described
- further in <xref linkend="exrec"/>.
- </para>
-
- <sect2 id="excatHSMviolation">
- <title>HSM violation</title>
- <para>
- This error is indicated when STATUS value doesn't match HSM
- requirement during issuing or execution any ATA/ATAPI command.
- </para>
-
- <itemizedlist>
- <title>Examples</title>
-
- <listitem>
- <para>
- ATA_STATUS doesn't contain !BSY &amp;&amp; DRDY &amp;&amp; !DRQ while trying
- to issue a command.
- </para>
- </listitem>
-
- <listitem>
- <para>
- !BSY &amp;&amp; !DRQ during PIO data transfer.
- </para>
- </listitem>
-
- <listitem>
- <para>
- DRQ on command completion.
- </para>
- </listitem>
-
- <listitem>
- <para>
- !BSY &amp;&amp; ERR after CDB transfer starts but before the
- last byte of CDB is transferred. ATA/ATAPI standard states
- that &quot;The device shall not terminate the PACKET command
- with an error before the last byte of the command packet has
- been written&quot; in the error outputs description of PACKET
- command and the state diagram doesn't include such
- transitions.
- </para>
- </listitem>
-
- </itemizedlist>
-
- <para>
- In these cases, HSM is violated and not much information
- regarding the error can be acquired from STATUS or ERROR
- register. IOW, this error can be anything - driver bug,
- faulty device, controller and/or cable.
- </para>
-
- <para>
- As HSM is violated, reset is necessary to restore known state.
- Reconfiguring transport for lower speed might be helpful too
- as transmission errors sometimes cause this kind of errors.
- </para>
- </sect2>
-
- <sect2 id="excatDevErr">
- <title>ATA/ATAPI device error (non-NCQ / non-CHECK CONDITION)</title>
-
- <para>
- These are errors detected and reported by ATA/ATAPI devices
- indicating device problems. For this type of errors, STATUS
- and ERROR register values are valid and describe error
- condition. Note that some of ATA bus errors are detected by
- ATA/ATAPI devices and reported using the same mechanism as
- device errors. Those cases are described later in this
- section.
- </para>
-
- <para>
- For ATA commands, this type of errors are indicated by !BSY
- &amp;&amp; ERR during command execution and on completion.
- </para>
-
- <para>For ATAPI commands,</para>
-
- <itemizedlist>
-
- <listitem>
- <para>
- !BSY &amp;&amp; ERR &amp;&amp; ABRT right after issuing PACKET
- indicates that PACKET command is not supported and falls in
- this category.
- </para>
- </listitem>
-
- <listitem>
- <para>
- !BSY &amp;&amp; ERR(==CHK) &amp;&amp; !ABRT after the last
- byte of CDB is transferred indicates CHECK CONDITION and
- doesn't fall in this category.
- </para>
- </listitem>
-
- <listitem>
- <para>
- !BSY &amp;&amp; ERR(==CHK) &amp;&amp; ABRT after the last byte
- of CDB is transferred *probably* indicates CHECK CONDITION and
- doesn't fall in this category.
- </para>
- </listitem>
-
- </itemizedlist>
-
- <para>
- Of errors detected as above, the following are not ATA/ATAPI
- device errors but ATA bus errors and should be handled
- according to <xref linkend="excatATAbusErr"/>.
- </para>
-
- <variablelist>
-
- <varlistentry>
- <term>CRC error during data transfer</term>
- <listitem>
- <para>
- This is indicated by ICRC bit in the ERROR register and
- means that corruption occurred during data transfer. Up to
- ATA/ATAPI-7, the standard specifies that this bit is only
- applicable to UDMA transfers but ATA/ATAPI-8 draft revision
- 1f says that the bit may be applicable to multiword DMA and
- PIO.
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry>
- <term>ABRT error during data transfer or on completion</term>
- <listitem>
- <para>
- Up to ATA/ATAPI-7, the standard specifies that ABRT could be
- set on ICRC errors and on cases where a device is not able
- to complete a command. Combined with the fact that MWDMA
- and PIO transfer errors aren't allowed to use ICRC bit up to
- ATA/ATAPI-7, it seems to imply that ABRT bit alone could
- indicate transfer errors.
- </para>
- <para>
- However, ATA/ATAPI-8 draft revision 1f removes the part
- that ICRC errors can turn on ABRT. So, this is kind of
- gray area. Some heuristics are needed here.
- </para>
- </listitem>
- </varlistentry>
-
- </variablelist>
-
- <para>
- ATA/ATAPI device errors can be further categorized as follows.
- </para>
-
- <variablelist>
-
- <varlistentry>
- <term>Media errors</term>
- <listitem>
- <para>
- This is indicated by UNC bit in the ERROR register. ATA
- devices reports UNC error only after certain number of
- retries cannot recover the data, so there's nothing much
- else to do other than notifying upper layer.
- </para>
- <para>
- READ and WRITE commands report CHS or LBA of the first
- failed sector but ATA/ATAPI standard specifies that the
- amount of transferred data on error completion is
- indeterminate, so we cannot assume that sectors preceding
- the failed sector have been transferred and thus cannot
- complete those sectors successfully as SCSI does.
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry>
- <term>Media changed / media change requested error</term>
- <listitem>
- <para>
- &lt;&lt;TODO: fill here&gt;&gt;
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry><term>Address error</term>
- <listitem>
- <para>
- This is indicated by IDNF bit in the ERROR register.
- Report to upper layer.
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry><term>Other errors</term>
- <listitem>
- <para>
- This can be invalid command or parameter indicated by ABRT
- ERROR bit or some other error condition. Note that ABRT
- bit can indicate a lot of things including ICRC and Address
- errors. Heuristics needed.
- </para>
- </listitem>
- </varlistentry>
-
- </variablelist>
-
- <para>
- Depending on commands, not all STATUS/ERROR bits are
- applicable. These non-applicable bits are marked with
- &quot;na&quot; in the output descriptions but up to ATA/ATAPI-7
- no definition of &quot;na&quot; can be found. However,
- ATA/ATAPI-8 draft revision 1f describes &quot;N/A&quot; as
- follows.
- </para>
-
- <blockquote>
- <variablelist>
- <varlistentry><term>3.2.3.3a N/A</term>
- <listitem>
- <para>
- A keyword the indicates a field has no defined value in
- this standard and should not be checked by the host or
- device. N/A fields should be cleared to zero.
- </para>
- </listitem>
- </varlistentry>
- </variablelist>
- </blockquote>
-
- <para>
- So, it seems reasonable to assume that &quot;na&quot; bits are
- cleared to zero by devices and thus need no explicit masking.
- </para>
-
- </sect2>
-
- <sect2 id="excatATAPIcc">
- <title>ATAPI device CHECK CONDITION</title>
-
- <para>
- ATAPI device CHECK CONDITION error is indicated by set CHK bit
- (ERR bit) in the STATUS register after the last byte of CDB is
- transferred for a PACKET command. For this kind of errors,
- sense data should be acquired to gather information regarding
- the errors. REQUEST SENSE packet command should be used to
- acquire sense data.
- </para>
-
- <para>
- Once sense data is acquired, this type of errors can be
- handled similarly to other SCSI errors. Note that sense data
- may indicate ATA bus error (e.g. Sense Key 04h HARDWARE ERROR
- &amp;&amp; ASC/ASCQ 47h/00h SCSI PARITY ERROR). In such
- cases, the error should be considered as an ATA bus error and
- handled according to <xref linkend="excatATAbusErr"/>.
- </para>
-
- </sect2>
-
- <sect2 id="excatNCQerr">
- <title>ATA device error (NCQ)</title>
-
- <para>
- NCQ command error is indicated by cleared BSY and set ERR bit
- during NCQ command phase (one or more NCQ commands
- outstanding). Although STATUS and ERROR registers will
- contain valid values describing the error, READ LOG EXT is
- required to clear the error condition, determine which command
- has failed and acquire more information.
- </para>
-
- <para>
- READ LOG EXT Log Page 10h reports which tag has failed and
- taskfile register values describing the error. With this
- information the failed command can be handled as a normal ATA
- command error as in <xref linkend="excatDevErr"/> and all
- other in-flight commands must be retried. Note that this
- retry should not be counted - it's likely that commands
- retried this way would have completed normally if it were not
- for the failed command.
- </para>
-
- <para>
- Note that ATA bus errors can be reported as ATA device NCQ
- errors. This should be handled as described in <xref
- linkend="excatATAbusErr"/>.
- </para>
-
- <para>
- If READ LOG EXT Log Page 10h fails or reports NQ, we're
- thoroughly screwed. This condition should be treated
- according to <xref linkend="excatHSMviolation"/>.
- </para>
-
- </sect2>
-
- <sect2 id="excatATAbusErr">
- <title>ATA bus error</title>
-
- <para>
- ATA bus error means that data corruption occurred during
- transmission over ATA bus (SATA or PATA). This type of errors
- can be indicated by
- </para>
-
- <itemizedlist>
-
- <listitem>
- <para>
- ICRC or ABRT error as described in <xref linkend="excatDevErr"/>.
- </para>
- </listitem>
-
- <listitem>
- <para>
- Controller-specific error completion with error information
- indicating transmission error.
- </para>
- </listitem>
-
- <listitem>
- <para>
- On some controllers, command timeout. In this case, there may
- be a mechanism to determine that the timeout is due to
- transmission error.
- </para>
- </listitem>
-
- <listitem>
- <para>
- Unknown/random errors, timeouts and all sorts of weirdities.
- </para>
- </listitem>
-
- </itemizedlist>
-
- <para>
- As described above, transmission errors can cause wide variety
- of symptoms ranging from device ICRC error to random device
- lockup, and, for many cases, there is no way to tell if an
- error condition is due to transmission error or not;
- therefore, it's necessary to employ some kind of heuristic
- when dealing with errors and timeouts. For example,
- encountering repetitive ABRT errors for known supported
- command is likely to indicate ATA bus error.
- </para>
-
- <para>
- Once it's determined that ATA bus errors have possibly
- occurred, lowering ATA bus transmission speed is one of
- actions which may alleviate the problem. See <xref
- linkend="exrecReconf"/> for more information.
- </para>
-
- </sect2>
-
- <sect2 id="excatPCIbusErr">
- <title>PCI bus error</title>
-
- <para>
- Data corruption or other failures during transmission over PCI
- (or other system bus). For standard BMDMA, this is indicated
- by Error bit in the BMDMA Status register. This type of
- errors must be logged as it indicates something is very wrong
- with the system. Resetting host controller is recommended.
- </para>
-
- </sect2>
-
- <sect2 id="excatLateCompletion">
- <title>Late completion</title>
-
- <para>
- This occurs when timeout occurs and the timeout handler finds
- out that the timed out command has completed successfully or
- with error. This is usually caused by lost interrupts. This
- type of errors must be logged. Resetting host controller is
- recommended.
- </para>
-
- </sect2>
-
- <sect2 id="excatUnknown">
- <title>Unknown error (timeout)</title>
-
- <para>
- This is when timeout occurs and the command is still
- processing or the host and device are in unknown state. When
- this occurs, HSM could be in any valid or invalid state. To
- bring the device to known state and make it forget about the
- timed out command, resetting is necessary. The timed out
- command may be retried.
- </para>
-
- <para>
- Timeouts can also be caused by transmission errors. Refer to
- <xref linkend="excatATAbusErr"/> for more details.
- </para>
-
- </sect2>
-
- <sect2 id="excatHoplugPM">
- <title>Hotplug and power management exceptions</title>
-
- <para>
- &lt;&lt;TODO: fill here&gt;&gt;
- </para>
-
- </sect2>
-
- </sect1>
-
- <sect1 id="exrec">
- <title>EH recovery actions</title>
-
- <para>
- This section discusses several important recovery actions.
- </para>
-
- <sect2 id="exrecClr">
- <title>Clearing error condition</title>
-
- <para>
- Many controllers require its error registers to be cleared by
- error handler. Different controllers may have different
- requirements.
- </para>
-
- <para>
- For SATA, it's strongly recommended to clear at least SError
- register during error handling.
- </para>
- </sect2>
-
- <sect2 id="exrecRst">
- <title>Reset</title>
-
- <para>
- During EH, resetting is necessary in the following cases.
- </para>
-
- <itemizedlist>
-
- <listitem>
- <para>
- HSM is in unknown or invalid state
- </para>
- </listitem>
-
- <listitem>
- <para>
- HBA is in unknown or invalid state
- </para>
- </listitem>
-
- <listitem>
- <para>
- EH needs to make HBA/device forget about in-flight commands
- </para>
- </listitem>
-
- <listitem>
- <para>
- HBA/device behaves weirdly
- </para>
- </listitem>
-
- </itemizedlist>
-
- <para>
- Resetting during EH might be a good idea regardless of error
- condition to improve EH robustness. Whether to reset both or
- either one of HBA and device depends on situation but the
- following scheme is recommended.
- </para>
-
- <itemizedlist>
-
- <listitem>
- <para>
- When it's known that HBA is in ready state but ATA/ATAPI
- device is in unknown state, reset only device.
- </para>
- </listitem>
-
- <listitem>
- <para>
- If HBA is in unknown state, reset both HBA and device.
- </para>
- </listitem>
-
- </itemizedlist>
-
- <para>
- HBA resetting is implementation specific. For a controller
- complying to taskfile/BMDMA PCI IDE, stopping active DMA
- transaction may be sufficient iff BMDMA state is the only HBA
- context. But even mostly taskfile/BMDMA PCI IDE complying
- controllers may have implementation specific requirements and
- mechanism to reset themselves. This must be addressed by
- specific drivers.
- </para>
-
- <para>
- OTOH, ATA/ATAPI standard describes in detail ways to reset
- ATA/ATAPI devices.
- </para>
-
- <variablelist>
-
- <varlistentry><term>PATA hardware reset</term>
- <listitem>
- <para>
- This is hardware initiated device reset signalled with
- asserted PATA RESET- signal. There is no standard way to
- initiate hardware reset from software although some
- hardware provides registers that allow driver to directly
- tweak the RESET- signal.
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry><term>Software reset</term>
- <listitem>
- <para>
- This is achieved by turning CONTROL SRST bit on for at
- least 5us. Both PATA and SATA support it but, in case of
- SATA, this may require controller-specific support as the
- second Register FIS to clear SRST should be transmitted
- while BSY bit is still set. Note that on PATA, this resets
- both master and slave devices on a channel.
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry><term>EXECUTE DEVICE DIAGNOSTIC command</term>
- <listitem>
- <para>
- Although ATA/ATAPI standard doesn't describe exactly, EDD
- implies some level of resetting, possibly similar level
- with software reset. Host-side EDD protocol can be handled
- with normal command processing and most SATA controllers
- should be able to handle EDD's just like other commands.
- As in software reset, EDD affects both devices on a PATA
- bus.
- </para>
- <para>
- Although EDD does reset devices, this doesn't suit error
- handling as EDD cannot be issued while BSY is set and it's
- unclear how it will act when device is in unknown/weird
- state.
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry><term>ATAPI DEVICE RESET command</term>
- <listitem>
- <para>
- This is very similar to software reset except that reset
- can be restricted to the selected device without affecting
- the other device sharing the cable.
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry><term>SATA phy reset</term>
- <listitem>
- <para>
- This is the preferred way of resetting a SATA device. In
- effect, it's identical to PATA hardware reset. Note that
- this can be done with the standard SCR Control register.
- As such, it's usually easier to implement than software
- reset.
- </para>
- </listitem>
- </varlistentry>
-
- </variablelist>
-
- <para>
- One more thing to consider when resetting devices is that
- resetting clears certain configuration parameters and they
- need to be set to their previous or newly adjusted values
- after reset.
- </para>
-
- <para>
- Parameters affected are.
- </para>
-
- <itemizedlist>
-
- <listitem>
- <para>
- CHS set up with INITIALIZE DEVICE PARAMETERS (seldom used)
- </para>
- </listitem>
-
- <listitem>
- <para>
- Parameters set with SET FEATURES including transfer mode setting
- </para>
- </listitem>
-
- <listitem>
- <para>
- Block count set with SET MULTIPLE MODE
- </para>
- </listitem>
-
- <listitem>
- <para>
- Other parameters (SET MAX, MEDIA LOCK...)
- </para>
- </listitem>
-
- </itemizedlist>
-
- <para>
- ATA/ATAPI standard specifies that some parameters must be
- maintained across hardware or software reset, but doesn't
- strictly specify all of them. Always reconfiguring needed
- parameters after reset is required for robustness. Note that
- this also applies when resuming from deep sleep (power-off).
- </para>
-
- <para>
- Also, ATA/ATAPI standard requires that IDENTIFY DEVICE /
- IDENTIFY PACKET DEVICE is issued after any configuration
- parameter is updated or a hardware reset and the result used
- for further operation. OS driver is required to implement
- revalidation mechanism to support this.
- </para>
-
- </sect2>
-
- <sect2 id="exrecReconf">
- <title>Reconfigure transport</title>
-
- <para>
- For both PATA and SATA, a lot of corners are cut for cheap
- connectors, cables or controllers and it's quite common to see
- high transmission error rate. This can be mitigated by
- lowering transmission speed.
- </para>
-
- <para>
- The following is a possible scheme Jeff Garzik suggested.
- </para>
-
- <blockquote>
- <para>
- If more than $N (3?) transmission errors happen in 15 minutes,
- </para>
- <itemizedlist>
- <listitem>
- <para>
- if SATA, decrease SATA PHY speed. if speed cannot be decreased,
- </para>
- </listitem>
- <listitem>
- <para>
- decrease UDMA xfer speed. if at UDMA0, switch to PIO4,
- </para>
- </listitem>
- <listitem>
- <para>
- decrease PIO xfer speed. if at PIO3, complain, but continue
- </para>
- </listitem>
- </itemizedlist>
- </blockquote>
-
- </sect2>
-
- </sect1>
-
- </chapter>
-
- <chapter id="PiixInt">
- <title>ata_piix Internals</title>
-!Idrivers/ata/ata_piix.c
- </chapter>
-
- <chapter id="SILInt">
- <title>sata_sil Internals</title>
-!Idrivers/ata/sata_sil.c
- </chapter>
-
- <chapter id="libataThanks">
- <title>Thanks</title>
- <para>
- The bulk of the ATA knowledge comes thanks to long conversations with
- Andre Hedrick (www.linux-ide.org), and long hours pondering the ATA
- and SCSI specifications.
- </para>
- <para>
- Thanks to Alan Cox for pointing out similarities
- between SATA and SCSI, and in general for motivation to hack on
- libata.
- </para>
- <para>
- libata's device detection
- method, ata_pio_devchk, and in general all the early probing was
- based on extensive study of Hale Landis's probe/reset code in his
- ATADRVR driver (www.ata-atapi.com).
- </para>
- </chapter>
-
-</book>
diff --git a/Documentation/DocBook/librs.tmpl b/Documentation/DocBook/librs.tmpl
deleted file mode 100644
index 94f21361e0ed..000000000000
--- a/Documentation/DocBook/librs.tmpl
+++ /dev/null
@@ -1,289 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
- "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
-
-<book id="Reed-Solomon-Library-Guide">
- <bookinfo>
- <title>Reed-Solomon Library Programming Interface</title>
-
- <authorgroup>
- <author>
- <firstname>Thomas</firstname>
- <surname>Gleixner</surname>
- <affiliation>
- <address>
- <email>tglx@linutronix.de</email>
- </address>
- </affiliation>
- </author>
- </authorgroup>
-
- <copyright>
- <year>2004</year>
- <holder>Thomas Gleixner</holder>
- </copyright>
-
- <legalnotice>
- <para>
- This documentation is free software; you can redistribute
- it and/or modify it under the terms of the GNU General Public
- License version 2 as published by the Free Software Foundation.
- </para>
-
- <para>
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- See the GNU General Public License for more details.
- </para>
-
- <para>
- You should have received a copy of the GNU General Public
- License along with this program; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- MA 02111-1307 USA
- </para>
-
- <para>
- For more details see the file COPYING in the source
- distribution of Linux.
- </para>
- </legalnotice>
- </bookinfo>
-
-<toc></toc>
-
- <chapter id="intro">
- <title>Introduction</title>
- <para>
- The generic Reed-Solomon Library provides encoding, decoding
- and error correction functions.
- </para>
- <para>
- Reed-Solomon codes are used in communication and storage
- applications to ensure data integrity.
- </para>
- <para>
- This documentation is provided for developers who want to utilize
- the functions provided by the library.
- </para>
- </chapter>
-
- <chapter id="bugs">
- <title>Known Bugs And Assumptions</title>
- <para>
- None.
- </para>
- </chapter>
-
- <chapter id="usage">
- <title>Usage</title>
- <para>
- This chapter provides examples of how to use the library.
- </para>
- <sect1>
- <title>Initializing</title>
- <para>
- The init function init_rs returns a pointer to an
- rs decoder structure, which holds the necessary
- information for encoding, decoding and error correction
- with the given polynomial. It either uses an existing
- matching decoder or creates a new one. On creation all
- the lookup tables for fast en/decoding are created.
- The function may take a while, so make sure not to
- call it in critical code paths.
- </para>
- <programlisting>
-/* the Reed Solomon control structure */
-static struct rs_control *rs_decoder;
-
-/* Symbolsize is 10 (bits)
- * Primitive polynomial is x^10+x^3+1
- * first consecutive root is 0
- * primitive element to generate roots = 1
- * generator polynomial degree (number of roots) = 6
- */
-rs_decoder = init_rs (10, 0x409, 0, 1, 6);
- </programlisting>
- </sect1>
- <sect1>
- <title>Encoding</title>
- <para>
- The encoder calculates the Reed-Solomon code over
- the given data length and stores the result in
- the parity buffer. Note that the parity buffer must
- be initialized before calling the encoder.
- </para>
- <para>
- The expanded data can be inverted on the fly by
- providing a non-zero inversion mask. The expanded data is
- XOR'ed with the mask. This is used e.g. for FLASH
- ECC, where the all 0xFF is inverted to an all 0x00.
- The Reed-Solomon code for all 0x00 is all 0x00. The
- code is inverted before storing to FLASH so it is 0xFF
- too. This prevents that reading from an erased FLASH
- results in ECC errors.
- </para>
- <para>
- The databytes are expanded to the given symbol size
- on the fly. There is no support for encoding continuous
- bitstreams with a symbol size != 8 at the moment. If
- it is necessary it should be not a big deal to implement
- such functionality.
- </para>
- <programlisting>
-/* Parity buffer. Size = number of roots */
-uint16_t par[6];
-/* Initialize the parity buffer */
-memset(par, 0, sizeof(par));
-/* Encode 512 byte in data8. Store parity in buffer par */
-encode_rs8 (rs_decoder, data8, 512, par, 0);
- </programlisting>
- </sect1>
- <sect1>
- <title>Decoding</title>
- <para>
- The decoder calculates the syndrome over
- the given data length and the received parity symbols
- and corrects errors in the data.
- </para>
- <para>
- If a syndrome is available from a hardware decoder
- then the syndrome calculation is skipped.
- </para>
- <para>
- The correction of the data buffer can be suppressed
- by providing a correction pattern buffer and an error
- location buffer to the decoder. The decoder stores the
- calculated error location and the correction bitmask
- in the given buffers. This is useful for hardware
- decoders which use a weird bit ordering scheme.
- </para>
- <para>
- The databytes are expanded to the given symbol size
- on the fly. There is no support for decoding continuous
- bitstreams with a symbolsize != 8 at the moment. If
- it is necessary it should be not a big deal to implement
- such functionality.
- </para>
-
- <sect2>
- <title>
- Decoding with syndrome calculation, direct data correction
- </title>
- <programlisting>
-/* Parity buffer. Size = number of roots */
-uint16_t par[6];
-uint8_t data[512];
-int numerr;
-/* Receive data */
-.....
-/* Receive parity */
-.....
-/* Decode 512 byte in data8.*/
-numerr = decode_rs8 (rs_decoder, data8, par, 512, NULL, 0, NULL, 0, NULL);
- </programlisting>
- </sect2>
-
- <sect2>
- <title>
- Decoding with syndrome given by hardware decoder, direct data correction
- </title>
- <programlisting>
-/* Parity buffer. Size = number of roots */
-uint16_t par[6], syn[6];
-uint8_t data[512];
-int numerr;
-/* Receive data */
-.....
-/* Receive parity */
-.....
-/* Get syndrome from hardware decoder */
-.....
-/* Decode 512 byte in data8.*/
-numerr = decode_rs8 (rs_decoder, data8, par, 512, syn, 0, NULL, 0, NULL);
- </programlisting>
- </sect2>
-
- <sect2>
- <title>
- Decoding with syndrome given by hardware decoder, no direct data correction.
- </title>
- <para>
- Note: It's not necessary to give data and received parity to the decoder.
- </para>
- <programlisting>
-/* Parity buffer. Size = number of roots */
-uint16_t par[6], syn[6], corr[8];
-uint8_t data[512];
-int numerr, errpos[8];
-/* Receive data */
-.....
-/* Receive parity */
-.....
-/* Get syndrome from hardware decoder */
-.....
-/* Decode 512 byte in data8.*/
-numerr = decode_rs8 (rs_decoder, NULL, NULL, 512, syn, 0, errpos, 0, corr);
-for (i = 0; i &lt; numerr; i++) {
- do_error_correction_in_your_buffer(errpos[i], corr[i]);
-}
- </programlisting>
- </sect2>
- </sect1>
- <sect1>
- <title>Cleanup</title>
- <para>
- The function free_rs frees the allocated resources,
- if the caller is the last user of the decoder.
- </para>
- <programlisting>
-/* Release resources */
-free_rs(rs_decoder);
- </programlisting>
- </sect1>
-
- </chapter>
-
- <chapter id="structs">
- <title>Structures</title>
- <para>
- This chapter contains the autogenerated documentation of the structures which are
- used in the Reed-Solomon Library and are relevant for a developer.
- </para>
-!Iinclude/linux/rslib.h
- </chapter>
-
- <chapter id="pubfunctions">
- <title>Public Functions Provided</title>
- <para>
- This chapter contains the autogenerated documentation of the Reed-Solomon functions
- which are exported.
- </para>
-!Elib/reed_solomon/reed_solomon.c
- </chapter>
-
- <chapter id="credits">
- <title>Credits</title>
- <para>
- The library code for encoding and decoding was written by Phil Karn.
- </para>
- <programlisting>
- Copyright 2002, Phil Karn, KA9Q
- May be used under the terms of the GNU General Public License (GPL)
- </programlisting>
- <para>
- The wrapper functions and interfaces are written by Thomas Gleixner.
- </para>
- <para>
- Many users have provided bugfixes, improvements and helping hands for testing.
- Thanks a lot.
- </para>
- <para>
- The following people have contributed to this document:
- </para>
- <para>
- Thomas Gleixner<email>tglx@linutronix.de</email>
- </para>
- </chapter>
-</book>
diff --git a/Documentation/DocBook/lsm.tmpl b/Documentation/DocBook/lsm.tmpl
deleted file mode 100644
index fe7664ce9667..000000000000
--- a/Documentation/DocBook/lsm.tmpl
+++ /dev/null
@@ -1,265 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
- "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
-
-<article class="whitepaper" id="LinuxSecurityModule" lang="en">
- <articleinfo>
- <title>Linux Security Modules: General Security Hooks for Linux</title>
- <authorgroup>
- <author>
- <firstname>Stephen</firstname>
- <surname>Smalley</surname>
- <affiliation>
- <orgname>NAI Labs</orgname>
- <address><email>ssmalley@nai.com</email></address>
- </affiliation>
- </author>
- <author>
- <firstname>Timothy</firstname>
- <surname>Fraser</surname>
- <affiliation>
- <orgname>NAI Labs</orgname>
- <address><email>tfraser@nai.com</email></address>
- </affiliation>
- </author>
- <author>
- <firstname>Chris</firstname>
- <surname>Vance</surname>
- <affiliation>
- <orgname>NAI Labs</orgname>
- <address><email>cvance@nai.com</email></address>
- </affiliation>
- </author>
- </authorgroup>
- </articleinfo>
-
-<sect1 id="Introduction"><title>Introduction</title>
-
-<para>
-In March 2001, the National Security Agency (NSA) gave a presentation
-about Security-Enhanced Linux (SELinux) at the 2.5 Linux Kernel
-Summit. SELinux is an implementation of flexible and fine-grained
-nondiscretionary access controls in the Linux kernel, originally
-implemented as its own particular kernel patch. Several other
-security projects (e.g. RSBAC, Medusa) have also developed flexible
-access control architectures for the Linux kernel, and various
-projects have developed particular access control models for Linux
-(e.g. LIDS, DTE, SubDomain). Each project has developed and
-maintained its own kernel patch to support its security needs.
-</para>
-
-<para>
-In response to the NSA presentation, Linus Torvalds made a set of
-remarks that described a security framework he would be willing to
-consider for inclusion in the mainstream Linux kernel. He described a
-general framework that would provide a set of security hooks to
-control operations on kernel objects and a set of opaque security
-fields in kernel data structures for maintaining security attributes.
-This framework could then be used by loadable kernel modules to
-implement any desired model of security. Linus also suggested the
-possibility of migrating the Linux capabilities code into such a
-module.
-</para>
-
-<para>
-The Linux Security Modules (LSM) project was started by WireX to
-develop such a framework. LSM is a joint development effort by
-several security projects, including Immunix, SELinux, SGI and Janus,
-and several individuals, including Greg Kroah-Hartman and James
-Morris, to develop a Linux kernel patch that implements this
-framework. The patch is currently tracking the 2.4 series and is
-targeted for integration into the 2.5 development series. This
-technical report provides an overview of the framework and the example
-capabilities security module provided by the LSM kernel patch.
-</para>
-
-</sect1>
-
-<sect1 id="framework"><title>LSM Framework</title>
-
-<para>
-The LSM kernel patch provides a general kernel framework to support
-security modules. In particular, the LSM framework is primarily
-focused on supporting access control modules, although future
-development is likely to address other security needs such as
-auditing. By itself, the framework does not provide any additional
-security; it merely provides the infrastructure to support security
-modules. The LSM kernel patch also moves most of the capabilities
-logic into an optional security module, with the system defaulting
-to the traditional superuser logic. This capabilities module
-is discussed further in <xref linkend="cap"/>.
-</para>
-
-<para>
-The LSM kernel patch adds security fields to kernel data structures
-and inserts calls to hook functions at critical points in the kernel
-code to manage the security fields and to perform access control. It
-also adds functions for registering and unregistering security
-modules, and adds a general <function>security</function> system call
-to support new system calls for security-aware applications.
-</para>
-
-<para>
-The LSM security fields are simply <type>void*</type> pointers. For
-process and program execution security information, security fields
-were added to <structname>struct task_struct</structname> and
-<structname>struct linux_binprm</structname>. For filesystem security
-information, a security field was added to
-<structname>struct super_block</structname>. For pipe, file, and socket
-security information, security fields were added to
-<structname>struct inode</structname> and
-<structname>struct file</structname>. For packet and network device security
-information, security fields were added to
-<structname>struct sk_buff</structname> and
-<structname>struct net_device</structname>. For System V IPC security
-information, security fields were added to
-<structname>struct kern_ipc_perm</structname> and
-<structname>struct msg_msg</structname>; additionally, the definitions
-for <structname>struct msg_msg</structname>, <structname>struct
-msg_queue</structname>, and <structname>struct
-shmid_kernel</structname> were moved to header files
-(<filename>include/linux/msg.h</filename> and
-<filename>include/linux/shm.h</filename> as appropriate) to allow
-the security modules to use these definitions.
-</para>
-
-<para>
-Each LSM hook is a function pointer in a global table,
-security_ops. This table is a
-<structname>security_operations</structname> structure as defined by
-<filename>include/linux/security.h</filename>. Detailed documentation
-for each hook is included in this header file. At present, this
-structure consists of a collection of substructures that group related
-hooks based on the kernel object (e.g. task, inode, file, sk_buff,
-etc) as well as some top-level hook function pointers for system
-operations. This structure is likely to be flattened in the future
-for performance. The placement of the hook calls in the kernel code
-is described by the "called:" lines in the per-hook documentation in
-the header file. The hook calls can also be easily found in the
-kernel code by looking for the string "security_ops->".
-
-</para>
-
-<para>
-Linus mentioned per-process security hooks in his original remarks as a
-possible alternative to global security hooks. However, if LSM were
-to start from the perspective of per-process hooks, then the base
-framework would have to deal with how to handle operations that
-involve multiple processes (e.g. kill), since each process might have
-its own hook for controlling the operation. This would require a
-general mechanism for composing hooks in the base framework.
-Additionally, LSM would still need global hooks for operations that
-have no process context (e.g. network input operations).
-Consequently, LSM provides global security hooks, but a security
-module is free to implement per-process hooks (where that makes sense)
-by storing a security_ops table in each process' security field and
-then invoking these per-process hooks from the global hooks.
-The problem of composition is thus deferred to the module.
-</para>
-
-<para>
-The global security_ops table is initialized to a set of hook
-functions provided by a dummy security module that provides
-traditional superuser logic. A <function>register_security</function>
-function (in <filename>security/security.c</filename>) is provided to
-allow a security module to set security_ops to refer to its own hook
-functions, and an <function>unregister_security</function> function is
-provided to revert security_ops to the dummy module hooks. This
-mechanism is used to set the primary security module, which is
-responsible for making the final decision for each hook.
-</para>
-
-<para>
-LSM also provides a simple mechanism for stacking additional security
-modules with the primary security module. It defines
-<function>register_security</function> and
-<function>unregister_security</function> hooks in the
-<structname>security_operations</structname> structure and provides
-<function>mod_reg_security</function> and
-<function>mod_unreg_security</function> functions that invoke these
-hooks after performing some sanity checking. A security module can
-call these functions in order to stack with other modules. However,
-the actual details of how this stacking is handled are deferred to the
-module, which can implement these hooks in any way it wishes
-(including always returning an error if it does not wish to support
-stacking). In this manner, LSM again defers the problem of
-composition to the module.
-</para>
-
-<para>
-Although the LSM hooks are organized into substructures based on
-kernel object, all of the hooks can be viewed as falling into two
-major categories: hooks that are used to manage the security fields
-and hooks that are used to perform access control. Examples of the
-first category of hooks include the
-<function>alloc_security</function> and
-<function>free_security</function> hooks defined for each kernel data
-structure that has a security field. These hooks are used to allocate
-and free security structures for kernel objects. The first category
-of hooks also includes hooks that set information in the security
-field after allocation, such as the <function>post_lookup</function>
-hook in <structname>struct inode_security_ops</structname>. This hook
-is used to set security information for inodes after successful lookup
-operations. An example of the second category of hooks is the
-<function>permission</function> hook in
-<structname>struct inode_security_ops</structname>. This hook checks
-permission when accessing an inode.
-</para>
-
-</sect1>
-
-<sect1 id="cap"><title>LSM Capabilities Module</title>
-
-<para>
-The LSM kernel patch moves most of the existing POSIX.1e capabilities
-logic into an optional security module stored in the file
-<filename>security/capability.c</filename>. This change allows
-users who do not want to use capabilities to omit this code entirely
-from their kernel, instead using the dummy module for traditional
-superuser logic or any other module that they desire. This change
-also allows the developers of the capabilities logic to maintain and
-enhance their code more freely, without needing to integrate patches
-back into the base kernel.
-</para>
-
-<para>
-In addition to moving the capabilities logic, the LSM kernel patch
-could move the capability-related fields from the kernel data
-structures into the new security fields managed by the security
-modules. However, at present, the LSM kernel patch leaves the
-capability fields in the kernel data structures. In his original
-remarks, Linus suggested that this might be preferable so that other
-security modules can be easily stacked with the capabilities module
-without needing to chain multiple security structures on the security field.
-It also avoids imposing extra overhead on the capabilities module
-to manage the security fields. However, the LSM framework could
-certainly support such a move if it is determined to be desirable,
-with only a few additional changes described below.
-</para>
-
-<para>
-At present, the capabilities logic for computing process capabilities
-on <function>execve</function> and <function>set*uid</function>,
-checking capabilities for a particular process, saving and checking
-capabilities for netlink messages, and handling the
-<function>capget</function> and <function>capset</function> system
-calls have been moved into the capabilities module. There are still a
-few locations in the base kernel where capability-related fields are
-directly examined or modified, but the current version of the LSM
-patch does allow a security module to completely replace the
-assignment and testing of capabilities. These few locations would
-need to be changed if the capability-related fields were moved into
-the security field. The following is a list of known locations that
-still perform such direct examination or modification of
-capability-related fields:
-<itemizedlist>
-<listitem><para><filename>fs/open.c</filename>:<function>sys_access</function></para></listitem>
-<listitem><para><filename>fs/lockd/host.c</filename>:<function>nlm_bind_host</function></para></listitem>
-<listitem><para><filename>fs/nfsd/auth.c</filename>:<function>nfsd_setuser</function></para></listitem>
-<listitem><para><filename>fs/proc/array.c</filename>:<function>task_cap</function></para></listitem>
-</itemizedlist>
-</para>
-
-</sect1>
-
-</article>
diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl
deleted file mode 100644
index b442921bca54..000000000000
--- a/Documentation/DocBook/mtdnand.tmpl
+++ /dev/null
@@ -1,1291 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
- "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
-
-<book id="MTD-NAND-Guide">
- <bookinfo>
- <title>MTD NAND Driver Programming Interface</title>
-
- <authorgroup>
- <author>
- <firstname>Thomas</firstname>
- <surname>Gleixner</surname>
- <affiliation>
- <address>
- <email>tglx@linutronix.de</email>
- </address>
- </affiliation>
- </author>
- </authorgroup>
-
- <copyright>
- <year>2004</year>
- <holder>Thomas Gleixner</holder>
- </copyright>
-
- <legalnotice>
- <para>
- This documentation is free software; you can redistribute
- it and/or modify it under the terms of the GNU General Public
- License version 2 as published by the Free Software Foundation.
- </para>
-
- <para>
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- See the GNU General Public License for more details.
- </para>
-
- <para>
- You should have received a copy of the GNU General Public
- License along with this program; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- MA 02111-1307 USA
- </para>
-
- <para>
- For more details see the file COPYING in the source
- distribution of Linux.
- </para>
- </legalnotice>
- </bookinfo>
-
-<toc></toc>
-
- <chapter id="intro">
- <title>Introduction</title>
- <para>
- The generic NAND driver supports almost all NAND and AG-AND based
- chips and connects them to the Memory Technology Devices (MTD)
- subsystem of the Linux Kernel.
- </para>
- <para>
- This documentation is provided for developers who want to implement
- board drivers or filesystem drivers suitable for NAND devices.
- </para>
- </chapter>
-
- <chapter id="bugs">
- <title>Known Bugs And Assumptions</title>
- <para>
- None.
- </para>
- </chapter>
-
- <chapter id="dochints">
- <title>Documentation hints</title>
- <para>
- The function and structure docs are autogenerated. Each function and
- struct member has a short description which is marked with an [XXX] identifier.
- The following chapters explain the meaning of those identifiers.
- </para>
- <sect1 id="Function_identifiers_XXX">
- <title>Function identifiers [XXX]</title>
- <para>
- The functions are marked with [XXX] identifiers in the short
- comment. The identifiers explain the usage and scope of the
- functions. Following identifiers are used:
- </para>
- <itemizedlist>
- <listitem><para>
- [MTD Interface]</para><para>
- These functions provide the interface to the MTD kernel API.
- They are not replaceable and provide functionality
- which is complete hardware independent.
- </para></listitem>
- <listitem><para>
- [NAND Interface]</para><para>
- These functions are exported and provide the interface to the NAND kernel API.
- </para></listitem>
- <listitem><para>
- [GENERIC]</para><para>
- Generic functions are not replaceable and provide functionality
- which is complete hardware independent.
- </para></listitem>
- <listitem><para>
- [DEFAULT]</para><para>
- Default functions provide hardware related functionality which is suitable
- for most of the implementations. These functions can be replaced by the
- board driver if necessary. Those functions are called via pointers in the
- NAND chip description structure. The board driver can set the functions which
- should be replaced by board dependent functions before calling nand_scan().
- If the function pointer is NULL on entry to nand_scan() then the pointer
- is set to the default function which is suitable for the detected chip type.
- </para></listitem>
- </itemizedlist>
- </sect1>
- <sect1 id="Struct_member_identifiers_XXX">
- <title>Struct member identifiers [XXX]</title>
- <para>
- The struct members are marked with [XXX] identifiers in the
- comment. The identifiers explain the usage and scope of the
- members. Following identifiers are used:
- </para>
- <itemizedlist>
- <listitem><para>
- [INTERN]</para><para>
- These members are for NAND driver internal use only and must not be
- modified. Most of these values are calculated from the chip geometry
- information which is evaluated during nand_scan().
- </para></listitem>
- <listitem><para>
- [REPLACEABLE]</para><para>
- Replaceable members hold hardware related functions which can be
- provided by the board driver. The board driver can set the functions which
- should be replaced by board dependent functions before calling nand_scan().
- If the function pointer is NULL on entry to nand_scan() then the pointer
- is set to the default function which is suitable for the detected chip type.
- </para></listitem>
- <listitem><para>
- [BOARDSPECIFIC]</para><para>
- Board specific members hold hardware related information which must
- be provided by the board driver. The board driver must set the function
- pointers and datafields before calling nand_scan().
- </para></listitem>
- <listitem><para>
- [OPTIONAL]</para><para>
- Optional members can hold information relevant for the board driver. The
- generic NAND driver code does not use this information.
- </para></listitem>
- </itemizedlist>
- </sect1>
- </chapter>
-
- <chapter id="basicboarddriver">
- <title>Basic board driver</title>
- <para>
- For most boards it will be sufficient to provide just the
- basic functions and fill out some really board dependent
- members in the nand chip description structure.
- </para>
- <sect1 id="Basic_defines">
- <title>Basic defines</title>
- <para>
- At least you have to provide a nand_chip structure
- and a storage for the ioremap'ed chip address.
- You can allocate the nand_chip structure using
- kmalloc or you can allocate it statically.
- The NAND chip structure embeds an mtd structure
- which will be registered to the MTD subsystem.
- You can extract a pointer to the mtd structure
- from a nand_chip pointer using the nand_to_mtd()
- helper.
- </para>
- <para>
- Kmalloc based example
- </para>
- <programlisting>
-static struct mtd_info *board_mtd;
-static void __iomem *baseaddr;
- </programlisting>
- <para>
- Static example
- </para>
- <programlisting>
-static struct nand_chip board_chip;
-static void __iomem *baseaddr;
- </programlisting>
- </sect1>
- <sect1 id="Partition_defines">
- <title>Partition defines</title>
- <para>
- If you want to divide your device into partitions, then
- define a partitioning scheme suitable to your board.
- </para>
- <programlisting>
-#define NUM_PARTITIONS 2
-static struct mtd_partition partition_info[] = {
- { .name = "Flash partition 1",
- .offset = 0,
- .size = 8 * 1024 * 1024 },
- { .name = "Flash partition 2",
- .offset = MTDPART_OFS_NEXT,
- .size = MTDPART_SIZ_FULL },
-};
- </programlisting>
- </sect1>
- <sect1 id="Hardware_control_functions">
- <title>Hardware control function</title>
- <para>
- The hardware control function provides access to the
- control pins of the NAND chip(s).
- The access can be done by GPIO pins or by address lines.
- If you use address lines, make sure that the timing
- requirements are met.
- </para>
- <para>
- <emphasis>GPIO based example</emphasis>
- </para>
- <programlisting>
-static void board_hwcontrol(struct mtd_info *mtd, int cmd)
-{
- switch(cmd){
- case NAND_CTL_SETCLE: /* Set CLE pin high */ break;
- case NAND_CTL_CLRCLE: /* Set CLE pin low */ break;
- case NAND_CTL_SETALE: /* Set ALE pin high */ break;
- case NAND_CTL_CLRALE: /* Set ALE pin low */ break;
- case NAND_CTL_SETNCE: /* Set nCE pin low */ break;
- case NAND_CTL_CLRNCE: /* Set nCE pin high */ break;
- }
-}
- </programlisting>
- <para>
- <emphasis>Address lines based example.</emphasis> It's assumed that the
- nCE pin is driven by a chip select decoder.
- </para>
- <programlisting>
-static void board_hwcontrol(struct mtd_info *mtd, int cmd)
-{
- struct nand_chip *this = mtd_to_nand(mtd);
- switch(cmd){
- case NAND_CTL_SETCLE: this->IO_ADDR_W |= CLE_ADRR_BIT; break;
- case NAND_CTL_CLRCLE: this->IO_ADDR_W &amp;= ~CLE_ADRR_BIT; break;
- case NAND_CTL_SETALE: this->IO_ADDR_W |= ALE_ADRR_BIT; break;
- case NAND_CTL_CLRALE: this->IO_ADDR_W &amp;= ~ALE_ADRR_BIT; break;
- }
-}
- </programlisting>
- </sect1>
- <sect1 id="Device_ready_function">
- <title>Device ready function</title>
- <para>
- If the hardware interface has the ready busy pin of the NAND chip connected to a
- GPIO or other accessible I/O pin, this function is used to read back the state of the
- pin. The function has no arguments and should return 0, if the device is busy (R/B pin
- is low) and 1, if the device is ready (R/B pin is high).
- If the hardware interface does not give access to the ready busy pin, then
- the function must not be defined and the function pointer this->dev_ready is set to NULL.
- </para>
- </sect1>
- <sect1 id="Init_function">
- <title>Init function</title>
- <para>
- The init function allocates memory and sets up all the board
- specific parameters and function pointers. When everything
- is set up nand_scan() is called. This function tries to
- detect and identify then chip. If a chip is found all the
- internal data fields are initialized accordingly.
- The structure(s) have to be zeroed out first and then filled with the necessary
- information about the device.
- </para>
- <programlisting>
-static int __init board_init (void)
-{
- struct nand_chip *this;
- int err = 0;
-
- /* Allocate memory for MTD device structure and private data */
- this = kzalloc(sizeof(struct nand_chip), GFP_KERNEL);
- if (!this) {
- printk ("Unable to allocate NAND MTD device structure.\n");
- err = -ENOMEM;
- goto out;
- }
-
- board_mtd = nand_to_mtd(this);
-
- /* map physical address */
- baseaddr = ioremap(CHIP_PHYSICAL_ADDRESS, 1024);
- if (!baseaddr) {
- printk("Ioremap to access NAND chip failed\n");
- err = -EIO;
- goto out_mtd;
- }
-
- /* Set address of NAND IO lines */
- this->IO_ADDR_R = baseaddr;
- this->IO_ADDR_W = baseaddr;
- /* Reference hardware control function */
- this->hwcontrol = board_hwcontrol;
- /* Set command delay time, see datasheet for correct value */
- this->chip_delay = CHIP_DEPENDEND_COMMAND_DELAY;
- /* Assign the device ready function, if available */
- this->dev_ready = board_dev_ready;
- this->eccmode = NAND_ECC_SOFT;
-
- /* Scan to find existence of the device */
- if (nand_scan (board_mtd, 1)) {
- err = -ENXIO;
- goto out_ior;
- }
-
- add_mtd_partitions(board_mtd, partition_info, NUM_PARTITIONS);
- goto out;
-
-out_ior:
- iounmap(baseaddr);
-out_mtd:
- kfree (this);
-out:
- return err;
-}
-module_init(board_init);
- </programlisting>
- </sect1>
- <sect1 id="Exit_function">
- <title>Exit function</title>
- <para>
- The exit function is only necessary if the driver is
- compiled as a module. It releases all resources which
- are held by the chip driver and unregisters the partitions
- in the MTD layer.
- </para>
- <programlisting>
-#ifdef MODULE
-static void __exit board_cleanup (void)
-{
- /* Release resources, unregister device */
- nand_release (board_mtd);
-
- /* unmap physical address */
- iounmap(baseaddr);
-
- /* Free the MTD device structure */
- kfree (mtd_to_nand(board_mtd));
-}
-module_exit(board_cleanup);
-#endif
- </programlisting>
- </sect1>
- </chapter>
-
- <chapter id="boarddriversadvanced">
- <title>Advanced board driver functions</title>
- <para>
- This chapter describes the advanced functionality of the NAND
- driver. For a list of functions which can be overridden by the board
- driver see the documentation of the nand_chip structure.
- </para>
- <sect1 id="Multiple_chip_control">
- <title>Multiple chip control</title>
- <para>
- The nand driver can control chip arrays. Therefore the
- board driver must provide an own select_chip function. This
- function must (de)select the requested chip.
- The function pointer in the nand_chip structure must
- be set before calling nand_scan(). The maxchip parameter
- of nand_scan() defines the maximum number of chips to
- scan for. Make sure that the select_chip function can
- handle the requested number of chips.
- </para>
- <para>
- The nand driver concatenates the chips to one virtual
- chip and provides this virtual chip to the MTD layer.
- </para>
- <para>
- <emphasis>Note: The driver can only handle linear chip arrays
- of equally sized chips. There is no support for
- parallel arrays which extend the buswidth.</emphasis>
- </para>
- <para>
- <emphasis>GPIO based example</emphasis>
- </para>
- <programlisting>
-static void board_select_chip (struct mtd_info *mtd, int chip)
-{
- /* Deselect all chips, set all nCE pins high */
- GPIO(BOARD_NAND_NCE) |= 0xff;
- if (chip >= 0)
- GPIO(BOARD_NAND_NCE) &amp;= ~ (1 &lt;&lt; chip);
-}
- </programlisting>
- <para>
- <emphasis>Address lines based example.</emphasis>
- Its assumed that the nCE pins are connected to an
- address decoder.
- </para>
- <programlisting>
-static void board_select_chip (struct mtd_info *mtd, int chip)
-{
- struct nand_chip *this = mtd_to_nand(mtd);
-
- /* Deselect all chips */
- this->IO_ADDR_R &amp;= ~BOARD_NAND_ADDR_MASK;
- this->IO_ADDR_W &amp;= ~BOARD_NAND_ADDR_MASK;
- switch (chip) {
- case 0:
- this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIP0;
- this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIP0;
- break;
- ....
- case n:
- this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIPn;
- this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIPn;
- break;
- }
-}
- </programlisting>
- </sect1>
- <sect1 id="Hardware_ECC_support">
- <title>Hardware ECC support</title>
- <sect2 id="Functions_and_constants">
- <title>Functions and constants</title>
- <para>
- The nand driver supports three different types of
- hardware ECC.
- <itemizedlist>
- <listitem><para>NAND_ECC_HW3_256</para><para>
- Hardware ECC generator providing 3 bytes ECC per
- 256 byte.
- </para> </listitem>
- <listitem><para>NAND_ECC_HW3_512</para><para>
- Hardware ECC generator providing 3 bytes ECC per
- 512 byte.
- </para> </listitem>
- <listitem><para>NAND_ECC_HW6_512</para><para>
- Hardware ECC generator providing 6 bytes ECC per
- 512 byte.
- </para> </listitem>
- <listitem><para>NAND_ECC_HW8_512</para><para>
- Hardware ECC generator providing 6 bytes ECC per
- 512 byte.
- </para> </listitem>
- </itemizedlist>
- If your hardware generator has a different functionality
- add it at the appropriate place in nand_base.c
- </para>
- <para>
- The board driver must provide following functions:
- <itemizedlist>
- <listitem><para>enable_hwecc</para><para>
- This function is called before reading / writing to
- the chip. Reset or initialize the hardware generator
- in this function. The function is called with an
- argument which let you distinguish between read
- and write operations.
- </para> </listitem>
- <listitem><para>calculate_ecc</para><para>
- This function is called after read / write from / to
- the chip. Transfer the ECC from the hardware to
- the buffer. If the option NAND_HWECC_SYNDROME is set
- then the function is only called on write. See below.
- </para> </listitem>
- <listitem><para>correct_data</para><para>
- In case of an ECC error this function is called for
- error detection and correction. Return 1 respectively 2
- in case the error can be corrected. If the error is
- not correctable return -1. If your hardware generator
- matches the default algorithm of the nand_ecc software
- generator then use the correction function provided
- by nand_ecc instead of implementing duplicated code.
- </para> </listitem>
- </itemizedlist>
- </para>
- </sect2>
- <sect2 id="Hardware_ECC_with_syndrome_calculation">
- <title>Hardware ECC with syndrome calculation</title>
- <para>
- Many hardware ECC implementations provide Reed-Solomon
- codes and calculate an error syndrome on read. The syndrome
- must be converted to a standard Reed-Solomon syndrome
- before calling the error correction code in the generic
- Reed-Solomon library.
- </para>
- <para>
- The ECC bytes must be placed immediately after the data
- bytes in order to make the syndrome generator work. This
- is contrary to the usual layout used by software ECC. The
- separation of data and out of band area is not longer
- possible. The nand driver code handles this layout and
- the remaining free bytes in the oob area are managed by
- the autoplacement code. Provide a matching oob-layout
- in this case. See rts_from4.c and diskonchip.c for
- implementation reference. In those cases we must also
- use bad block tables on FLASH, because the ECC layout is
- interfering with the bad block marker positions.
- See bad block table support for details.
- </para>
- </sect2>
- </sect1>
- <sect1 id="Bad_Block_table_support">
- <title>Bad block table support</title>
- <para>
- Most NAND chips mark the bad blocks at a defined
- position in the spare area. Those blocks must
- not be erased under any circumstances as the bad
- block information would be lost.
- It is possible to check the bad block mark each
- time when the blocks are accessed by reading the
- spare area of the first page in the block. This
- is time consuming so a bad block table is used.
- </para>
- <para>
- The nand driver supports various types of bad block
- tables.
- <itemizedlist>
- <listitem><para>Per device</para><para>
- The bad block table contains all bad block information
- of the device which can consist of multiple chips.
- </para> </listitem>
- <listitem><para>Per chip</para><para>
- A bad block table is used per chip and contains the
- bad block information for this particular chip.
- </para> </listitem>
- <listitem><para>Fixed offset</para><para>
- The bad block table is located at a fixed offset
- in the chip (device). This applies to various
- DiskOnChip devices.
- </para> </listitem>
- <listitem><para>Automatic placed</para><para>
- The bad block table is automatically placed and
- detected either at the end or at the beginning
- of a chip (device)
- </para> </listitem>
- <listitem><para>Mirrored tables</para><para>
- The bad block table is mirrored on the chip (device) to
- allow updates of the bad block table without data loss.
- </para> </listitem>
- </itemizedlist>
- </para>
- <para>
- nand_scan() calls the function nand_default_bbt().
- nand_default_bbt() selects appropriate default
- bad block table descriptors depending on the chip information
- which was retrieved by nand_scan().
- </para>
- <para>
- The standard policy is scanning the device for bad
- blocks and build a ram based bad block table which
- allows faster access than always checking the
- bad block information on the flash chip itself.
- </para>
- <sect2 id="Flash_based_tables">
- <title>Flash based tables</title>
- <para>
- It may be desired or necessary to keep a bad block table in FLASH.
- For AG-AND chips this is mandatory, as they have no factory marked
- bad blocks. They have factory marked good blocks. The marker pattern
- is erased when the block is erased to be reused. So in case of
- powerloss before writing the pattern back to the chip this block
- would be lost and added to the bad blocks. Therefore we scan the
- chip(s) when we detect them the first time for good blocks and
- store this information in a bad block table before erasing any
- of the blocks.
- </para>
- <para>
- The blocks in which the tables are stored are protected against
- accidental access by marking them bad in the memory bad block
- table. The bad block table management functions are allowed
- to circumvent this protection.
- </para>
- <para>
- The simplest way to activate the FLASH based bad block table support
- is to set the option NAND_BBT_USE_FLASH in the bbt_option field of
- the nand chip structure before calling nand_scan(). For AG-AND
- chips is this done by default.
- This activates the default FLASH based bad block table functionality
- of the NAND driver. The default bad block table options are
- <itemizedlist>
- <listitem><para>Store bad block table per chip</para></listitem>
- <listitem><para>Use 2 bits per block</para></listitem>
- <listitem><para>Automatic placement at the end of the chip</para></listitem>
- <listitem><para>Use mirrored tables with version numbers</para></listitem>
- <listitem><para>Reserve 4 blocks at the end of the chip</para></listitem>
- </itemizedlist>
- </para>
- </sect2>
- <sect2 id="User_defined_tables">
- <title>User defined tables</title>
- <para>
- User defined tables are created by filling out a
- nand_bbt_descr structure and storing the pointer in the
- nand_chip structure member bbt_td before calling nand_scan().
- If a mirror table is necessary a second structure must be
- created and a pointer to this structure must be stored
- in bbt_md inside the nand_chip structure. If the bbt_md
- member is set to NULL then only the main table is used
- and no scan for the mirrored table is performed.
- </para>
- <para>
- The most important field in the nand_bbt_descr structure
- is the options field. The options define most of the
- table properties. Use the predefined constants from
- nand.h to define the options.
- <itemizedlist>
- <listitem><para>Number of bits per block</para>
- <para>The supported number of bits is 1, 2, 4, 8.</para></listitem>
- <listitem><para>Table per chip</para>
- <para>Setting the constant NAND_BBT_PERCHIP selects that
- a bad block table is managed for each chip in a chip array.
- If this option is not set then a per device bad block table
- is used.</para></listitem>
- <listitem><para>Table location is absolute</para>
- <para>Use the option constant NAND_BBT_ABSPAGE and
- define the absolute page number where the bad block
- table starts in the field pages. If you have selected bad block
- tables per chip and you have a multi chip array then the start page
- must be given for each chip in the chip array. Note: there is no scan
- for a table ident pattern performed, so the fields
- pattern, veroffs, offs, len can be left uninitialized</para></listitem>
- <listitem><para>Table location is automatically detected</para>
- <para>The table can either be located in the first or the last good
- blocks of the chip (device). Set NAND_BBT_LASTBLOCK to place
- the bad block table at the end of the chip (device). The
- bad block tables are marked and identified by a pattern which
- is stored in the spare area of the first page in the block which
- holds the bad block table. Store a pointer to the pattern
- in the pattern field. Further the length of the pattern has to be
- stored in len and the offset in the spare area must be given
- in the offs member of the nand_bbt_descr structure. For mirrored
- bad block tables different patterns are mandatory.</para></listitem>
- <listitem><para>Table creation</para>
- <para>Set the option NAND_BBT_CREATE to enable the table creation
- if no table can be found during the scan. Usually this is done only
- once if a new chip is found. </para></listitem>
- <listitem><para>Table write support</para>
- <para>Set the option NAND_BBT_WRITE to enable the table write support.
- This allows the update of the bad block table(s) in case a block has
- to be marked bad due to wear. The MTD interface function block_markbad
- is calling the update function of the bad block table. If the write
- support is enabled then the table is updated on FLASH.</para>
- <para>
- Note: Write support should only be enabled for mirrored tables with
- version control.
- </para></listitem>
- <listitem><para>Table version control</para>
- <para>Set the option NAND_BBT_VERSION to enable the table version control.
- It's highly recommended to enable this for mirrored tables with write
- support. It makes sure that the risk of losing the bad block
- table information is reduced to the loss of the information about the
- one worn out block which should be marked bad. The version is stored in
- 4 consecutive bytes in the spare area of the device. The position of
- the version number is defined by the member veroffs in the bad block table
- descriptor.</para></listitem>
- <listitem><para>Save block contents on write</para>
- <para>
- In case that the block which holds the bad block table does contain
- other useful information, set the option NAND_BBT_SAVECONTENT. When
- the bad block table is written then the whole block is read the bad
- block table is updated and the block is erased and everything is
- written back. If this option is not set only the bad block table
- is written and everything else in the block is ignored and erased.
- </para></listitem>
- <listitem><para>Number of reserved blocks</para>
- <para>
- For automatic placement some blocks must be reserved for
- bad block table storage. The number of reserved blocks is defined
- in the maxblocks member of the bad block table description structure.
- Reserving 4 blocks for mirrored tables should be a reasonable number.
- This also limits the number of blocks which are scanned for the bad
- block table ident pattern.
- </para></listitem>
- </itemizedlist>
- </para>
- </sect2>
- </sect1>
- <sect1 id="Spare_area_placement">
- <title>Spare area (auto)placement</title>
- <para>
- The nand driver implements different possibilities for
- placement of filesystem data in the spare area,
- <itemizedlist>
- <listitem><para>Placement defined by fs driver</para></listitem>
- <listitem><para>Automatic placement</para></listitem>
- </itemizedlist>
- The default placement function is automatic placement. The
- nand driver has built in default placement schemes for the
- various chiptypes. If due to hardware ECC functionality the
- default placement does not fit then the board driver can
- provide a own placement scheme.
- </para>
- <para>
- File system drivers can provide a own placement scheme which
- is used instead of the default placement scheme.
- </para>
- <para>
- Placement schemes are defined by a nand_oobinfo structure
- <programlisting>
-struct nand_oobinfo {
- int useecc;
- int eccbytes;
- int eccpos[24];
- int oobfree[8][2];
-};
- </programlisting>
- <itemizedlist>
- <listitem><para>useecc</para><para>
- The useecc member controls the ecc and placement function. The header
- file include/mtd/mtd-abi.h contains constants to select ecc and
- placement. MTD_NANDECC_OFF switches off the ecc complete. This is
- not recommended and available for testing and diagnosis only.
- MTD_NANDECC_PLACE selects caller defined placement, MTD_NANDECC_AUTOPLACE
- selects automatic placement.
- </para></listitem>
- <listitem><para>eccbytes</para><para>
- The eccbytes member defines the number of ecc bytes per page.
- </para></listitem>
- <listitem><para>eccpos</para><para>
- The eccpos array holds the byte offsets in the spare area where
- the ecc codes are placed.
- </para></listitem>
- <listitem><para>oobfree</para><para>
- The oobfree array defines the areas in the spare area which can be
- used for automatic placement. The information is given in the format
- {offset, size}. offset defines the start of the usable area, size the
- length in bytes. More than one area can be defined. The list is terminated
- by an {0, 0} entry.
- </para></listitem>
- </itemizedlist>
- </para>
- <sect2 id="Placement_defined_by_fs_driver">
- <title>Placement defined by fs driver</title>
- <para>
- The calling function provides a pointer to a nand_oobinfo
- structure which defines the ecc placement. For writes the
- caller must provide a spare area buffer along with the
- data buffer. The spare area buffer size is (number of pages) *
- (size of spare area). For reads the buffer size is
- (number of pages) * ((size of spare area) + (number of ecc
- steps per page) * sizeof (int)). The driver stores the
- result of the ecc check for each tuple in the spare buffer.
- The storage sequence is
- </para>
- <para>
- &lt;spare data page 0&gt;&lt;ecc result 0&gt;...&lt;ecc result n&gt;
- </para>
- <para>
- ...
- </para>
- <para>
- &lt;spare data page n&gt;&lt;ecc result 0&gt;...&lt;ecc result n&gt;
- </para>
- <para>
- This is a legacy mode used by YAFFS1.
- </para>
- <para>
- If the spare area buffer is NULL then only the ECC placement is
- done according to the given scheme in the nand_oobinfo structure.
- </para>
- </sect2>
- <sect2 id="Automatic_placement">
- <title>Automatic placement</title>
- <para>
- Automatic placement uses the built in defaults to place the
- ecc bytes in the spare area. If filesystem data have to be stored /
- read into the spare area then the calling function must provide a
- buffer. The buffer size per page is determined by the oobfree array in
- the nand_oobinfo structure.
- </para>
- <para>
- If the spare area buffer is NULL then only the ECC placement is
- done according to the default builtin scheme.
- </para>
- </sect2>
- </sect1>
- <sect1 id="Spare_area_autoplacement_default">
- <title>Spare area autoplacement default schemes</title>
- <sect2 id="pagesize_256">
- <title>256 byte pagesize</title>
-<informaltable><tgroup cols="3"><tbody>
-<row>
-<entry>Offset</entry>
-<entry>Content</entry>
-<entry>Comment</entry>
-</row>
-<row>
-<entry>0x00</entry>
-<entry>ECC byte 0</entry>
-<entry>Error correction code byte 0</entry>
-</row>
-<row>
-<entry>0x01</entry>
-<entry>ECC byte 1</entry>
-<entry>Error correction code byte 1</entry>
-</row>
-<row>
-<entry>0x02</entry>
-<entry>ECC byte 2</entry>
-<entry>Error correction code byte 2</entry>
-</row>
-<row>
-<entry>0x03</entry>
-<entry>Autoplace 0</entry>
-<entry></entry>
-</row>
-<row>
-<entry>0x04</entry>
-<entry>Autoplace 1</entry>
-<entry></entry>
-</row>
-<row>
-<entry>0x05</entry>
-<entry>Bad block marker</entry>
-<entry>If any bit in this byte is zero, then this block is bad.
-This applies only to the first page in a block. In the remaining
-pages this byte is reserved</entry>
-</row>
-<row>
-<entry>0x06</entry>
-<entry>Autoplace 2</entry>
-<entry></entry>
-</row>
-<row>
-<entry>0x07</entry>
-<entry>Autoplace 3</entry>
-<entry></entry>
-</row>
-</tbody></tgroup></informaltable>
- </sect2>
- <sect2 id="pagesize_512">
- <title>512 byte pagesize</title>
-<informaltable><tgroup cols="3"><tbody>
-<row>
-<entry>Offset</entry>
-<entry>Content</entry>
-<entry>Comment</entry>
-</row>
-<row>
-<entry>0x00</entry>
-<entry>ECC byte 0</entry>
-<entry>Error correction code byte 0 of the lower 256 Byte data in
-this page</entry>
-</row>
-<row>
-<entry>0x01</entry>
-<entry>ECC byte 1</entry>
-<entry>Error correction code byte 1 of the lower 256 Bytes of data
-in this page</entry>
-</row>
-<row>
-<entry>0x02</entry>
-<entry>ECC byte 2</entry>
-<entry>Error correction code byte 2 of the lower 256 Bytes of data
-in this page</entry>
-</row>
-<row>
-<entry>0x03</entry>
-<entry>ECC byte 3</entry>
-<entry>Error correction code byte 0 of the upper 256 Bytes of data
-in this page</entry>
-</row>
-<row>
-<entry>0x04</entry>
-<entry>reserved</entry>
-<entry>reserved</entry>
-</row>
-<row>
-<entry>0x05</entry>
-<entry>Bad block marker</entry>
-<entry>If any bit in this byte is zero, then this block is bad.
-This applies only to the first page in a block. In the remaining
-pages this byte is reserved</entry>
-</row>
-<row>
-<entry>0x06</entry>
-<entry>ECC byte 4</entry>
-<entry>Error correction code byte 1 of the upper 256 Bytes of data
-in this page</entry>
-</row>
-<row>
-<entry>0x07</entry>
-<entry>ECC byte 5</entry>
-<entry>Error correction code byte 2 of the upper 256 Bytes of data
-in this page</entry>
-</row>
-<row>
-<entry>0x08 - 0x0F</entry>
-<entry>Autoplace 0 - 7</entry>
-<entry></entry>
-</row>
-</tbody></tgroup></informaltable>
- </sect2>
- <sect2 id="pagesize_2048">
- <title>2048 byte pagesize</title>
-<informaltable><tgroup cols="3"><tbody>
-<row>
-<entry>Offset</entry>
-<entry>Content</entry>
-<entry>Comment</entry>
-</row>
-<row>
-<entry>0x00</entry>
-<entry>Bad block marker</entry>
-<entry>If any bit in this byte is zero, then this block is bad.
-This applies only to the first page in a block. In the remaining
-pages this byte is reserved</entry>
-</row>
-<row>
-<entry>0x01</entry>
-<entry>Reserved</entry>
-<entry>Reserved</entry>
-</row>
-<row>
-<entry>0x02-0x27</entry>
-<entry>Autoplace 0 - 37</entry>
-<entry></entry>
-</row>
-<row>
-<entry>0x28</entry>
-<entry>ECC byte 0</entry>
-<entry>Error correction code byte 0 of the first 256 Byte data in
-this page</entry>
-</row>
-<row>
-<entry>0x29</entry>
-<entry>ECC byte 1</entry>
-<entry>Error correction code byte 1 of the first 256 Bytes of data
-in this page</entry>
-</row>
-<row>
-<entry>0x2A</entry>
-<entry>ECC byte 2</entry>
-<entry>Error correction code byte 2 of the first 256 Bytes data in
-this page</entry>
-</row>
-<row>
-<entry>0x2B</entry>
-<entry>ECC byte 3</entry>
-<entry>Error correction code byte 0 of the second 256 Bytes of data
-in this page</entry>
-</row>
-<row>
-<entry>0x2C</entry>
-<entry>ECC byte 4</entry>
-<entry>Error correction code byte 1 of the second 256 Bytes of data
-in this page</entry>
-</row>
-<row>
-<entry>0x2D</entry>
-<entry>ECC byte 5</entry>
-<entry>Error correction code byte 2 of the second 256 Bytes of data
-in this page</entry>
-</row>
-<row>
-<entry>0x2E</entry>
-<entry>ECC byte 6</entry>
-<entry>Error correction code byte 0 of the third 256 Bytes of data
-in this page</entry>
-</row>
-<row>
-<entry>0x2F</entry>
-<entry>ECC byte 7</entry>
-<entry>Error correction code byte 1 of the third 256 Bytes of data
-in this page</entry>
-</row>
-<row>
-<entry>0x30</entry>
-<entry>ECC byte 8</entry>
-<entry>Error correction code byte 2 of the third 256 Bytes of data
-in this page</entry>
-</row>
-<row>
-<entry>0x31</entry>
-<entry>ECC byte 9</entry>
-<entry>Error correction code byte 0 of the fourth 256 Bytes of data
-in this page</entry>
-</row>
-<row>
-<entry>0x32</entry>
-<entry>ECC byte 10</entry>
-<entry>Error correction code byte 1 of the fourth 256 Bytes of data
-in this page</entry>
-</row>
-<row>
-<entry>0x33</entry>
-<entry>ECC byte 11</entry>
-<entry>Error correction code byte 2 of the fourth 256 Bytes of data
-in this page</entry>
-</row>
-<row>
-<entry>0x34</entry>
-<entry>ECC byte 12</entry>
-<entry>Error correction code byte 0 of the fifth 256 Bytes of data
-in this page</entry>
-</row>
-<row>
-<entry>0x35</entry>
-<entry>ECC byte 13</entry>
-<entry>Error correction code byte 1 of the fifth 256 Bytes of data
-in this page</entry>
-</row>
-<row>
-<entry>0x36</entry>
-<entry>ECC byte 14</entry>
-<entry>Error correction code byte 2 of the fifth 256 Bytes of data
-in this page</entry>
-</row>
-<row>
-<entry>0x37</entry>
-<entry>ECC byte 15</entry>
-<entry>Error correction code byte 0 of the sixt 256 Bytes of data
-in this page</entry>
-</row>
-<row>
-<entry>0x38</entry>
-<entry>ECC byte 16</entry>
-<entry>Error correction code byte 1 of the sixt 256 Bytes of data
-in this page</entry>
-</row>
-<row>
-<entry>0x39</entry>
-<entry>ECC byte 17</entry>
-<entry>Error correction code byte 2 of the sixt 256 Bytes of data
-in this page</entry>
-</row>
-<row>
-<entry>0x3A</entry>
-<entry>ECC byte 18</entry>
-<entry>Error correction code byte 0 of the seventh 256 Bytes of
-data in this page</entry>
-</row>
-<row>
-<entry>0x3B</entry>
-<entry>ECC byte 19</entry>
-<entry>Error correction code byte 1 of the seventh 256 Bytes of
-data in this page</entry>
-</row>
-<row>
-<entry>0x3C</entry>
-<entry>ECC byte 20</entry>
-<entry>Error correction code byte 2 of the seventh 256 Bytes of
-data in this page</entry>
-</row>
-<row>
-<entry>0x3D</entry>
-<entry>ECC byte 21</entry>
-<entry>Error correction code byte 0 of the eighth 256 Bytes of data
-in this page</entry>
-</row>
-<row>
-<entry>0x3E</entry>
-<entry>ECC byte 22</entry>
-<entry>Error correction code byte 1 of the eighth 256 Bytes of data
-in this page</entry>
-</row>
-<row>
-<entry>0x3F</entry>
-<entry>ECC byte 23</entry>
-<entry>Error correction code byte 2 of the eighth 256 Bytes of data
-in this page</entry>
-</row>
-</tbody></tgroup></informaltable>
- </sect2>
- </sect1>
- </chapter>
-
- <chapter id="filesystems">
- <title>Filesystem support</title>
- <para>
- The NAND driver provides all necessary functions for a
- filesystem via the MTD interface.
- </para>
- <para>
- Filesystems must be aware of the NAND peculiarities and
- restrictions. One major restrictions of NAND Flash is, that you cannot
- write as often as you want to a page. The consecutive writes to a page,
- before erasing it again, are restricted to 1-3 writes, depending on the
- manufacturers specifications. This applies similar to the spare area.
- </para>
- <para>
- Therefore NAND aware filesystems must either write in page size chunks
- or hold a writebuffer to collect smaller writes until they sum up to
- pagesize. Available NAND aware filesystems: JFFS2, YAFFS.
- </para>
- <para>
- The spare area usage to store filesystem data is controlled by
- the spare area placement functionality which is described in one
- of the earlier chapters.
- </para>
- </chapter>
- <chapter id="tools">
- <title>Tools</title>
- <para>
- The MTD project provides a couple of helpful tools to handle NAND Flash.
- <itemizedlist>
- <listitem><para>flasherase, flasheraseall: Erase and format FLASH partitions</para></listitem>
- <listitem><para>nandwrite: write filesystem images to NAND FLASH</para></listitem>
- <listitem><para>nanddump: dump the contents of a NAND FLASH partitions</para></listitem>
- </itemizedlist>
- </para>
- <para>
- These tools are aware of the NAND restrictions. Please use those tools
- instead of complaining about errors which are caused by non NAND aware
- access methods.
- </para>
- </chapter>
-
- <chapter id="defines">
- <title>Constants</title>
- <para>
- This chapter describes the constants which might be relevant for a driver developer.
- </para>
- <sect1 id="Chip_option_constants">
- <title>Chip option constants</title>
- <sect2 id="Constants_for_chip_id_table">
- <title>Constants for chip id table</title>
- <para>
- These constants are defined in nand.h. They are ored together to describe
- the chip functionality.
- <programlisting>
-/* Buswitdh is 16 bit */
-#define NAND_BUSWIDTH_16 0x00000002
-/* Device supports partial programming without padding */
-#define NAND_NO_PADDING 0x00000004
-/* Chip has cache program function */
-#define NAND_CACHEPRG 0x00000008
-/* Chip has copy back function */
-#define NAND_COPYBACK 0x00000010
-/* AND Chip which has 4 banks and a confusing page / block
- * assignment. See Renesas datasheet for further information */
-#define NAND_IS_AND 0x00000020
-/* Chip has a array of 4 pages which can be read without
- * additional ready /busy waits */
-#define NAND_4PAGE_ARRAY 0x00000040
- </programlisting>
- </para>
- </sect2>
- <sect2 id="Constants_for_runtime_options">
- <title>Constants for runtime options</title>
- <para>
- These constants are defined in nand.h. They are ored together to describe
- the functionality.
- <programlisting>
-/* The hw ecc generator provides a syndrome instead a ecc value on read
- * This can only work if we have the ecc bytes directly behind the
- * data bytes. Applies for DOC and AG-AND Renesas HW Reed Solomon generators */
-#define NAND_HWECC_SYNDROME 0x00020000
- </programlisting>
- </para>
- </sect2>
- </sect1>
-
- <sect1 id="EEC_selection_constants">
- <title>ECC selection constants</title>
- <para>
- Use these constants to select the ECC algorithm.
- <programlisting>
-/* No ECC. Usage is not recommended ! */
-#define NAND_ECC_NONE 0
-/* Software ECC 3 byte ECC per 256 Byte data */
-#define NAND_ECC_SOFT 1
-/* Hardware ECC 3 byte ECC per 256 Byte data */
-#define NAND_ECC_HW3_256 2
-/* Hardware ECC 3 byte ECC per 512 Byte data */
-#define NAND_ECC_HW3_512 3
-/* Hardware ECC 6 byte ECC per 512 Byte data */
-#define NAND_ECC_HW6_512 4
-/* Hardware ECC 6 byte ECC per 512 Byte data */
-#define NAND_ECC_HW8_512 6
- </programlisting>
- </para>
- </sect1>
-
- <sect1 id="Hardware_control_related_constants">
- <title>Hardware control related constants</title>
- <para>
- These constants describe the requested hardware access function when
- the boardspecific hardware control function is called
- <programlisting>
-/* Select the chip by setting nCE to low */
-#define NAND_CTL_SETNCE 1
-/* Deselect the chip by setting nCE to high */
-#define NAND_CTL_CLRNCE 2
-/* Select the command latch by setting CLE to high */
-#define NAND_CTL_SETCLE 3
-/* Deselect the command latch by setting CLE to low */
-#define NAND_CTL_CLRCLE 4
-/* Select the address latch by setting ALE to high */
-#define NAND_CTL_SETALE 5
-/* Deselect the address latch by setting ALE to low */
-#define NAND_CTL_CLRALE 6
-/* Set write protection by setting WP to high. Not used! */
-#define NAND_CTL_SETWP 7
-/* Clear write protection by setting WP to low. Not used! */
-#define NAND_CTL_CLRWP 8
- </programlisting>
- </para>
- </sect1>
-
- <sect1 id="Bad_block_table_constants">
- <title>Bad block table related constants</title>
- <para>
- These constants describe the options used for bad block
- table descriptors.
- <programlisting>
-/* Options for the bad block table descriptors */
-
-/* The number of bits used per block in the bbt on the device */
-#define NAND_BBT_NRBITS_MSK 0x0000000F
-#define NAND_BBT_1BIT 0x00000001
-#define NAND_BBT_2BIT 0x00000002
-#define NAND_BBT_4BIT 0x00000004
-#define NAND_BBT_8BIT 0x00000008
-/* The bad block table is in the last good block of the device */
-#define NAND_BBT_LASTBLOCK 0x00000010
-/* The bbt is at the given page, else we must scan for the bbt */
-#define NAND_BBT_ABSPAGE 0x00000020
-/* bbt is stored per chip on multichip devices */
-#define NAND_BBT_PERCHIP 0x00000080
-/* bbt has a version counter at offset veroffs */
-#define NAND_BBT_VERSION 0x00000100
-/* Create a bbt if none axists */
-#define NAND_BBT_CREATE 0x00000200
-/* Write bbt if necessary */
-#define NAND_BBT_WRITE 0x00001000
-/* Read and write back block contents when writing bbt */
-#define NAND_BBT_SAVECONTENT 0x00002000
- </programlisting>
- </para>
- </sect1>
-
- </chapter>
-
- <chapter id="structs">
- <title>Structures</title>
- <para>
- This chapter contains the autogenerated documentation of the structures which are
- used in the NAND driver and might be relevant for a driver developer. Each
- struct member has a short description which is marked with an [XXX] identifier.
- See the chapter "Documentation hints" for an explanation.
- </para>
-!Iinclude/linux/mtd/nand.h
- </chapter>
-
- <chapter id="pubfunctions">
- <title>Public Functions Provided</title>
- <para>
- This chapter contains the autogenerated documentation of the NAND kernel API functions
- which are exported. Each function has a short description which is marked with an [XXX] identifier.
- See the chapter "Documentation hints" for an explanation.
- </para>
-!Edrivers/mtd/nand/nand_base.c
-!Edrivers/mtd/nand/nand_bbt.c
-!Edrivers/mtd/nand/nand_ecc.c
- </chapter>
-
- <chapter id="intfunctions">
- <title>Internal Functions Provided</title>
- <para>
- This chapter contains the autogenerated documentation of the NAND driver internal functions.
- Each function has a short description which is marked with an [XXX] identifier.
- See the chapter "Documentation hints" for an explanation.
- The functions marked with [DEFAULT] might be relevant for a board driver developer.
- </para>
-!Idrivers/mtd/nand/nand_base.c
-!Idrivers/mtd/nand/nand_bbt.c
-<!-- No internal functions for kernel-doc:
-X!Idrivers/mtd/nand/nand_ecc.c
--->
- </chapter>
-
- <chapter id="credits">
- <title>Credits</title>
- <para>
- The following people have contributed to the NAND driver:
- <orderedlist>
- <listitem><para>Steven J. Hill<email>sjhill@realitydiluted.com</email></para></listitem>
- <listitem><para>David Woodhouse<email>dwmw2@infradead.org</email></para></listitem>
- <listitem><para>Thomas Gleixner<email>tglx@linutronix.de</email></para></listitem>
- </orderedlist>
- A lot of users have provided bugfixes, improvements and helping hands for testing.
- Thanks a lot.
- </para>
- <para>
- The following people have contributed to this document:
- <orderedlist>
- <listitem><para>Thomas Gleixner<email>tglx@linutronix.de</email></para></listitem>
- </orderedlist>
- </para>
- </chapter>
-</book>
diff --git a/Documentation/DocBook/networking.tmpl b/Documentation/DocBook/networking.tmpl
deleted file mode 100644
index 29df25016c7c..000000000000
--- a/Documentation/DocBook/networking.tmpl
+++ /dev/null
@@ -1,111 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
- "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
-
-<book id="LinuxNetworking">
- <bookinfo>
- <title>Linux Networking and Network Devices APIs</title>
-
- <legalnotice>
- <para>
- This documentation is free software; you can redistribute
- it and/or modify it under the terms of the GNU General Public
- License as published by the Free Software Foundation; either
- version 2 of the License, or (at your option) any later
- version.
- </para>
-
- <para>
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- See the GNU General Public License for more details.
- </para>
-
- <para>
- You should have received a copy of the GNU General Public
- License along with this program; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- MA 02111-1307 USA
- </para>
-
- <para>
- For more details see the file COPYING in the source
- distribution of Linux.
- </para>
- </legalnotice>
- </bookinfo>
-
-<toc></toc>
-
- <chapter id="netcore">
- <title>Linux Networking</title>
- <sect1><title>Networking Base Types</title>
-!Iinclude/linux/net.h
- </sect1>
- <sect1><title>Socket Buffer Functions</title>
-!Iinclude/linux/skbuff.h
-!Iinclude/net/sock.h
-!Enet/socket.c
-!Enet/core/skbuff.c
-!Enet/core/sock.c
-!Enet/core/datagram.c
-!Enet/core/stream.c
- </sect1>
- <sect1><title>Socket Filter</title>
-!Enet/core/filter.c
- </sect1>
- <sect1><title>Generic Network Statistics</title>
-!Iinclude/uapi/linux/gen_stats.h
-!Enet/core/gen_stats.c
-!Enet/core/gen_estimator.c
- </sect1>
- <sect1><title>SUN RPC subsystem</title>
-<!-- The !D functionality is not perfect, garbage has to be protected by comments
-!Dnet/sunrpc/sunrpc_syms.c
--->
-!Enet/sunrpc/xdr.c
-!Enet/sunrpc/svc_xprt.c
-!Enet/sunrpc/xprt.c
-!Enet/sunrpc/sched.c
-!Enet/sunrpc/socklib.c
-!Enet/sunrpc/stats.c
-!Enet/sunrpc/rpc_pipe.c
-!Enet/sunrpc/rpcb_clnt.c
-!Enet/sunrpc/clnt.c
- </sect1>
- <sect1><title>WiMAX</title>
-!Enet/wimax/op-msg.c
-!Enet/wimax/op-reset.c
-!Enet/wimax/op-rfkill.c
-!Enet/wimax/stack.c
-!Iinclude/net/wimax.h
-!Iinclude/uapi/linux/wimax.h
- </sect1>
- </chapter>
-
- <chapter id="netdev">
- <title>Network device support</title>
- <sect1><title>Driver Support</title>
-!Enet/core/dev.c
-!Enet/ethernet/eth.c
-!Enet/sched/sch_generic.c
-!Iinclude/linux/etherdevice.h
-!Iinclude/linux/netdevice.h
- </sect1>
- <sect1><title>PHY Support</title>
-!Edrivers/net/phy/phy.c
-!Idrivers/net/phy/phy.c
-!Edrivers/net/phy/phy_device.c
-!Idrivers/net/phy/phy_device.c
-!Edrivers/net/phy/mdio_bus.c
-!Idrivers/net/phy/mdio_bus.c
- </sect1>
-<!-- FIXME: Removed for now since no structured comments in source
- <sect1><title>Wireless</title>
-X!Enet/core/wireless.c
- </sect1>
--->
- </chapter>
-
-</book>
diff --git a/Documentation/DocBook/rapidio.tmpl b/Documentation/DocBook/rapidio.tmpl
deleted file mode 100644
index ac3cca3399a1..000000000000
--- a/Documentation/DocBook/rapidio.tmpl
+++ /dev/null
@@ -1,155 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
- "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" [
- <!ENTITY rapidio SYSTEM "rapidio.xml">
- ]>
-
-<book id="RapidIO-Guide">
- <bookinfo>
- <title>RapidIO Subsystem Guide</title>
-
- <authorgroup>
- <author>
- <firstname>Matt</firstname>
- <surname>Porter</surname>
- <affiliation>
- <address>
- <email>mporter@kernel.crashing.org</email>
- <email>mporter@mvista.com</email>
- </address>
- </affiliation>
- </author>
- </authorgroup>
-
- <copyright>
- <year>2005</year>
- <holder>MontaVista Software, Inc.</holder>
- </copyright>
-
- <legalnotice>
- <para>
- This documentation is free software; you can redistribute
- it and/or modify it under the terms of the GNU General Public
- License version 2 as published by the Free Software Foundation.
- </para>
-
- <para>
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- See the GNU General Public License for more details.
- </para>
-
- <para>
- You should have received a copy of the GNU General Public
- License along with this program; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- MA 02111-1307 USA
- </para>
-
- <para>
- For more details see the file COPYING in the source
- distribution of Linux.
- </para>
- </legalnotice>
- </bookinfo>
-
-<toc></toc>
-
- <chapter id="intro">
- <title>Introduction</title>
- <para>
- RapidIO is a high speed switched fabric interconnect with
- features aimed at the embedded market. RapidIO provides
- support for memory-mapped I/O as well as message-based
- transactions over the switched fabric network. RapidIO has
- a standardized discovery mechanism not unlike the PCI bus
- standard that allows simple detection of devices in a
- network.
- </para>
- <para>
- This documentation is provided for developers intending
- to support RapidIO on new architectures, write new drivers,
- or to understand the subsystem internals.
- </para>
- </chapter>
-
- <chapter id="bugs">
- <title>Known Bugs and Limitations</title>
-
- <sect1 id="known_bugs">
- <title>Bugs</title>
- <para>None. ;)</para>
- </sect1>
- <sect1 id="Limitations">
- <title>Limitations</title>
- <para>
- <orderedlist>
- <listitem><para>Access/management of RapidIO memory regions is not supported</para></listitem>
- <listitem><para>Multiple host enumeration is not supported</para></listitem>
- </orderedlist>
- </para>
- </sect1>
- </chapter>
-
- <chapter id="drivers">
- <title>RapidIO driver interface</title>
- <para>
- Drivers are provided a set of calls in order
- to interface with the subsystem to gather info
- on devices, request/map memory region resources,
- and manage mailboxes/doorbells.
- </para>
- <sect1 id="Functions">
- <title>Functions</title>
-!Iinclude/linux/rio_drv.h
-!Edrivers/rapidio/rio-driver.c
-!Edrivers/rapidio/rio.c
- </sect1>
- </chapter>
-
- <chapter id="internals">
- <title>Internals</title>
-
- <para>
- This chapter contains the autogenerated documentation of the RapidIO
- subsystem.
- </para>
-
- <sect1 id="Structures"><title>Structures</title>
-!Iinclude/linux/rio.h
- </sect1>
- <sect1 id="Enumeration_and_Discovery"><title>Enumeration and Discovery</title>
-!Idrivers/rapidio/rio-scan.c
- </sect1>
- <sect1 id="Driver_functionality"><title>Driver functionality</title>
-!Idrivers/rapidio/rio.c
-!Idrivers/rapidio/rio-access.c
- </sect1>
- <sect1 id="Device_model_support"><title>Device model support</title>
-!Idrivers/rapidio/rio-driver.c
- </sect1>
- <sect1 id="PPC32_support"><title>PPC32 support</title>
-!Iarch/powerpc/sysdev/fsl_rio.c
- </sect1>
- </chapter>
-
- <chapter id="credits">
- <title>Credits</title>
- <para>
- The following people have contributed to the RapidIO
- subsystem directly or indirectly:
- <orderedlist>
- <listitem><para>Matt Porter<email>mporter@kernel.crashing.org</email></para></listitem>
- <listitem><para>Randy Vinson<email>rvinson@mvista.com</email></para></listitem>
- <listitem><para>Dan Malek<email>dan@embeddedalley.com</email></para></listitem>
- </orderedlist>
- </para>
- <para>
- The following people have contributed to this document:
- <orderedlist>
- <listitem><para>Matt Porter<email>mporter@kernel.crashing.org</email></para></listitem>
- </orderedlist>
- </para>
- </chapter>
-</book>
diff --git a/Documentation/DocBook/s390-drivers.tmpl b/Documentation/DocBook/s390-drivers.tmpl
deleted file mode 100644
index 95bfc12e5439..000000000000
--- a/Documentation/DocBook/s390-drivers.tmpl
+++ /dev/null
@@ -1,161 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
- "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
-
-<book id="s390drivers">
- <bookinfo>
- <title>Writing s390 channel device drivers</title>
-
- <authorgroup>
- <author>
- <firstname>Cornelia</firstname>
- <surname>Huck</surname>
- <affiliation>
- <address>
- <email>cornelia.huck@de.ibm.com</email>
- </address>
- </affiliation>
- </author>
- </authorgroup>
-
- <copyright>
- <year>2007</year>
- <holder>IBM Corp.</holder>
- </copyright>
-
- <legalnotice>
- <para>
- This documentation is free software; you can redistribute
- it and/or modify it under the terms of the GNU General Public
- License as published by the Free Software Foundation; either
- version 2 of the License, or (at your option) any later
- version.
- </para>
-
- <para>
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- See the GNU General Public License for more details.
- </para>
-
- <para>
- You should have received a copy of the GNU General Public
- License along with this program; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- MA 02111-1307 USA
- </para>
-
- <para>
- For more details see the file COPYING in the source
- distribution of Linux.
- </para>
- </legalnotice>
- </bookinfo>
-
-<toc></toc>
-
- <chapter id="intro">
- <title>Introduction</title>
- <para>
- This document describes the interfaces available for device drivers that
- drive s390 based channel attached I/O devices. This includes interfaces for
- interaction with the hardware and interfaces for interacting with the
- common driver core. Those interfaces are provided by the s390 common I/O
- layer.
- </para>
- <para>
- The document assumes a familarity with the technical terms associated
- with the s390 channel I/O architecture. For a description of this
- architecture, please refer to the "z/Architecture: Principles of
- Operation", IBM publication no. SA22-7832.
- </para>
- <para>
- While most I/O devices on a s390 system are typically driven through the
- channel I/O mechanism described here, there are various other methods
- (like the diag interface). These are out of the scope of this document.
- </para>
- <para>
- Some additional information can also be found in the kernel source
- under Documentation/s390/driver-model.txt.
- </para>
- </chapter>
- <chapter id="ccw">
- <title>The ccw bus</title>
- <para>
- The ccw bus typically contains the majority of devices available to
- a s390 system. Named after the channel command word (ccw), the basic
- command structure used to address its devices, the ccw bus contains
- so-called channel attached devices. They are addressed via I/O
- subchannels, visible on the css bus. A device driver for
- channel-attached devices, however, will never interact with the
- subchannel directly, but only via the I/O device on the ccw bus,
- the ccw device.
- </para>
- <sect1 id="channelIO">
- <title>I/O functions for channel-attached devices</title>
- <para>
- Some hardware structures have been translated into C structures for use
- by the common I/O layer and device drivers. For more information on
- the hardware structures represented here, please consult the Principles
- of Operation.
- </para>
-!Iarch/s390/include/asm/cio.h
- </sect1>
- <sect1 id="ccwdev">
- <title>ccw devices</title>
- <para>
- Devices that want to initiate channel I/O need to attach to the ccw bus.
- Interaction with the driver core is done via the common I/O layer, which
- provides the abstractions of ccw devices and ccw device drivers.
- </para>
- <para>
- The functions that initiate or terminate channel I/O all act upon a
- ccw device structure. Device drivers must not bypass those functions
- or strange side effects may happen.
- </para>
-!Iarch/s390/include/asm/ccwdev.h
-!Edrivers/s390/cio/device.c
-!Edrivers/s390/cio/device_ops.c
- </sect1>
- <sect1 id="cmf">
- <title>The channel-measurement facility</title>
- <para>
- The channel-measurement facility provides a means to collect
- measurement data which is made available by the channel subsystem
- for each channel attached device.
- </para>
-!Iarch/s390/include/asm/cmb.h
-!Edrivers/s390/cio/cmf.c
- </sect1>
- </chapter>
-
- <chapter id="ccwgroup">
- <title>The ccwgroup bus</title>
- <para>
- The ccwgroup bus only contains artificial devices, created by the user.
- Many networking devices (e.g. qeth) are in fact composed of several
- ccw devices (like read, write and data channel for qeth). The
- ccwgroup bus provides a mechanism to create a meta-device which
- contains those ccw devices as slave devices and can be associated
- with the netdevice.
- </para>
- <sect1 id="ccwgroupdevices">
- <title>ccw group devices</title>
-!Iarch/s390/include/asm/ccwgroup.h
-!Edrivers/s390/cio/ccwgroup.c
- </sect1>
- </chapter>
-
- <chapter id="genericinterfaces">
- <title>Generic interfaces</title>
- <para>
- Some interfaces are available to other drivers that do not necessarily
- have anything to do with the busses described above, but still are
- indirectly using basic infrastructure in the common I/O layer.
- One example is the support for adapter interrupts.
- </para>
-!Edrivers/s390/cio/airq.c
- </chapter>
-
-</book>
diff --git a/Documentation/DocBook/scsi.tmpl b/Documentation/DocBook/scsi.tmpl
deleted file mode 100644
index 4b9b9b286cea..000000000000
--- a/Documentation/DocBook/scsi.tmpl
+++ /dev/null
@@ -1,409 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
- "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
-
-<book id="scsimid">
- <bookinfo>
- <title>SCSI Interfaces Guide</title>
-
- <authorgroup>
- <author>
- <firstname>James</firstname>
- <surname>Bottomley</surname>
- <affiliation>
- <address>
- <email>James.Bottomley@hansenpartnership.com</email>
- </address>
- </affiliation>
- </author>
-
- <author>
- <firstname>Rob</firstname>
- <surname>Landley</surname>
- <affiliation>
- <address>
- <email>rob@landley.net</email>
- </address>
- </affiliation>
- </author>
-
- </authorgroup>
-
- <copyright>
- <year>2007</year>
- <holder>Linux Foundation</holder>
- </copyright>
-
- <legalnotice>
- <para>
- This documentation is free software; you can redistribute
- it and/or modify it under the terms of the GNU General Public
- License version 2.
- </para>
-
- <para>
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- For more details see the file COPYING in the source
- distribution of Linux.
- </para>
- </legalnotice>
- </bookinfo>
-
- <toc></toc>
-
- <chapter id="intro">
- <title>Introduction</title>
- <sect1 id="protocol_vs_bus">
- <title>Protocol vs bus</title>
- <para>
- Once upon a time, the Small Computer Systems Interface defined both
- a parallel I/O bus and a data protocol to connect a wide variety of
- peripherals (disk drives, tape drives, modems, printers, scanners,
- optical drives, test equipment, and medical devices) to a host
- computer.
- </para>
- <para>
- Although the old parallel (fast/wide/ultra) SCSI bus has largely
- fallen out of use, the SCSI command set is more widely used than ever
- to communicate with devices over a number of different busses.
- </para>
- <para>
- The <ulink url='http://www.t10.org/scsi-3.htm'>SCSI protocol</ulink>
- is a big-endian peer-to-peer packet based protocol. SCSI commands
- are 6, 10, 12, or 16 bytes long, often followed by an associated data
- payload.
- </para>
- <para>
- SCSI commands can be transported over just about any kind of bus, and
- are the default protocol for storage devices attached to USB, SATA,
- SAS, Fibre Channel, FireWire, and ATAPI devices. SCSI packets are
- also commonly exchanged over Infiniband,
- <ulink url='http://i2o.shadowconnect.com/faq.php'>I20</ulink>, TCP/IP
- (<ulink url='https://en.wikipedia.org/wiki/ISCSI'>iSCSI</ulink>), even
- <ulink url='http://cyberelk.net/tim/parport/parscsi.html'>Parallel
- ports</ulink>.
- </para>
- </sect1>
- <sect1 id="subsystem_design">
- <title>Design of the Linux SCSI subsystem</title>
- <para>
- The SCSI subsystem uses a three layer design, with upper, mid, and low
- layers. Every operation involving the SCSI subsystem (such as reading
- a sector from a disk) uses one driver at each of the 3 levels: one
- upper layer driver, one lower layer driver, and the SCSI midlayer.
- </para>
- <para>
- The SCSI upper layer provides the interface between userspace and the
- kernel, in the form of block and char device nodes for I/O and
- ioctl(). The SCSI lower layer contains drivers for specific hardware
- devices.
- </para>
- <para>
- In between is the SCSI mid-layer, analogous to a network routing
- layer such as the IPv4 stack. The SCSI mid-layer routes a packet
- based data protocol between the upper layer's /dev nodes and the
- corresponding devices in the lower layer. It manages command queues,
- provides error handling and power management functions, and responds
- to ioctl() requests.
- </para>
- </sect1>
- </chapter>
-
- <chapter id="upper_layer">
- <title>SCSI upper layer</title>
- <para>
- The upper layer supports the user-kernel interface by providing
- device nodes.
- </para>
- <sect1 id="sd">
- <title>sd (SCSI Disk)</title>
- <para>sd (sd_mod.o)</para>
-<!-- !Idrivers/scsi/sd.c -->
- </sect1>
- <sect1 id="sr">
- <title>sr (SCSI CD-ROM)</title>
- <para>sr (sr_mod.o)</para>
- </sect1>
- <sect1 id="st">
- <title>st (SCSI Tape)</title>
- <para>st (st.o)</para>
- </sect1>
- <sect1 id="sg">
- <title>sg (SCSI Generic)</title>
- <para>sg (sg.o)</para>
- </sect1>
- <sect1 id="ch">
- <title>ch (SCSI Media Changer)</title>
- <para>ch (ch.c)</para>
- </sect1>
- </chapter>
-
- <chapter id="mid_layer">
- <title>SCSI mid layer</title>
-
- <sect1 id="midlayer_implementation">
- <title>SCSI midlayer implementation</title>
- <sect2 id="scsi_device.h">
- <title>include/scsi/scsi_device.h</title>
- <para>
- </para>
-!Iinclude/scsi/scsi_device.h
- </sect2>
-
- <sect2 id="scsi.c">
- <title>drivers/scsi/scsi.c</title>
- <para>Main file for the SCSI midlayer.</para>
-!Edrivers/scsi/scsi.c
- </sect2>
- <sect2 id="scsicam.c">
- <title>drivers/scsi/scsicam.c</title>
- <para>
- <ulink url='http://www.t10.org/ftp/t10/drafts/cam/cam-r12b.pdf'>SCSI
- Common Access Method</ulink> support functions, for use with
- HDIO_GETGEO, etc.
- </para>
-!Edrivers/scsi/scsicam.c
- </sect2>
- <sect2 id="scsi_error.c">
- <title>drivers/scsi/scsi_error.c</title>
- <para>Common SCSI error/timeout handling routines.</para>
-!Edrivers/scsi/scsi_error.c
- </sect2>
- <sect2 id="scsi_devinfo.c">
- <title>drivers/scsi/scsi_devinfo.c</title>
- <para>
- Manage scsi_dev_info_list, which tracks blacklisted and whitelisted
- devices.
- </para>
-!Idrivers/scsi/scsi_devinfo.c
- </sect2>
- <sect2 id="scsi_ioctl.c">
- <title>drivers/scsi/scsi_ioctl.c</title>
- <para>
- Handle ioctl() calls for SCSI devices.
- </para>
-!Edrivers/scsi/scsi_ioctl.c
- </sect2>
- <sect2 id="scsi_lib.c">
- <title>drivers/scsi/scsi_lib.c</title>
- <para>
- SCSI queuing library.
- </para>
-!Edrivers/scsi/scsi_lib.c
- </sect2>
- <sect2 id="scsi_lib_dma.c">
- <title>drivers/scsi/scsi_lib_dma.c</title>
- <para>
- SCSI library functions depending on DMA
- (map and unmap scatter-gather lists).
- </para>
-!Edrivers/scsi/scsi_lib_dma.c
- </sect2>
- <sect2 id="scsi_module.c">
- <title>drivers/scsi/scsi_module.c</title>
- <para>
- The file drivers/scsi/scsi_module.c contains legacy support for
- old-style host templates. It should never be used by any new driver.
- </para>
- </sect2>
- <sect2 id="scsi_proc.c">
- <title>drivers/scsi/scsi_proc.c</title>
- <para>
- The functions in this file provide an interface between
- the PROC file system and the SCSI device drivers
- It is mainly used for debugging, statistics and to pass
- information directly to the lowlevel driver.
-
- I.E. plumbing to manage /proc/scsi/*
- </para>
-!Idrivers/scsi/scsi_proc.c
- </sect2>
- <sect2 id="scsi_netlink.c">
- <title>drivers/scsi/scsi_netlink.c</title>
- <para>
- Infrastructure to provide async events from transports to userspace
- via netlink, using a single NETLINK_SCSITRANSPORT protocol for all
- transports.
-
- See <ulink url='http://marc.info/?l=linux-scsi&amp;m=115507374832500&amp;w=2'>the
- original patch submission</ulink> for more details.
- </para>
-!Idrivers/scsi/scsi_netlink.c
- </sect2>
- <sect2 id="scsi_scan.c">
- <title>drivers/scsi/scsi_scan.c</title>
- <para>
- Scan a host to determine which (if any) devices are attached.
-
- The general scanning/probing algorithm is as follows, exceptions are
- made to it depending on device specific flags, compilation options,
- and global variable (boot or module load time) settings.
-
- A specific LUN is scanned via an INQUIRY command; if the LUN has a
- device attached, a scsi_device is allocated and setup for it.
-
- For every id of every channel on the given host, start by scanning
- LUN 0. Skip hosts that don't respond at all to a scan of LUN 0.
- Otherwise, if LUN 0 has a device attached, allocate and setup a
- scsi_device for it. If target is SCSI-3 or up, issue a REPORT LUN,
- and scan all of the LUNs returned by the REPORT LUN; else,
- sequentially scan LUNs up until some maximum is reached, or a LUN is
- seen that cannot have a device attached to it.
- </para>
-!Idrivers/scsi/scsi_scan.c
- </sect2>
- <sect2 id="scsi_sysctl.c">
- <title>drivers/scsi/scsi_sysctl.c</title>
- <para>
- Set up the sysctl entry: "/dev/scsi/logging_level"
- (DEV_SCSI_LOGGING_LEVEL) which sets/returns scsi_logging_level.
- </para>
- </sect2>
- <sect2 id="scsi_sysfs.c">
- <title>drivers/scsi/scsi_sysfs.c</title>
- <para>
- SCSI sysfs interface routines.
- </para>
-!Edrivers/scsi/scsi_sysfs.c
- </sect2>
- <sect2 id="hosts.c">
- <title>drivers/scsi/hosts.c</title>
- <para>
- mid to lowlevel SCSI driver interface
- </para>
-!Edrivers/scsi/hosts.c
- </sect2>
- <sect2 id="constants.c">
- <title>drivers/scsi/constants.c</title>
- <para>
- mid to lowlevel SCSI driver interface
- </para>
-!Edrivers/scsi/constants.c
- </sect2>
- </sect1>
-
- <sect1 id="Transport_classes">
- <title>Transport classes</title>
- <para>
- Transport classes are service libraries for drivers in the SCSI
- lower layer, which expose transport attributes in sysfs.
- </para>
- <sect2 id="Fibre_Channel_transport">
- <title>Fibre Channel transport</title>
- <para>
- The file drivers/scsi/scsi_transport_fc.c defines transport attributes
- for Fibre Channel.
- </para>
-!Edrivers/scsi/scsi_transport_fc.c
- </sect2>
- <sect2 id="iSCSI_transport">
- <title>iSCSI transport class</title>
- <para>
- The file drivers/scsi/scsi_transport_iscsi.c defines transport
- attributes for the iSCSI class, which sends SCSI packets over TCP/IP
- connections.
- </para>
-!Edrivers/scsi/scsi_transport_iscsi.c
- </sect2>
- <sect2 id="SAS_transport">
- <title>Serial Attached SCSI (SAS) transport class</title>
- <para>
- The file drivers/scsi/scsi_transport_sas.c defines transport
- attributes for Serial Attached SCSI, a variant of SATA aimed at
- large high-end systems.
- </para>
- <para>
- The SAS transport class contains common code to deal with SAS HBAs,
- an aproximated representation of SAS topologies in the driver model,
- and various sysfs attributes to expose these topologies and management
- interfaces to userspace.
- </para>
- <para>
- In addition to the basic SCSI core objects this transport class
- introduces two additional intermediate objects: The SAS PHY
- as represented by struct sas_phy defines an "outgoing" PHY on
- a SAS HBA or Expander, and the SAS remote PHY represented by
- struct sas_rphy defines an "incoming" PHY on a SAS Expander or
- end device. Note that this is purely a software concept, the
- underlying hardware for a PHY and a remote PHY is the exactly
- the same.
- </para>
- <para>
- There is no concept of a SAS port in this code, users can see
- what PHYs form a wide port based on the port_identifier attribute,
- which is the same for all PHYs in a port.
- </para>
-!Edrivers/scsi/scsi_transport_sas.c
- </sect2>
- <sect2 id="SATA_transport">
- <title>SATA transport class</title>
- <para>
- The SATA transport is handled by libata, which has its own book of
- documentation in this directory.
- </para>
- </sect2>
- <sect2 id="SPI_transport">
- <title>Parallel SCSI (SPI) transport class</title>
- <para>
- The file drivers/scsi/scsi_transport_spi.c defines transport
- attributes for traditional (fast/wide/ultra) SCSI busses.
- </para>
-!Edrivers/scsi/scsi_transport_spi.c
- </sect2>
- <sect2 id="SRP_transport">
- <title>SCSI RDMA (SRP) transport class</title>
- <para>
- The file drivers/scsi/scsi_transport_srp.c defines transport
- attributes for SCSI over Remote Direct Memory Access.
- </para>
-!Edrivers/scsi/scsi_transport_srp.c
- </sect2>
- </sect1>
-
- </chapter>
-
- <chapter id="lower_layer">
- <title>SCSI lower layer</title>
- <sect1 id="hba_drivers">
- <title>Host Bus Adapter transport types</title>
- <para>
- Many modern device controllers use the SCSI command set as a protocol to
- communicate with their devices through many different types of physical
- connections.
- </para>
- <para>
- In SCSI language a bus capable of carrying SCSI commands is
- called a "transport", and a controller connecting to such a bus is
- called a "host bus adapter" (HBA).
- </para>
- <sect2 id="scsi_debug.c">
- <title>Debug transport</title>
- <para>
- The file drivers/scsi/scsi_debug.c simulates a host adapter with a
- variable number of disks (or disk like devices) attached, sharing a
- common amount of RAM. Does a lot of checking to make sure that we are
- not getting blocks mixed up, and panics the kernel if anything out of
- the ordinary is seen.
- </para>
- <para>
- To be more realistic, the simulated devices have the transport
- attributes of SAS disks.
- </para>
- <para>
- For documentation see
- <ulink url='http://sg.danny.cz/sg/sdebug26.html'>http://sg.danny.cz/sg/sdebug26.html</ulink>
- </para>
-<!-- !Edrivers/scsi/scsi_debug.c -->
- </sect2>
- <sect2 id="todo">
- <title>todo</title>
- <para>Parallel (fast/wide/ultra) SCSI, USB, SATA,
- SAS, Fibre Channel, FireWire, ATAPI devices, Infiniband,
- I20, iSCSI, Parallel ports, netlink...
- </para>
- </sect2>
- </sect1>
- </chapter>
-</book>
diff --git a/Documentation/DocBook/sh.tmpl b/Documentation/DocBook/sh.tmpl
deleted file mode 100644
index 4a38f604fa66..000000000000
--- a/Documentation/DocBook/sh.tmpl
+++ /dev/null
@@ -1,105 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
- "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
-
-<book id="sh-drivers">
- <bookinfo>
- <title>SuperH Interfaces Guide</title>
-
- <authorgroup>
- <author>
- <firstname>Paul</firstname>
- <surname>Mundt</surname>
- <affiliation>
- <address>
- <email>lethal@linux-sh.org</email>
- </address>
- </affiliation>
- </author>
- </authorgroup>
-
- <copyright>
- <year>2008-2010</year>
- <holder>Paul Mundt</holder>
- </copyright>
- <copyright>
- <year>2008-2010</year>
- <holder>Renesas Technology Corp.</holder>
- </copyright>
- <copyright>
- <year>2010</year>
- <holder>Renesas Electronics Corp.</holder>
- </copyright>
-
- <legalnotice>
- <para>
- This documentation is free software; you can redistribute
- it and/or modify it under the terms of the GNU General Public
- License version 2 as published by the Free Software Foundation.
- </para>
-
- <para>
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- See the GNU General Public License for more details.
- </para>
-
- <para>
- You should have received a copy of the GNU General Public
- License along with this program; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- MA 02111-1307 USA
- </para>
-
- <para>
- For more details see the file COPYING in the source
- distribution of Linux.
- </para>
- </legalnotice>
- </bookinfo>
-
-<toc></toc>
-
- <chapter id="mm">
- <title>Memory Management</title>
- <sect1 id="sh4">
- <title>SH-4</title>
- <sect2 id="sq">
- <title>Store Queue API</title>
-!Earch/sh/kernel/cpu/sh4/sq.c
- </sect2>
- </sect1>
- <sect1 id="sh5">
- <title>SH-5</title>
- <sect2 id="tlb">
- <title>TLB Interfaces</title>
-!Iarch/sh/mm/tlb-sh5.c
-!Iarch/sh/include/asm/tlb_64.h
- </sect2>
- </sect1>
- </chapter>
- <chapter id="mach">
- <title>Machine Specific Interfaces</title>
- <sect1 id="dreamcast">
- <title>mach-dreamcast</title>
-!Iarch/sh/boards/mach-dreamcast/rtc.c
- </sect1>
- <sect1 id="x3proto">
- <title>mach-x3proto</title>
-!Earch/sh/boards/mach-x3proto/ilsel.c
- </sect1>
- </chapter>
- <chapter id="busses">
- <title>Busses</title>
- <sect1 id="superhyway">
- <title>SuperHyway</title>
-!Edrivers/sh/superhyway/superhyway.c
- </sect1>
-
- <sect1 id="maple">
- <title>Maple</title>
-!Edrivers/sh/maple/maple.c
- </sect1>
- </chapter>
-</book>
diff --git a/Documentation/DocBook/stylesheet.xsl b/Documentation/DocBook/stylesheet.xsl
deleted file mode 100644
index 3bf4ecf3d760..000000000000
--- a/Documentation/DocBook/stylesheet.xsl
+++ /dev/null
@@ -1,11 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<stylesheet xmlns="http://www.w3.org/1999/XSL/Transform" version="1.0">
-<param name="chunk.quietly">1</param>
-<param name="funcsynopsis.style">ansi</param>
-<param name="funcsynopsis.tabular.threshold">80</param>
-<param name="callout.graphics">0</param>
-<!-- <param name="paper.type">A4</param> -->
-<param name="generate.consistent.ids">1</param>
-<param name="generate.section.toc.level">2</param>
-<param name="use.id.as.filename">1</param>
-</stylesheet>
diff --git a/Documentation/DocBook/w1.tmpl b/Documentation/DocBook/w1.tmpl
deleted file mode 100644
index b0228d4c81bb..000000000000
--- a/Documentation/DocBook/w1.tmpl
+++ /dev/null
@@ -1,101 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
- "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
-
-<book id="w1id">
- <bookinfo>
- <title>W1: Dallas' 1-wire bus</title>
-
- <authorgroup>
- <author>
- <firstname>David</firstname>
- <surname>Fries</surname>
- <affiliation>
- <address>
- <email>David@Fries.net</email>
- </address>
- </affiliation>
- </author>
-
- </authorgroup>
-
- <copyright>
- <year>2013</year>
- <!--
- <holder></holder>
- -->
- </copyright>
-
- <legalnotice>
- <para>
- This documentation is free software; you can redistribute
- it and/or modify it under the terms of the GNU General Public
- License version 2.
- </para>
-
- <para>
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- For more details see the file COPYING in the source
- distribution of Linux.
- </para>
- </legalnotice>
- </bookinfo>
-
- <toc></toc>
-
- <chapter id="w1_internal">
- <title>W1 API internal to the kernel</title>
-
- <sect1 id="w1_internal_api">
- <title>W1 API internal to the kernel</title>
- <sect2 id="w1.h">
- <title>drivers/w1/w1.h</title>
- <para>W1 core functions.</para>
-!Idrivers/w1/w1.h
- </sect2>
-
- <sect2 id="w1.c">
- <title>drivers/w1/w1.c</title>
- <para>W1 core functions.</para>
-!Idrivers/w1/w1.c
- </sect2>
-
- <sect2 id="w1_family.h">
- <title>drivers/w1/w1_family.h</title>
- <para>Allows registering device family operations.</para>
-!Idrivers/w1/w1_family.h
- </sect2>
-
- <sect2 id="w1_family.c">
- <title>drivers/w1/w1_family.c</title>
- <para>Allows registering device family operations.</para>
-!Edrivers/w1/w1_family.c
- </sect2>
-
- <sect2 id="w1_int.c">
- <title>drivers/w1/w1_int.c</title>
- <para>W1 internal initialization for master devices.</para>
-!Edrivers/w1/w1_int.c
- </sect2>
-
- <sect2 id="w1_netlink.h">
- <title>drivers/w1/w1_netlink.h</title>
- <para>W1 external netlink API structures and commands.</para>
-!Idrivers/w1/w1_netlink.h
- </sect2>
-
- <sect2 id="w1_io.c">
- <title>drivers/w1/w1_io.c</title>
- <para>W1 input/output.</para>
-!Edrivers/w1/w1_io.c
-!Idrivers/w1/w1_io.c
- </sect2>
-
- </sect1>
-
-
- </chapter>
-
-</book>
diff --git a/Documentation/DocBook/z8530book.tmpl b/Documentation/DocBook/z8530book.tmpl
deleted file mode 100644
index 6f3883be877e..000000000000
--- a/Documentation/DocBook/z8530book.tmpl
+++ /dev/null
@@ -1,371 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
- "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
-
-<book id="Z85230Guide">
- <bookinfo>
- <title>Z8530 Programming Guide</title>
-
- <authorgroup>
- <author>
- <firstname>Alan</firstname>
- <surname>Cox</surname>
- <affiliation>
- <address>
- <email>alan@lxorguk.ukuu.org.uk</email>
- </address>
- </affiliation>
- </author>
- </authorgroup>
-
- <copyright>
- <year>2000</year>
- <holder>Alan Cox</holder>
- </copyright>
-
- <legalnotice>
- <para>
- This documentation is free software; you can redistribute
- it and/or modify it under the terms of the GNU General Public
- License as published by the Free Software Foundation; either
- version 2 of the License, or (at your option) any later
- version.
- </para>
-
- <para>
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- See the GNU General Public License for more details.
- </para>
-
- <para>
- You should have received a copy of the GNU General Public
- License along with this program; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- MA 02111-1307 USA
- </para>
-
- <para>
- For more details see the file COPYING in the source
- distribution of Linux.
- </para>
- </legalnotice>
- </bookinfo>
-
-<toc></toc>
-
- <chapter id="intro">
- <title>Introduction</title>
- <para>
- The Z85x30 family synchronous/asynchronous controller chips are
- used on a large number of cheap network interface cards. The
- kernel provides a core interface layer that is designed to make
- it easy to provide WAN services using this chip.
- </para>
- <para>
- The current driver only support synchronous operation. Merging the
- asynchronous driver support into this code to allow any Z85x30
- device to be used as both a tty interface and as a synchronous
- controller is a project for Linux post the 2.4 release
- </para>
- </chapter>
-
- <chapter id="Driver_Modes">
- <title>Driver Modes</title>
- <para>
- The Z85230 driver layer can drive Z8530, Z85C30 and Z85230 devices
- in three different modes. Each mode can be applied to an individual
- channel on the chip (each chip has two channels).
- </para>
- <para>
- The PIO synchronous mode supports the most common Z8530 wiring. Here
- the chip is interface to the I/O and interrupt facilities of the
- host machine but not to the DMA subsystem. When running PIO the
- Z8530 has extremely tight timing requirements. Doing high speeds,
- even with a Z85230 will be tricky. Typically you should expect to
- achieve at best 9600 baud with a Z8C530 and 64Kbits with a Z85230.
- </para>
- <para>
- The DMA mode supports the chip when it is configured to use dual DMA
- channels on an ISA bus. The better cards tend to support this mode
- of operation for a single channel. With DMA running the Z85230 tops
- out when it starts to hit ISA DMA constraints at about 512Kbits. It
- is worth noting here that many PC machines hang or crash when the
- chip is driven fast enough to hold the ISA bus solid.
- </para>
- <para>
- Transmit DMA mode uses a single DMA channel. The DMA channel is used
- for transmission as the transmit FIFO is smaller than the receive
- FIFO. it gives better performance than pure PIO mode but is nowhere
- near as ideal as pure DMA mode.
- </para>
- </chapter>
-
- <chapter id="Using_the_Z85230_driver">
- <title>Using the Z85230 driver</title>
- <para>
- The Z85230 driver provides the back end interface to your board. To
- configure a Z8530 interface you need to detect the board and to
- identify its ports and interrupt resources. It is also your problem
- to verify the resources are available.
- </para>
- <para>
- Having identified the chip you need to fill in a struct z8530_dev,
- which describes each chip. This object must exist until you finally
- shutdown the board. Firstly zero the active field. This ensures
- nothing goes off without you intending it. The irq field should
- be set to the interrupt number of the chip. (Each chip has a single
- interrupt source rather than each channel). You are responsible
- for allocating the interrupt line. The interrupt handler should be
- set to <function>z8530_interrupt</function>. The device id should
- be set to the z8530_dev structure pointer. Whether the interrupt can
- be shared or not is board dependent, and up to you to initialise.
- </para>
- <para>
- The structure holds two channel structures.
- Initialise chanA.ctrlio and chanA.dataio with the address of the
- control and data ports. You can or this with Z8530_PORT_SLEEP to
- indicate your interface needs the 5uS delay for chip settling done
- in software. The PORT_SLEEP option is architecture specific. Other
- flags may become available on future platforms, eg for MMIO.
- Initialise the chanA.irqs to &amp;z8530_nop to start the chip up
- as disabled and discarding interrupt events. This ensures that
- stray interrupts will be mopped up and not hang the bus. Set
- chanA.dev to point to the device structure itself. The
- private and name field you may use as you wish. The private field
- is unused by the Z85230 layer. The name is used for error reporting
- and it may thus make sense to make it match the network name.
- </para>
- <para>
- Repeat the same operation with the B channel if your chip has
- both channels wired to something useful. This isn't always the
- case. If it is not wired then the I/O values do not matter, but
- you must initialise chanB.dev.
- </para>
- <para>
- If your board has DMA facilities then initialise the txdma and
- rxdma fields for the relevant channels. You must also allocate the
- ISA DMA channels and do any necessary board level initialisation
- to configure them. The low level driver will do the Z8530 and
- DMA controller programming but not board specific magic.
- </para>
- <para>
- Having initialised the device you can then call
- <function>z8530_init</function>. This will probe the chip and
- reset it into a known state. An identification sequence is then
- run to identify the chip type. If the checks fail to pass the
- function returns a non zero error code. Typically this indicates
- that the port given is not valid. After this call the
- type field of the z8530_dev structure is initialised to either
- Z8530, Z85C30 or Z85230 according to the chip found.
- </para>
- <para>
- Once you have called z8530_init you can also make use of the utility
- function <function>z8530_describe</function>. This provides a
- consistent reporting format for the Z8530 devices, and allows all
- the drivers to provide consistent reporting.
- </para>
- </chapter>
-
- <chapter id="Attaching_Network_Interfaces">
- <title>Attaching Network Interfaces</title>
- <para>
- If you wish to use the network interface facilities of the driver,
- then you need to attach a network device to each channel that is
- present and in use. In addition to use the generic HDLC
- you need to follow some additional plumbing rules. They may seem
- complex but a look at the example hostess_sv11 driver should
- reassure you.
- </para>
- <para>
- The network device used for each channel should be pointed to by
- the netdevice field of each channel. The hdlc-&gt; priv field of the
- network device points to your private data - you will need to be
- able to find your private data from this.
- </para>
- <para>
- The way most drivers approach this particular problem is to
- create a structure holding the Z8530 device definition and
- put that into the private field of the network device. The
- network device fields of the channels then point back to the
- network devices.
- </para>
- <para>
- If you wish to use the generic HDLC then you need to register
- the HDLC device.
- </para>
- <para>
- Before you register your network device you will also need to
- provide suitable handlers for most of the network device callbacks.
- See the network device documentation for more details on this.
- </para>
- </chapter>
-
- <chapter id="Configuring_And_Activating_The_Port">
- <title>Configuring And Activating The Port</title>
- <para>
- The Z85230 driver provides helper functions and tables to load the
- port registers on the Z8530 chips. When programming the register
- settings for a channel be aware that the documentation recommends
- initialisation orders. Strange things happen when these are not
- followed.
- </para>
- <para>
- <function>z8530_channel_load</function> takes an array of
- pairs of initialisation values in an array of u8 type. The first
- value is the Z8530 register number. Add 16 to indicate the alternate
- register bank on the later chips. The array is terminated by a 255.
- </para>
- <para>
- The driver provides a pair of public tables. The
- z8530_hdlc_kilostream table is for the UK 'Kilostream' service and
- also happens to cover most other end host configurations. The
- z8530_hdlc_kilostream_85230 table is the same configuration using
- the enhancements of the 85230 chip. The configuration loaded is
- standard NRZ encoded synchronous data with HDLC bitstuffing. All
- of the timing is taken from the other end of the link.
- </para>
- <para>
- When writing your own tables be aware that the driver internally
- tracks register values. It may need to reload values. You should
- therefore be sure to set registers 1-7, 9-11, 14 and 15 in all
- configurations. Where the register settings depend on DMA selection
- the driver will update the bits itself when you open or close.
- Loading a new table with the interface open is not recommended.
- </para>
- <para>
- There are three standard configurations supported by the core
- code. In PIO mode the interface is programmed up to use
- interrupt driven PIO. This places high demands on the host processor
- to avoid latency. The driver is written to take account of latency
- issues but it cannot avoid latencies caused by other drivers,
- notably IDE in PIO mode. Because the drivers allocate buffers you
- must also prevent MTU changes while the port is open.
- </para>
- <para>
- Once the port is open it will call the rx_function of each channel
- whenever a completed packet arrived. This is invoked from
- interrupt context and passes you the channel and a network
- buffer (struct sk_buff) holding the data. The data includes
- the CRC bytes so most users will want to trim the last two
- bytes before processing the data. This function is very timing
- critical. When you wish to simply discard data the support
- code provides the function <function>z8530_null_rx</function>
- to discard the data.
- </para>
- <para>
- To active PIO mode sending and receiving the <function>
- z8530_sync_open</function> is called. This expects to be passed
- the network device and the channel. Typically this is called from
- your network device open callback. On a failure a non zero error
- status is returned. The <function>z8530_sync_close</function>
- function shuts down a PIO channel. This must be done before the
- channel is opened again and before the driver shuts down
- and unloads.
- </para>
- <para>
- The ideal mode of operation is dual channel DMA mode. Here the
- kernel driver will configure the board for DMA in both directions.
- The driver also handles ISA DMA issues such as controller
- programming and the memory range limit for you. This mode is
- activated by calling the <function>z8530_sync_dma_open</function>
- function. On failure a non zero error value is returned.
- Once this mode is activated it can be shut down by calling the
- <function>z8530_sync_dma_close</function>. You must call the close
- function matching the open mode you used.
- </para>
- <para>
- The final supported mode uses a single DMA channel to drive the
- transmit side. As the Z85C30 has a larger FIFO on the receive
- channel this tends to increase the maximum speed a little.
- This is activated by calling the <function>z8530_sync_txdma_open
- </function>. This returns a non zero error code on failure. The
- <function>z8530_sync_txdma_close</function> function closes down
- the Z8530 interface from this mode.
- </para>
- </chapter>
-
- <chapter id="Network_Layer_Functions">
- <title>Network Layer Functions</title>
- <para>
- The Z8530 layer provides functions to queue packets for
- transmission. The driver internally buffers the frame currently
- being transmitted and one further frame (in order to keep back
- to back transmission running). Any further buffering is up to
- the caller.
- </para>
- <para>
- The function <function>z8530_queue_xmit</function> takes a network
- buffer in sk_buff format and queues it for transmission. The
- caller must provide the entire packet with the exception of the
- bitstuffing and CRC. This is normally done by the caller via
- the generic HDLC interface layer. It returns 0 if the buffer has been
- queued and non zero values for queue full. If the function accepts
- the buffer it becomes property of the Z8530 layer and the caller
- should not free it.
- </para>
- <para>
- The function <function>z8530_get_stats</function> returns a pointer
- to an internally maintained per interface statistics block. This
- provides most of the interface code needed to implement the network
- layer get_stats callback.
- </para>
- </chapter>
-
- <chapter id="Porting_The_Z8530_Driver">
- <title>Porting The Z8530 Driver</title>
- <para>
- The Z8530 driver is written to be portable. In DMA mode it makes
- assumptions about the use of ISA DMA. These are probably warranted
- in most cases as the Z85230 in particular was designed to glue to PC
- type machines. The PIO mode makes no real assumptions.
- </para>
- <para>
- Should you need to retarget the Z8530 driver to another architecture
- the only code that should need changing are the port I/O functions.
- At the moment these assume PC I/O port accesses. This may not be
- appropriate for all platforms. Replacing
- <function>z8530_read_port</function> and <function>z8530_write_port
- </function> is intended to be all that is required to port this
- driver layer.
- </para>
- </chapter>
-
- <chapter id="bugs">
- <title>Known Bugs And Assumptions</title>
- <para>
- <variablelist>
- <varlistentry><term>Interrupt Locking</term>
- <listitem>
- <para>
- The locking in the driver is done via the global cli/sti lock. This
- makes for relatively poor SMP performance. Switching this to use a
- per device spin lock would probably materially improve performance.
- </para>
- </listitem></varlistentry>
-
- <varlistentry><term>Occasional Failures</term>
- <listitem>
- <para>
- We have reports of occasional failures when run for very long
- periods of time and the driver starts to receive junk frames. At
- the moment the cause of this is not clear.
- </para>
- </listitem></varlistentry>
- </variablelist>
-
- </para>
- </chapter>
-
- <chapter id="pubfunctions">
- <title>Public Functions Provided</title>
-!Edrivers/net/wan/z85230.c
- </chapter>
-
- <chapter id="intfunctions">
- <title>Internal Functions</title>
-!Idrivers/net/wan/z85230.c
- </chapter>
-
-</book>