diff options
Diffstat (limited to 'Documentation/DocBook')
-rw-r--r-- | Documentation/DocBook/.gitignore | 17 | ||||
-rw-r--r-- | Documentation/DocBook/Makefile | 282 | ||||
-rw-r--r-- | Documentation/DocBook/filesystems.tmpl | 381 | ||||
-rw-r--r-- | Documentation/DocBook/kernel-hacking.tmpl | 1312 | ||||
-rw-r--r-- | Documentation/DocBook/kernel-locking.tmpl | 2151 | ||||
-rw-r--r-- | Documentation/DocBook/kgdb.tmpl | 918 | ||||
-rw-r--r-- | Documentation/DocBook/libata.tmpl | 1625 | ||||
-rw-r--r-- | Documentation/DocBook/librs.tmpl | 289 | ||||
-rw-r--r-- | Documentation/DocBook/lsm.tmpl | 265 | ||||
-rw-r--r-- | Documentation/DocBook/mtdnand.tmpl | 1291 | ||||
-rw-r--r-- | Documentation/DocBook/networking.tmpl | 111 | ||||
-rw-r--r-- | Documentation/DocBook/rapidio.tmpl | 155 | ||||
-rw-r--r-- | Documentation/DocBook/s390-drivers.tmpl | 161 | ||||
-rw-r--r-- | Documentation/DocBook/scsi.tmpl | 409 | ||||
-rw-r--r-- | Documentation/DocBook/sh.tmpl | 105 | ||||
-rw-r--r-- | Documentation/DocBook/stylesheet.xsl | 11 | ||||
-rw-r--r-- | Documentation/DocBook/w1.tmpl | 101 | ||||
-rw-r--r-- | Documentation/DocBook/z8530book.tmpl | 371 |
18 files changed, 0 insertions, 9955 deletions
diff --git a/Documentation/DocBook/.gitignore b/Documentation/DocBook/.gitignore deleted file mode 100644 index e05da3f7aa21..000000000000 --- a/Documentation/DocBook/.gitignore +++ /dev/null @@ -1,17 +0,0 @@ -*.xml -*.ps -*.pdf -*.html -*.9.gz -*.9 -*.aux -*.dvi -*.log -*.out -*.png -*.gif -*.svg -*.proc -*.db -media-indices.tmpl -media-entities.tmpl diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile deleted file mode 100644 index 85916f13d330..000000000000 --- a/Documentation/DocBook/Makefile +++ /dev/null @@ -1,282 +0,0 @@ -### -# This makefile is used to generate the kernel documentation, -# primarily based on in-line comments in various source files. -# See Documentation/kernel-doc-nano-HOWTO.txt for instruction in how -# to document the SRC - and how to read it. -# To add a new book the only step required is to add the book to the -# list of DOCBOOKS. - -DOCBOOKS := z8530book.xml \ - kernel-hacking.xml kernel-locking.xml \ - networking.xml \ - filesystems.xml lsm.xml kgdb.xml \ - libata.xml mtdnand.xml librs.xml rapidio.xml \ - s390-drivers.xml scsi.xml \ - sh.xml w1.xml - -ifeq ($(DOCBOOKS),) - -# Skip DocBook build if the user explicitly requested no DOCBOOKS. -.DEFAULT: - @echo " SKIP DocBook $@ target (DOCBOOKS=\"\" specified)." -else -ifneq ($(SPHINXDIRS),) - -# Skip DocBook build if the user explicitly requested a sphinx dir -.DEFAULT: - @echo " SKIP DocBook $@ target (SPHINXDIRS specified)." -else - - -### -# The build process is as follows (targets): -# (xmldocs) [by docproc] -# file.tmpl --> file.xml +--> file.ps (psdocs) [by db2ps or xmlto] -# +--> file.pdf (pdfdocs) [by db2pdf or xmlto] -# +--> DIR=file (htmldocs) [by xmlto] -# +--> man/ (mandocs) [by xmlto] - - -# for PDF and PS output you can choose between xmlto and docbook-utils tools -PDF_METHOD = $(prefer-db2x) -PS_METHOD = $(prefer-db2x) - - -targets += $(DOCBOOKS) -BOOKS := $(addprefix $(obj)/,$(DOCBOOKS)) -xmldocs: $(BOOKS) -sgmldocs: xmldocs - -PS := $(patsubst %.xml, %.ps, $(BOOKS)) -psdocs: $(PS) - -PDF := $(patsubst %.xml, %.pdf, $(BOOKS)) -pdfdocs: $(PDF) - -HTML := $(sort $(patsubst %.xml, %.html, $(BOOKS))) -htmldocs: $(HTML) - $(call cmd,build_main_index) - -MAN := $(patsubst %.xml, %.9, $(BOOKS)) -mandocs: $(MAN) - find $(obj)/man -name '*.9' | xargs gzip -nf - -# Default location for installed man pages -export INSTALL_MAN_PATH = $(objtree)/usr - -installmandocs: mandocs - mkdir -p $(INSTALL_MAN_PATH)/man/man9/ - find $(obj)/man -name '*.9.gz' -printf '%h %f\n' | \ - sort -k 2 -k 1 | uniq -f 1 | sed -e 's: :/:' | \ - xargs install -m 644 -t $(INSTALL_MAN_PATH)/man/man9/ - -# no-op for the DocBook toolchain -epubdocs: -latexdocs: -linkcheckdocs: - -### -#External programs used -KERNELDOCXMLREF = $(srctree)/scripts/kernel-doc-xml-ref -KERNELDOC = $(srctree)/scripts/kernel-doc -DOCPROC = $(objtree)/scripts/docproc -CHECK_LC_CTYPE = $(objtree)/scripts/check-lc_ctype - -# Use a fixed encoding - UTF-8 if the C library has support built-in -# or ASCII if not -LC_CTYPE := $(call try-run, LC_CTYPE=C.UTF-8 $(CHECK_LC_CTYPE),C.UTF-8,C) -export LC_CTYPE - -XMLTOFLAGS = -m $(srctree)/$(src)/stylesheet.xsl -XMLTOFLAGS += --skip-validation - -### -# DOCPROC is used for two purposes: -# 1) To generate a dependency list for a .tmpl file -# 2) To preprocess a .tmpl file and call kernel-doc with -# appropriate parameters. -# The following rules are used to generate the .xml documentation -# required to generate the final targets. (ps, pdf, html). -quiet_cmd_docproc = DOCPROC $@ - cmd_docproc = SRCTREE=$(srctree)/ $(DOCPROC) doc $< >$@ -define rule_docproc - set -e; \ - $(if $($(quiet)cmd_$(1)),echo ' $($(quiet)cmd_$(1))';) \ - $(cmd_$(1)); \ - ( \ - echo 'cmd_$@ := $(cmd_$(1))'; \ - echo $@: `SRCTREE=$(srctree) $(DOCPROC) depend $<`; \ - ) > $(dir $@).$(notdir $@).cmd -endef - -%.xml: %.tmpl $(KERNELDOC) $(DOCPROC) $(KERNELDOCXMLREF) FORCE - $(call if_changed_rule,docproc) - -# Tell kbuild to always build the programs -always := $(hostprogs-y) - -notfoundtemplate = echo "*** You have to install docbook-utils or xmlto ***"; \ - exit 1 -db2xtemplate = db2TYPE -o $(dir $@) $< -xmltotemplate = xmlto TYPE $(XMLTOFLAGS) -o $(dir $@) $< - -# determine which methods are available -ifeq ($(shell which db2ps >/dev/null 2>&1 && echo found),found) - use-db2x = db2x - prefer-db2x = db2x -else - use-db2x = notfound - prefer-db2x = $(use-xmlto) -endif -ifeq ($(shell which xmlto >/dev/null 2>&1 && echo found),found) - use-xmlto = xmlto - prefer-xmlto = xmlto -else - use-xmlto = notfound - prefer-xmlto = $(use-db2x) -endif - -# the commands, generated from the chosen template -quiet_cmd_db2ps = PS $@ - cmd_db2ps = $(subst TYPE,ps, $($(PS_METHOD)template)) -%.ps : %.xml - $(call cmd,db2ps) - -quiet_cmd_db2pdf = PDF $@ - cmd_db2pdf = $(subst TYPE,pdf, $($(PDF_METHOD)template)) -%.pdf : %.xml - $(call cmd,db2pdf) - - -index = index.html -main_idx = $(obj)/$(index) -quiet_cmd_build_main_index = HTML $(main_idx) - cmd_build_main_index = rm -rf $(main_idx); \ - echo '<h1>Linux Kernel HTML Documentation</h1>' >> $(main_idx) && \ - echo '<h2>Kernel Version: $(KERNELVERSION)</h2>' >> $(main_idx) && \ - cat $(HTML) >> $(main_idx) - -quiet_cmd_db2html = HTML $@ - cmd_db2html = xmlto html $(XMLTOFLAGS) -o $(patsubst %.html,%,$@) $< && \ - echo '<a HREF="$(patsubst %.html,%,$(notdir $@))/index.html"> \ - $(patsubst %.html,%,$(notdir $@))</a><p>' > $@ - -### -# Rules to create an aux XML and .db, and use them to re-process the DocBook XML -# to fill internal hyperlinks - gen_aux_xml = : - quiet_gen_aux_xml = echo ' XMLREF $@' -silent_gen_aux_xml = : -%.aux.xml: %.xml - @$($(quiet)gen_aux_xml) - @rm -rf $@ - @(cat $< | egrep "^<refentry id" | egrep -o "\".*\"" | cut -f 2 -d \" > $<.db) - @$(KERNELDOCXMLREF) -db $<.db $< > $@ -.PRECIOUS: %.aux.xml - -%.html: %.aux.xml - @(which xmlto > /dev/null 2>&1) || \ - (echo "*** You need to install xmlto ***"; \ - exit 1) - @rm -rf $@ $(patsubst %.html,%,$@) - $(call cmd,db2html) - @if [ ! -z "$(PNG-$(basename $(notdir $@)))" ]; then \ - cp $(PNG-$(basename $(notdir $@))) $(patsubst %.html,%,$@); fi - -quiet_cmd_db2man = MAN $@ - cmd_db2man = if grep -q refentry $<; then xmlto man $(XMLTOFLAGS) -o $(obj)/man/$(*F) $< ; fi -%.9 : %.xml - @(which xmlto > /dev/null 2>&1) || \ - (echo "*** You need to install xmlto ***"; \ - exit 1) - $(Q)mkdir -p $(obj)/man/$(*F) - $(call cmd,db2man) - @touch $@ - -### -# Rules to generate postscripts and PNG images from .fig format files -quiet_cmd_fig2eps = FIG2EPS $@ - cmd_fig2eps = fig2dev -Leps $< $@ - -%.eps: %.fig - @(which fig2dev > /dev/null 2>&1) || \ - (echo "*** You need to install transfig ***"; \ - exit 1) - $(call cmd,fig2eps) - -quiet_cmd_fig2png = FIG2PNG $@ - cmd_fig2png = fig2dev -Lpng $< $@ - -%.png: %.fig - @(which fig2dev > /dev/null 2>&1) || \ - (echo "*** You need to install transfig ***"; \ - exit 1) - $(call cmd,fig2png) - -### -# Rule to convert a .c file to inline XML documentation - gen_xml = : - quiet_gen_xml = echo ' GEN $@' -silent_gen_xml = : -%.xml: %.c - @$($(quiet)gen_xml) - @( \ - echo "<programlisting>"; \ - expand --tabs=8 < $< | \ - sed -e "s/&/\\&/g" \ - -e "s/</\\</g" \ - -e "s/>/\\>/g"; \ - echo "</programlisting>") > $@ - -endif # DOCBOOKS="" -endif # SPHINDIR=... - -### -# Help targets as used by the top-level makefile -dochelp: - @echo ' Linux kernel internal documentation in different formats (DocBook):' - @echo ' htmldocs - HTML' - @echo ' pdfdocs - PDF' - @echo ' psdocs - Postscript' - @echo ' xmldocs - XML DocBook' - @echo ' mandocs - man pages' - @echo ' installmandocs - install man pages generated by mandocs to INSTALL_MAN_PATH'; \ - echo ' (default: $(INSTALL_MAN_PATH))'; \ - echo '' - @echo ' cleandocs - clean all generated DocBook files' - @echo - @echo ' make DOCBOOKS="s1.xml s2.xml" [target] Generate only docs s1.xml s2.xml' - @echo ' valid values for DOCBOOKS are: $(DOCBOOKS)' - @echo - @echo " make DOCBOOKS=\"\" [target] Don't generate docs from Docbook" - @echo ' This is useful to generate only the ReST docs (Sphinx)' - - -### -# Temporary files left by various tools -clean-files := $(DOCBOOKS) \ - $(patsubst %.xml, %.dvi, $(DOCBOOKS)) \ - $(patsubst %.xml, %.aux, $(DOCBOOKS)) \ - $(patsubst %.xml, %.tex, $(DOCBOOKS)) \ - $(patsubst %.xml, %.log, $(DOCBOOKS)) \ - $(patsubst %.xml, %.out, $(DOCBOOKS)) \ - $(patsubst %.xml, %.ps, $(DOCBOOKS)) \ - $(patsubst %.xml, %.pdf, $(DOCBOOKS)) \ - $(patsubst %.xml, %.html, $(DOCBOOKS)) \ - $(patsubst %.xml, %.9, $(DOCBOOKS)) \ - $(patsubst %.xml, %.aux.xml, $(DOCBOOKS)) \ - $(patsubst %.xml, %.xml.db, $(DOCBOOKS)) \ - $(patsubst %.xml, %.xml, $(DOCBOOKS)) \ - $(patsubst %.xml, .%.xml.cmd, $(DOCBOOKS)) \ - $(index) - -clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS)) man - -cleandocs: - $(Q)rm -f $(call objectify, $(clean-files)) - $(Q)rm -rf $(call objectify, $(clean-dirs)) - -# Declare the contents of the .PHONY variable as phony. We keep that -# information in a variable so we can use it in if_changed and friends. - -.PHONY: $(PHONY) diff --git a/Documentation/DocBook/filesystems.tmpl b/Documentation/DocBook/filesystems.tmpl deleted file mode 100644 index 6006b6358c86..000000000000 --- a/Documentation/DocBook/filesystems.tmpl +++ /dev/null @@ -1,381 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="Linux-filesystems-API"> - <bookinfo> - <title>Linux Filesystems API</title> - - <legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later - version. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - </para> - - <para> - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - </para> - - <para> - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - </bookinfo> - -<toc></toc> - - <chapter id="vfs"> - <title>The Linux VFS</title> - <sect1 id="the_filesystem_types"><title>The Filesystem types</title> -!Iinclude/linux/fs.h - </sect1> - <sect1 id="the_directory_cache"><title>The Directory Cache</title> -!Efs/dcache.c -!Iinclude/linux/dcache.h - </sect1> - <sect1 id="inode_handling"><title>Inode Handling</title> -!Efs/inode.c -!Efs/bad_inode.c - </sect1> - <sect1 id="registration_and_superblocks"><title>Registration and Superblocks</title> -!Efs/super.c - </sect1> - <sect1 id="file_locks"><title>File Locks</title> -!Efs/locks.c -!Ifs/locks.c - </sect1> - <sect1 id="other_functions"><title>Other Functions</title> -!Efs/mpage.c -!Efs/namei.c -!Efs/buffer.c -!Eblock/bio.c -!Efs/seq_file.c -!Efs/filesystems.c -!Efs/fs-writeback.c -!Efs/block_dev.c - </sect1> - </chapter> - - <chapter id="proc"> - <title>The proc filesystem</title> - - <sect1 id="sysctl_interface"><title>sysctl interface</title> -!Ekernel/sysctl.c - </sect1> - - <sect1 id="proc_filesystem_interface"><title>proc filesystem interface</title> -!Ifs/proc/base.c - </sect1> - </chapter> - - <chapter id="fs_events"> - <title>Events based on file descriptors</title> -!Efs/eventfd.c - </chapter> - - <chapter id="sysfs"> - <title>The Filesystem for Exporting Kernel Objects</title> -!Efs/sysfs/file.c -!Efs/sysfs/symlink.c - </chapter> - - <chapter id="debugfs"> - <title>The debugfs filesystem</title> - - <sect1 id="debugfs_interface"><title>debugfs interface</title> -!Efs/debugfs/inode.c -!Efs/debugfs/file.c - </sect1> - </chapter> - - <chapter id="LinuxJDBAPI"> - <chapterinfo> - <title>The Linux Journalling API</title> - - <authorgroup> - <author> - <firstname>Roger</firstname> - <surname>Gammans</surname> - <affiliation> - <address> - <email>rgammans@computer-surgery.co.uk</email> - </address> - </affiliation> - </author> - </authorgroup> - - <authorgroup> - <author> - <firstname>Stephen</firstname> - <surname>Tweedie</surname> - <affiliation> - <address> - <email>sct@redhat.com</email> - </address> - </affiliation> - </author> - </authorgroup> - - <copyright> - <year>2002</year> - <holder>Roger Gammans</holder> - </copyright> - </chapterinfo> - - <title>The Linux Journalling API</title> - - <sect1 id="journaling_overview"> - <title>Overview</title> - <sect2 id="journaling_details"> - <title>Details</title> -<para> -The journalling layer is easy to use. You need to -first of all create a journal_t data structure. There are -two calls to do this dependent on how you decide to allocate the physical -media on which the journal resides. The jbd2_journal_init_inode() call -is for journals stored in filesystem inodes, or the jbd2_journal_init_dev() -call can be used for journal stored on a raw device (in a continuous range -of blocks). A journal_t is a typedef for a struct pointer, so when -you are finally finished make sure you call jbd2_journal_destroy() on it -to free up any used kernel memory. -</para> - -<para> -Once you have got your journal_t object you need to 'mount' or load the journal -file. The journalling layer expects the space for the journal was already -allocated and initialized properly by the userspace tools. When loading the -journal you must call jbd2_journal_load() to process journal contents. If the -client file system detects the journal contents does not need to be processed -(or even need not have valid contents), it may call jbd2_journal_wipe() to -clear the journal contents before calling jbd2_journal_load(). -</para> - -<para> -Note that jbd2_journal_wipe(..,0) calls jbd2_journal_skip_recovery() for you if -it detects any outstanding transactions in the journal and similarly -jbd2_journal_load() will call jbd2_journal_recover() if necessary. I would -advise reading ext4_load_journal() in fs/ext4/super.c for examples on this -stage. -</para> - -<para> -Now you can go ahead and start modifying the underlying -filesystem. Almost. -</para> - -<para> - -You still need to actually journal your filesystem changes, this -is done by wrapping them into transactions. Additionally you -also need to wrap the modification of each of the buffers -with calls to the journal layer, so it knows what the modifications -you are actually making are. To do this use jbd2_journal_start() which -returns a transaction handle. -</para> - -<para> -jbd2_journal_start() -and its counterpart jbd2_journal_stop(), which indicates the end of a -transaction are nestable calls, so you can reenter a transaction if necessary, -but remember you must call jbd2_journal_stop() the same number of times as -jbd2_journal_start() before the transaction is completed (or more accurately -leaves the update phase). Ext4/VFS makes use of this feature to simplify -handling of inode dirtying, quota support, etc. -</para> - -<para> -Inside each transaction you need to wrap the modifications to the -individual buffers (blocks). Before you start to modify a buffer you -need to call jbd2_journal_get_{create,write,undo}_access() as appropriate, -this allows the journalling layer to copy the unmodified data if it -needs to. After all the buffer may be part of a previously uncommitted -transaction. -At this point you are at last ready to modify a buffer, and once -you are have done so you need to call jbd2_journal_dirty_{meta,}data(). -Or if you've asked for access to a buffer you now know is now longer -required to be pushed back on the device you can call jbd2_journal_forget() -in much the same way as you might have used bforget() in the past. -</para> - -<para> -A jbd2_journal_flush() may be called at any time to commit and checkpoint -all your transactions. -</para> - -<para> -Then at umount time , in your put_super() you can then call jbd2_journal_destroy() -to clean up your in-core journal object. -</para> - -<para> -Unfortunately there a couple of ways the journal layer can cause a deadlock. -The first thing to note is that each task can only have -a single outstanding transaction at any one time, remember nothing -commits until the outermost jbd2_journal_stop(). This means -you must complete the transaction at the end of each file/inode/address -etc. operation you perform, so that the journalling system isn't re-entered -on another journal. Since transactions can't be nested/batched -across differing journals, and another filesystem other than -yours (say ext4) may be modified in a later syscall. -</para> - -<para> -The second case to bear in mind is that jbd2_journal_start() can -block if there isn't enough space in the journal for your transaction -(based on the passed nblocks param) - when it blocks it merely(!) needs to -wait for transactions to complete and be committed from other tasks, -so essentially we are waiting for jbd2_journal_stop(). So to avoid -deadlocks you must treat jbd2_journal_start/stop() as if they -were semaphores and include them in your semaphore ordering rules to prevent -deadlocks. Note that jbd2_journal_extend() has similar blocking behaviour to -jbd2_journal_start() so you can deadlock here just as easily as on -jbd2_journal_start(). -</para> - -<para> -Try to reserve the right number of blocks the first time. ;-). This will -be the maximum number of blocks you are going to touch in this transaction. -I advise having a look at at least ext4_jbd.h to see the basis on which -ext4 uses to make these decisions. -</para> - -<para> -Another wriggle to watch out for is your on-disk block allocation strategy. -Why? Because, if you do a delete, you need to ensure you haven't reused any -of the freed blocks until the transaction freeing these blocks commits. If you -reused these blocks and crash happens, there is no way to restore the contents -of the reallocated blocks at the end of the last fully committed transaction. - -One simple way of doing this is to mark blocks as free in internal in-memory -block allocation structures only after the transaction freeing them commits. -Ext4 uses journal commit callback for this purpose. -</para> - -<para> -With journal commit callbacks you can ask the journalling layer to call a -callback function when the transaction is finally committed to disk, so that -you can do some of your own management. You ask the journalling layer for -calling the callback by simply setting journal->j_commit_callback function -pointer and that function is called after each transaction commit. You can also -use transaction->t_private_list for attaching entries to a transaction that -need processing when the transaction commits. -</para> - -<para> -JBD2 also provides a way to block all transaction updates via -jbd2_journal_{un,}lock_updates(). Ext4 uses this when it wants a window with a -clean and stable fs for a moment. E.g. -</para> - -<programlisting> - - jbd2_journal_lock_updates() //stop new stuff happening.. - jbd2_journal_flush() // checkpoint everything. - ..do stuff on stable fs - jbd2_journal_unlock_updates() // carry on with filesystem use. -</programlisting> - -<para> -The opportunities for abuse and DOS attacks with this should be obvious, -if you allow unprivileged userspace to trigger codepaths containing these -calls. -</para> - - </sect2> - - <sect2 id="jbd_summary"> - <title>Summary</title> -<para> -Using the journal is a matter of wrapping the different context changes, -being each mount, each modification (transaction) and each changed buffer -to tell the journalling layer about them. -</para> - - </sect2> - - </sect1> - - <sect1 id="data_types"> - <title>Data Types</title> - <para> - The journalling layer uses typedefs to 'hide' the concrete definitions - of the structures used. As a client of the JBD2 layer you can - just rely on the using the pointer as a magic cookie of some sort. - - Obviously the hiding is not enforced as this is 'C'. - </para> - <sect2 id="structures"><title>Structures</title> -!Iinclude/linux/jbd2.h - </sect2> - </sect1> - - <sect1 id="functions"> - <title>Functions</title> - <para> - The functions here are split into two groups those that - affect a journal as a whole, and those which are used to - manage transactions - </para> - <sect2 id="journal_level"><title>Journal Level</title> -!Efs/jbd2/journal.c -!Ifs/jbd2/recovery.c - </sect2> - <sect2 id="transaction_level"><title>Transasction Level</title> -!Efs/jbd2/transaction.c - </sect2> - </sect1> - <sect1 id="see_also"> - <title>See also</title> - <para> - <citation> - <ulink url="http://kernel.org/pub/linux/kernel/people/sct/ext3/journal-design.ps.gz"> - Journaling the Linux ext2fs Filesystem, LinuxExpo 98, Stephen Tweedie - </ulink> - </citation> - </para> - <para> - <citation> - <ulink url="http://olstrans.sourceforge.net/release/OLS2000-ext3/OLS2000-ext3.html"> - Ext3 Journalling FileSystem, OLS 2000, Dr. Stephen Tweedie - </ulink> - </citation> - </para> - </sect1> - - </chapter> - - <chapter id="splice"> - <title>splice API</title> - <para> - splice is a method for moving blocks of data around inside the - kernel, without continually transferring them between the kernel - and user space. - </para> -!Ffs/splice.c - </chapter> - - <chapter id="pipes"> - <title>pipes API</title> - <para> - Pipe interfaces are all for in-kernel (builtin image) use. - They are not exported for use by modules. - </para> -!Iinclude/linux/pipe_fs_i.h -!Ffs/pipe.c - </chapter> - -</book> diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl deleted file mode 100644 index da5c087462b1..000000000000 --- a/Documentation/DocBook/kernel-hacking.tmpl +++ /dev/null @@ -1,1312 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="lk-hacking-guide"> - <bookinfo> - <title>Unreliable Guide To Hacking The Linux Kernel</title> - - <authorgroup> - <author> - <firstname>Rusty</firstname> - <surname>Russell</surname> - <affiliation> - <address> - <email>rusty@rustcorp.com.au</email> - </address> - </affiliation> - </author> - </authorgroup> - - <copyright> - <year>2005</year> - <holder>Rusty Russell</holder> - </copyright> - - <legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later - version. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - </para> - - <para> - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - </para> - - <para> - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - - <releaseinfo> - This is the first release of this document as part of the kernel tarball. - </releaseinfo> - - </bookinfo> - - <toc></toc> - - <chapter id="introduction"> - <title>Introduction</title> - <para> - Welcome, gentle reader, to Rusty's Remarkably Unreliable Guide to Linux - Kernel Hacking. This document describes the common routines and - general requirements for kernel code: its goal is to serve as a - primer for Linux kernel development for experienced C - programmers. I avoid implementation details: that's what the - code is for, and I ignore whole tracts of useful routines. - </para> - <para> - Before you read this, please understand that I never wanted to - write this document, being grossly under-qualified, but I always - wanted to read it, and this was the only way. I hope it will - grow into a compendium of best practice, common starting points - and random information. - </para> - </chapter> - - <chapter id="basic-players"> - <title>The Players</title> - - <para> - At any time each of the CPUs in a system can be: - </para> - - <itemizedlist> - <listitem> - <para> - not associated with any process, serving a hardware interrupt; - </para> - </listitem> - - <listitem> - <para> - not associated with any process, serving a softirq or tasklet; - </para> - </listitem> - - <listitem> - <para> - running in kernel space, associated with a process (user context); - </para> - </listitem> - - <listitem> - <para> - running a process in user space. - </para> - </listitem> - </itemizedlist> - - <para> - There is an ordering between these. The bottom two can preempt - each other, but above that is a strict hierarchy: each can only be - preempted by the ones above it. For example, while a softirq is - running on a CPU, no other softirq will preempt it, but a hardware - interrupt can. However, any other CPUs in the system execute - independently. - </para> - - <para> - We'll see a number of ways that the user context can block - interrupts, to become truly non-preemptable. - </para> - - <sect1 id="basics-usercontext"> - <title>User Context</title> - - <para> - User context is when you are coming in from a system call or other - trap: like userspace, you can be preempted by more important tasks - and by interrupts. You can sleep, by calling - <function>schedule()</function>. - </para> - - <note> - <para> - You are always in user context on module load and unload, - and on operations on the block device layer. - </para> - </note> - - <para> - In user context, the <varname>current</varname> pointer (indicating - the task we are currently executing) is valid, and - <function>in_interrupt()</function> - (<filename>include/linux/interrupt.h</filename>) is <returnvalue>false - </returnvalue>. - </para> - - <caution> - <para> - Beware that if you have preemption or softirqs disabled - (see below), <function>in_interrupt()</function> will return a - false positive. - </para> - </caution> - </sect1> - - <sect1 id="basics-hardirqs"> - <title>Hardware Interrupts (Hard IRQs)</title> - - <para> - Timer ticks, <hardware>network cards</hardware> and - <hardware>keyboard</hardware> are examples of real - hardware which produce interrupts at any time. The kernel runs - interrupt handlers, which services the hardware. The kernel - guarantees that this handler is never re-entered: if the same - interrupt arrives, it is queued (or dropped). Because it - disables interrupts, this handler has to be fast: frequently it - simply acknowledges the interrupt, marks a 'software interrupt' - for execution and exits. - </para> - - <para> - You can tell you are in a hardware interrupt, because - <function>in_irq()</function> returns <returnvalue>true</returnvalue>. - </para> - <caution> - <para> - Beware that this will return a false positive if interrupts are disabled - (see below). - </para> - </caution> - </sect1> - - <sect1 id="basics-softirqs"> - <title>Software Interrupt Context: Softirqs and Tasklets</title> - - <para> - Whenever a system call is about to return to userspace, or a - hardware interrupt handler exits, any 'software interrupts' - which are marked pending (usually by hardware interrupts) are - run (<filename>kernel/softirq.c</filename>). - </para> - - <para> - Much of the real interrupt handling work is done here. Early in - the transition to <acronym>SMP</acronym>, there were only 'bottom - halves' (BHs), which didn't take advantage of multiple CPUs. Shortly - after we switched from wind-up computers made of match-sticks and snot, - we abandoned this limitation and switched to 'softirqs'. - </para> - - <para> - <filename class="headerfile">include/linux/interrupt.h</filename> lists the - different softirqs. A very important softirq is the - timer softirq (<filename - class="headerfile">include/linux/timer.h</filename>): you can - register to have it call functions for you in a given length of - time. - </para> - - <para> - Softirqs are often a pain to deal with, since the same softirq - will run simultaneously on more than one CPU. For this reason, - tasklets (<filename - class="headerfile">include/linux/interrupt.h</filename>) are more - often used: they are dynamically-registrable (meaning you can have - as many as you want), and they also guarantee that any tasklet - will only run on one CPU at any time, although different tasklets - can run simultaneously. - </para> - <caution> - <para> - The name 'tasklet' is misleading: they have nothing to do with 'tasks', - and probably more to do with some bad vodka Alexey Kuznetsov had at the - time. - </para> - </caution> - - <para> - You can tell you are in a softirq (or tasklet) - using the <function>in_softirq()</function> macro - (<filename class="headerfile">include/linux/interrupt.h</filename>). - </para> - <caution> - <para> - Beware that this will return a false positive if a bh lock (see below) - is held. - </para> - </caution> - </sect1> - </chapter> - - <chapter id="basic-rules"> - <title>Some Basic Rules</title> - - <variablelist> - <varlistentry> - <term>No memory protection</term> - <listitem> - <para> - If you corrupt memory, whether in user context or - interrupt context, the whole machine will crash. Are you - sure you can't do what you want in userspace? - </para> - </listitem> - </varlistentry> - - <varlistentry> - <term>No floating point or <acronym>MMX</acronym></term> - <listitem> - <para> - The <acronym>FPU</acronym> context is not saved; even in user - context the <acronym>FPU</acronym> state probably won't - correspond with the current process: you would mess with some - user process' <acronym>FPU</acronym> state. If you really want - to do this, you would have to explicitly save/restore the full - <acronym>FPU</acronym> state (and avoid context switches). It - is generally a bad idea; use fixed point arithmetic first. - </para> - </listitem> - </varlistentry> - - <varlistentry> - <term>A rigid stack limit</term> - <listitem> - <para> - Depending on configuration options the kernel stack is about 3K to 6K for most 32-bit architectures: it's - about 14K on most 64-bit archs, and often shared with interrupts - so you can't use it all. Avoid deep recursion and huge local - arrays on the stack (allocate them dynamically instead). - </para> - </listitem> - </varlistentry> - - <varlistentry> - <term>The Linux kernel is portable</term> - <listitem> - <para> - Let's keep it that way. Your code should be 64-bit clean, - and endian-independent. You should also minimize CPU - specific stuff, e.g. inline assembly should be cleanly - encapsulated and minimized to ease porting. Generally it - should be restricted to the architecture-dependent part of - the kernel tree. - </para> - </listitem> - </varlistentry> - </variablelist> - </chapter> - - <chapter id="ioctls"> - <title>ioctls: Not writing a new system call</title> - - <para> - A system call generally looks like this - </para> - - <programlisting> -asmlinkage long sys_mycall(int arg) -{ - return 0; -} - </programlisting> - - <para> - First, in most cases you don't want to create a new system call. - You create a character device and implement an appropriate ioctl - for it. This is much more flexible than system calls, doesn't have - to be entered in every architecture's - <filename class="headerfile">include/asm/unistd.h</filename> and - <filename>arch/kernel/entry.S</filename> file, and is much more - likely to be accepted by Linus. - </para> - - <para> - If all your routine does is read or write some parameter, consider - implementing a <function>sysfs</function> interface instead. - </para> - - <para> - Inside the ioctl you're in user context to a process. When a - error occurs you return a negated errno (see - <filename class="headerfile">include/linux/errno.h</filename>), - otherwise you return <returnvalue>0</returnvalue>. - </para> - - <para> - After you slept you should check if a signal occurred: the - Unix/Linux way of handling signals is to temporarily exit the - system call with the <constant>-ERESTARTSYS</constant> error. The - system call entry code will switch back to user context, process - the signal handler and then your system call will be restarted - (unless the user disabled that). So you should be prepared to - process the restart, e.g. if you're in the middle of manipulating - some data structure. - </para> - - <programlisting> -if (signal_pending(current)) - return -ERESTARTSYS; - </programlisting> - - <para> - If you're doing longer computations: first think userspace. If you - <emphasis>really</emphasis> want to do it in kernel you should - regularly check if you need to give up the CPU (remember there is - cooperative multitasking per CPU). Idiom: - </para> - - <programlisting> -cond_resched(); /* Will sleep */ - </programlisting> - - <para> - A short note on interface design: the UNIX system call motto is - "Provide mechanism not policy". - </para> - </chapter> - - <chapter id="deadlock-recipes"> - <title>Recipes for Deadlock</title> - - <para> - You cannot call any routines which may sleep, unless: - </para> - <itemizedlist> - <listitem> - <para> - You are in user context. - </para> - </listitem> - - <listitem> - <para> - You do not own any spinlocks. - </para> - </listitem> - - <listitem> - <para> - You have interrupts enabled (actually, Andi Kleen says - that the scheduling code will enable them for you, but - that's probably not what you wanted). - </para> - </listitem> - </itemizedlist> - - <para> - Note that some functions may sleep implicitly: common ones are - the user space access functions (*_user) and memory allocation - functions without <symbol>GFP_ATOMIC</symbol>. - </para> - - <para> - You should always compile your kernel - <symbol>CONFIG_DEBUG_ATOMIC_SLEEP</symbol> on, and it will warn - you if you break these rules. If you <emphasis>do</emphasis> break - the rules, you will eventually lock up your box. - </para> - - <para> - Really. - </para> - </chapter> - - <chapter id="common-routines"> - <title>Common Routines</title> - - <sect1 id="routines-printk"> - <title> - <function>printk()</function> - <filename class="headerfile">include/linux/kernel.h</filename> - </title> - - <para> - <function>printk()</function> feeds kernel messages to the - console, dmesg, and the syslog daemon. It is useful for debugging - and reporting errors, and can be used inside interrupt context, - but use with caution: a machine which has its console flooded with - printk messages is unusable. It uses a format string mostly - compatible with ANSI C printf, and C string concatenation to give - it a first "priority" argument: - </para> - - <programlisting> -printk(KERN_INFO "i = %u\n", i); - </programlisting> - - <para> - See <filename class="headerfile">include/linux/kernel.h</filename>; - for other KERN_ values; these are interpreted by syslog as the - level. Special case: for printing an IP address use - </para> - - <programlisting> -__be32 ipaddress; -printk(KERN_INFO "my ip: %pI4\n", &ipaddress); - </programlisting> - - <para> - <function>printk()</function> internally uses a 1K buffer and does - not catch overruns. Make sure that will be enough. - </para> - - <note> - <para> - You will know when you are a real kernel hacker - when you start typoing printf as printk in your user programs :) - </para> - </note> - - <!--- From the Lions book reader department --> - - <note> - <para> - Another sidenote: the original Unix Version 6 sources had a - comment on top of its printf function: "Printf should not be - used for chit-chat". You should follow that advice. - </para> - </note> - </sect1> - - <sect1 id="routines-copy"> - <title> - <function>copy_[to/from]_user()</function> - / - <function>get_user()</function> - / - <function>put_user()</function> - <filename class="headerfile">include/linux/uaccess.h</filename> - </title> - - <para> - <emphasis>[SLEEPS]</emphasis> - </para> - - <para> - <function>put_user()</function> and <function>get_user()</function> - are used to get and put single values (such as an int, char, or - long) from and to userspace. A pointer into userspace should - never be simply dereferenced: data should be copied using these - routines. Both return <constant>-EFAULT</constant> or 0. - </para> - <para> - <function>copy_to_user()</function> and - <function>copy_from_user()</function> are more general: they copy - an arbitrary amount of data to and from userspace. - <caution> - <para> - Unlike <function>put_user()</function> and - <function>get_user()</function>, they return the amount of - uncopied data (ie. <returnvalue>0</returnvalue> still means - success). - </para> - </caution> - [Yes, this moronic interface makes me cringe. The flamewar comes up every year or so. --RR.] - </para> - <para> - The functions may sleep implicitly. This should never be called - outside user context (it makes no sense), with interrupts - disabled, or a spinlock held. - </para> - </sect1> - - <sect1 id="routines-kmalloc"> - <title><function>kmalloc()</function>/<function>kfree()</function> - <filename class="headerfile">include/linux/slab.h</filename></title> - - <para> - <emphasis>[MAY SLEEP: SEE BELOW]</emphasis> - </para> - - <para> - These routines are used to dynamically request pointer-aligned - chunks of memory, like malloc and free do in userspace, but - <function>kmalloc()</function> takes an extra flag word. - Important values: - </para> - - <variablelist> - <varlistentry> - <term> - <constant> - GFP_KERNEL - </constant> - </term> - <listitem> - <para> - May sleep and swap to free memory. Only allowed in user - context, but is the most reliable way to allocate memory. - </para> - </listitem> - </varlistentry> - - <varlistentry> - <term> - <constant> - GFP_ATOMIC - </constant> - </term> - <listitem> - <para> - Don't sleep. Less reliable than <constant>GFP_KERNEL</constant>, - but may be called from interrupt context. You should - <emphasis>really</emphasis> have a good out-of-memory - error-handling strategy. - </para> - </listitem> - </varlistentry> - - <varlistentry> - <term> - <constant> - GFP_DMA - </constant> - </term> - <listitem> - <para> - Allocate ISA DMA lower than 16MB. If you don't know what that - is you don't need it. Very unreliable. - </para> - </listitem> - </varlistentry> - </variablelist> - - <para> - If you see a <errorname>sleeping function called from invalid - context</errorname> warning message, then maybe you called a - sleeping allocation function from interrupt context without - <constant>GFP_ATOMIC</constant>. You should really fix that. - Run, don't walk. - </para> - - <para> - If you are allocating at least <constant>PAGE_SIZE</constant> - (<filename class="headerfile">include/asm/page.h</filename>) bytes, - consider using <function>__get_free_pages()</function> - - (<filename class="headerfile">include/linux/mm.h</filename>). It - takes an order argument (0 for page sized, 1 for double page, 2 - for four pages etc.) and the same memory priority flag word as - above. - </para> - - <para> - If you are allocating more than a page worth of bytes you can use - <function>vmalloc()</function>. It'll allocate virtual memory in - the kernel map. This block is not contiguous in physical memory, - but the <acronym>MMU</acronym> makes it look like it is for you - (so it'll only look contiguous to the CPUs, not to external device - drivers). If you really need large physically contiguous memory - for some weird device, you have a problem: it is poorly supported - in Linux because after some time memory fragmentation in a running - kernel makes it hard. The best way is to allocate the block early - in the boot process via the <function>alloc_bootmem()</function> - routine. - </para> - - <para> - Before inventing your own cache of often-used objects consider - using a slab cache in - <filename class="headerfile">include/linux/slab.h</filename> - </para> - </sect1> - - <sect1 id="routines-current"> - <title><function>current</function> - <filename class="headerfile">include/asm/current.h</filename></title> - - <para> - This global variable (really a macro) contains a pointer to - the current task structure, so is only valid in user context. - For example, when a process makes a system call, this will - point to the task structure of the calling process. It is - <emphasis>not NULL</emphasis> in interrupt context. - </para> - </sect1> - - <sect1 id="routines-udelay"> - <title><function>mdelay()</function>/<function>udelay()</function> - <filename class="headerfile">include/asm/delay.h</filename> - <filename class="headerfile">include/linux/delay.h</filename> - </title> - - <para> - The <function>udelay()</function> and <function>ndelay()</function> functions can be used for small pauses. - Do not use large values with them as you risk - overflow - the helper function <function>mdelay()</function> is useful - here, or consider <function>msleep()</function>. - </para> - </sect1> - - <sect1 id="routines-endian"> - <title><function>cpu_to_be32()</function>/<function>be32_to_cpu()</function>/<function>cpu_to_le32()</function>/<function>le32_to_cpu()</function> - <filename class="headerfile">include/asm/byteorder.h</filename> - </title> - - <para> - The <function>cpu_to_be32()</function> family (where the "32" can - be replaced by 64 or 16, and the "be" can be replaced by "le") are - the general way to do endian conversions in the kernel: they - return the converted value. All variations supply the reverse as - well: <function>be32_to_cpu()</function>, etc. - </para> - - <para> - There are two major variations of these functions: the pointer - variation, such as <function>cpu_to_be32p()</function>, which take - a pointer to the given type, and return the converted value. The - other variation is the "in-situ" family, such as - <function>cpu_to_be32s()</function>, which convert value referred - to by the pointer, and return void. - </para> - </sect1> - - <sect1 id="routines-local-irqs"> - <title><function>local_irq_save()</function>/<function>local_irq_restore()</function> - <filename class="headerfile">include/linux/irqflags.h</filename> - </title> - - <para> - These routines disable hard interrupts on the local CPU, and - restore them. They are reentrant; saving the previous state in - their one <varname>unsigned long flags</varname> argument. If you - know that interrupts are enabled, you can simply use - <function>local_irq_disable()</function> and - <function>local_irq_enable()</function>. - </para> - </sect1> - - <sect1 id="routines-softirqs"> - <title><function>local_bh_disable()</function>/<function>local_bh_enable()</function> - <filename class="headerfile">include/linux/interrupt.h</filename></title> - - <para> - These routines disable soft interrupts on the local CPU, and - restore them. They are reentrant; if soft interrupts were - disabled before, they will still be disabled after this pair - of functions has been called. They prevent softirqs and tasklets - from running on the current CPU. - </para> - </sect1> - - <sect1 id="routines-processorids"> - <title><function>smp_processor_id</function>() - <filename class="headerfile">include/asm/smp.h</filename></title> - - <para> - <function>get_cpu()</function> disables preemption (so you won't - suddenly get moved to another CPU) and returns the current - processor number, between 0 and <symbol>NR_CPUS</symbol>. Note - that the CPU numbers are not necessarily continuous. You return - it again with <function>put_cpu()</function> when you are done. - </para> - <para> - If you know you cannot be preempted by another task (ie. you are - in interrupt context, or have preemption disabled) you can use - smp_processor_id(). - </para> - </sect1> - - <sect1 id="routines-init"> - <title><type>__init</type>/<type>__exit</type>/<type>__initdata</type> - <filename class="headerfile">include/linux/init.h</filename></title> - - <para> - After boot, the kernel frees up a special section; functions - marked with <type>__init</type> and data structures marked with - <type>__initdata</type> are dropped after boot is complete: similarly - modules discard this memory after initialization. <type>__exit</type> - is used to declare a function which is only required on exit: the - function will be dropped if this file is not compiled as a module. - See the header file for use. Note that it makes no sense for a function - marked with <type>__init</type> to be exported to modules with - <function>EXPORT_SYMBOL()</function> - this will break. - </para> - - </sect1> - - <sect1 id="routines-init-again"> - <title><function>__initcall()</function>/<function>module_init()</function> - <filename class="headerfile">include/linux/init.h</filename></title> - <para> - Many parts of the kernel are well served as a module - (dynamically-loadable parts of the kernel). Using the - <function>module_init()</function> and - <function>module_exit()</function> macros it is easy to write code - without #ifdefs which can operate both as a module or built into - the kernel. - </para> - - <para> - The <function>module_init()</function> macro defines which - function is to be called at module insertion time (if the file is - compiled as a module), or at boot time: if the file is not - compiled as a module the <function>module_init()</function> macro - becomes equivalent to <function>__initcall()</function>, which - through linker magic ensures that the function is called on boot. - </para> - - <para> - The function can return a negative error number to cause - module loading to fail (unfortunately, this has no effect if - the module is compiled into the kernel). This function is - called in user context with interrupts enabled, so it can sleep. - </para> - </sect1> - - <sect1 id="routines-moduleexit"> - <title> <function>module_exit()</function> - <filename class="headerfile">include/linux/init.h</filename> </title> - - <para> - This macro defines the function to be called at module removal - time (or never, in the case of the file compiled into the - kernel). It will only be called if the module usage count has - reached zero. This function can also sleep, but cannot fail: - everything must be cleaned up by the time it returns. - </para> - - <para> - Note that this macro is optional: if it is not present, your - module will not be removable (except for 'rmmod -f'). - </para> - </sect1> - - <sect1 id="routines-module-use-counters"> - <title> <function>try_module_get()</function>/<function>module_put()</function> - <filename class="headerfile">include/linux/module.h</filename></title> - - <para> - These manipulate the module usage count, to protect against - removal (a module also can't be removed if another module uses one - of its exported symbols: see below). Before calling into module - code, you should call <function>try_module_get()</function> on - that module: if it fails, then the module is being removed and you - should act as if it wasn't there. Otherwise, you can safely enter - the module, and call <function>module_put()</function> when you're - finished. - </para> - - <para> - Most registerable structures have an - <structfield>owner</structfield> field, such as in the - <structname>file_operations</structname> structure. Set this field - to the macro <symbol>THIS_MODULE</symbol>. - </para> - </sect1> - - <!-- add info on new-style module refcounting here --> - </chapter> - - <chapter id="queues"> - <title>Wait Queues - <filename class="headerfile">include/linux/wait.h</filename> - </title> - <para> - <emphasis>[SLEEPS]</emphasis> - </para> - - <para> - A wait queue is used to wait for someone to wake you up when a - certain condition is true. They must be used carefully to ensure - there is no race condition. You declare a - <type>wait_queue_head_t</type>, and then processes which want to - wait for that condition declare a <type>wait_queue_t</type> - referring to themselves, and place that in the queue. - </para> - - <sect1 id="queue-declaring"> - <title>Declaring</title> - - <para> - You declare a <type>wait_queue_head_t</type> using the - <function>DECLARE_WAIT_QUEUE_HEAD()</function> macro, or using the - <function>init_waitqueue_head()</function> routine in your - initialization code. - </para> - </sect1> - - <sect1 id="queue-waitqueue"> - <title>Queuing</title> - - <para> - Placing yourself in the waitqueue is fairly complex, because you - must put yourself in the queue before checking the condition. - There is a macro to do this: - <function>wait_event_interruptible()</function> - - <filename class="headerfile">include/linux/wait.h</filename> The - first argument is the wait queue head, and the second is an - expression which is evaluated; the macro returns - <returnvalue>0</returnvalue> when this expression is true, or - <returnvalue>-ERESTARTSYS</returnvalue> if a signal is received. - The <function>wait_event()</function> version ignores signals. - </para> - - </sect1> - - <sect1 id="queue-waking"> - <title>Waking Up Queued Tasks</title> - - <para> - Call <function>wake_up()</function> - - <filename class="headerfile">include/linux/wait.h</filename>;, - which will wake up every process in the queue. The exception is - if one has <constant>TASK_EXCLUSIVE</constant> set, in which case - the remainder of the queue will not be woken. There are other variants - of this basic function available in the same header. - </para> - </sect1> - </chapter> - - <chapter id="atomic-ops"> - <title>Atomic Operations</title> - - <para> - Certain operations are guaranteed atomic on all platforms. The - first class of operations work on <type>atomic_t</type> - - <filename class="headerfile">include/asm/atomic.h</filename>; this - contains a signed integer (at least 32 bits long), and you must use - these functions to manipulate or read atomic_t variables. - <function>atomic_read()</function> and - <function>atomic_set()</function> get and set the counter, - <function>atomic_add()</function>, - <function>atomic_sub()</function>, - <function>atomic_inc()</function>, - <function>atomic_dec()</function>, and - <function>atomic_dec_and_test()</function> (returns - <returnvalue>true</returnvalue> if it was decremented to zero). - </para> - - <para> - Yes. It returns <returnvalue>true</returnvalue> (i.e. != 0) if the - atomic variable is zero. - </para> - - <para> - Note that these functions are slower than normal arithmetic, and - so should not be used unnecessarily. - </para> - - <para> - The second class of atomic operations is atomic bit operations on an - <type>unsigned long</type>, defined in - - <filename class="headerfile">include/linux/bitops.h</filename>. These - operations generally take a pointer to the bit pattern, and a bit - number: 0 is the least significant bit. - <function>set_bit()</function>, <function>clear_bit()</function> - and <function>change_bit()</function> set, clear, and flip the - given bit. <function>test_and_set_bit()</function>, - <function>test_and_clear_bit()</function> and - <function>test_and_change_bit()</function> do the same thing, - except return true if the bit was previously set; these are - particularly useful for atomically setting flags. - </para> - - <para> - It is possible to call these operations with bit indices greater - than BITS_PER_LONG. The resulting behavior is strange on big-endian - platforms though so it is a good idea not to do this. - </para> - </chapter> - - <chapter id="symbols"> - <title>Symbols</title> - - <para> - Within the kernel proper, the normal linking rules apply - (ie. unless a symbol is declared to be file scope with the - <type>static</type> keyword, it can be used anywhere in the - kernel). However, for modules, a special exported symbol table is - kept which limits the entry points to the kernel proper. Modules - can also export symbols. - </para> - - <sect1 id="sym-exportsymbols"> - <title><function>EXPORT_SYMBOL()</function> - <filename class="headerfile">include/linux/export.h</filename></title> - - <para> - This is the classic method of exporting a symbol: dynamically - loaded modules will be able to use the symbol as normal. - </para> - </sect1> - - <sect1 id="sym-exportsymbols-gpl"> - <title><function>EXPORT_SYMBOL_GPL()</function> - <filename class="headerfile">include/linux/export.h</filename></title> - - <para> - Similar to <function>EXPORT_SYMBOL()</function> except that the - symbols exported by <function>EXPORT_SYMBOL_GPL()</function> can - only be seen by modules with a - <function>MODULE_LICENSE()</function> that specifies a GPL - compatible license. It implies that the function is considered - an internal implementation issue, and not really an interface. - Some maintainers and developers may however - require EXPORT_SYMBOL_GPL() when adding any new APIs or functionality. - </para> - </sect1> - </chapter> - - <chapter id="conventions"> - <title>Routines and Conventions</title> - - <sect1 id="conventions-doublelinkedlist"> - <title>Double-linked lists - <filename class="headerfile">include/linux/list.h</filename></title> - - <para> - There used to be three sets of linked-list routines in the kernel - headers, but this one is the winner. If you don't have some - particular pressing need for a single list, it's a good choice. - </para> - - <para> - In particular, <function>list_for_each_entry</function> is useful. - </para> - </sect1> - - <sect1 id="convention-returns"> - <title>Return Conventions</title> - - <para> - For code called in user context, it's very common to defy C - convention, and return <returnvalue>0</returnvalue> for success, - and a negative error number - (eg. <returnvalue>-EFAULT</returnvalue>) for failure. This can be - unintuitive at first, but it's fairly widespread in the kernel. - </para> - - <para> - Using <function>ERR_PTR()</function> - - <filename class="headerfile">include/linux/err.h</filename>; to - encode a negative error number into a pointer, and - <function>IS_ERR()</function> and <function>PTR_ERR()</function> - to get it back out again: avoids a separate pointer parameter for - the error number. Icky, but in a good way. - </para> - </sect1> - - <sect1 id="conventions-borkedcompile"> - <title>Breaking Compilation</title> - - <para> - Linus and the other developers sometimes change function or - structure names in development kernels; this is not done just to - keep everyone on their toes: it reflects a fundamental change - (eg. can no longer be called with interrupts on, or does extra - checks, or doesn't do checks which were caught before). Usually - this is accompanied by a fairly complete note to the linux-kernel - mailing list; search the archive. Simply doing a global replace - on the file usually makes things <emphasis>worse</emphasis>. - </para> - </sect1> - - <sect1 id="conventions-initialising"> - <title>Initializing structure members</title> - - <para> - The preferred method of initializing structures is to use - designated initialisers, as defined by ISO C99, eg: - </para> - <programlisting> -static struct block_device_operations opt_fops = { - .open = opt_open, - .release = opt_release, - .ioctl = opt_ioctl, - .check_media_change = opt_media_change, -}; - </programlisting> - <para> - This makes it easy to grep for, and makes it clear which - structure fields are set. You should do this because it looks - cool. - </para> - </sect1> - - <sect1 id="conventions-gnu-extns"> - <title>GNU Extensions</title> - - <para> - GNU Extensions are explicitly allowed in the Linux kernel. - Note that some of the more complex ones are not very well - supported, due to lack of general use, but the following are - considered standard (see the GCC info page section "C - Extensions" for more details - Yes, really the info page, the - man page is only a short summary of the stuff in info). - </para> - <itemizedlist> - <listitem> - <para> - Inline functions - </para> - </listitem> - <listitem> - <para> - Statement expressions (ie. the ({ and }) constructs). - </para> - </listitem> - <listitem> - <para> - Declaring attributes of a function / variable / type - (__attribute__) - </para> - </listitem> - <listitem> - <para> - typeof - </para> - </listitem> - <listitem> - <para> - Zero length arrays - </para> - </listitem> - <listitem> - <para> - Macro varargs - </para> - </listitem> - <listitem> - <para> - Arithmetic on void pointers - </para> - </listitem> - <listitem> - <para> - Non-Constant initializers - </para> - </listitem> - <listitem> - <para> - Assembler Instructions (not outside arch/ and include/asm/) - </para> - </listitem> - <listitem> - <para> - Function names as strings (__func__). - </para> - </listitem> - <listitem> - <para> - __builtin_constant_p() - </para> - </listitem> - </itemizedlist> - - <para> - Be wary when using long long in the kernel, the code gcc generates for - it is horrible and worse: division and multiplication does not work - on i386 because the GCC runtime functions for it are missing from - the kernel environment. - </para> - - <!-- FIXME: add a note about ANSI aliasing cleanness --> - </sect1> - - <sect1 id="conventions-cplusplus"> - <title>C++</title> - - <para> - Using C++ in the kernel is usually a bad idea, because the - kernel does not provide the necessary runtime environment - and the include files are not tested for it. It is still - possible, but not recommended. If you really want to do - this, forget about exceptions at least. - </para> - </sect1> - - <sect1 id="conventions-ifdef"> - <title>#if</title> - - <para> - It is generally considered cleaner to use macros in header files - (or at the top of .c files) to abstract away functions rather than - using `#if' pre-processor statements throughout the source code. - </para> - </sect1> - </chapter> - - <chapter id="submitting"> - <title>Putting Your Stuff in the Kernel</title> - - <para> - In order to get your stuff into shape for official inclusion, or - even to make a neat patch, there's administrative work to be - done: - </para> - <itemizedlist> - <listitem> - <para> - Figure out whose pond you've been pissing in. Look at the top of - the source files, inside the <filename>MAINTAINERS</filename> - file, and last of all in the <filename>CREDITS</filename> file. - You should coordinate with this person to make sure you're not - duplicating effort, or trying something that's already been - rejected. - </para> - - <para> - Make sure you put your name and EMail address at the top of - any files you create or mangle significantly. This is the - first place people will look when they find a bug, or when - <emphasis>they</emphasis> want to make a change. - </para> - </listitem> - - <listitem> - <para> - Usually you want a configuration option for your kernel hack. - Edit <filename>Kconfig</filename> in the appropriate directory. - The Config language is simple to use by cut and paste, and there's - complete documentation in - <filename>Documentation/kbuild/kconfig-language.txt</filename>. - </para> - - <para> - In your description of the option, make sure you address both the - expert user and the user who knows nothing about your feature. Mention - incompatibilities and issues here. <emphasis> Definitely - </emphasis> end your description with <quote> if in doubt, say N - </quote> (or, occasionally, `Y'); this is for people who have no - idea what you are talking about. - </para> - </listitem> - - <listitem> - <para> - Edit the <filename>Makefile</filename>: the CONFIG variables are - exported here so you can usually just add a "obj-$(CONFIG_xxx) += - xxx.o" line. The syntax is documented in - <filename>Documentation/kbuild/makefiles.txt</filename>. - </para> - </listitem> - - <listitem> - <para> - Put yourself in <filename>CREDITS</filename> if you've done - something noteworthy, usually beyond a single file (your name - should be at the top of the source files anyway). - <filename>MAINTAINERS</filename> means you want to be consulted - when changes are made to a subsystem, and hear about bugs; it - implies a more-than-passing commitment to some part of the code. - </para> - </listitem> - - <listitem> - <para> - Finally, don't forget to read <filename>Documentation/process/submitting-patches.rst</filename> - and possibly <filename>Documentation/process/submitting-drivers.rst</filename>. - </para> - </listitem> - </itemizedlist> - </chapter> - - <chapter id="cantrips"> - <title>Kernel Cantrips</title> - - <para> - Some favorites from browsing the source. Feel free to add to this - list. - </para> - - <para> - <filename>arch/x86/include/asm/delay.h:</filename> - </para> - <programlisting> -#define ndelay(n) (__builtin_constant_p(n) ? \ - ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \ - __ndelay(n)) - </programlisting> - - <para> - <filename>include/linux/fs.h</filename>: - </para> - <programlisting> -/* - * Kernel pointers have redundant information, so we can use a - * scheme where we can return either an error code or a dentry - * pointer with the same return value. - * - * This should be a per-architecture thing, to allow different - * error and pointer decisions. - */ - #define ERR_PTR(err) ((void *)((long)(err))) - #define PTR_ERR(ptr) ((long)(ptr)) - #define IS_ERR(ptr) ((unsigned long)(ptr) > (unsigned long)(-1000)) -</programlisting> - - <para> - <filename>arch/x86/include/asm/uaccess_32.h:</filename> - </para> - - <programlisting> -#define copy_to_user(to,from,n) \ - (__builtin_constant_p(n) ? \ - __constant_copy_to_user((to),(from),(n)) : \ - __generic_copy_to_user((to),(from),(n))) - </programlisting> - - <para> - <filename>arch/sparc/kernel/head.S:</filename> - </para> - - <programlisting> -/* - * Sun people can't spell worth damn. "compatability" indeed. - * At least we *know* we can't spell, and use a spell-checker. - */ - -/* Uh, actually Linus it is I who cannot spell. Too much murky - * Sparc assembly will do this to ya. - */ -C_LABEL(cputypvar): - .asciz "compatibility" - -/* Tested on SS-5, SS-10. Probably someone at Sun applied a spell-checker. */ - .align 4 -C_LABEL(cputypvar_sun4m): - .asciz "compatible" - </programlisting> - - <para> - <filename>arch/sparc/lib/checksum.S:</filename> - </para> - - <programlisting> - /* Sun, you just can't beat me, you just can't. Stop trying, - * give up. I'm serious, I am going to kick the living shit - * out of you, game over, lights out. - */ - </programlisting> - </chapter> - - <chapter id="credits"> - <title>Thanks</title> - - <para> - Thanks to Andi Kleen for the idea, answering my questions, fixing - my mistakes, filling content, etc. Philipp Rumpf for more spelling - and clarity fixes, and some excellent non-obvious points. Werner - Almesberger for giving me a great summary of - <function>disable_irq()</function>, and Jes Sorensen and Andrea - Arcangeli added caveats. Michael Elizabeth Chastain for checking - and adding to the Configure section. <!-- Rusty insisted on this - bit; I didn't do it! --> Telsa Gwynne for teaching me DocBook. - </para> - </chapter> -</book> - diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl deleted file mode 100644 index 7c9cc4846cb6..000000000000 --- a/Documentation/DocBook/kernel-locking.tmpl +++ /dev/null @@ -1,2151 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="LKLockingGuide"> - <bookinfo> - <title>Unreliable Guide To Locking</title> - - <authorgroup> - <author> - <firstname>Rusty</firstname> - <surname>Russell</surname> - <affiliation> - <address> - <email>rusty@rustcorp.com.au</email> - </address> - </affiliation> - </author> - </authorgroup> - - <copyright> - <year>2003</year> - <holder>Rusty Russell</holder> - </copyright> - - <legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later - version. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - </para> - - <para> - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - </para> - - <para> - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - </bookinfo> - - <toc></toc> - <chapter id="intro"> - <title>Introduction</title> - <para> - Welcome, to Rusty's Remarkably Unreliable Guide to Kernel - Locking issues. This document describes the locking systems in - the Linux Kernel in 2.6. - </para> - <para> - With the wide availability of HyperThreading, and <firstterm - linkend="gloss-preemption">preemption </firstterm> in the Linux - Kernel, everyone hacking on the kernel needs to know the - fundamentals of concurrency and locking for - <firstterm linkend="gloss-smp"><acronym>SMP</acronym></firstterm>. - </para> - </chapter> - - <chapter id="races"> - <title>The Problem With Concurrency</title> - <para> - (Skip this if you know what a Race Condition is). - </para> - <para> - In a normal program, you can increment a counter like so: - </para> - <programlisting> - very_important_count++; - </programlisting> - - <para> - This is what they would expect to happen: - </para> - - <table> - <title>Expected Results</title> - - <tgroup cols="2" align="left"> - - <thead> - <row> - <entry>Instance 1</entry> - <entry>Instance 2</entry> - </row> - </thead> - - <tbody> - <row> - <entry>read very_important_count (5)</entry> - <entry></entry> - </row> - <row> - <entry>add 1 (6)</entry> - <entry></entry> - </row> - <row> - <entry>write very_important_count (6)</entry> - <entry></entry> - </row> - <row> - <entry></entry> - <entry>read very_important_count (6)</entry> - </row> - <row> - <entry></entry> - <entry>add 1 (7)</entry> - </row> - <row> - <entry></entry> - <entry>write very_important_count (7)</entry> - </row> - </tbody> - - </tgroup> - </table> - - <para> - This is what might happen: - </para> - - <table> - <title>Possible Results</title> - - <tgroup cols="2" align="left"> - <thead> - <row> - <entry>Instance 1</entry> - <entry>Instance 2</entry> - </row> - </thead> - - <tbody> - <row> - <entry>read very_important_count (5)</entry> - <entry></entry> - </row> - <row> - <entry></entry> - <entry>read very_important_count (5)</entry> - </row> - <row> - <entry>add 1 (6)</entry> - <entry></entry> - </row> - <row> - <entry></entry> - <entry>add 1 (6)</entry> - </row> - <row> - <entry>write very_important_count (6)</entry> - <entry></entry> - </row> - <row> - <entry></entry> - <entry>write very_important_count (6)</entry> - </row> - </tbody> - </tgroup> - </table> - - <sect1 id="race-condition"> - <title>Race Conditions and Critical Regions</title> - <para> - This overlap, where the result depends on the - relative timing of multiple tasks, is called a <firstterm>race condition</firstterm>. - The piece of code containing the concurrency issue is called a - <firstterm>critical region</firstterm>. And especially since Linux starting running - on SMP machines, they became one of the major issues in kernel - design and implementation. - </para> - <para> - Preemption can have the same effect, even if there is only one - CPU: by preempting one task during the critical region, we have - exactly the same race condition. In this case the thread which - preempts might run the critical region itself. - </para> - <para> - The solution is to recognize when these simultaneous accesses - occur, and use locks to make sure that only one instance can - enter the critical region at any time. There are many - friendly primitives in the Linux kernel to help you do this. - And then there are the unfriendly primitives, but I'll pretend - they don't exist. - </para> - </sect1> - </chapter> - - <chapter id="locks"> - <title>Locking in the Linux Kernel</title> - - <para> - If I could give you one piece of advice: never sleep with anyone - crazier than yourself. But if I had to give you advice on - locking: <emphasis>keep it simple</emphasis>. - </para> - - <para> - Be reluctant to introduce new locks. - </para> - - <para> - Strangely enough, this last one is the exact reverse of my advice when - you <emphasis>have</emphasis> slept with someone crazier than yourself. - And you should think about getting a big dog. - </para> - - <sect1 id="lock-intro"> - <title>Two Main Types of Kernel Locks: Spinlocks and Mutexes</title> - - <para> - There are two main types of kernel locks. The fundamental type - is the spinlock - (<filename class="headerfile">include/asm/spinlock.h</filename>), - which is a very simple single-holder lock: if you can't get the - spinlock, you keep trying (spinning) until you can. Spinlocks are - very small and fast, and can be used anywhere. - </para> - <para> - The second type is a mutex - (<filename class="headerfile">include/linux/mutex.h</filename>): it - is like a spinlock, but you may block holding a mutex. - If you can't lock a mutex, your task will suspend itself, and be woken - up when the mutex is released. This means the CPU can do something - else while you are waiting. There are many cases when you simply - can't sleep (see <xref linkend="sleeping-things"/>), and so have to - use a spinlock instead. - </para> - <para> - Neither type of lock is recursive: see - <xref linkend="deadlock"/>. - </para> - </sect1> - - <sect1 id="uniprocessor"> - <title>Locks and Uniprocessor Kernels</title> - - <para> - For kernels compiled without <symbol>CONFIG_SMP</symbol>, and - without <symbol>CONFIG_PREEMPT</symbol> spinlocks do not exist at - all. This is an excellent design decision: when no-one else can - run at the same time, there is no reason to have a lock. - </para> - - <para> - If the kernel is compiled without <symbol>CONFIG_SMP</symbol>, - but <symbol>CONFIG_PREEMPT</symbol> is set, then spinlocks - simply disable preemption, which is sufficient to prevent any - races. For most purposes, we can think of preemption as - equivalent to SMP, and not worry about it separately. - </para> - - <para> - You should always test your locking code with <symbol>CONFIG_SMP</symbol> - and <symbol>CONFIG_PREEMPT</symbol> enabled, even if you don't have an SMP test box, because it - will still catch some kinds of locking bugs. - </para> - - <para> - Mutexes still exist, because they are required for - synchronization between <firstterm linkend="gloss-usercontext">user - contexts</firstterm>, as we will see below. - </para> - </sect1> - - <sect1 id="usercontextlocking"> - <title>Locking Only In User Context</title> - - <para> - If you have a data structure which is only ever accessed from - user context, then you can use a simple mutex - (<filename>include/linux/mutex.h</filename>) to protect it. This - is the most trivial case: you initialize the mutex. Then you can - call <function>mutex_lock_interruptible()</function> to grab the mutex, - and <function>mutex_unlock()</function> to release it. There is also a - <function>mutex_lock()</function>, which should be avoided, because it - will not return if a signal is received. - </para> - - <para> - Example: <filename>net/netfilter/nf_sockopt.c</filename> allows - registration of new <function>setsockopt()</function> and - <function>getsockopt()</function> calls, with - <function>nf_register_sockopt()</function>. Registration and - de-registration are only done on module load and unload (and boot - time, where there is no concurrency), and the list of registrations - is only consulted for an unknown <function>setsockopt()</function> - or <function>getsockopt()</function> system call. The - <varname>nf_sockopt_mutex</varname> is perfect to protect this, - especially since the setsockopt and getsockopt calls may well - sleep. - </para> - </sect1> - - <sect1 id="lock-user-bh"> - <title>Locking Between User Context and Softirqs</title> - - <para> - If a <firstterm linkend="gloss-softirq">softirq</firstterm> shares - data with user context, you have two problems. Firstly, the current - user context can be interrupted by a softirq, and secondly, the - critical region could be entered from another CPU. This is where - <function>spin_lock_bh()</function> - (<filename class="headerfile">include/linux/spinlock.h</filename>) is - used. It disables softirqs on that CPU, then grabs the lock. - <function>spin_unlock_bh()</function> does the reverse. (The - '_bh' suffix is a historical reference to "Bottom Halves", the - old name for software interrupts. It should really be - called spin_lock_softirq()' in a perfect world). - </para> - - <para> - Note that you can also use <function>spin_lock_irq()</function> - or <function>spin_lock_irqsave()</function> here, which stop - hardware interrupts as well: see <xref linkend="hardirq-context"/>. - </para> - - <para> - This works perfectly for <firstterm linkend="gloss-up"><acronym>UP - </acronym></firstterm> as well: the spin lock vanishes, and this macro - simply becomes <function>local_bh_disable()</function> - (<filename class="headerfile">include/linux/interrupt.h</filename>), which - protects you from the softirq being run. - </para> - </sect1> - - <sect1 id="lock-user-tasklet"> - <title>Locking Between User Context and Tasklets</title> - - <para> - This is exactly the same as above, because <firstterm - linkend="gloss-tasklet">tasklets</firstterm> are actually run - from a softirq. - </para> - </sect1> - - <sect1 id="lock-user-timers"> - <title>Locking Between User Context and Timers</title> - - <para> - This, too, is exactly the same as above, because <firstterm - linkend="gloss-timers">timers</firstterm> are actually run from - a softirq. From a locking point of view, tasklets and timers - are identical. - </para> - </sect1> - - <sect1 id="lock-tasklets"> - <title>Locking Between Tasklets/Timers</title> - - <para> - Sometimes a tasklet or timer might want to share data with - another tasklet or timer. - </para> - - <sect2 id="lock-tasklets-same"> - <title>The Same Tasklet/Timer</title> - <para> - Since a tasklet is never run on two CPUs at once, you don't - need to worry about your tasklet being reentrant (running - twice at once), even on SMP. - </para> - </sect2> - - <sect2 id="lock-tasklets-different"> - <title>Different Tasklets/Timers</title> - <para> - If another tasklet/timer wants - to share data with your tasklet or timer , you will both need to use - <function>spin_lock()</function> and - <function>spin_unlock()</function> calls. - <function>spin_lock_bh()</function> is - unnecessary here, as you are already in a tasklet, and - none will be run on the same CPU. - </para> - </sect2> - </sect1> - - <sect1 id="lock-softirqs"> - <title>Locking Between Softirqs</title> - - <para> - Often a softirq might - want to share data with itself or a tasklet/timer. - </para> - - <sect2 id="lock-softirqs-same"> - <title>The Same Softirq</title> - - <para> - The same softirq can run on the other CPUs: you can use a - per-CPU array (see <xref linkend="per-cpu"/>) for better - performance. If you're going so far as to use a softirq, - you probably care about scalable performance enough - to justify the extra complexity. - </para> - - <para> - You'll need to use <function>spin_lock()</function> and - <function>spin_unlock()</function> for shared data. - </para> - </sect2> - - <sect2 id="lock-softirqs-different"> - <title>Different Softirqs</title> - - <para> - You'll need to use <function>spin_lock()</function> and - <function>spin_unlock()</function> for shared data, whether it - be a timer, tasklet, different softirq or the same or another - softirq: any of them could be running on a different CPU. - </para> - </sect2> - </sect1> - </chapter> - - <chapter id="hardirq-context"> - <title>Hard IRQ Context</title> - - <para> - Hardware interrupts usually communicate with a - tasklet or softirq. Frequently this involves putting work in a - queue, which the softirq will take out. - </para> - - <sect1 id="hardirq-softirq"> - <title>Locking Between Hard IRQ and Softirqs/Tasklets</title> - - <para> - If a hardware irq handler shares data with a softirq, you have - two concerns. Firstly, the softirq processing can be - interrupted by a hardware interrupt, and secondly, the - critical region could be entered by a hardware interrupt on - another CPU. This is where <function>spin_lock_irq()</function> is - used. It is defined to disable interrupts on that cpu, then grab - the lock. <function>spin_unlock_irq()</function> does the reverse. - </para> - - <para> - The irq handler does not to use - <function>spin_lock_irq()</function>, because the softirq cannot - run while the irq handler is running: it can use - <function>spin_lock()</function>, which is slightly faster. The - only exception would be if a different hardware irq handler uses - the same lock: <function>spin_lock_irq()</function> will stop - that from interrupting us. - </para> - - <para> - This works perfectly for UP as well: the spin lock vanishes, - and this macro simply becomes <function>local_irq_disable()</function> - (<filename class="headerfile">include/asm/smp.h</filename>), which - protects you from the softirq/tasklet/BH being run. - </para> - - <para> - <function>spin_lock_irqsave()</function> - (<filename>include/linux/spinlock.h</filename>) is a variant - which saves whether interrupts were on or off in a flags word, - which is passed to <function>spin_unlock_irqrestore()</function>. This - means that the same code can be used inside an hard irq handler (where - interrupts are already off) and in softirqs (where the irq - disabling is required). - </para> - - <para> - Note that softirqs (and hence tasklets and timers) are run on - return from hardware interrupts, so - <function>spin_lock_irq()</function> also stops these. In that - sense, <function>spin_lock_irqsave()</function> is the most - general and powerful locking function. - </para> - - </sect1> - <sect1 id="hardirq-hardirq"> - <title>Locking Between Two Hard IRQ Handlers</title> - <para> - It is rare to have to share data between two IRQ handlers, but - if you do, <function>spin_lock_irqsave()</function> should be - used: it is architecture-specific whether all interrupts are - disabled inside irq handlers themselves. - </para> - </sect1> - - </chapter> - - <chapter id="cheatsheet"> - <title>Cheat Sheet For Locking</title> - <para> - Pete Zaitcev gives the following summary: - </para> - <itemizedlist> - <listitem> - <para> - If you are in a process context (any syscall) and want to - lock other process out, use a mutex. You can take a mutex - and sleep (<function>copy_from_user*(</function> or - <function>kmalloc(x,GFP_KERNEL)</function>). - </para> - </listitem> - <listitem> - <para> - Otherwise (== data can be touched in an interrupt), use - <function>spin_lock_irqsave()</function> and - <function>spin_unlock_irqrestore()</function>. - </para> - </listitem> - <listitem> - <para> - Avoid holding spinlock for more than 5 lines of code and - across any function call (except accessors like - <function>readb</function>). - </para> - </listitem> - </itemizedlist> - - <sect1 id="minimum-lock-reqirements"> - <title>Table of Minimum Requirements</title> - - <para> The following table lists the <emphasis>minimum</emphasis> - locking requirements between various contexts. In some cases, - the same context can only be running on one CPU at a time, so - no locking is required for that context (eg. a particular - thread can only run on one CPU at a time, but if it needs - shares data with another thread, locking is required). - </para> - <para> - Remember the advice above: you can always use - <function>spin_lock_irqsave()</function>, which is a superset - of all other spinlock primitives. - </para> - - <table> -<title>Table of Locking Requirements</title> -<tgroup cols="11"> -<tbody> - -<row> -<entry></entry> -<entry>IRQ Handler A</entry> -<entry>IRQ Handler B</entry> -<entry>Softirq A</entry> -<entry>Softirq B</entry> -<entry>Tasklet A</entry> -<entry>Tasklet B</entry> -<entry>Timer A</entry> -<entry>Timer B</entry> -<entry>User Context A</entry> -<entry>User Context B</entry> -</row> - -<row> -<entry>IRQ Handler A</entry> -<entry>None</entry> -</row> - -<row> -<entry>IRQ Handler B</entry> -<entry>SLIS</entry> -<entry>None</entry> -</row> - -<row> -<entry>Softirq A</entry> -<entry>SLI</entry> -<entry>SLI</entry> -<entry>SL</entry> -</row> - -<row> -<entry>Softirq B</entry> -<entry>SLI</entry> -<entry>SLI</entry> -<entry>SL</entry> -<entry>SL</entry> -</row> - -<row> -<entry>Tasklet A</entry> -<entry>SLI</entry> -<entry>SLI</entry> -<entry>SL</entry> -<entry>SL</entry> -<entry>None</entry> -</row> - -<row> -<entry>Tasklet B</entry> -<entry>SLI</entry> -<entry>SLI</entry> -<entry>SL</entry> -<entry>SL</entry> -<entry>SL</entry> -<entry>None</entry> -</row> - -<row> -<entry>Timer A</entry> -<entry>SLI</entry> -<entry>SLI</entry> -<entry>SL</entry> -<entry>SL</entry> -<entry>SL</entry> -<entry>SL</entry> -<entry>None</entry> -</row> - -<row> -<entry>Timer B</entry> -<entry>SLI</entry> -<entry>SLI</entry> -<entry>SL</entry> -<entry>SL</entry> -<entry>SL</entry> -<entry>SL</entry> -<entry>SL</entry> -<entry>None</entry> -</row> - -<row> -<entry>User Context A</entry> -<entry>SLI</entry> -<entry>SLI</entry> -<entry>SLBH</entry> -<entry>SLBH</entry> -<entry>SLBH</entry> -<entry>SLBH</entry> -<entry>SLBH</entry> -<entry>SLBH</entry> -<entry>None</entry> -</row> - -<row> -<entry>User Context B</entry> -<entry>SLI</entry> -<entry>SLI</entry> -<entry>SLBH</entry> -<entry>SLBH</entry> -<entry>SLBH</entry> -<entry>SLBH</entry> -<entry>SLBH</entry> -<entry>SLBH</entry> -<entry>MLI</entry> -<entry>None</entry> -</row> - -</tbody> -</tgroup> -</table> - - <table> -<title>Legend for Locking Requirements Table</title> -<tgroup cols="2"> -<tbody> - -<row> -<entry>SLIS</entry> -<entry>spin_lock_irqsave</entry> -</row> -<row> -<entry>SLI</entry> -<entry>spin_lock_irq</entry> -</row> -<row> -<entry>SL</entry> -<entry>spin_lock</entry> -</row> -<row> -<entry>SLBH</entry> -<entry>spin_lock_bh</entry> -</row> -<row> -<entry>MLI</entry> -<entry>mutex_lock_interruptible</entry> -</row> - -</tbody> -</tgroup> -</table> - -</sect1> -</chapter> - -<chapter id="trylock-functions"> - <title>The trylock Functions</title> - <para> - There are functions that try to acquire a lock only once and immediately - return a value telling about success or failure to acquire the lock. - They can be used if you need no access to the data protected with the lock - when some other thread is holding the lock. You should acquire the lock - later if you then need access to the data protected with the lock. - </para> - - <para> - <function>spin_trylock()</function> does not spin but returns non-zero if - it acquires the spinlock on the first try or 0 if not. This function can - be used in all contexts like <function>spin_lock</function>: you must have - disabled the contexts that might interrupt you and acquire the spin lock. - </para> - - <para> - <function>mutex_trylock()</function> does not suspend your task - but returns non-zero if it could lock the mutex on the first try - or 0 if not. This function cannot be safely used in hardware or software - interrupt contexts despite not sleeping. - </para> -</chapter> - - <chapter id="Examples"> - <title>Common Examples</title> - <para> -Let's step through a simple example: a cache of number to name -mappings. The cache keeps a count of how often each of the objects is -used, and when it gets full, throws out the least used one. - - </para> - - <sect1 id="examples-usercontext"> - <title>All In User Context</title> - <para> -For our first example, we assume that all operations are in user -context (ie. from system calls), so we can sleep. This means we can -use a mutex to protect the cache and all the objects within -it. Here's the code: - </para> - - <programlisting> -#include <linux/list.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/mutex.h> -#include <asm/errno.h> - -struct object -{ - struct list_head list; - int id; - char name[32]; - int popularity; -}; - -/* Protects the cache, cache_num, and the objects within it */ -static DEFINE_MUTEX(cache_lock); -static LIST_HEAD(cache); -static unsigned int cache_num = 0; -#define MAX_CACHE_SIZE 10 - -/* Must be holding cache_lock */ -static struct object *__cache_find(int id) -{ - struct object *i; - - list_for_each_entry(i, &cache, list) - if (i->id == id) { - i->popularity++; - return i; - } - return NULL; -} - -/* Must be holding cache_lock */ -static void __cache_delete(struct object *obj) -{ - BUG_ON(!obj); - list_del(&obj->list); - kfree(obj); - cache_num--; -} - -/* Must be holding cache_lock */ -static void __cache_add(struct object *obj) -{ - list_add(&obj->list, &cache); - if (++cache_num > MAX_CACHE_SIZE) { - struct object *i, *outcast = NULL; - list_for_each_entry(i, &cache, list) { - if (!outcast || i->popularity < outcast->popularity) - outcast = i; - } - __cache_delete(outcast); - } -} - -int cache_add(int id, const char *name) -{ - struct object *obj; - - if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL) - return -ENOMEM; - - strlcpy(obj->name, name, sizeof(obj->name)); - obj->id = id; - obj->popularity = 0; - - mutex_lock(&cache_lock); - __cache_add(obj); - mutex_unlock(&cache_lock); - return 0; -} - -void cache_delete(int id) -{ - mutex_lock(&cache_lock); - __cache_delete(__cache_find(id)); - mutex_unlock(&cache_lock); -} - -int cache_find(int id, char *name) -{ - struct object *obj; - int ret = -ENOENT; - - mutex_lock(&cache_lock); - obj = __cache_find(id); - if (obj) { - ret = 0; - strcpy(name, obj->name); - } - mutex_unlock(&cache_lock); - return ret; -} -</programlisting> - - <para> -Note that we always make sure we have the cache_lock when we add, -delete, or look up the cache: both the cache infrastructure itself and -the contents of the objects are protected by the lock. In this case -it's easy, since we copy the data for the user, and never let them -access the objects directly. - </para> - <para> -There is a slight (and common) optimization here: in -<function>cache_add</function> we set up the fields of the object -before grabbing the lock. This is safe, as no-one else can access it -until we put it in cache. - </para> - </sect1> - - <sect1 id="examples-interrupt"> - <title>Accessing From Interrupt Context</title> - <para> -Now consider the case where <function>cache_find</function> can be -called from interrupt context: either a hardware interrupt or a -softirq. An example would be a timer which deletes object from the -cache. - </para> - <para> -The change is shown below, in standard patch format: the -<symbol>-</symbol> are lines which are taken away, and the -<symbol>+</symbol> are lines which are added. - </para> -<programlisting> ---- cache.c.usercontext 2003-12-09 13:58:54.000000000 +1100 -+++ cache.c.interrupt 2003-12-09 14:07:49.000000000 +1100 -@@ -12,7 +12,7 @@ - int popularity; - }; - --static DEFINE_MUTEX(cache_lock); -+static DEFINE_SPINLOCK(cache_lock); - static LIST_HEAD(cache); - static unsigned int cache_num = 0; - #define MAX_CACHE_SIZE 10 -@@ -55,6 +55,7 @@ - int cache_add(int id, const char *name) - { - struct object *obj; -+ unsigned long flags; - - if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL) - return -ENOMEM; -@@ -63,30 +64,33 @@ - obj->id = id; - obj->popularity = 0; - -- mutex_lock(&cache_lock); -+ spin_lock_irqsave(&cache_lock, flags); - __cache_add(obj); -- mutex_unlock(&cache_lock); -+ spin_unlock_irqrestore(&cache_lock, flags); - return 0; - } - - void cache_delete(int id) - { -- mutex_lock(&cache_lock); -+ unsigned long flags; -+ -+ spin_lock_irqsave(&cache_lock, flags); - __cache_delete(__cache_find(id)); -- mutex_unlock(&cache_lock); -+ spin_unlock_irqrestore(&cache_lock, flags); - } - - int cache_find(int id, char *name) - { - struct object *obj; - int ret = -ENOENT; -+ unsigned long flags; - -- mutex_lock(&cache_lock); -+ spin_lock_irqsave(&cache_lock, flags); - obj = __cache_find(id); - if (obj) { - ret = 0; - strcpy(name, obj->name); - } -- mutex_unlock(&cache_lock); -+ spin_unlock_irqrestore(&cache_lock, flags); - return ret; - } -</programlisting> - - <para> -Note that the <function>spin_lock_irqsave</function> will turn off -interrupts if they are on, otherwise does nothing (if we are already -in an interrupt handler), hence these functions are safe to call from -any context. - </para> - <para> -Unfortunately, <function>cache_add</function> calls -<function>kmalloc</function> with the <symbol>GFP_KERNEL</symbol> -flag, which is only legal in user context. I have assumed that -<function>cache_add</function> is still only called in user context, -otherwise this should become a parameter to -<function>cache_add</function>. - </para> - </sect1> - <sect1 id="examples-refcnt"> - <title>Exposing Objects Outside This File</title> - <para> -If our objects contained more information, it might not be sufficient -to copy the information in and out: other parts of the code might want -to keep pointers to these objects, for example, rather than looking up -the id every time. This produces two problems. - </para> - <para> -The first problem is that we use the <symbol>cache_lock</symbol> to -protect objects: we'd need to make this non-static so the rest of the -code can use it. This makes locking trickier, as it is no longer all -in one place. - </para> - <para> -The second problem is the lifetime problem: if another structure keeps -a pointer to an object, it presumably expects that pointer to remain -valid. Unfortunately, this is only guaranteed while you hold the -lock, otherwise someone might call <function>cache_delete</function> -and even worse, add another object, re-using the same address. - </para> - <para> -As there is only one lock, you can't hold it forever: no-one else would -get any work done. - </para> - <para> -The solution to this problem is to use a reference count: everyone who -has a pointer to the object increases it when they first get the -object, and drops the reference count when they're finished with it. -Whoever drops it to zero knows it is unused, and can actually delete it. - </para> - <para> -Here is the code: - </para> - -<programlisting> ---- cache.c.interrupt 2003-12-09 14:25:43.000000000 +1100 -+++ cache.c.refcnt 2003-12-09 14:33:05.000000000 +1100 -@@ -7,6 +7,7 @@ - struct object - { - struct list_head list; -+ unsigned int refcnt; - int id; - char name[32]; - int popularity; -@@ -17,6 +18,35 @@ - static unsigned int cache_num = 0; - #define MAX_CACHE_SIZE 10 - -+static void __object_put(struct object *obj) -+{ -+ if (--obj->refcnt == 0) -+ kfree(obj); -+} -+ -+static void __object_get(struct object *obj) -+{ -+ obj->refcnt++; -+} -+ -+void object_put(struct object *obj) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&cache_lock, flags); -+ __object_put(obj); -+ spin_unlock_irqrestore(&cache_lock, flags); -+} -+ -+void object_get(struct object *obj) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&cache_lock, flags); -+ __object_get(obj); -+ spin_unlock_irqrestore(&cache_lock, flags); -+} -+ - /* Must be holding cache_lock */ - static struct object *__cache_find(int id) - { -@@ -35,6 +65,7 @@ - { - BUG_ON(!obj); - list_del(&obj->list); -+ __object_put(obj); - cache_num--; - } - -@@ -63,6 +94,7 @@ - strlcpy(obj->name, name, sizeof(obj->name)); - obj->id = id; - obj->popularity = 0; -+ obj->refcnt = 1; /* The cache holds a reference */ - - spin_lock_irqsave(&cache_lock, flags); - __cache_add(obj); -@@ -79,18 +111,15 @@ - spin_unlock_irqrestore(&cache_lock, flags); - } - --int cache_find(int id, char *name) -+struct object *cache_find(int id) - { - struct object *obj; -- int ret = -ENOENT; - unsigned long flags; - - spin_lock_irqsave(&cache_lock, flags); - obj = __cache_find(id); -- if (obj) { -- ret = 0; -- strcpy(name, obj->name); -- } -+ if (obj) -+ __object_get(obj); - spin_unlock_irqrestore(&cache_lock, flags); -- return ret; -+ return obj; - } -</programlisting> - -<para> -We encapsulate the reference counting in the standard 'get' and 'put' -functions. Now we can return the object itself from -<function>cache_find</function> which has the advantage that the user -can now sleep holding the object (eg. to -<function>copy_to_user</function> to name to userspace). -</para> -<para> -The other point to note is that I said a reference should be held for -every pointer to the object: thus the reference count is 1 when first -inserted into the cache. In some versions the framework does not hold -a reference count, but they are more complicated. -</para> - - <sect2 id="examples-refcnt-atomic"> - <title>Using Atomic Operations For The Reference Count</title> -<para> -In practice, <type>atomic_t</type> would usually be used for -<structfield>refcnt</structfield>. There are a number of atomic -operations defined in - -<filename class="headerfile">include/asm/atomic.h</filename>: these are -guaranteed to be seen atomically from all CPUs in the system, so no -lock is required. In this case, it is simpler than using spinlocks, -although for anything non-trivial using spinlocks is clearer. The -<function>atomic_inc</function> and -<function>atomic_dec_and_test</function> are used instead of the -standard increment and decrement operators, and the lock is no longer -used to protect the reference count itself. -</para> - -<programlisting> ---- cache.c.refcnt 2003-12-09 15:00:35.000000000 +1100 -+++ cache.c.refcnt-atomic 2003-12-11 15:49:42.000000000 +1100 -@@ -7,7 +7,7 @@ - struct object - { - struct list_head list; -- unsigned int refcnt; -+ atomic_t refcnt; - int id; - char name[32]; - int popularity; -@@ -18,33 +18,15 @@ - static unsigned int cache_num = 0; - #define MAX_CACHE_SIZE 10 - --static void __object_put(struct object *obj) --{ -- if (--obj->refcnt == 0) -- kfree(obj); --} -- --static void __object_get(struct object *obj) --{ -- obj->refcnt++; --} -- - void object_put(struct object *obj) - { -- unsigned long flags; -- -- spin_lock_irqsave(&cache_lock, flags); -- __object_put(obj); -- spin_unlock_irqrestore(&cache_lock, flags); -+ if (atomic_dec_and_test(&obj->refcnt)) -+ kfree(obj); - } - - void object_get(struct object *obj) - { -- unsigned long flags; -- -- spin_lock_irqsave(&cache_lock, flags); -- __object_get(obj); -- spin_unlock_irqrestore(&cache_lock, flags); -+ atomic_inc(&obj->refcnt); - } - - /* Must be holding cache_lock */ -@@ -65,7 +47,7 @@ - { - BUG_ON(!obj); - list_del(&obj->list); -- __object_put(obj); -+ object_put(obj); - cache_num--; - } - -@@ -94,7 +76,7 @@ - strlcpy(obj->name, name, sizeof(obj->name)); - obj->id = id; - obj->popularity = 0; -- obj->refcnt = 1; /* The cache holds a reference */ -+ atomic_set(&obj->refcnt, 1); /* The cache holds a reference */ - - spin_lock_irqsave(&cache_lock, flags); - __cache_add(obj); -@@ -119,7 +101,7 @@ - spin_lock_irqsave(&cache_lock, flags); - obj = __cache_find(id); - if (obj) -- __object_get(obj); -+ object_get(obj); - spin_unlock_irqrestore(&cache_lock, flags); - return obj; - } -</programlisting> -</sect2> -</sect1> - - <sect1 id="examples-lock-per-obj"> - <title>Protecting The Objects Themselves</title> - <para> -In these examples, we assumed that the objects (except the reference -counts) never changed once they are created. If we wanted to allow -the name to change, there are three possibilities: - </para> - <itemizedlist> - <listitem> - <para> -You can make <symbol>cache_lock</symbol> non-static, and tell people -to grab that lock before changing the name in any object. - </para> - </listitem> - <listitem> - <para> -You can provide a <function>cache_obj_rename</function> which grabs -this lock and changes the name for the caller, and tell everyone to -use that function. - </para> - </listitem> - <listitem> - <para> -You can make the <symbol>cache_lock</symbol> protect only the cache -itself, and use another lock to protect the name. - </para> - </listitem> - </itemizedlist> - - <para> -Theoretically, you can make the locks as fine-grained as one lock for -every field, for every object. In practice, the most common variants -are: -</para> - <itemizedlist> - <listitem> - <para> -One lock which protects the infrastructure (the <symbol>cache</symbol> -list in this example) and all the objects. This is what we have done -so far. - </para> - </listitem> - <listitem> - <para> -One lock which protects the infrastructure (including the list -pointers inside the objects), and one lock inside the object which -protects the rest of that object. - </para> - </listitem> - <listitem> - <para> -Multiple locks to protect the infrastructure (eg. one lock per hash -chain), possibly with a separate per-object lock. - </para> - </listitem> - </itemizedlist> - -<para> -Here is the "lock-per-object" implementation: -</para> -<programlisting> ---- cache.c.refcnt-atomic 2003-12-11 15:50:54.000000000 +1100 -+++ cache.c.perobjectlock 2003-12-11 17:15:03.000000000 +1100 -@@ -6,11 +6,17 @@ - - struct object - { -+ /* These two protected by cache_lock. */ - struct list_head list; -+ int popularity; -+ - atomic_t refcnt; -+ -+ /* Doesn't change once created. */ - int id; -+ -+ spinlock_t lock; /* Protects the name */ - char name[32]; -- int popularity; - }; - - static DEFINE_SPINLOCK(cache_lock); -@@ -77,6 +84,7 @@ - obj->id = id; - obj->popularity = 0; - atomic_set(&obj->refcnt, 1); /* The cache holds a reference */ -+ spin_lock_init(&obj->lock); - - spin_lock_irqsave(&cache_lock, flags); - __cache_add(obj); -</programlisting> - -<para> -Note that I decide that the <structfield>popularity</structfield> -count should be protected by the <symbol>cache_lock</symbol> rather -than the per-object lock: this is because it (like the -<structname>struct list_head</structname> inside the object) is -logically part of the infrastructure. This way, I don't need to grab -the lock of every object in <function>__cache_add</function> when -seeking the least popular. -</para> - -<para> -I also decided that the <structfield>id</structfield> member is -unchangeable, so I don't need to grab each object lock in -<function>__cache_find()</function> to examine the -<structfield>id</structfield>: the object lock is only used by a -caller who wants to read or write the <structfield>name</structfield> -field. -</para> - -<para> -Note also that I added a comment describing what data was protected by -which locks. This is extremely important, as it describes the runtime -behavior of the code, and can be hard to gain from just reading. And -as Alan Cox says, <quote>Lock data, not code</quote>. -</para> -</sect1> -</chapter> - - <chapter id="common-problems"> - <title>Common Problems</title> - <sect1 id="deadlock"> - <title>Deadlock: Simple and Advanced</title> - - <para> - There is a coding bug where a piece of code tries to grab a - spinlock twice: it will spin forever, waiting for the lock to - be released (spinlocks, rwlocks and mutexes are not - recursive in Linux). This is trivial to diagnose: not a - stay-up-five-nights-talk-to-fluffy-code-bunnies kind of - problem. - </para> - - <para> - For a slightly more complex case, imagine you have a region - shared by a softirq and user context. If you use a - <function>spin_lock()</function> call to protect it, it is - possible that the user context will be interrupted by the softirq - while it holds the lock, and the softirq will then spin - forever trying to get the same lock. - </para> - - <para> - Both of these are called deadlock, and as shown above, it can - occur even with a single CPU (although not on UP compiles, - since spinlocks vanish on kernel compiles with - <symbol>CONFIG_SMP</symbol>=n. You'll still get data corruption - in the second example). - </para> - - <para> - This complete lockup is easy to diagnose: on SMP boxes the - watchdog timer or compiling with <symbol>DEBUG_SPINLOCK</symbol> set - (<filename>include/linux/spinlock.h</filename>) will show this up - immediately when it happens. - </para> - - <para> - A more complex problem is the so-called 'deadly embrace', - involving two or more locks. Say you have a hash table: each - entry in the table is a spinlock, and a chain of hashed - objects. Inside a softirq handler, you sometimes want to - alter an object from one place in the hash to another: you - grab the spinlock of the old hash chain and the spinlock of - the new hash chain, and delete the object from the old one, - and insert it in the new one. - </para> - - <para> - There are two problems here. First, if your code ever - tries to move the object to the same chain, it will deadlock - with itself as it tries to lock it twice. Secondly, if the - same softirq on another CPU is trying to move another object - in the reverse direction, the following could happen: - </para> - - <table> - <title>Consequences</title> - - <tgroup cols="2" align="left"> - - <thead> - <row> - <entry>CPU 1</entry> - <entry>CPU 2</entry> - </row> - </thead> - - <tbody> - <row> - <entry>Grab lock A -> OK</entry> - <entry>Grab lock B -> OK</entry> - </row> - <row> - <entry>Grab lock B -> spin</entry> - <entry>Grab lock A -> spin</entry> - </row> - </tbody> - </tgroup> - </table> - - <para> - The two CPUs will spin forever, waiting for the other to give up - their lock. It will look, smell, and feel like a crash. - </para> - </sect1> - - <sect1 id="techs-deadlock-prevent"> - <title>Preventing Deadlock</title> - - <para> - Textbooks will tell you that if you always lock in the same - order, you will never get this kind of deadlock. Practice - will tell you that this approach doesn't scale: when I - create a new lock, I don't understand enough of the kernel - to figure out where in the 5000 lock hierarchy it will fit. - </para> - - <para> - The best locks are encapsulated: they never get exposed in - headers, and are never held around calls to non-trivial - functions outside the same file. You can read through this - code and see that it will never deadlock, because it never - tries to grab another lock while it has that one. People - using your code don't even need to know you are using a - lock. - </para> - - <para> - A classic problem here is when you provide callbacks or - hooks: if you call these with the lock held, you risk simple - deadlock, or a deadly embrace (who knows what the callback - will do?). Remember, the other programmers are out to get - you, so don't do this. - </para> - - <sect2 id="techs-deadlock-overprevent"> - <title>Overzealous Prevention Of Deadlocks</title> - - <para> - Deadlocks are problematic, but not as bad as data - corruption. Code which grabs a read lock, searches a list, - fails to find what it wants, drops the read lock, grabs a - write lock and inserts the object has a race condition. - </para> - - <para> - If you don't see why, please stay the fuck away from my code. - </para> - </sect2> - </sect1> - - <sect1 id="racing-timers"> - <title>Racing Timers: A Kernel Pastime</title> - - <para> - Timers can produce their own special problems with races. - Consider a collection of objects (list, hash, etc) where each - object has a timer which is due to destroy it. - </para> - - <para> - If you want to destroy the entire collection (say on module - removal), you might do the following: - </para> - - <programlisting> - /* THIS CODE BAD BAD BAD BAD: IF IT WAS ANY WORSE IT WOULD USE - HUNGARIAN NOTATION */ - spin_lock_bh(&list_lock); - - while (list) { - struct foo *next = list->next; - del_timer(&list->timer); - kfree(list); - list = next; - } - - spin_unlock_bh(&list_lock); - </programlisting> - - <para> - Sooner or later, this will crash on SMP, because a timer can - have just gone off before the <function>spin_lock_bh()</function>, - and it will only get the lock after we - <function>spin_unlock_bh()</function>, and then try to free - the element (which has already been freed!). - </para> - - <para> - This can be avoided by checking the result of - <function>del_timer()</function>: if it returns - <returnvalue>1</returnvalue>, the timer has been deleted. - If <returnvalue>0</returnvalue>, it means (in this - case) that it is currently running, so we can do: - </para> - - <programlisting> - retry: - spin_lock_bh(&list_lock); - - while (list) { - struct foo *next = list->next; - if (!del_timer(&list->timer)) { - /* Give timer a chance to delete this */ - spin_unlock_bh(&list_lock); - goto retry; - } - kfree(list); - list = next; - } - - spin_unlock_bh(&list_lock); - </programlisting> - - <para> - Another common problem is deleting timers which restart - themselves (by calling <function>add_timer()</function> at the end - of their timer function). Because this is a fairly common case - which is prone to races, you should use <function>del_timer_sync()</function> - (<filename class="headerfile">include/linux/timer.h</filename>) - to handle this case. It returns the number of times the timer - had to be deleted before we finally stopped it from adding itself back - in. - </para> - </sect1> - - </chapter> - - <chapter id="Efficiency"> - <title>Locking Speed</title> - - <para> -There are three main things to worry about when considering speed of -some code which does locking. First is concurrency: how many things -are going to be waiting while someone else is holding a lock. Second -is the time taken to actually acquire and release an uncontended lock. -Third is using fewer, or smarter locks. I'm assuming that the lock is -used fairly often: otherwise, you wouldn't be concerned about -efficiency. -</para> - <para> -Concurrency depends on how long the lock is usually held: you should -hold the lock for as long as needed, but no longer. In the cache -example, we always create the object without the lock held, and then -grab the lock only when we are ready to insert it in the list. -</para> - <para> -Acquisition times depend on how much damage the lock operations do to -the pipeline (pipeline stalls) and how likely it is that this CPU was -the last one to grab the lock (ie. is the lock cache-hot for this -CPU): on a machine with more CPUs, this likelihood drops fast. -Consider a 700MHz Intel Pentium III: an instruction takes about 0.7ns, -an atomic increment takes about 58ns, a lock which is cache-hot on -this CPU takes 160ns, and a cacheline transfer from another CPU takes -an additional 170 to 360ns. (These figures from Paul McKenney's -<ulink url="http://www.linuxjournal.com/article.php?sid=6993"> Linux -Journal RCU article</ulink>). -</para> - <para> -These two aims conflict: holding a lock for a short time might be done -by splitting locks into parts (such as in our final per-object-lock -example), but this increases the number of lock acquisitions, and the -results are often slower than having a single lock. This is another -reason to advocate locking simplicity. -</para> - <para> -The third concern is addressed below: there are some methods to reduce -the amount of locking which needs to be done. -</para> - - <sect1 id="efficiency-rwlocks"> - <title>Read/Write Lock Variants</title> - - <para> - Both spinlocks and mutexes have read/write variants: - <type>rwlock_t</type> and <structname>struct rw_semaphore</structname>. - These divide users into two classes: the readers and the writers. If - you are only reading the data, you can get a read lock, but to write to - the data you need the write lock. Many people can hold a read lock, - but a writer must be sole holder. - </para> - - <para> - If your code divides neatly along reader/writer lines (as our - cache code does), and the lock is held by readers for - significant lengths of time, using these locks can help. They - are slightly slower than the normal locks though, so in practice - <type>rwlock_t</type> is not usually worthwhile. - </para> - </sect1> - - <sect1 id="efficiency-read-copy-update"> - <title>Avoiding Locks: Read Copy Update</title> - - <para> - There is a special method of read/write locking called Read Copy - Update. Using RCU, the readers can avoid taking a lock - altogether: as we expect our cache to be read more often than - updated (otherwise the cache is a waste of time), it is a - candidate for this optimization. - </para> - - <para> - How do we get rid of read locks? Getting rid of read locks - means that writers may be changing the list underneath the - readers. That is actually quite simple: we can read a linked - list while an element is being added if the writer adds the - element very carefully. For example, adding - <symbol>new</symbol> to a single linked list called - <symbol>list</symbol>: - </para> - - <programlisting> - new->next = list->next; - wmb(); - list->next = new; - </programlisting> - - <para> - The <function>wmb()</function> is a write memory barrier. It - ensures that the first operation (setting the new element's - <symbol>next</symbol> pointer) is complete and will be seen by - all CPUs, before the second operation is (putting the new - element into the list). This is important, since modern - compilers and modern CPUs can both reorder instructions unless - told otherwise: we want a reader to either not see the new - element at all, or see the new element with the - <symbol>next</symbol> pointer correctly pointing at the rest of - the list. - </para> - <para> - Fortunately, there is a function to do this for standard - <structname>struct list_head</structname> lists: - <function>list_add_rcu()</function> - (<filename>include/linux/list.h</filename>). - </para> - <para> - Removing an element from the list is even simpler: we replace - the pointer to the old element with a pointer to its successor, - and readers will either see it, or skip over it. - </para> - <programlisting> - list->next = old->next; - </programlisting> - <para> - There is <function>list_del_rcu()</function> - (<filename>include/linux/list.h</filename>) which does this (the - normal version poisons the old object, which we don't want). - </para> - <para> - The reader must also be careful: some CPUs can look through the - <symbol>next</symbol> pointer to start reading the contents of - the next element early, but don't realize that the pre-fetched - contents is wrong when the <symbol>next</symbol> pointer changes - underneath them. Once again, there is a - <function>list_for_each_entry_rcu()</function> - (<filename>include/linux/list.h</filename>) to help you. Of - course, writers can just use - <function>list_for_each_entry()</function>, since there cannot - be two simultaneous writers. - </para> - <para> - Our final dilemma is this: when can we actually destroy the - removed element? Remember, a reader might be stepping through - this element in the list right now: if we free this element and - the <symbol>next</symbol> pointer changes, the reader will jump - off into garbage and crash. We need to wait until we know that - all the readers who were traversing the list when we deleted the - element are finished. We use <function>call_rcu()</function> to - register a callback which will actually destroy the object once - all pre-existing readers are finished. Alternatively, - <function>synchronize_rcu()</function> may be used to block until - all pre-existing are finished. - </para> - <para> - But how does Read Copy Update know when the readers are - finished? The method is this: firstly, the readers always - traverse the list inside - <function>rcu_read_lock()</function>/<function>rcu_read_unlock()</function> - pairs: these simply disable preemption so the reader won't go to - sleep while reading the list. - </para> - <para> - RCU then waits until every other CPU has slept at least once: - since readers cannot sleep, we know that any readers which were - traversing the list during the deletion are finished, and the - callback is triggered. The real Read Copy Update code is a - little more optimized than this, but this is the fundamental - idea. - </para> - -<programlisting> ---- cache.c.perobjectlock 2003-12-11 17:15:03.000000000 +1100 -+++ cache.c.rcupdate 2003-12-11 17:55:14.000000000 +1100 -@@ -1,15 +1,18 @@ - #include <linux/list.h> - #include <linux/slab.h> - #include <linux/string.h> -+#include <linux/rcupdate.h> - #include <linux/mutex.h> - #include <asm/errno.h> - - struct object - { -- /* These two protected by cache_lock. */ -+ /* This is protected by RCU */ - struct list_head list; - int popularity; - -+ struct rcu_head rcu; -+ - atomic_t refcnt; - - /* Doesn't change once created. */ -@@ -40,7 +43,7 @@ - { - struct object *i; - -- list_for_each_entry(i, &cache, list) { -+ list_for_each_entry_rcu(i, &cache, list) { - if (i->id == id) { - i->popularity++; - return i; -@@ -49,19 +52,25 @@ - return NULL; - } - -+/* Final discard done once we know no readers are looking. */ -+static void cache_delete_rcu(void *arg) -+{ -+ object_put(arg); -+} -+ - /* Must be holding cache_lock */ - static void __cache_delete(struct object *obj) - { - BUG_ON(!obj); -- list_del(&obj->list); -- object_put(obj); -+ list_del_rcu(&obj->list); - cache_num--; -+ call_rcu(&obj->rcu, cache_delete_rcu); - } - - /* Must be holding cache_lock */ - static void __cache_add(struct object *obj) - { -- list_add(&obj->list, &cache); -+ list_add_rcu(&obj->list, &cache); - if (++cache_num > MAX_CACHE_SIZE) { - struct object *i, *outcast = NULL; - list_for_each_entry(i, &cache, list) { -@@ -104,12 +114,11 @@ - struct object *cache_find(int id) - { - struct object *obj; -- unsigned long flags; - -- spin_lock_irqsave(&cache_lock, flags); -+ rcu_read_lock(); - obj = __cache_find(id); - if (obj) - object_get(obj); -- spin_unlock_irqrestore(&cache_lock, flags); -+ rcu_read_unlock(); - return obj; - } -</programlisting> - -<para> -Note that the reader will alter the -<structfield>popularity</structfield> member in -<function>__cache_find()</function>, and now it doesn't hold a lock. -One solution would be to make it an <type>atomic_t</type>, but for -this usage, we don't really care about races: an approximate result is -good enough, so I didn't change it. -</para> - -<para> -The result is that <function>cache_find()</function> requires no -synchronization with any other functions, so is almost as fast on SMP -as it would be on UP. -</para> - -<para> -There is a further optimization possible here: remember our original -cache code, where there were no reference counts and the caller simply -held the lock whenever using the object? This is still possible: if -you hold the lock, no one can delete the object, so you don't need to -get and put the reference count. -</para> - -<para> -Now, because the 'read lock' in RCU is simply disabling preemption, a -caller which always has preemption disabled between calling -<function>cache_find()</function> and -<function>object_put()</function> does not need to actually get and -put the reference count: we could expose -<function>__cache_find()</function> by making it non-static, and -such callers could simply call that. -</para> -<para> -The benefit here is that the reference count is not written to: the -object is not altered in any way, which is much faster on SMP -machines due to caching. -</para> - </sect1> - - <sect1 id="per-cpu"> - <title>Per-CPU Data</title> - - <para> - Another technique for avoiding locking which is used fairly - widely is to duplicate information for each CPU. For example, - if you wanted to keep a count of a common condition, you could - use a spin lock and a single counter. Nice and simple. - </para> - - <para> - If that was too slow (it's usually not, but if you've got a - really big machine to test on and can show that it is), you - could instead use a counter for each CPU, then none of them need - an exclusive lock. See <function>DEFINE_PER_CPU()</function>, - <function>get_cpu_var()</function> and - <function>put_cpu_var()</function> - (<filename class="headerfile">include/linux/percpu.h</filename>). - </para> - - <para> - Of particular use for simple per-cpu counters is the - <type>local_t</type> type, and the - <function>cpu_local_inc()</function> and related functions, - which are more efficient than simple code on some architectures - (<filename class="headerfile">include/asm/local.h</filename>). - </para> - - <para> - Note that there is no simple, reliable way of getting an exact - value of such a counter, without introducing more locks. This - is not a problem for some uses. - </para> - </sect1> - - <sect1 id="mostly-hardirq"> - <title>Data Which Mostly Used By An IRQ Handler</title> - - <para> - If data is always accessed from within the same IRQ handler, you - don't need a lock at all: the kernel already guarantees that the - irq handler will not run simultaneously on multiple CPUs. - </para> - <para> - Manfred Spraul points out that you can still do this, even if - the data is very occasionally accessed in user context or - softirqs/tasklets. The irq handler doesn't use a lock, and - all other accesses are done as so: - </para> - -<programlisting> - spin_lock(&lock); - disable_irq(irq); - ... - enable_irq(irq); - spin_unlock(&lock); -</programlisting> - <para> - The <function>disable_irq()</function> prevents the irq handler - from running (and waits for it to finish if it's currently - running on other CPUs). The spinlock prevents any other - accesses happening at the same time. Naturally, this is slower - than just a <function>spin_lock_irq()</function> call, so it - only makes sense if this type of access happens extremely - rarely. - </para> - </sect1> - </chapter> - - <chapter id="sleeping-things"> - <title>What Functions Are Safe To Call From Interrupts?</title> - - <para> - Many functions in the kernel sleep (ie. call schedule()) - directly or indirectly: you can never call them while holding a - spinlock, or with preemption disabled. This also means you need - to be in user context: calling them from an interrupt is illegal. - </para> - - <sect1 id="sleeping"> - <title>Some Functions Which Sleep</title> - - <para> - The most common ones are listed below, but you usually have to - read the code to find out if other calls are safe. If everyone - else who calls it can sleep, you probably need to be able to - sleep, too. In particular, registration and deregistration - functions usually expect to be called from user context, and can - sleep. - </para> - - <itemizedlist> - <listitem> - <para> - Accesses to - <firstterm linkend="gloss-userspace">userspace</firstterm>: - </para> - <itemizedlist> - <listitem> - <para> - <function>copy_from_user()</function> - </para> - </listitem> - <listitem> - <para> - <function>copy_to_user()</function> - </para> - </listitem> - <listitem> - <para> - <function>get_user()</function> - </para> - </listitem> - <listitem> - <para> - <function>put_user()</function> - </para> - </listitem> - </itemizedlist> - </listitem> - - <listitem> - <para> - <function>kmalloc(GFP_KERNEL)</function> - </para> - </listitem> - - <listitem> - <para> - <function>mutex_lock_interruptible()</function> and - <function>mutex_lock()</function> - </para> - <para> - There is a <function>mutex_trylock()</function> which does not - sleep. Still, it must not be used inside interrupt context since - its implementation is not safe for that. - <function>mutex_unlock()</function> will also never sleep. - It cannot be used in interrupt context either since a mutex - must be released by the same task that acquired it. - </para> - </listitem> - </itemizedlist> - </sect1> - - <sect1 id="dont-sleep"> - <title>Some Functions Which Don't Sleep</title> - - <para> - Some functions are safe to call from any context, or holding - almost any lock. - </para> - - <itemizedlist> - <listitem> - <para> - <function>printk()</function> - </para> - </listitem> - <listitem> - <para> - <function>kfree()</function> - </para> - </listitem> - <listitem> - <para> - <function>add_timer()</function> and <function>del_timer()</function> - </para> - </listitem> - </itemizedlist> - </sect1> - </chapter> - - <chapter id="apiref-mutex"> - <title>Mutex API reference</title> -!Iinclude/linux/mutex.h -!Ekernel/locking/mutex.c - </chapter> - - <chapter id="apiref-futex"> - <title>Futex API reference</title> -!Ikernel/futex.c - </chapter> - - <chapter id="references"> - <title>Further reading</title> - - <itemizedlist> - <listitem> - <para> - <filename>Documentation/locking/spinlocks.txt</filename>: - Linus Torvalds' spinlocking tutorial in the kernel sources. - </para> - </listitem> - - <listitem> - <para> - Unix Systems for Modern Architectures: Symmetric - Multiprocessing and Caching for Kernel Programmers: - </para> - - <para> - Curt Schimmel's very good introduction to kernel level - locking (not written for Linux, but nearly everything - applies). The book is expensive, but really worth every - penny to understand SMP locking. [ISBN: 0201633388] - </para> - </listitem> - </itemizedlist> - </chapter> - - <chapter id="thanks"> - <title>Thanks</title> - - <para> - Thanks to Telsa Gwynne for DocBooking, neatening and adding - style. - </para> - - <para> - Thanks to Martin Pool, Philipp Rumpf, Stephen Rothwell, Paul - Mackerras, Ruedi Aschwanden, Alan Cox, Manfred Spraul, Tim - Waugh, Pete Zaitcev, James Morris, Robert Love, Paul McKenney, - John Ashby for proofreading, correcting, flaming, commenting. - </para> - - <para> - Thanks to the cabal for having no influence on this document. - </para> - </chapter> - - <glossary id="glossary"> - <title>Glossary</title> - - <glossentry id="gloss-preemption"> - <glossterm>preemption</glossterm> - <glossdef> - <para> - Prior to 2.5, or when <symbol>CONFIG_PREEMPT</symbol> is - unset, processes in user context inside the kernel would not - preempt each other (ie. you had that CPU until you gave it up, - except for interrupts). With the addition of - <symbol>CONFIG_PREEMPT</symbol> in 2.5.4, this changed: when - in user context, higher priority tasks can "cut in": spinlocks - were changed to disable preemption, even on UP. - </para> - </glossdef> - </glossentry> - - <glossentry id="gloss-bh"> - <glossterm>bh</glossterm> - <glossdef> - <para> - Bottom Half: for historical reasons, functions with - '_bh' in them often now refer to any software interrupt, e.g. - <function>spin_lock_bh()</function> blocks any software interrupt - on the current CPU. Bottom halves are deprecated, and will - eventually be replaced by tasklets. Only one bottom half will be - running at any time. - </para> - </glossdef> - </glossentry> - - <glossentry id="gloss-hwinterrupt"> - <glossterm>Hardware Interrupt / Hardware IRQ</glossterm> - <glossdef> - <para> - Hardware interrupt request. <function>in_irq()</function> returns - <returnvalue>true</returnvalue> in a hardware interrupt handler. - </para> - </glossdef> - </glossentry> - - <glossentry id="gloss-interruptcontext"> - <glossterm>Interrupt Context</glossterm> - <glossdef> - <para> - Not user context: processing a hardware irq or software irq. - Indicated by the <function>in_interrupt()</function> macro - returning <returnvalue>true</returnvalue>. - </para> - </glossdef> - </glossentry> - - <glossentry id="gloss-smp"> - <glossterm><acronym>SMP</acronym></glossterm> - <glossdef> - <para> - Symmetric Multi-Processor: kernels compiled for multiple-CPU - machines. (CONFIG_SMP=y). - </para> - </glossdef> - </glossentry> - - <glossentry id="gloss-softirq"> - <glossterm>Software Interrupt / softirq</glossterm> - <glossdef> - <para> - Software interrupt handler. <function>in_irq()</function> returns - <returnvalue>false</returnvalue>; <function>in_softirq()</function> - returns <returnvalue>true</returnvalue>. Tasklets and softirqs - both fall into the category of 'software interrupts'. - </para> - <para> - Strictly speaking a softirq is one of up to 32 enumerated software - interrupts which can run on multiple CPUs at once. - Sometimes used to refer to tasklets as - well (ie. all software interrupts). - </para> - </glossdef> - </glossentry> - - <glossentry id="gloss-tasklet"> - <glossterm>tasklet</glossterm> - <glossdef> - <para> - A dynamically-registrable software interrupt, - which is guaranteed to only run on one CPU at a time. - </para> - </glossdef> - </glossentry> - - <glossentry id="gloss-timers"> - <glossterm>timer</glossterm> - <glossdef> - <para> - A dynamically-registrable software interrupt, which is run at - (or close to) a given time. When running, it is just like a - tasklet (in fact, they are called from the TIMER_SOFTIRQ). - </para> - </glossdef> - </glossentry> - - <glossentry id="gloss-up"> - <glossterm><acronym>UP</acronym></glossterm> - <glossdef> - <para> - Uni-Processor: Non-SMP. (CONFIG_SMP=n). - </para> - </glossdef> - </glossentry> - - <glossentry id="gloss-usercontext"> - <glossterm>User Context</glossterm> - <glossdef> - <para> - The kernel executing on behalf of a particular process (ie. a - system call or trap) or kernel thread. You can tell which - process with the <symbol>current</symbol> macro.) Not to - be confused with userspace. Can be interrupted by software or - hardware interrupts. - </para> - </glossdef> - </glossentry> - - <glossentry id="gloss-userspace"> - <glossterm>Userspace</glossterm> - <glossdef> - <para> - A process executing its own code outside the kernel. - </para> - </glossdef> - </glossentry> - - </glossary> -</book> - diff --git a/Documentation/DocBook/kgdb.tmpl b/Documentation/DocBook/kgdb.tmpl deleted file mode 100644 index 856ac20bf367..000000000000 --- a/Documentation/DocBook/kgdb.tmpl +++ /dev/null @@ -1,918 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="kgdbOnLinux"> - <bookinfo> - <title>Using kgdb, kdb and the kernel debugger internals</title> - - <authorgroup> - <author> - <firstname>Jason</firstname> - <surname>Wessel</surname> - <affiliation> - <address> - <email>jason.wessel@windriver.com</email> - </address> - </affiliation> - </author> - </authorgroup> - <copyright> - <year>2008,2010</year> - <holder>Wind River Systems, Inc.</holder> - </copyright> - <copyright> - <year>2004-2005</year> - <holder>MontaVista Software, Inc.</holder> - </copyright> - <copyright> - <year>2004</year> - <holder>Amit S. Kale</holder> - </copyright> - - <legalnotice> - <para> - This file is licensed under the terms of the GNU General Public License - version 2. This program is licensed "as is" without any warranty of any - kind, whether express or implied. - </para> - - </legalnotice> - </bookinfo> - -<toc></toc> - <chapter id="Introduction"> - <title>Introduction</title> - <para> - The kernel has two different debugger front ends (kdb and kgdb) - which interface to the debug core. It is possible to use either - of the debugger front ends and dynamically transition between them - if you configure the kernel properly at compile and runtime. - </para> - <para> - Kdb is simplistic shell-style interface which you can use on a - system console with a keyboard or serial console. You can use it - to inspect memory, registers, process lists, dmesg, and even set - breakpoints to stop in a certain location. Kdb is not a source - level debugger, although you can set breakpoints and execute some - basic kernel run control. Kdb is mainly aimed at doing some - analysis to aid in development or diagnosing kernel problems. You - can access some symbols by name in kernel built-ins or in kernel - modules if the code was built - with <symbol>CONFIG_KALLSYMS</symbol>. - </para> - <para> - Kgdb is intended to be used as a source level debugger for the - Linux kernel. It is used along with gdb to debug a Linux kernel. - The expectation is that gdb can be used to "break in" to the - kernel to inspect memory, variables and look through call stack - information similar to the way an application developer would use - gdb to debug an application. It is possible to place breakpoints - in kernel code and perform some limited execution stepping. - </para> - <para> - Two machines are required for using kgdb. One of these machines is - a development machine and the other is the target machine. The - kernel to be debugged runs on the target machine. The development - machine runs an instance of gdb against the vmlinux file which - contains the symbols (not a boot image such as bzImage, zImage, - uImage...). In gdb the developer specifies the connection - parameters and connects to kgdb. The type of connection a - developer makes with gdb depends on the availability of kgdb I/O - modules compiled as built-ins or loadable kernel modules in the test - machine's kernel. - </para> - </chapter> - <chapter id="CompilingAKernel"> - <title>Compiling a kernel</title> - <para> - <itemizedlist> - <listitem><para>In order to enable compilation of kdb, you must first enable kgdb.</para></listitem> - <listitem><para>The kgdb test compile options are described in the kgdb test suite chapter.</para></listitem> - </itemizedlist> - </para> - <sect1 id="CompileKGDB"> - <title>Kernel config options for kgdb</title> - <para> - To enable <symbol>CONFIG_KGDB</symbol> you should look under - "Kernel hacking" / "Kernel debugging" and select "KGDB: kernel debugger". - </para> - <para> - While it is not a hard requirement that you have symbols in your - vmlinux file, gdb tends not to be very useful without the symbolic - data, so you will want to turn - on <symbol>CONFIG_DEBUG_INFO</symbol> which is called "Compile the - kernel with debug info" in the config menu. - </para> - <para> - It is advised, but not required, that you turn on the - <symbol>CONFIG_FRAME_POINTER</symbol> kernel option which is called "Compile the - kernel with frame pointers" in the config menu. This option - inserts code to into the compiled executable which saves the frame - information in registers or on the stack at different points which - allows a debugger such as gdb to more accurately construct - stack back traces while debugging the kernel. - </para> - <para> - If the architecture that you are using supports the kernel option - CONFIG_STRICT_KERNEL_RWX, you should consider turning it off. This - option will prevent the use of software breakpoints because it - marks certain regions of the kernel's memory space as read-only. - If kgdb supports it for the architecture you are using, you can - use hardware breakpoints if you desire to run with the - CONFIG_STRICT_KERNEL_RWX option turned on, else you need to turn off - this option. - </para> - <para> - Next you should choose one of more I/O drivers to interconnect - debugging host and debugged target. Early boot debugging requires - a KGDB I/O driver that supports early debugging and the driver - must be built into the kernel directly. Kgdb I/O driver - configuration takes place via kernel or module parameters which - you can learn more about in the in the section that describes the - parameter "kgdboc". - </para> - <para>Here is an example set of .config symbols to enable or - disable for kgdb: - <itemizedlist> - <listitem><para># CONFIG_STRICT_KERNEL_RWX is not set</para></listitem> - <listitem><para>CONFIG_FRAME_POINTER=y</para></listitem> - <listitem><para>CONFIG_KGDB=y</para></listitem> - <listitem><para>CONFIG_KGDB_SERIAL_CONSOLE=y</para></listitem> - </itemizedlist> - </para> - </sect1> - <sect1 id="CompileKDB"> - <title>Kernel config options for kdb</title> - <para>Kdb is quite a bit more complex than the simple gdbstub - sitting on top of the kernel's debug core. Kdb must implement a - shell, and also adds some helper functions in other parts of the - kernel, responsible for printing out interesting data such as what - you would see if you ran "lsmod", or "ps". In order to build kdb - into the kernel you follow the same steps as you would for kgdb. - </para> - <para>The main config option for kdb - is <symbol>CONFIG_KGDB_KDB</symbol> which is called "KGDB_KDB: - include kdb frontend for kgdb" in the config menu. In theory you - would have already also selected an I/O driver such as the - CONFIG_KGDB_SERIAL_CONSOLE interface if you plan on using kdb on a - serial port, when you were configuring kgdb. - </para> - <para>If you want to use a PS/2-style keyboard with kdb, you would - select CONFIG_KDB_KEYBOARD which is called "KGDB_KDB: keyboard as - input device" in the config menu. The CONFIG_KDB_KEYBOARD option - is not used for anything in the gdb interface to kgdb. The - CONFIG_KDB_KEYBOARD option only works with kdb. - </para> - <para>Here is an example set of .config symbols to enable/disable kdb: - <itemizedlist> - <listitem><para># CONFIG_STRICT_KERNEL_RWX is not set</para></listitem> - <listitem><para>CONFIG_FRAME_POINTER=y</para></listitem> - <listitem><para>CONFIG_KGDB=y</para></listitem> - <listitem><para>CONFIG_KGDB_SERIAL_CONSOLE=y</para></listitem> - <listitem><para>CONFIG_KGDB_KDB=y</para></listitem> - <listitem><para>CONFIG_KDB_KEYBOARD=y</para></listitem> - </itemizedlist> - </para> - </sect1> - </chapter> - <chapter id="kgdbKernelArgs"> - <title>Kernel Debugger Boot Arguments</title> - <para>This section describes the various runtime kernel - parameters that affect the configuration of the kernel debugger. - The following chapter covers using kdb and kgdb as well as - providing some examples of the configuration parameters.</para> - <sect1 id="kgdboc"> - <title>Kernel parameter: kgdboc</title> - <para>The kgdboc driver was originally an abbreviation meant to - stand for "kgdb over console". Today it is the primary mechanism - to configure how to communicate from gdb to kgdb as well as the - devices you want to use to interact with the kdb shell. - </para> - <para>For kgdb/gdb, kgdboc is designed to work with a single serial - port. It is intended to cover the circumstance where you want to - use a serial console as your primary console as well as using it to - perform kernel debugging. It is also possible to use kgdb on a - serial port which is not designated as a system console. Kgdboc - may be configured as a kernel built-in or a kernel loadable module. - You can only make use of <constant>kgdbwait</constant> and early - debugging if you build kgdboc into the kernel as a built-in. - </para> - <para>Optionally you can elect to activate kms (Kernel Mode - Setting) integration. When you use kms with kgdboc and you have a - video driver that has atomic mode setting hooks, it is possible to - enter the debugger on the graphics console. When the kernel - execution is resumed, the previous graphics mode will be restored. - This integration can serve as a useful tool to aid in diagnosing - crashes or doing analysis of memory with kdb while allowing the - full graphics console applications to run. - </para> - <sect2 id="kgdbocArgs"> - <title>kgdboc arguments</title> - <para>Usage: <constant>kgdboc=[kms][[,]kbd][[,]serial_device][,baud]</constant></para> - <para>The order listed above must be observed if you use any of the - optional configurations together. - </para> - <para>Abbreviations: - <itemizedlist> - <listitem><para>kms = Kernel Mode Setting</para></listitem> - <listitem><para>kbd = Keyboard</para></listitem> - </itemizedlist> - </para> - <para>You can configure kgdboc to use the keyboard, and/or a serial - device depending on if you are using kdb and/or kgdb, in one of the - following scenarios. The order listed above must be observed if - you use any of the optional configurations together. Using kms + - only gdb is generally not a useful combination.</para> - <sect3 id="kgdbocArgs1"> - <title>Using loadable module or built-in</title> - <para> - <orderedlist> - <listitem><para>As a kernel built-in:</para> - <para>Use the kernel boot argument: <constant>kgdboc=<tty-device>,[baud]</constant></para></listitem> - <listitem> - <para>As a kernel loadable module:</para> - <para>Use the command: <constant>modprobe kgdboc kgdboc=<tty-device>,[baud]</constant></para> - <para>Here are two examples of how you might format the kgdboc - string. The first is for an x86 target using the first serial port. - The second example is for the ARM Versatile AB using the second - serial port. - <orderedlist> - <listitem><para><constant>kgdboc=ttyS0,115200</constant></para></listitem> - <listitem><para><constant>kgdboc=ttyAMA1,115200</constant></para></listitem> - </orderedlist> - </para> - </listitem> - </orderedlist></para> - </sect3> - <sect3 id="kgdbocArgs2"> - <title>Configure kgdboc at runtime with sysfs</title> - <para>At run time you can enable or disable kgdboc by echoing a - parameters into the sysfs. Here are two examples:</para> - <orderedlist> - <listitem><para>Enable kgdboc on ttyS0</para> - <para><constant>echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem> - <listitem><para>Disable kgdboc</para> - <para><constant>echo "" > /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem> - </orderedlist> - <para>NOTE: You do not need to specify the baud if you are - configuring the console on tty which is already configured or - open.</para> - </sect3> - <sect3 id="kgdbocArgs3"> - <title>More examples</title> - <para>You can configure kgdboc to use the keyboard, and/or a serial device - depending on if you are using kdb and/or kgdb, in one of the - following scenarios. - <orderedlist> - <listitem><para>kdb and kgdb over only a serial port</para> - <para><constant>kgdboc=<serial_device>[,baud]</constant></para> - <para>Example: <constant>kgdboc=ttyS0,115200</constant></para> - </listitem> - <listitem><para>kdb and kgdb with keyboard and a serial port</para> - <para><constant>kgdboc=kbd,<serial_device>[,baud]</constant></para> - <para>Example: <constant>kgdboc=kbd,ttyS0,115200</constant></para> - </listitem> - <listitem><para>kdb with a keyboard</para> - <para><constant>kgdboc=kbd</constant></para> - </listitem> - <listitem><para>kdb with kernel mode setting</para> - <para><constant>kgdboc=kms,kbd</constant></para> - </listitem> - <listitem><para>kdb with kernel mode setting and kgdb over a serial port</para> - <para><constant>kgdboc=kms,kbd,ttyS0,115200</constant></para> - </listitem> - </orderedlist> - </para> - <para>NOTE: Kgdboc does not support interrupting the target via the - gdb remote protocol. You must manually send a sysrq-g unless you - have a proxy that splits console output to a terminal program. - A console proxy has a separate TCP port for the debugger and a separate - TCP port for the "human" console. The proxy can take care of sending - the sysrq-g for you. - </para> - <para>When using kgdboc with no debugger proxy, you can end up - connecting the debugger at one of two entry points. If an - exception occurs after you have loaded kgdboc, a message should - print on the console stating it is waiting for the debugger. In - this case you disconnect your terminal program and then connect the - debugger in its place. If you want to interrupt the target system - and forcibly enter a debug session you have to issue a Sysrq - sequence and then type the letter <constant>g</constant>. Then - you disconnect the terminal session and connect gdb. Your options - if you don't like this are to hack gdb to send the sysrq-g for you - as well as on the initial connect, or to use a debugger proxy that - allows an unmodified gdb to do the debugging. - </para> - </sect3> - </sect2> - </sect1> - <sect1 id="kgdbwait"> - <title>Kernel parameter: kgdbwait</title> - <para> - The Kernel command line option <constant>kgdbwait</constant> makes - kgdb wait for a debugger connection during booting of a kernel. You - can only use this option if you compiled a kgdb I/O driver into the - kernel and you specified the I/O driver configuration as a kernel - command line option. The kgdbwait parameter should always follow the - configuration parameter for the kgdb I/O driver in the kernel - command line else the I/O driver will not be configured prior to - asking the kernel to use it to wait. - </para> - <para> - The kernel will stop and wait as early as the I/O driver and - architecture allows when you use this option. If you build the - kgdb I/O driver as a loadable kernel module kgdbwait will not do - anything. - </para> - </sect1> - <sect1 id="kgdbcon"> - <title>Kernel parameter: kgdbcon</title> - <para> The kgdbcon feature allows you to see printk() messages - inside gdb while gdb is connected to the kernel. Kdb does not make - use of the kgdbcon feature. - </para> - <para>Kgdb supports using the gdb serial protocol to send console - messages to the debugger when the debugger is connected and running. - There are two ways to activate this feature. - <orderedlist> - <listitem><para>Activate with the kernel command line option:</para> - <para><constant>kgdbcon</constant></para> - </listitem> - <listitem><para>Use sysfs before configuring an I/O driver</para> - <para> - <constant>echo 1 > /sys/module/kgdb/parameters/kgdb_use_con</constant> - </para> - <para> - NOTE: If you do this after you configure the kgdb I/O driver, the - setting will not take effect until the next point the I/O is - reconfigured. - </para> - </listitem> - </orderedlist> - </para> - <para>IMPORTANT NOTE: You cannot use kgdboc + kgdbcon on a tty that is an - active system console. An example of incorrect usage is <constant>console=ttyS0,115200 kgdboc=ttyS0 kgdbcon</constant> - </para> - <para>It is possible to use this option with kgdboc on a tty that is not a system console. - </para> - </sect1> - <sect1 id="kgdbreboot"> - <title>Run time parameter: kgdbreboot</title> - <para> The kgdbreboot feature allows you to change how the debugger - deals with the reboot notification. You have 3 choices for the - behavior. The default behavior is always set to 0.</para> - <orderedlist> - <listitem><para>echo -1 > /sys/module/debug_core/parameters/kgdbreboot</para> - <para>Ignore the reboot notification entirely.</para> - </listitem> - <listitem><para>echo 0 > /sys/module/debug_core/parameters/kgdbreboot</para> - <para>Send the detach message to any attached debugger client.</para> - </listitem> - <listitem><para>echo 1 > /sys/module/debug_core/parameters/kgdbreboot</para> - <para>Enter the debugger on reboot notify.</para> - </listitem> - </orderedlist> - </sect1> - </chapter> - <chapter id="usingKDB"> - <title>Using kdb</title> - <para> - </para> - <sect1 id="quickKDBserial"> - <title>Quick start for kdb on a serial port</title> - <para>This is a quick example of how to use kdb.</para> - <para><orderedlist> - <listitem><para>Configure kgdboc at boot using kernel parameters: - <itemizedlist> - <listitem><para><constant>console=ttyS0,115200 kgdboc=ttyS0,115200</constant></para></listitem> - </itemizedlist></para> - <para>OR</para> - <para>Configure kgdboc after the kernel has booted; assuming you are using a serial port console: - <itemizedlist> - <listitem><para><constant>echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem> - </itemizedlist> - </para> - </listitem> - <listitem><para>Enter the kernel debugger manually or by waiting for an oops or fault. There are several ways you can enter the kernel debugger manually; all involve using the sysrq-g, which means you must have enabled CONFIG_MAGIC_SYSRQ=y in your kernel config.</para> - <itemizedlist> - <listitem><para>When logged in as root or with a super user session you can run:</para> - <para><constant>echo g > /proc/sysrq-trigger</constant></para></listitem> - <listitem><para>Example using minicom 2.2</para> - <para>Press: <constant>Control-a</constant></para> - <para>Press: <constant>f</constant></para> - <para>Press: <constant>g</constant></para> - </listitem> - <listitem><para>When you have telneted to a terminal server that supports sending a remote break</para> - <para>Press: <constant>Control-]</constant></para> - <para>Type in:<constant>send break</constant></para> - <para>Press: <constant>Enter</constant></para> - <para>Press: <constant>g</constant></para> - </listitem> - </itemizedlist> - </listitem> - <listitem><para>From the kdb prompt you can run the "help" command to see a complete list of the commands that are available.</para> - <para>Some useful commands in kdb include: - <itemizedlist> - <listitem><para>lsmod -- Shows where kernel modules are loaded</para></listitem> - <listitem><para>ps -- Displays only the active processes</para></listitem> - <listitem><para>ps A -- Shows all the processes</para></listitem> - <listitem><para>summary -- Shows kernel version info and memory usage</para></listitem> - <listitem><para>bt -- Get a backtrace of the current process using dump_stack()</para></listitem> - <listitem><para>dmesg -- View the kernel syslog buffer</para></listitem> - <listitem><para>go -- Continue the system</para></listitem> - </itemizedlist> - </para> - </listitem> - <listitem> - <para>When you are done using kdb you need to consider rebooting the - system or using the "go" command to resuming normal kernel - execution. If you have paused the kernel for a lengthy period of - time, applications that rely on timely networking or anything to do - with real wall clock time could be adversely affected, so you - should take this into consideration when using the kernel - debugger.</para> - </listitem> - </orderedlist></para> - </sect1> - <sect1 id="quickKDBkeyboard"> - <title>Quick start for kdb using a keyboard connected console</title> - <para>This is a quick example of how to use kdb with a keyboard.</para> - <para><orderedlist> - <listitem><para>Configure kgdboc at boot using kernel parameters: - <itemizedlist> - <listitem><para><constant>kgdboc=kbd</constant></para></listitem> - </itemizedlist></para> - <para>OR</para> - <para>Configure kgdboc after the kernel has booted: - <itemizedlist> - <listitem><para><constant>echo kbd > /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem> - </itemizedlist> - </para> - </listitem> - <listitem><para>Enter the kernel debugger manually or by waiting for an oops or fault. There are several ways you can enter the kernel debugger manually; all involve using the sysrq-g, which means you must have enabled CONFIG_MAGIC_SYSRQ=y in your kernel config.</para> - <itemizedlist> - <listitem><para>When logged in as root or with a super user session you can run:</para> - <para><constant>echo g > /proc/sysrq-trigger</constant></para></listitem> - <listitem><para>Example using a laptop keyboard</para> - <para>Press and hold down: <constant>Alt</constant></para> - <para>Press and hold down: <constant>Fn</constant></para> - <para>Press and release the key with the label: <constant>SysRq</constant></para> - <para>Release: <constant>Fn</constant></para> - <para>Press and release: <constant>g</constant></para> - <para>Release: <constant>Alt</constant></para> - </listitem> - <listitem><para>Example using a PS/2 101-key keyboard</para> - <para>Press and hold down: <constant>Alt</constant></para> - <para>Press and release the key with the label: <constant>SysRq</constant></para> - <para>Press and release: <constant>g</constant></para> - <para>Release: <constant>Alt</constant></para> - </listitem> - </itemizedlist> - </listitem> - <listitem> - <para>Now type in a kdb command such as "help", "dmesg", "bt" or "go" to continue kernel execution.</para> - </listitem> - </orderedlist></para> - </sect1> - </chapter> - <chapter id="EnableKGDB"> - <title>Using kgdb / gdb</title> - <para>In order to use kgdb you must activate it by passing - configuration information to one of the kgdb I/O drivers. If you - do not pass any configuration information kgdb will not do anything - at all. Kgdb will only actively hook up to the kernel trap hooks - if a kgdb I/O driver is loaded and configured. If you unconfigure - a kgdb I/O driver, kgdb will unregister all the kernel hook points. - </para> - <para> All kgdb I/O drivers can be reconfigured at run time, if - <symbol>CONFIG_SYSFS</symbol> and <symbol>CONFIG_MODULES</symbol> - are enabled, by echo'ing a new config string to - <constant>/sys/module/<driver>/parameter/<option></constant>. - The driver can be unconfigured by passing an empty string. You cannot - change the configuration while the debugger is attached. Make sure - to detach the debugger with the <constant>detach</constant> command - prior to trying to unconfigure a kgdb I/O driver. - </para> - <sect1 id="ConnectingGDB"> - <title>Connecting with gdb to a serial port</title> - <orderedlist> - <listitem><para>Configure kgdboc</para> - <para>Configure kgdboc at boot using kernel parameters: - <itemizedlist> - <listitem><para><constant>kgdboc=ttyS0,115200</constant></para></listitem> - </itemizedlist></para> - <para>OR</para> - <para>Configure kgdboc after the kernel has booted: - <itemizedlist> - <listitem><para><constant>echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem> - </itemizedlist></para> - </listitem> - <listitem> - <para>Stop kernel execution (break into the debugger)</para> - <para>In order to connect to gdb via kgdboc, the kernel must - first be stopped. There are several ways to stop the kernel which - include using kgdbwait as a boot argument, via a sysrq-g, or running - the kernel until it takes an exception where it waits for the - debugger to attach. - <itemizedlist> - <listitem><para>When logged in as root or with a super user session you can run:</para> - <para><constant>echo g > /proc/sysrq-trigger</constant></para></listitem> - <listitem><para>Example using minicom 2.2</para> - <para>Press: <constant>Control-a</constant></para> - <para>Press: <constant>f</constant></para> - <para>Press: <constant>g</constant></para> - </listitem> - <listitem><para>When you have telneted to a terminal server that supports sending a remote break</para> - <para>Press: <constant>Control-]</constant></para> - <para>Type in:<constant>send break</constant></para> - <para>Press: <constant>Enter</constant></para> - <para>Press: <constant>g</constant></para> - </listitem> - </itemizedlist> - </para> - </listitem> - <listitem> - <para>Connect from gdb</para> - <para> - Example (using a directly connected port): - </para> - <programlisting> - % gdb ./vmlinux - (gdb) set remotebaud 115200 - (gdb) target remote /dev/ttyS0 - </programlisting> - <para> - Example (kgdb to a terminal server on TCP port 2012): - </para> - <programlisting> - % gdb ./vmlinux - (gdb) target remote 192.168.2.2:2012 - </programlisting> - <para> - Once connected, you can debug a kernel the way you would debug an - application program. - </para> - <para> - If you are having problems connecting or something is going - seriously wrong while debugging, it will most often be the case - that you want to enable gdb to be verbose about its target - communications. You do this prior to issuing the <constant>target - remote</constant> command by typing in: <constant>set debug remote 1</constant> - </para> - </listitem> - </orderedlist> - <para>Remember if you continue in gdb, and need to "break in" again, - you need to issue an other sysrq-g. It is easy to create a simple - entry point by putting a breakpoint at <constant>sys_sync</constant> - and then you can run "sync" from a shell or script to break into the - debugger.</para> - </sect1> - </chapter> - <chapter id="switchKdbKgdb"> - <title>kgdb and kdb interoperability</title> - <para>It is possible to transition between kdb and kgdb dynamically. - The debug core will remember which you used the last time and - automatically start in the same mode.</para> - <sect1> - <title>Switching between kdb and kgdb</title> - <sect2> - <title>Switching from kgdb to kdb</title> - <para> - There are two ways to switch from kgdb to kdb: you can use gdb to - issue a maintenance packet, or you can blindly type the command $3#33. - Whenever the kernel debugger stops in kgdb mode it will print the - message <constant>KGDB or $3#33 for KDB</constant>. It is important - to note that you have to type the sequence correctly in one pass. - You cannot type a backspace or delete because kgdb will interpret - that as part of the debug stream. - <orderedlist> - <listitem><para>Change from kgdb to kdb by blindly typing:</para> - <para><constant>$3#33</constant></para></listitem> - <listitem><para>Change from kgdb to kdb with gdb</para> - <para><constant>maintenance packet 3</constant></para> - <para>NOTE: Now you must kill gdb. Typically you press control-z and - issue the command: kill -9 %</para></listitem> - </orderedlist> - </para> - </sect2> - <sect2> - <title>Change from kdb to kgdb</title> - <para>There are two ways you can change from kdb to kgdb. You can - manually enter kgdb mode by issuing the kgdb command from the kdb - shell prompt, or you can connect gdb while the kdb shell prompt is - active. The kdb shell looks for the typical first commands that gdb - would issue with the gdb remote protocol and if it sees one of those - commands it automatically changes into kgdb mode.</para> - <orderedlist> - <listitem><para>From kdb issue the command:</para> - <para><constant>kgdb</constant></para> - <para>Now disconnect your terminal program and connect gdb in its place</para></listitem> - <listitem><para>At the kdb prompt, disconnect the terminal program and connect gdb in its place.</para></listitem> - </orderedlist> - </sect2> - </sect1> - <sect1> - <title>Running kdb commands from gdb</title> - <para>It is possible to run a limited set of kdb commands from gdb, - using the gdb monitor command. You don't want to execute any of the - run control or breakpoint operations, because it can disrupt the - state of the kernel debugger. You should be using gdb for - breakpoints and run control operations if you have gdb connected. - The more useful commands to run are things like lsmod, dmesg, ps or - possibly some of the memory information commands. To see all the kdb - commands you can run <constant>monitor help</constant>.</para> - <para>Example: - <informalexample><programlisting> -(gdb) monitor ps -1 idle process (state I) and -27 sleeping system daemon (state M) processes suppressed, -use 'ps A' to see all. -Task Addr Pid Parent [*] cpu State Thread Command - -0xc78291d0 1 0 0 0 S 0xc7829404 init -0xc7954150 942 1 0 0 S 0xc7954384 dropbear -0xc78789c0 944 1 0 0 S 0xc7878bf4 sh -(gdb) - </programlisting></informalexample> - </para> - </sect1> - </chapter> - <chapter id="KGDBTestSuite"> - <title>kgdb Test Suite</title> - <para> - When kgdb is enabled in the kernel config you can also elect to - enable the config parameter KGDB_TESTS. Turning this on will - enable a special kgdb I/O module which is designed to test the - kgdb internal functions. - </para> - <para> - The kgdb tests are mainly intended for developers to test the kgdb - internals as well as a tool for developing a new kgdb architecture - specific implementation. These tests are not really for end users - of the Linux kernel. The primary source of documentation would be - to look in the drivers/misc/kgdbts.c file. - </para> - <para> - The kgdb test suite can also be configured at compile time to run - the core set of tests by setting the kernel config parameter - KGDB_TESTS_ON_BOOT. This particular option is aimed at automated - regression testing and does not require modifying the kernel boot - config arguments. If this is turned on, the kgdb test suite can - be disabled by specifying "kgdbts=" as a kernel boot argument. - </para> - </chapter> - <chapter id="CommonBackEndReq"> - <title>Kernel Debugger Internals</title> - <sect1 id="kgdbArchitecture"> - <title>Architecture Specifics</title> - <para> - The kernel debugger is organized into a number of components: - <orderedlist> - <listitem><para>The debug core</para> - <para> - The debug core is found in kernel/debugger/debug_core.c. It contains: - <itemizedlist> - <listitem><para>A generic OS exception handler which includes - sync'ing the processors into a stopped state on an multi-CPU - system.</para></listitem> - <listitem><para>The API to talk to the kgdb I/O drivers</para></listitem> - <listitem><para>The API to make calls to the arch-specific kgdb implementation</para></listitem> - <listitem><para>The logic to perform safe memory reads and writes to memory while using the debugger</para></listitem> - <listitem><para>A full implementation for software breakpoints unless overridden by the arch</para></listitem> - <listitem><para>The API to invoke either the kdb or kgdb frontend to the debug core.</para></listitem> - <listitem><para>The structures and callback API for atomic kernel mode setting.</para> - <para>NOTE: kgdboc is where the kms callbacks are invoked.</para></listitem> - </itemizedlist> - </para> - </listitem> - <listitem><para>kgdb arch-specific implementation</para> - <para> - This implementation is generally found in arch/*/kernel/kgdb.c. - As an example, arch/x86/kernel/kgdb.c contains the specifics to - implement HW breakpoint as well as the initialization to - dynamically register and unregister for the trap handlers on - this architecture. The arch-specific portion implements: - <itemizedlist> - <listitem><para>contains an arch-specific trap catcher which - invokes kgdb_handle_exception() to start kgdb about doing its - work</para></listitem> - <listitem><para>translation to and from gdb specific packet format to pt_regs</para></listitem> - <listitem><para>Registration and unregistration of architecture specific trap hooks</para></listitem> - <listitem><para>Any special exception handling and cleanup</para></listitem> - <listitem><para>NMI exception handling and cleanup</para></listitem> - <listitem><para>(optional) HW breakpoints</para></listitem> - </itemizedlist> - </para> - </listitem> - <listitem><para>gdbstub frontend (aka kgdb)</para> - <para>The gdbstub is located in kernel/debug/gdbstub.c. It contains:</para> - <itemizedlist> - <listitem><para>All the logic to implement the gdb serial protocol</para></listitem> - </itemizedlist> - </listitem> - <listitem><para>kdb frontend</para> - <para>The kdb debugger shell is broken down into a number of - components. The kdb core is located in kernel/debug/kdb. There - are a number of helper functions in some of the other kernel - components to make it possible for kdb to examine and report - information about the kernel without taking locks that could - cause a kernel deadlock. The kdb core contains implements the following functionality.</para> - <itemizedlist> - <listitem><para>A simple shell</para></listitem> - <listitem><para>The kdb core command set</para></listitem> - <listitem><para>A registration API to register additional kdb shell commands.</para> - <itemizedlist> - <listitem><para>A good example of a self-contained kdb module - is the "ftdump" command for dumping the ftrace buffer. See: - kernel/trace/trace_kdb.c</para></listitem> - <listitem><para>For an example of how to dynamically register - a new kdb command you can build the kdb_hello.ko kernel module - from samples/kdb/kdb_hello.c. To build this example you can - set CONFIG_SAMPLES=y and CONFIG_SAMPLE_KDB=m in your kernel - config. Later run "modprobe kdb_hello" and the next time you - enter the kdb shell, you can run the "hello" - command.</para></listitem> - </itemizedlist></listitem> - <listitem><para>The implementation for kdb_printf() which - emits messages directly to I/O drivers, bypassing the kernel - log.</para></listitem> - <listitem><para>SW / HW breakpoint management for the kdb shell</para></listitem> - </itemizedlist> - </listitem> - <listitem><para>kgdb I/O driver</para> - <para> - Each kgdb I/O driver has to provide an implementation for the following: - <itemizedlist> - <listitem><para>configuration via built-in or module</para></listitem> - <listitem><para>dynamic configuration and kgdb hook registration calls</para></listitem> - <listitem><para>read and write character interface</para></listitem> - <listitem><para>A cleanup handler for unconfiguring from the kgdb core</para></listitem> - <listitem><para>(optional) Early debug methodology</para></listitem> - </itemizedlist> - Any given kgdb I/O driver has to operate very closely with the - hardware and must do it in such a way that does not enable - interrupts or change other parts of the system context without - completely restoring them. The kgdb core will repeatedly "poll" - a kgdb I/O driver for characters when it needs input. The I/O - driver is expected to return immediately if there is no data - available. Doing so allows for the future possibility to touch - watchdog hardware in such a way as to have a target system not - reset when these are enabled. - </para> - </listitem> - </orderedlist> - </para> - <para> - If you are intent on adding kgdb architecture specific support - for a new architecture, the architecture should define - <constant>HAVE_ARCH_KGDB</constant> in the architecture specific - Kconfig file. This will enable kgdb for the architecture, and - at that point you must create an architecture specific kgdb - implementation. - </para> - <para> - There are a few flags which must be set on every architecture in - their <asm/kgdb.h> file. These are: - <itemizedlist> - <listitem> - <para> - NUMREGBYTES: The size in bytes of all of the registers, so - that we can ensure they will all fit into a packet. - </para> - </listitem> - <listitem> - <para> - BUFMAX: The size in bytes of the buffer GDB will read into. - This must be larger than NUMREGBYTES. - </para> - </listitem> - <listitem> - <para> - CACHE_FLUSH_IS_SAFE: Set to 1 if it is always safe to call - flush_cache_range or flush_icache_range. On some architectures, - these functions may not be safe to call on SMP since we keep other - CPUs in a holding pattern. - </para> - </listitem> - </itemizedlist> - </para> - <para> - There are also the following functions for the common backend, - found in kernel/kgdb.c, that must be supplied by the - architecture-specific backend unless marked as (optional), in - which case a default function maybe used if the architecture - does not need to provide a specific implementation. - </para> -!Iinclude/linux/kgdb.h - </sect1> - <sect1 id="kgdbocDesign"> - <title>kgdboc internals</title> - <sect2> - <title>kgdboc and uarts</title> - <para> - The kgdboc driver is actually a very thin driver that relies on the - underlying low level to the hardware driver having "polling hooks" - to which the tty driver is attached. In the initial - implementation of kgdboc the serial_core was changed to expose a - low level UART hook for doing polled mode reading and writing of a - single character while in an atomic context. When kgdb makes an I/O - request to the debugger, kgdboc invokes a callback in the serial - core which in turn uses the callback in the UART driver.</para> - <para> - When using kgdboc with a UART, the UART driver must implement two callbacks in the <constant>struct uart_ops</constant>. Example from drivers/8250.c:<programlisting> -#ifdef CONFIG_CONSOLE_POLL - .poll_get_char = serial8250_get_poll_char, - .poll_put_char = serial8250_put_poll_char, -#endif - </programlisting> - Any implementation specifics around creating a polling driver use the - <constant>#ifdef CONFIG_CONSOLE_POLL</constant>, as shown above. - Keep in mind that polling hooks have to be implemented in such a way - that they can be called from an atomic context and have to restore - the state of the UART chip on return such that the system can return - to normal when the debugger detaches. You need to be very careful - with any kind of lock you consider, because failing here is most likely - going to mean pressing the reset button. - </para> - </sect2> - <sect2 id="kgdbocKbd"> - <title>kgdboc and keyboards</title> - <para>The kgdboc driver contains logic to configure communications - with an attached keyboard. The keyboard infrastructure is only - compiled into the kernel when CONFIG_KDB_KEYBOARD=y is set in the - kernel configuration.</para> - <para>The core polled keyboard driver driver for PS/2 type keyboards - is in drivers/char/kdb_keyboard.c. This driver is hooked into the - debug core when kgdboc populates the callback in the array - called <constant>kdb_poll_funcs[]</constant>. The - kdb_get_kbd_char() is the top-level function which polls hardware - for single character input. - </para> - </sect2> - <sect2 id="kgdbocKms"> - <title>kgdboc and kms</title> - <para>The kgdboc driver contains logic to request the graphics - display to switch to a text context when you are using - "kgdboc=kms,kbd", provided that you have a video driver which has a - frame buffer console and atomic kernel mode setting support.</para> - <para> - Every time the kernel - debugger is entered it calls kgdboc_pre_exp_handler() which in turn - calls con_debug_enter() in the virtual console layer. On resuming kernel - execution, the kernel debugger calls kgdboc_post_exp_handler() which - in turn calls con_debug_leave().</para> - <para>Any video driver that wants to be compatible with the kernel - debugger and the atomic kms callbacks must implement the - mode_set_base_atomic, fb_debug_enter and fb_debug_leave operations. - For the fb_debug_enter and fb_debug_leave the option exists to use - the generic drm fb helper functions or implement something custom for - the hardware. The following example shows the initialization of the - .mode_set_base_atomic operation in - drivers/gpu/drm/i915/intel_display.c: - <informalexample> - <programlisting> -static const struct drm_crtc_helper_funcs intel_helper_funcs = { -[...] - .mode_set_base_atomic = intel_pipe_set_base_atomic, -[...] -}; - </programlisting> - </informalexample> - </para> - <para>Here is an example of how the i915 driver initializes the fb_debug_enter and fb_debug_leave functions to use the generic drm helpers in - drivers/gpu/drm/i915/intel_fb.c: - <informalexample> - <programlisting> -static struct fb_ops intelfb_ops = { -[...] - .fb_debug_enter = drm_fb_helper_debug_enter, - .fb_debug_leave = drm_fb_helper_debug_leave, -[...] -}; - </programlisting> - </informalexample> - </para> - </sect2> - </sect1> - </chapter> - <chapter id="credits"> - <title>Credits</title> - <para> - The following people have contributed to this document: - <orderedlist> - <listitem><para>Amit Kale<email>amitkale@linsyssoft.com</email></para></listitem> - <listitem><para>Tom Rini<email>trini@kernel.crashing.org</email></para></listitem> - </orderedlist> - In March 2008 this document was completely rewritten by: - <itemizedlist> - <listitem><para>Jason Wessel<email>jason.wessel@windriver.com</email></para></listitem> - </itemizedlist> - In Jan 2010 this document was updated to include kdb. - <itemizedlist> - <listitem><para>Jason Wessel<email>jason.wessel@windriver.com</email></para></listitem> - </itemizedlist> - </para> - </chapter> -</book> - diff --git a/Documentation/DocBook/libata.tmpl b/Documentation/DocBook/libata.tmpl deleted file mode 100644 index 0320910b866d..000000000000 --- a/Documentation/DocBook/libata.tmpl +++ /dev/null @@ -1,1625 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="libataDevGuide"> - <bookinfo> - <title>libATA Developer's Guide</title> - - <authorgroup> - <author> - <firstname>Jeff</firstname> - <surname>Garzik</surname> - </author> - </authorgroup> - - <copyright> - <year>2003-2006</year> - <holder>Jeff Garzik</holder> - </copyright> - - <legalnotice> - <para> - The contents of this file are subject to the Open - Software License version 1.1 that can be found at - <ulink url="http://fedoraproject.org/wiki/Licensing:OSL1.1">http://fedoraproject.org/wiki/Licensing:OSL1.1</ulink> - and is included herein by reference. - </para> - - <para> - Alternatively, the contents of this file may be used under the terms - of the GNU General Public License version 2 (the "GPL") as distributed - in the kernel source COPYING file, in which case the provisions of - the GPL are applicable instead of the above. If you wish to allow - the use of your version of this file only under the terms of the - GPL and not to allow others to use your version of this file under - the OSL, indicate your decision by deleting the provisions above and - replace them with the notice and other provisions required by the GPL. - If you do not delete the provisions above, a recipient may use your - version of this file under either the OSL or the GPL. - </para> - - </legalnotice> - </bookinfo> - -<toc></toc> - - <chapter id="libataIntroduction"> - <title>Introduction</title> - <para> - libATA is a library used inside the Linux kernel to support ATA host - controllers and devices. libATA provides an ATA driver API, class - transports for ATA and ATAPI devices, and SCSI<->ATA translation - for ATA devices according to the T10 SAT specification. - </para> - <para> - This Guide documents the libATA driver API, library functions, library - internals, and a couple sample ATA low-level drivers. - </para> - </chapter> - - <chapter id="libataDriverApi"> - <title>libata Driver API</title> - <para> - struct ata_port_operations is defined for every low-level libata - hardware driver, and it controls how the low-level driver - interfaces with the ATA and SCSI layers. - </para> - <para> - FIS-based drivers will hook into the system with ->qc_prep() and - ->qc_issue() high-level hooks. Hardware which behaves in a manner - similar to PCI IDE hardware may utilize several generic helpers, - defining at a bare minimum the bus I/O addresses of the ATA shadow - register blocks. - </para> - <sect1> - <title>struct ata_port_operations</title> - - <sect2><title>Disable ATA port</title> - <programlisting> -void (*port_disable) (struct ata_port *); - </programlisting> - - <para> - Called from ata_bus_probe() error path, as well as when - unregistering from the SCSI module (rmmod, hot unplug). - This function should do whatever needs to be done to take the - port out of use. In most cases, ata_port_disable() can be used - as this hook. - </para> - <para> - Called from ata_bus_probe() on a failed probe. - Called from ata_scsi_release(). - </para> - - </sect2> - - <sect2><title>Post-IDENTIFY device configuration</title> - <programlisting> -void (*dev_config) (struct ata_port *, struct ata_device *); - </programlisting> - - <para> - Called after IDENTIFY [PACKET] DEVICE is issued to each device - found. Typically used to apply device-specific fixups prior to - issue of SET FEATURES - XFER MODE, and prior to operation. - </para> - <para> - This entry may be specified as NULL in ata_port_operations. - </para> - - </sect2> - - <sect2><title>Set PIO/DMA mode</title> - <programlisting> -void (*set_piomode) (struct ata_port *, struct ata_device *); -void (*set_dmamode) (struct ata_port *, struct ata_device *); -void (*post_set_mode) (struct ata_port *); -unsigned int (*mode_filter) (struct ata_port *, struct ata_device *, unsigned int); - </programlisting> - - <para> - Hooks called prior to the issue of SET FEATURES - XFER MODE - command. The optional ->mode_filter() hook is called when libata - has built a mask of the possible modes. This is passed to the - ->mode_filter() function which should return a mask of valid modes - after filtering those unsuitable due to hardware limits. It is not - valid to use this interface to add modes. - </para> - <para> - dev->pio_mode and dev->dma_mode are guaranteed to be valid when - ->set_piomode() and when ->set_dmamode() is called. The timings for - any other drive sharing the cable will also be valid at this point. - That is the library records the decisions for the modes of each - drive on a channel before it attempts to set any of them. - </para> - <para> - ->post_set_mode() is - called unconditionally, after the SET FEATURES - XFER MODE - command completes successfully. - </para> - - <para> - ->set_piomode() is always called (if present), but - ->set_dma_mode() is only called if DMA is possible. - </para> - - </sect2> - - <sect2><title>Taskfile read/write</title> - <programlisting> -void (*sff_tf_load) (struct ata_port *ap, struct ata_taskfile *tf); -void (*sff_tf_read) (struct ata_port *ap, struct ata_taskfile *tf); - </programlisting> - - <para> - ->tf_load() is called to load the given taskfile into hardware - registers / DMA buffers. ->tf_read() is called to read the - hardware registers / DMA buffers, to obtain the current set of - taskfile register values. - Most drivers for taskfile-based hardware (PIO or MMIO) use - ata_sff_tf_load() and ata_sff_tf_read() for these hooks. - </para> - - </sect2> - - <sect2><title>PIO data read/write</title> - <programlisting> -void (*sff_data_xfer) (struct ata_device *, unsigned char *, unsigned int, int); - </programlisting> - - <para> -All bmdma-style drivers must implement this hook. This is the low-level -operation that actually copies the data bytes during a PIO data -transfer. -Typically the driver will choose one of ata_sff_data_xfer_noirq(), -ata_sff_data_xfer(), or ata_sff_data_xfer32(). - </para> - - </sect2> - - <sect2><title>ATA command execute</title> - <programlisting> -void (*sff_exec_command)(struct ata_port *ap, struct ata_taskfile *tf); - </programlisting> - - <para> - causes an ATA command, previously loaded with - ->tf_load(), to be initiated in hardware. - Most drivers for taskfile-based hardware use ata_sff_exec_command() - for this hook. - </para> - - </sect2> - - <sect2><title>Per-cmd ATAPI DMA capabilities filter</title> - <programlisting> -int (*check_atapi_dma) (struct ata_queued_cmd *qc); - </programlisting> - - <para> -Allow low-level driver to filter ATA PACKET commands, returning a status -indicating whether or not it is OK to use DMA for the supplied PACKET -command. - </para> - <para> - This hook may be specified as NULL, in which case libata will - assume that atapi dma can be supported. - </para> - - </sect2> - - <sect2><title>Read specific ATA shadow registers</title> - <programlisting> -u8 (*sff_check_status)(struct ata_port *ap); -u8 (*sff_check_altstatus)(struct ata_port *ap); - </programlisting> - - <para> - Reads the Status/AltStatus ATA shadow register from - hardware. On some hardware, reading the Status register has - the side effect of clearing the interrupt condition. - Most drivers for taskfile-based hardware use - ata_sff_check_status() for this hook. - </para> - - </sect2> - - <sect2><title>Write specific ATA shadow register</title> - <programlisting> -void (*sff_set_devctl)(struct ata_port *ap, u8 ctl); - </programlisting> - - <para> - Write the device control ATA shadow register to the hardware. - Most drivers don't need to define this. - </para> - - </sect2> - - <sect2><title>Select ATA device on bus</title> - <programlisting> -void (*sff_dev_select)(struct ata_port *ap, unsigned int device); - </programlisting> - - <para> - Issues the low-level hardware command(s) that causes one of N - hardware devices to be considered 'selected' (active and - available for use) on the ATA bus. This generally has no - meaning on FIS-based devices. - </para> - <para> - Most drivers for taskfile-based hardware use - ata_sff_dev_select() for this hook. - </para> - - </sect2> - - <sect2><title>Private tuning method</title> - <programlisting> -void (*set_mode) (struct ata_port *ap); - </programlisting> - - <para> - By default libata performs drive and controller tuning in - accordance with the ATA timing rules and also applies blacklists - and cable limits. Some controllers need special handling and have - custom tuning rules, typically raid controllers that use ATA - commands but do not actually do drive timing. - </para> - - <warning> - <para> - This hook should not be used to replace the standard controller - tuning logic when a controller has quirks. Replacing the default - tuning logic in that case would bypass handling for drive and - bridge quirks that may be important to data reliability. If a - controller needs to filter the mode selection it should use the - mode_filter hook instead. - </para> - </warning> - - </sect2> - - <sect2><title>Control PCI IDE BMDMA engine</title> - <programlisting> -void (*bmdma_setup) (struct ata_queued_cmd *qc); -void (*bmdma_start) (struct ata_queued_cmd *qc); -void (*bmdma_stop) (struct ata_port *ap); -u8 (*bmdma_status) (struct ata_port *ap); - </programlisting> - - <para> -When setting up an IDE BMDMA transaction, these hooks arm -(->bmdma_setup), fire (->bmdma_start), and halt (->bmdma_stop) -the hardware's DMA engine. ->bmdma_status is used to read the standard -PCI IDE DMA Status register. - </para> - - <para> -These hooks are typically either no-ops, or simply not implemented, in -FIS-based drivers. - </para> - <para> -Most legacy IDE drivers use ata_bmdma_setup() for the bmdma_setup() -hook. ata_bmdma_setup() will write the pointer to the PRD table to -the IDE PRD Table Address register, enable DMA in the DMA Command -register, and call exec_command() to begin the transfer. - </para> - <para> -Most legacy IDE drivers use ata_bmdma_start() for the bmdma_start() -hook. ata_bmdma_start() will write the ATA_DMA_START flag to the DMA -Command register. - </para> - <para> -Many legacy IDE drivers use ata_bmdma_stop() for the bmdma_stop() -hook. ata_bmdma_stop() clears the ATA_DMA_START flag in the DMA -command register. - </para> - <para> -Many legacy IDE drivers use ata_bmdma_status() as the bmdma_status() hook. - </para> - - </sect2> - - <sect2><title>High-level taskfile hooks</title> - <programlisting> -void (*qc_prep) (struct ata_queued_cmd *qc); -int (*qc_issue) (struct ata_queued_cmd *qc); - </programlisting> - - <para> - Higher-level hooks, these two hooks can potentially supercede - several of the above taskfile/DMA engine hooks. ->qc_prep is - called after the buffers have been DMA-mapped, and is typically - used to populate the hardware's DMA scatter-gather table. - Most drivers use the standard ata_qc_prep() helper function, but - more advanced drivers roll their own. - </para> - <para> - ->qc_issue is used to make a command active, once the hardware - and S/G tables have been prepared. IDE BMDMA drivers use the - helper function ata_qc_issue_prot() for taskfile protocol-based - dispatch. More advanced drivers implement their own ->qc_issue. - </para> - <para> - ata_qc_issue_prot() calls ->tf_load(), ->bmdma_setup(), and - ->bmdma_start() as necessary to initiate a transfer. - </para> - - </sect2> - - <sect2><title>Exception and probe handling (EH)</title> - <programlisting> -void (*eng_timeout) (struct ata_port *ap); -void (*phy_reset) (struct ata_port *ap); - </programlisting> - - <para> -Deprecated. Use ->error_handler() instead. - </para> - - <programlisting> -void (*freeze) (struct ata_port *ap); -void (*thaw) (struct ata_port *ap); - </programlisting> - - <para> -ata_port_freeze() is called when HSM violations or some other -condition disrupts normal operation of the port. A frozen port -is not allowed to perform any operation until the port is -thawed, which usually follows a successful reset. - </para> - - <para> -The optional ->freeze() callback can be used for freezing the port -hardware-wise (e.g. mask interrupt and stop DMA engine). If a -port cannot be frozen hardware-wise, the interrupt handler -must ack and clear interrupts unconditionally while the port -is frozen. - </para> - <para> -The optional ->thaw() callback is called to perform the opposite of ->freeze(): -prepare the port for normal operation once again. Unmask interrupts, -start DMA engine, etc. - </para> - - <programlisting> -void (*error_handler) (struct ata_port *ap); - </programlisting> - - <para> -->error_handler() is a driver's hook into probe, hotplug, and recovery -and other exceptional conditions. The primary responsibility of an -implementation is to call ata_do_eh() or ata_bmdma_drive_eh() with a set -of EH hooks as arguments: - </para> - - <para> -'prereset' hook (may be NULL) is called during an EH reset, before any other actions -are taken. - </para> - - <para> -'postreset' hook (may be NULL) is called after the EH reset is performed. Based on -existing conditions, severity of the problem, and hardware capabilities, - </para> - - <para> -Either 'softreset' (may be NULL) or 'hardreset' (may be NULL) will be -called to perform the low-level EH reset. - </para> - - <programlisting> -void (*post_internal_cmd) (struct ata_queued_cmd *qc); - </programlisting> - - <para> -Perform any hardware-specific actions necessary to finish processing -after executing a probe-time or EH-time command via ata_exec_internal(). - </para> - - </sect2> - - <sect2><title>Hardware interrupt handling</title> - <programlisting> -irqreturn_t (*irq_handler)(int, void *, struct pt_regs *); -void (*irq_clear) (struct ata_port *); - </programlisting> - - <para> - ->irq_handler is the interrupt handling routine registered with - the system, by libata. ->irq_clear is called during probe just - before the interrupt handler is registered, to be sure hardware - is quiet. - </para> - <para> - The second argument, dev_instance, should be cast to a pointer - to struct ata_host_set. - </para> - <para> - Most legacy IDE drivers use ata_sff_interrupt() for the - irq_handler hook, which scans all ports in the host_set, - determines which queued command was active (if any), and calls - ata_sff_host_intr(ap,qc). - </para> - <para> - Most legacy IDE drivers use ata_sff_irq_clear() for the - irq_clear() hook, which simply clears the interrupt and error - flags in the DMA status register. - </para> - - </sect2> - - <sect2><title>SATA phy read/write</title> - <programlisting> -int (*scr_read) (struct ata_port *ap, unsigned int sc_reg, - u32 *val); -int (*scr_write) (struct ata_port *ap, unsigned int sc_reg, - u32 val); - </programlisting> - - <para> - Read and write standard SATA phy registers. Currently only used - if ->phy_reset hook called the sata_phy_reset() helper function. - sc_reg is one of SCR_STATUS, SCR_CONTROL, SCR_ERROR, or SCR_ACTIVE. - </para> - - </sect2> - - <sect2><title>Init and shutdown</title> - <programlisting> -int (*port_start) (struct ata_port *ap); -void (*port_stop) (struct ata_port *ap); -void (*host_stop) (struct ata_host_set *host_set); - </programlisting> - - <para> - ->port_start() is called just after the data structures for each - port are initialized. Typically this is used to alloc per-port - DMA buffers / tables / rings, enable DMA engines, and similar - tasks. Some drivers also use this entry point as a chance to - allocate driver-private memory for ap->private_data. - </para> - <para> - Many drivers use ata_port_start() as this hook or call - it from their own port_start() hooks. ata_port_start() - allocates space for a legacy IDE PRD table and returns. - </para> - <para> - ->port_stop() is called after ->host_stop(). Its sole function - is to release DMA/memory resources, now that they are no longer - actively being used. Many drivers also free driver-private - data from port at this time. - </para> - <para> - ->host_stop() is called after all ->port_stop() calls -have completed. The hook must finalize hardware shutdown, release DMA -and other resources, etc. - This hook may be specified as NULL, in which case it is not called. - </para> - - </sect2> - - </sect1> - </chapter> - - <chapter id="libataEH"> - <title>Error handling</title> - - <para> - This chapter describes how errors are handled under libata. - Readers are advised to read SCSI EH - (Documentation/scsi/scsi_eh.txt) and ATA exceptions doc first. - </para> - - <sect1><title>Origins of commands</title> - <para> - In libata, a command is represented with struct ata_queued_cmd - or qc. qc's are preallocated during port initialization and - repetitively used for command executions. Currently only one - qc is allocated per port but yet-to-be-merged NCQ branch - allocates one for each tag and maps each qc to NCQ tag 1-to-1. - </para> - <para> - libata commands can originate from two sources - libata itself - and SCSI midlayer. libata internal commands are used for - initialization and error handling. All normal blk requests - and commands for SCSI emulation are passed as SCSI commands - through queuecommand callback of SCSI host template. - </para> - </sect1> - - <sect1><title>How commands are issued</title> - - <variablelist> - - <varlistentry><term>Internal commands</term> - <listitem> - <para> - First, qc is allocated and initialized using - ata_qc_new_init(). Although ata_qc_new_init() doesn't - implement any wait or retry mechanism when qc is not - available, internal commands are currently issued only during - initialization and error recovery, so no other command is - active and allocation is guaranteed to succeed. - </para> - <para> - Once allocated qc's taskfile is initialized for the command to - be executed. qc currently has two mechanisms to notify - completion. One is via qc->complete_fn() callback and the - other is completion qc->waiting. qc->complete_fn() callback - is the asynchronous path used by normal SCSI translated - commands and qc->waiting is the synchronous (issuer sleeps in - process context) path used by internal commands. - </para> - <para> - Once initialization is complete, host_set lock is acquired - and the qc is issued. - </para> - </listitem> - </varlistentry> - - <varlistentry><term>SCSI commands</term> - <listitem> - <para> - All libata drivers use ata_scsi_queuecmd() as - hostt->queuecommand callback. scmds can either be simulated - or translated. No qc is involved in processing a simulated - scmd. The result is computed right away and the scmd is - completed. - </para> - <para> - For a translated scmd, ata_qc_new_init() is invoked to - allocate a qc and the scmd is translated into the qc. SCSI - midlayer's completion notification function pointer is stored - into qc->scsidone. - </para> - <para> - qc->complete_fn() callback is used for completion - notification. ATA commands use ata_scsi_qc_complete() while - ATAPI commands use atapi_qc_complete(). Both functions end up - calling qc->scsidone to notify upper layer when the qc is - finished. After translation is completed, the qc is issued - with ata_qc_issue(). - </para> - <para> - Note that SCSI midlayer invokes hostt->queuecommand while - holding host_set lock, so all above occur while holding - host_set lock. - </para> - </listitem> - </varlistentry> - - </variablelist> - </sect1> - - <sect1><title>How commands are processed</title> - <para> - Depending on which protocol and which controller are used, - commands are processed differently. For the purpose of - discussion, a controller which uses taskfile interface and all - standard callbacks is assumed. - </para> - <para> - Currently 6 ATA command protocols are used. They can be - sorted into the following four categories according to how - they are processed. - </para> - - <variablelist> - <varlistentry><term>ATA NO DATA or DMA</term> - <listitem> - <para> - ATA_PROT_NODATA and ATA_PROT_DMA fall into this category. - These types of commands don't require any software - intervention once issued. Device will raise interrupt on - completion. - </para> - </listitem> - </varlistentry> - - <varlistentry><term>ATA PIO</term> - <listitem> - <para> - ATA_PROT_PIO is in this category. libata currently - implements PIO with polling. ATA_NIEN bit is set to turn - off interrupt and pio_task on ata_wq performs polling and - IO. - </para> - </listitem> - </varlistentry> - - <varlistentry><term>ATAPI NODATA or DMA</term> - <listitem> - <para> - ATA_PROT_ATAPI_NODATA and ATA_PROT_ATAPI_DMA are in this - category. packet_task is used to poll BSY bit after - issuing PACKET command. Once BSY is turned off by the - device, packet_task transfers CDB and hands off processing - to interrupt handler. - </para> - </listitem> - </varlistentry> - - <varlistentry><term>ATAPI PIO</term> - <listitem> - <para> - ATA_PROT_ATAPI is in this category. ATA_NIEN bit is set - and, as in ATAPI NODATA or DMA, packet_task submits cdb. - However, after submitting cdb, further processing (data - transfer) is handed off to pio_task. - </para> - </listitem> - </varlistentry> - </variablelist> - </sect1> - - <sect1><title>How commands are completed</title> - <para> - Once issued, all qc's are either completed with - ata_qc_complete() or time out. For commands which are handled - by interrupts, ata_host_intr() invokes ata_qc_complete(), and, - for PIO tasks, pio_task invokes ata_qc_complete(). In error - cases, packet_task may also complete commands. - </para> - <para> - ata_qc_complete() does the following. - </para> - - <orderedlist> - - <listitem> - <para> - DMA memory is unmapped. - </para> - </listitem> - - <listitem> - <para> - ATA_QCFLAG_ACTIVE is cleared from qc->flags. - </para> - </listitem> - - <listitem> - <para> - qc->complete_fn() callback is invoked. If the return value of - the callback is not zero. Completion is short circuited and - ata_qc_complete() returns. - </para> - </listitem> - - <listitem> - <para> - __ata_qc_complete() is called, which does - <orderedlist> - - <listitem> - <para> - qc->flags is cleared to zero. - </para> - </listitem> - - <listitem> - <para> - ap->active_tag and qc->tag are poisoned. - </para> - </listitem> - - <listitem> - <para> - qc->waiting is cleared & completed (in that order). - </para> - </listitem> - - <listitem> - <para> - qc is deallocated by clearing appropriate bit in ap->qactive. - </para> - </listitem> - - </orderedlist> - </para> - </listitem> - - </orderedlist> - - <para> - So, it basically notifies upper layer and deallocates qc. One - exception is short-circuit path in #3 which is used by - atapi_qc_complete(). - </para> - <para> - For all non-ATAPI commands, whether it fails or not, almost - the same code path is taken and very little error handling - takes place. A qc is completed with success status if it - succeeded, with failed status otherwise. - </para> - <para> - However, failed ATAPI commands require more handling as - REQUEST SENSE is needed to acquire sense data. If an ATAPI - command fails, ata_qc_complete() is invoked with error status, - which in turn invokes atapi_qc_complete() via - qc->complete_fn() callback. - </para> - <para> - This makes atapi_qc_complete() set scmd->result to - SAM_STAT_CHECK_CONDITION, complete the scmd and return 1. As - the sense data is empty but scmd->result is CHECK CONDITION, - SCSI midlayer will invoke EH for the scmd, and returning 1 - makes ata_qc_complete() to return without deallocating the qc. - This leads us to ata_scsi_error() with partially completed qc. - </para> - - </sect1> - - <sect1><title>ata_scsi_error()</title> - <para> - ata_scsi_error() is the current transportt->eh_strategy_handler() - for libata. As discussed above, this will be entered in two - cases - timeout and ATAPI error completion. This function - calls low level libata driver's eng_timeout() callback, the - standard callback for which is ata_eng_timeout(). It checks - if a qc is active and calls ata_qc_timeout() on the qc if so. - Actual error handling occurs in ata_qc_timeout(). - </para> - <para> - If EH is invoked for timeout, ata_qc_timeout() stops BMDMA and - completes the qc. Note that as we're currently in EH, we - cannot call scsi_done. As described in SCSI EH doc, a - recovered scmd should be either retried with - scsi_queue_insert() or finished with scsi_finish_command(). - Here, we override qc->scsidone with scsi_finish_command() and - calls ata_qc_complete(). - </para> - <para> - If EH is invoked due to a failed ATAPI qc, the qc here is - completed but not deallocated. The purpose of this - half-completion is to use the qc as place holder to make EH - code reach this place. This is a bit hackish, but it works. - </para> - <para> - Once control reaches here, the qc is deallocated by invoking - __ata_qc_complete() explicitly. Then, internal qc for REQUEST - SENSE is issued. Once sense data is acquired, scmd is - finished by directly invoking scsi_finish_command() on the - scmd. Note that as we already have completed and deallocated - the qc which was associated with the scmd, we don't need - to/cannot call ata_qc_complete() again. - </para> - - </sect1> - - <sect1><title>Problems with the current EH</title> - - <itemizedlist> - - <listitem> - <para> - Error representation is too crude. Currently any and all - error conditions are represented with ATA STATUS and ERROR - registers. Errors which aren't ATA device errors are treated - as ATA device errors by setting ATA_ERR bit. Better error - descriptor which can properly represent ATA and other - errors/exceptions is needed. - </para> - </listitem> - - <listitem> - <para> - When handling timeouts, no action is taken to make device - forget about the timed out command and ready for new commands. - </para> - </listitem> - - <listitem> - <para> - EH handling via ata_scsi_error() is not properly protected - from usual command processing. On EH entrance, the device is - not in quiescent state. Timed out commands may succeed or - fail any time. pio_task and atapi_task may still be running. - </para> - </listitem> - - <listitem> - <para> - Too weak error recovery. Devices / controllers causing HSM - mismatch errors and other errors quite often require reset to - return to known state. Also, advanced error handling is - necessary to support features like NCQ and hotplug. - </para> - </listitem> - - <listitem> - <para> - ATA errors are directly handled in the interrupt handler and - PIO errors in pio_task. This is problematic for advanced - error handling for the following reasons. - </para> - <para> - First, advanced error handling often requires context and - internal qc execution. - </para> - <para> - Second, even a simple failure (say, CRC error) needs - information gathering and could trigger complex error handling - (say, resetting & reconfiguring). Having multiple code - paths to gather information, enter EH and trigger actions - makes life painful. - </para> - <para> - Third, scattered EH code makes implementing low level drivers - difficult. Low level drivers override libata callbacks. If - EH is scattered over several places, each affected callbacks - should perform its part of error handling. This can be error - prone and painful. - </para> - </listitem> - - </itemizedlist> - </sect1> - </chapter> - - <chapter id="libataExt"> - <title>libata Library</title> -!Edrivers/ata/libata-core.c - </chapter> - - <chapter id="libataInt"> - <title>libata Core Internals</title> -!Idrivers/ata/libata-core.c - </chapter> - - <chapter id="libataScsiInt"> - <title>libata SCSI translation/emulation</title> -!Edrivers/ata/libata-scsi.c -!Idrivers/ata/libata-scsi.c - </chapter> - - <chapter id="ataExceptions"> - <title>ATA errors and exceptions</title> - - <para> - This chapter tries to identify what error/exception conditions exist - for ATA/ATAPI devices and describe how they should be handled in - implementation-neutral way. - </para> - - <para> - The term 'error' is used to describe conditions where either an - explicit error condition is reported from device or a command has - timed out. - </para> - - <para> - The term 'exception' is either used to describe exceptional - conditions which are not errors (say, power or hotplug events), or - to describe both errors and non-error exceptional conditions. Where - explicit distinction between error and exception is necessary, the - term 'non-error exception' is used. - </para> - - <sect1 id="excat"> - <title>Exception categories</title> - <para> - Exceptions are described primarily with respect to legacy - taskfile + bus master IDE interface. If a controller provides - other better mechanism for error reporting, mapping those into - categories described below shouldn't be difficult. - </para> - - <para> - In the following sections, two recovery actions - reset and - reconfiguring transport - are mentioned. These are described - further in <xref linkend="exrec"/>. - </para> - - <sect2 id="excatHSMviolation"> - <title>HSM violation</title> - <para> - This error is indicated when STATUS value doesn't match HSM - requirement during issuing or execution any ATA/ATAPI command. - </para> - - <itemizedlist> - <title>Examples</title> - - <listitem> - <para> - ATA_STATUS doesn't contain !BSY && DRDY && !DRQ while trying - to issue a command. - </para> - </listitem> - - <listitem> - <para> - !BSY && !DRQ during PIO data transfer. - </para> - </listitem> - - <listitem> - <para> - DRQ on command completion. - </para> - </listitem> - - <listitem> - <para> - !BSY && ERR after CDB transfer starts but before the - last byte of CDB is transferred. ATA/ATAPI standard states - that "The device shall not terminate the PACKET command - with an error before the last byte of the command packet has - been written" in the error outputs description of PACKET - command and the state diagram doesn't include such - transitions. - </para> - </listitem> - - </itemizedlist> - - <para> - In these cases, HSM is violated and not much information - regarding the error can be acquired from STATUS or ERROR - register. IOW, this error can be anything - driver bug, - faulty device, controller and/or cable. - </para> - - <para> - As HSM is violated, reset is necessary to restore known state. - Reconfiguring transport for lower speed might be helpful too - as transmission errors sometimes cause this kind of errors. - </para> - </sect2> - - <sect2 id="excatDevErr"> - <title>ATA/ATAPI device error (non-NCQ / non-CHECK CONDITION)</title> - - <para> - These are errors detected and reported by ATA/ATAPI devices - indicating device problems. For this type of errors, STATUS - and ERROR register values are valid and describe error - condition. Note that some of ATA bus errors are detected by - ATA/ATAPI devices and reported using the same mechanism as - device errors. Those cases are described later in this - section. - </para> - - <para> - For ATA commands, this type of errors are indicated by !BSY - && ERR during command execution and on completion. - </para> - - <para>For ATAPI commands,</para> - - <itemizedlist> - - <listitem> - <para> - !BSY && ERR && ABRT right after issuing PACKET - indicates that PACKET command is not supported and falls in - this category. - </para> - </listitem> - - <listitem> - <para> - !BSY && ERR(==CHK) && !ABRT after the last - byte of CDB is transferred indicates CHECK CONDITION and - doesn't fall in this category. - </para> - </listitem> - - <listitem> - <para> - !BSY && ERR(==CHK) && ABRT after the last byte - of CDB is transferred *probably* indicates CHECK CONDITION and - doesn't fall in this category. - </para> - </listitem> - - </itemizedlist> - - <para> - Of errors detected as above, the following are not ATA/ATAPI - device errors but ATA bus errors and should be handled - according to <xref linkend="excatATAbusErr"/>. - </para> - - <variablelist> - - <varlistentry> - <term>CRC error during data transfer</term> - <listitem> - <para> - This is indicated by ICRC bit in the ERROR register and - means that corruption occurred during data transfer. Up to - ATA/ATAPI-7, the standard specifies that this bit is only - applicable to UDMA transfers but ATA/ATAPI-8 draft revision - 1f says that the bit may be applicable to multiword DMA and - PIO. - </para> - </listitem> - </varlistentry> - - <varlistentry> - <term>ABRT error during data transfer or on completion</term> - <listitem> - <para> - Up to ATA/ATAPI-7, the standard specifies that ABRT could be - set on ICRC errors and on cases where a device is not able - to complete a command. Combined with the fact that MWDMA - and PIO transfer errors aren't allowed to use ICRC bit up to - ATA/ATAPI-7, it seems to imply that ABRT bit alone could - indicate transfer errors. - </para> - <para> - However, ATA/ATAPI-8 draft revision 1f removes the part - that ICRC errors can turn on ABRT. So, this is kind of - gray area. Some heuristics are needed here. - </para> - </listitem> - </varlistentry> - - </variablelist> - - <para> - ATA/ATAPI device errors can be further categorized as follows. - </para> - - <variablelist> - - <varlistentry> - <term>Media errors</term> - <listitem> - <para> - This is indicated by UNC bit in the ERROR register. ATA - devices reports UNC error only after certain number of - retries cannot recover the data, so there's nothing much - else to do other than notifying upper layer. - </para> - <para> - READ and WRITE commands report CHS or LBA of the first - failed sector but ATA/ATAPI standard specifies that the - amount of transferred data on error completion is - indeterminate, so we cannot assume that sectors preceding - the failed sector have been transferred and thus cannot - complete those sectors successfully as SCSI does. - </para> - </listitem> - </varlistentry> - - <varlistentry> - <term>Media changed / media change requested error</term> - <listitem> - <para> - <<TODO: fill here>> - </para> - </listitem> - </varlistentry> - - <varlistentry><term>Address error</term> - <listitem> - <para> - This is indicated by IDNF bit in the ERROR register. - Report to upper layer. - </para> - </listitem> - </varlistentry> - - <varlistentry><term>Other errors</term> - <listitem> - <para> - This can be invalid command or parameter indicated by ABRT - ERROR bit or some other error condition. Note that ABRT - bit can indicate a lot of things including ICRC and Address - errors. Heuristics needed. - </para> - </listitem> - </varlistentry> - - </variablelist> - - <para> - Depending on commands, not all STATUS/ERROR bits are - applicable. These non-applicable bits are marked with - "na" in the output descriptions but up to ATA/ATAPI-7 - no definition of "na" can be found. However, - ATA/ATAPI-8 draft revision 1f describes "N/A" as - follows. - </para> - - <blockquote> - <variablelist> - <varlistentry><term>3.2.3.3a N/A</term> - <listitem> - <para> - A keyword the indicates a field has no defined value in - this standard and should not be checked by the host or - device. N/A fields should be cleared to zero. - </para> - </listitem> - </varlistentry> - </variablelist> - </blockquote> - - <para> - So, it seems reasonable to assume that "na" bits are - cleared to zero by devices and thus need no explicit masking. - </para> - - </sect2> - - <sect2 id="excatATAPIcc"> - <title>ATAPI device CHECK CONDITION</title> - - <para> - ATAPI device CHECK CONDITION error is indicated by set CHK bit - (ERR bit) in the STATUS register after the last byte of CDB is - transferred for a PACKET command. For this kind of errors, - sense data should be acquired to gather information regarding - the errors. REQUEST SENSE packet command should be used to - acquire sense data. - </para> - - <para> - Once sense data is acquired, this type of errors can be - handled similarly to other SCSI errors. Note that sense data - may indicate ATA bus error (e.g. Sense Key 04h HARDWARE ERROR - && ASC/ASCQ 47h/00h SCSI PARITY ERROR). In such - cases, the error should be considered as an ATA bus error and - handled according to <xref linkend="excatATAbusErr"/>. - </para> - - </sect2> - - <sect2 id="excatNCQerr"> - <title>ATA device error (NCQ)</title> - - <para> - NCQ command error is indicated by cleared BSY and set ERR bit - during NCQ command phase (one or more NCQ commands - outstanding). Although STATUS and ERROR registers will - contain valid values describing the error, READ LOG EXT is - required to clear the error condition, determine which command - has failed and acquire more information. - </para> - - <para> - READ LOG EXT Log Page 10h reports which tag has failed and - taskfile register values describing the error. With this - information the failed command can be handled as a normal ATA - command error as in <xref linkend="excatDevErr"/> and all - other in-flight commands must be retried. Note that this - retry should not be counted - it's likely that commands - retried this way would have completed normally if it were not - for the failed command. - </para> - - <para> - Note that ATA bus errors can be reported as ATA device NCQ - errors. This should be handled as described in <xref - linkend="excatATAbusErr"/>. - </para> - - <para> - If READ LOG EXT Log Page 10h fails or reports NQ, we're - thoroughly screwed. This condition should be treated - according to <xref linkend="excatHSMviolation"/>. - </para> - - </sect2> - - <sect2 id="excatATAbusErr"> - <title>ATA bus error</title> - - <para> - ATA bus error means that data corruption occurred during - transmission over ATA bus (SATA or PATA). This type of errors - can be indicated by - </para> - - <itemizedlist> - - <listitem> - <para> - ICRC or ABRT error as described in <xref linkend="excatDevErr"/>. - </para> - </listitem> - - <listitem> - <para> - Controller-specific error completion with error information - indicating transmission error. - </para> - </listitem> - - <listitem> - <para> - On some controllers, command timeout. In this case, there may - be a mechanism to determine that the timeout is due to - transmission error. - </para> - </listitem> - - <listitem> - <para> - Unknown/random errors, timeouts and all sorts of weirdities. - </para> - </listitem> - - </itemizedlist> - - <para> - As described above, transmission errors can cause wide variety - of symptoms ranging from device ICRC error to random device - lockup, and, for many cases, there is no way to tell if an - error condition is due to transmission error or not; - therefore, it's necessary to employ some kind of heuristic - when dealing with errors and timeouts. For example, - encountering repetitive ABRT errors for known supported - command is likely to indicate ATA bus error. - </para> - - <para> - Once it's determined that ATA bus errors have possibly - occurred, lowering ATA bus transmission speed is one of - actions which may alleviate the problem. See <xref - linkend="exrecReconf"/> for more information. - </para> - - </sect2> - - <sect2 id="excatPCIbusErr"> - <title>PCI bus error</title> - - <para> - Data corruption or other failures during transmission over PCI - (or other system bus). For standard BMDMA, this is indicated - by Error bit in the BMDMA Status register. This type of - errors must be logged as it indicates something is very wrong - with the system. Resetting host controller is recommended. - </para> - - </sect2> - - <sect2 id="excatLateCompletion"> - <title>Late completion</title> - - <para> - This occurs when timeout occurs and the timeout handler finds - out that the timed out command has completed successfully or - with error. This is usually caused by lost interrupts. This - type of errors must be logged. Resetting host controller is - recommended. - </para> - - </sect2> - - <sect2 id="excatUnknown"> - <title>Unknown error (timeout)</title> - - <para> - This is when timeout occurs and the command is still - processing or the host and device are in unknown state. When - this occurs, HSM could be in any valid or invalid state. To - bring the device to known state and make it forget about the - timed out command, resetting is necessary. The timed out - command may be retried. - </para> - - <para> - Timeouts can also be caused by transmission errors. Refer to - <xref linkend="excatATAbusErr"/> for more details. - </para> - - </sect2> - - <sect2 id="excatHoplugPM"> - <title>Hotplug and power management exceptions</title> - - <para> - <<TODO: fill here>> - </para> - - </sect2> - - </sect1> - - <sect1 id="exrec"> - <title>EH recovery actions</title> - - <para> - This section discusses several important recovery actions. - </para> - - <sect2 id="exrecClr"> - <title>Clearing error condition</title> - - <para> - Many controllers require its error registers to be cleared by - error handler. Different controllers may have different - requirements. - </para> - - <para> - For SATA, it's strongly recommended to clear at least SError - register during error handling. - </para> - </sect2> - - <sect2 id="exrecRst"> - <title>Reset</title> - - <para> - During EH, resetting is necessary in the following cases. - </para> - - <itemizedlist> - - <listitem> - <para> - HSM is in unknown or invalid state - </para> - </listitem> - - <listitem> - <para> - HBA is in unknown or invalid state - </para> - </listitem> - - <listitem> - <para> - EH needs to make HBA/device forget about in-flight commands - </para> - </listitem> - - <listitem> - <para> - HBA/device behaves weirdly - </para> - </listitem> - - </itemizedlist> - - <para> - Resetting during EH might be a good idea regardless of error - condition to improve EH robustness. Whether to reset both or - either one of HBA and device depends on situation but the - following scheme is recommended. - </para> - - <itemizedlist> - - <listitem> - <para> - When it's known that HBA is in ready state but ATA/ATAPI - device is in unknown state, reset only device. - </para> - </listitem> - - <listitem> - <para> - If HBA is in unknown state, reset both HBA and device. - </para> - </listitem> - - </itemizedlist> - - <para> - HBA resetting is implementation specific. For a controller - complying to taskfile/BMDMA PCI IDE, stopping active DMA - transaction may be sufficient iff BMDMA state is the only HBA - context. But even mostly taskfile/BMDMA PCI IDE complying - controllers may have implementation specific requirements and - mechanism to reset themselves. This must be addressed by - specific drivers. - </para> - - <para> - OTOH, ATA/ATAPI standard describes in detail ways to reset - ATA/ATAPI devices. - </para> - - <variablelist> - - <varlistentry><term>PATA hardware reset</term> - <listitem> - <para> - This is hardware initiated device reset signalled with - asserted PATA RESET- signal. There is no standard way to - initiate hardware reset from software although some - hardware provides registers that allow driver to directly - tweak the RESET- signal. - </para> - </listitem> - </varlistentry> - - <varlistentry><term>Software reset</term> - <listitem> - <para> - This is achieved by turning CONTROL SRST bit on for at - least 5us. Both PATA and SATA support it but, in case of - SATA, this may require controller-specific support as the - second Register FIS to clear SRST should be transmitted - while BSY bit is still set. Note that on PATA, this resets - both master and slave devices on a channel. - </para> - </listitem> - </varlistentry> - - <varlistentry><term>EXECUTE DEVICE DIAGNOSTIC command</term> - <listitem> - <para> - Although ATA/ATAPI standard doesn't describe exactly, EDD - implies some level of resetting, possibly similar level - with software reset. Host-side EDD protocol can be handled - with normal command processing and most SATA controllers - should be able to handle EDD's just like other commands. - As in software reset, EDD affects both devices on a PATA - bus. - </para> - <para> - Although EDD does reset devices, this doesn't suit error - handling as EDD cannot be issued while BSY is set and it's - unclear how it will act when device is in unknown/weird - state. - </para> - </listitem> - </varlistentry> - - <varlistentry><term>ATAPI DEVICE RESET command</term> - <listitem> - <para> - This is very similar to software reset except that reset - can be restricted to the selected device without affecting - the other device sharing the cable. - </para> - </listitem> - </varlistentry> - - <varlistentry><term>SATA phy reset</term> - <listitem> - <para> - This is the preferred way of resetting a SATA device. In - effect, it's identical to PATA hardware reset. Note that - this can be done with the standard SCR Control register. - As such, it's usually easier to implement than software - reset. - </para> - </listitem> - </varlistentry> - - </variablelist> - - <para> - One more thing to consider when resetting devices is that - resetting clears certain configuration parameters and they - need to be set to their previous or newly adjusted values - after reset. - </para> - - <para> - Parameters affected are. - </para> - - <itemizedlist> - - <listitem> - <para> - CHS set up with INITIALIZE DEVICE PARAMETERS (seldom used) - </para> - </listitem> - - <listitem> - <para> - Parameters set with SET FEATURES including transfer mode setting - </para> - </listitem> - - <listitem> - <para> - Block count set with SET MULTIPLE MODE - </para> - </listitem> - - <listitem> - <para> - Other parameters (SET MAX, MEDIA LOCK...) - </para> - </listitem> - - </itemizedlist> - - <para> - ATA/ATAPI standard specifies that some parameters must be - maintained across hardware or software reset, but doesn't - strictly specify all of them. Always reconfiguring needed - parameters after reset is required for robustness. Note that - this also applies when resuming from deep sleep (power-off). - </para> - - <para> - Also, ATA/ATAPI standard requires that IDENTIFY DEVICE / - IDENTIFY PACKET DEVICE is issued after any configuration - parameter is updated or a hardware reset and the result used - for further operation. OS driver is required to implement - revalidation mechanism to support this. - </para> - - </sect2> - - <sect2 id="exrecReconf"> - <title>Reconfigure transport</title> - - <para> - For both PATA and SATA, a lot of corners are cut for cheap - connectors, cables or controllers and it's quite common to see - high transmission error rate. This can be mitigated by - lowering transmission speed. - </para> - - <para> - The following is a possible scheme Jeff Garzik suggested. - </para> - - <blockquote> - <para> - If more than $N (3?) transmission errors happen in 15 minutes, - </para> - <itemizedlist> - <listitem> - <para> - if SATA, decrease SATA PHY speed. if speed cannot be decreased, - </para> - </listitem> - <listitem> - <para> - decrease UDMA xfer speed. if at UDMA0, switch to PIO4, - </para> - </listitem> - <listitem> - <para> - decrease PIO xfer speed. if at PIO3, complain, but continue - </para> - </listitem> - </itemizedlist> - </blockquote> - - </sect2> - - </sect1> - - </chapter> - - <chapter id="PiixInt"> - <title>ata_piix Internals</title> -!Idrivers/ata/ata_piix.c - </chapter> - - <chapter id="SILInt"> - <title>sata_sil Internals</title> -!Idrivers/ata/sata_sil.c - </chapter> - - <chapter id="libataThanks"> - <title>Thanks</title> - <para> - The bulk of the ATA knowledge comes thanks to long conversations with - Andre Hedrick (www.linux-ide.org), and long hours pondering the ATA - and SCSI specifications. - </para> - <para> - Thanks to Alan Cox for pointing out similarities - between SATA and SCSI, and in general for motivation to hack on - libata. - </para> - <para> - libata's device detection - method, ata_pio_devchk, and in general all the early probing was - based on extensive study of Hale Landis's probe/reset code in his - ATADRVR driver (www.ata-atapi.com). - </para> - </chapter> - -</book> diff --git a/Documentation/DocBook/librs.tmpl b/Documentation/DocBook/librs.tmpl deleted file mode 100644 index 94f21361e0ed..000000000000 --- a/Documentation/DocBook/librs.tmpl +++ /dev/null @@ -1,289 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="Reed-Solomon-Library-Guide"> - <bookinfo> - <title>Reed-Solomon Library Programming Interface</title> - - <authorgroup> - <author> - <firstname>Thomas</firstname> - <surname>Gleixner</surname> - <affiliation> - <address> - <email>tglx@linutronix.de</email> - </address> - </affiliation> - </author> - </authorgroup> - - <copyright> - <year>2004</year> - <holder>Thomas Gleixner</holder> - </copyright> - - <legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License version 2 as published by the Free Software Foundation. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - </para> - - <para> - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - </para> - - <para> - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - </bookinfo> - -<toc></toc> - - <chapter id="intro"> - <title>Introduction</title> - <para> - The generic Reed-Solomon Library provides encoding, decoding - and error correction functions. - </para> - <para> - Reed-Solomon codes are used in communication and storage - applications to ensure data integrity. - </para> - <para> - This documentation is provided for developers who want to utilize - the functions provided by the library. - </para> - </chapter> - - <chapter id="bugs"> - <title>Known Bugs And Assumptions</title> - <para> - None. - </para> - </chapter> - - <chapter id="usage"> - <title>Usage</title> - <para> - This chapter provides examples of how to use the library. - </para> - <sect1> - <title>Initializing</title> - <para> - The init function init_rs returns a pointer to an - rs decoder structure, which holds the necessary - information for encoding, decoding and error correction - with the given polynomial. It either uses an existing - matching decoder or creates a new one. On creation all - the lookup tables for fast en/decoding are created. - The function may take a while, so make sure not to - call it in critical code paths. - </para> - <programlisting> -/* the Reed Solomon control structure */ -static struct rs_control *rs_decoder; - -/* Symbolsize is 10 (bits) - * Primitive polynomial is x^10+x^3+1 - * first consecutive root is 0 - * primitive element to generate roots = 1 - * generator polynomial degree (number of roots) = 6 - */ -rs_decoder = init_rs (10, 0x409, 0, 1, 6); - </programlisting> - </sect1> - <sect1> - <title>Encoding</title> - <para> - The encoder calculates the Reed-Solomon code over - the given data length and stores the result in - the parity buffer. Note that the parity buffer must - be initialized before calling the encoder. - </para> - <para> - The expanded data can be inverted on the fly by - providing a non-zero inversion mask. The expanded data is - XOR'ed with the mask. This is used e.g. for FLASH - ECC, where the all 0xFF is inverted to an all 0x00. - The Reed-Solomon code for all 0x00 is all 0x00. The - code is inverted before storing to FLASH so it is 0xFF - too. This prevents that reading from an erased FLASH - results in ECC errors. - </para> - <para> - The databytes are expanded to the given symbol size - on the fly. There is no support for encoding continuous - bitstreams with a symbol size != 8 at the moment. If - it is necessary it should be not a big deal to implement - such functionality. - </para> - <programlisting> -/* Parity buffer. Size = number of roots */ -uint16_t par[6]; -/* Initialize the parity buffer */ -memset(par, 0, sizeof(par)); -/* Encode 512 byte in data8. Store parity in buffer par */ -encode_rs8 (rs_decoder, data8, 512, par, 0); - </programlisting> - </sect1> - <sect1> - <title>Decoding</title> - <para> - The decoder calculates the syndrome over - the given data length and the received parity symbols - and corrects errors in the data. - </para> - <para> - If a syndrome is available from a hardware decoder - then the syndrome calculation is skipped. - </para> - <para> - The correction of the data buffer can be suppressed - by providing a correction pattern buffer and an error - location buffer to the decoder. The decoder stores the - calculated error location and the correction bitmask - in the given buffers. This is useful for hardware - decoders which use a weird bit ordering scheme. - </para> - <para> - The databytes are expanded to the given symbol size - on the fly. There is no support for decoding continuous - bitstreams with a symbolsize != 8 at the moment. If - it is necessary it should be not a big deal to implement - such functionality. - </para> - - <sect2> - <title> - Decoding with syndrome calculation, direct data correction - </title> - <programlisting> -/* Parity buffer. Size = number of roots */ -uint16_t par[6]; -uint8_t data[512]; -int numerr; -/* Receive data */ -..... -/* Receive parity */ -..... -/* Decode 512 byte in data8.*/ -numerr = decode_rs8 (rs_decoder, data8, par, 512, NULL, 0, NULL, 0, NULL); - </programlisting> - </sect2> - - <sect2> - <title> - Decoding with syndrome given by hardware decoder, direct data correction - </title> - <programlisting> -/* Parity buffer. Size = number of roots */ -uint16_t par[6], syn[6]; -uint8_t data[512]; -int numerr; -/* Receive data */ -..... -/* Receive parity */ -..... -/* Get syndrome from hardware decoder */ -..... -/* Decode 512 byte in data8.*/ -numerr = decode_rs8 (rs_decoder, data8, par, 512, syn, 0, NULL, 0, NULL); - </programlisting> - </sect2> - - <sect2> - <title> - Decoding with syndrome given by hardware decoder, no direct data correction. - </title> - <para> - Note: It's not necessary to give data and received parity to the decoder. - </para> - <programlisting> -/* Parity buffer. Size = number of roots */ -uint16_t par[6], syn[6], corr[8]; -uint8_t data[512]; -int numerr, errpos[8]; -/* Receive data */ -..... -/* Receive parity */ -..... -/* Get syndrome from hardware decoder */ -..... -/* Decode 512 byte in data8.*/ -numerr = decode_rs8 (rs_decoder, NULL, NULL, 512, syn, 0, errpos, 0, corr); -for (i = 0; i < numerr; i++) { - do_error_correction_in_your_buffer(errpos[i], corr[i]); -} - </programlisting> - </sect2> - </sect1> - <sect1> - <title>Cleanup</title> - <para> - The function free_rs frees the allocated resources, - if the caller is the last user of the decoder. - </para> - <programlisting> -/* Release resources */ -free_rs(rs_decoder); - </programlisting> - </sect1> - - </chapter> - - <chapter id="structs"> - <title>Structures</title> - <para> - This chapter contains the autogenerated documentation of the structures which are - used in the Reed-Solomon Library and are relevant for a developer. - </para> -!Iinclude/linux/rslib.h - </chapter> - - <chapter id="pubfunctions"> - <title>Public Functions Provided</title> - <para> - This chapter contains the autogenerated documentation of the Reed-Solomon functions - which are exported. - </para> -!Elib/reed_solomon/reed_solomon.c - </chapter> - - <chapter id="credits"> - <title>Credits</title> - <para> - The library code for encoding and decoding was written by Phil Karn. - </para> - <programlisting> - Copyright 2002, Phil Karn, KA9Q - May be used under the terms of the GNU General Public License (GPL) - </programlisting> - <para> - The wrapper functions and interfaces are written by Thomas Gleixner. - </para> - <para> - Many users have provided bugfixes, improvements and helping hands for testing. - Thanks a lot. - </para> - <para> - The following people have contributed to this document: - </para> - <para> - Thomas Gleixner<email>tglx@linutronix.de</email> - </para> - </chapter> -</book> diff --git a/Documentation/DocBook/lsm.tmpl b/Documentation/DocBook/lsm.tmpl deleted file mode 100644 index fe7664ce9667..000000000000 --- a/Documentation/DocBook/lsm.tmpl +++ /dev/null @@ -1,265 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<article class="whitepaper" id="LinuxSecurityModule" lang="en"> - <articleinfo> - <title>Linux Security Modules: General Security Hooks for Linux</title> - <authorgroup> - <author> - <firstname>Stephen</firstname> - <surname>Smalley</surname> - <affiliation> - <orgname>NAI Labs</orgname> - <address><email>ssmalley@nai.com</email></address> - </affiliation> - </author> - <author> - <firstname>Timothy</firstname> - <surname>Fraser</surname> - <affiliation> - <orgname>NAI Labs</orgname> - <address><email>tfraser@nai.com</email></address> - </affiliation> - </author> - <author> - <firstname>Chris</firstname> - <surname>Vance</surname> - <affiliation> - <orgname>NAI Labs</orgname> - <address><email>cvance@nai.com</email></address> - </affiliation> - </author> - </authorgroup> - </articleinfo> - -<sect1 id="Introduction"><title>Introduction</title> - -<para> -In March 2001, the National Security Agency (NSA) gave a presentation -about Security-Enhanced Linux (SELinux) at the 2.5 Linux Kernel -Summit. SELinux is an implementation of flexible and fine-grained -nondiscretionary access controls in the Linux kernel, originally -implemented as its own particular kernel patch. Several other -security projects (e.g. RSBAC, Medusa) have also developed flexible -access control architectures for the Linux kernel, and various -projects have developed particular access control models for Linux -(e.g. LIDS, DTE, SubDomain). Each project has developed and -maintained its own kernel patch to support its security needs. -</para> - -<para> -In response to the NSA presentation, Linus Torvalds made a set of -remarks that described a security framework he would be willing to -consider for inclusion in the mainstream Linux kernel. He described a -general framework that would provide a set of security hooks to -control operations on kernel objects and a set of opaque security -fields in kernel data structures for maintaining security attributes. -This framework could then be used by loadable kernel modules to -implement any desired model of security. Linus also suggested the -possibility of migrating the Linux capabilities code into such a -module. -</para> - -<para> -The Linux Security Modules (LSM) project was started by WireX to -develop such a framework. LSM is a joint development effort by -several security projects, including Immunix, SELinux, SGI and Janus, -and several individuals, including Greg Kroah-Hartman and James -Morris, to develop a Linux kernel patch that implements this -framework. The patch is currently tracking the 2.4 series and is -targeted for integration into the 2.5 development series. This -technical report provides an overview of the framework and the example -capabilities security module provided by the LSM kernel patch. -</para> - -</sect1> - -<sect1 id="framework"><title>LSM Framework</title> - -<para> -The LSM kernel patch provides a general kernel framework to support -security modules. In particular, the LSM framework is primarily -focused on supporting access control modules, although future -development is likely to address other security needs such as -auditing. By itself, the framework does not provide any additional -security; it merely provides the infrastructure to support security -modules. The LSM kernel patch also moves most of the capabilities -logic into an optional security module, with the system defaulting -to the traditional superuser logic. This capabilities module -is discussed further in <xref linkend="cap"/>. -</para> - -<para> -The LSM kernel patch adds security fields to kernel data structures -and inserts calls to hook functions at critical points in the kernel -code to manage the security fields and to perform access control. It -also adds functions for registering and unregistering security -modules, and adds a general <function>security</function> system call -to support new system calls for security-aware applications. -</para> - -<para> -The LSM security fields are simply <type>void*</type> pointers. For -process and program execution security information, security fields -were added to <structname>struct task_struct</structname> and -<structname>struct linux_binprm</structname>. For filesystem security -information, a security field was added to -<structname>struct super_block</structname>. For pipe, file, and socket -security information, security fields were added to -<structname>struct inode</structname> and -<structname>struct file</structname>. For packet and network device security -information, security fields were added to -<structname>struct sk_buff</structname> and -<structname>struct net_device</structname>. For System V IPC security -information, security fields were added to -<structname>struct kern_ipc_perm</structname> and -<structname>struct msg_msg</structname>; additionally, the definitions -for <structname>struct msg_msg</structname>, <structname>struct -msg_queue</structname>, and <structname>struct -shmid_kernel</structname> were moved to header files -(<filename>include/linux/msg.h</filename> and -<filename>include/linux/shm.h</filename> as appropriate) to allow -the security modules to use these definitions. -</para> - -<para> -Each LSM hook is a function pointer in a global table, -security_ops. This table is a -<structname>security_operations</structname> structure as defined by -<filename>include/linux/security.h</filename>. Detailed documentation -for each hook is included in this header file. At present, this -structure consists of a collection of substructures that group related -hooks based on the kernel object (e.g. task, inode, file, sk_buff, -etc) as well as some top-level hook function pointers for system -operations. This structure is likely to be flattened in the future -for performance. The placement of the hook calls in the kernel code -is described by the "called:" lines in the per-hook documentation in -the header file. The hook calls can also be easily found in the -kernel code by looking for the string "security_ops->". - -</para> - -<para> -Linus mentioned per-process security hooks in his original remarks as a -possible alternative to global security hooks. However, if LSM were -to start from the perspective of per-process hooks, then the base -framework would have to deal with how to handle operations that -involve multiple processes (e.g. kill), since each process might have -its own hook for controlling the operation. This would require a -general mechanism for composing hooks in the base framework. -Additionally, LSM would still need global hooks for operations that -have no process context (e.g. network input operations). -Consequently, LSM provides global security hooks, but a security -module is free to implement per-process hooks (where that makes sense) -by storing a security_ops table in each process' security field and -then invoking these per-process hooks from the global hooks. -The problem of composition is thus deferred to the module. -</para> - -<para> -The global security_ops table is initialized to a set of hook -functions provided by a dummy security module that provides -traditional superuser logic. A <function>register_security</function> -function (in <filename>security/security.c</filename>) is provided to -allow a security module to set security_ops to refer to its own hook -functions, and an <function>unregister_security</function> function is -provided to revert security_ops to the dummy module hooks. This -mechanism is used to set the primary security module, which is -responsible for making the final decision for each hook. -</para> - -<para> -LSM also provides a simple mechanism for stacking additional security -modules with the primary security module. It defines -<function>register_security</function> and -<function>unregister_security</function> hooks in the -<structname>security_operations</structname> structure and provides -<function>mod_reg_security</function> and -<function>mod_unreg_security</function> functions that invoke these -hooks after performing some sanity checking. A security module can -call these functions in order to stack with other modules. However, -the actual details of how this stacking is handled are deferred to the -module, which can implement these hooks in any way it wishes -(including always returning an error if it does not wish to support -stacking). In this manner, LSM again defers the problem of -composition to the module. -</para> - -<para> -Although the LSM hooks are organized into substructures based on -kernel object, all of the hooks can be viewed as falling into two -major categories: hooks that are used to manage the security fields -and hooks that are used to perform access control. Examples of the -first category of hooks include the -<function>alloc_security</function> and -<function>free_security</function> hooks defined for each kernel data -structure that has a security field. These hooks are used to allocate -and free security structures for kernel objects. The first category -of hooks also includes hooks that set information in the security -field after allocation, such as the <function>post_lookup</function> -hook in <structname>struct inode_security_ops</structname>. This hook -is used to set security information for inodes after successful lookup -operations. An example of the second category of hooks is the -<function>permission</function> hook in -<structname>struct inode_security_ops</structname>. This hook checks -permission when accessing an inode. -</para> - -</sect1> - -<sect1 id="cap"><title>LSM Capabilities Module</title> - -<para> -The LSM kernel patch moves most of the existing POSIX.1e capabilities -logic into an optional security module stored in the file -<filename>security/capability.c</filename>. This change allows -users who do not want to use capabilities to omit this code entirely -from their kernel, instead using the dummy module for traditional -superuser logic or any other module that they desire. This change -also allows the developers of the capabilities logic to maintain and -enhance their code more freely, without needing to integrate patches -back into the base kernel. -</para> - -<para> -In addition to moving the capabilities logic, the LSM kernel patch -could move the capability-related fields from the kernel data -structures into the new security fields managed by the security -modules. However, at present, the LSM kernel patch leaves the -capability fields in the kernel data structures. In his original -remarks, Linus suggested that this might be preferable so that other -security modules can be easily stacked with the capabilities module -without needing to chain multiple security structures on the security field. -It also avoids imposing extra overhead on the capabilities module -to manage the security fields. However, the LSM framework could -certainly support such a move if it is determined to be desirable, -with only a few additional changes described below. -</para> - -<para> -At present, the capabilities logic for computing process capabilities -on <function>execve</function> and <function>set*uid</function>, -checking capabilities for a particular process, saving and checking -capabilities for netlink messages, and handling the -<function>capget</function> and <function>capset</function> system -calls have been moved into the capabilities module. There are still a -few locations in the base kernel where capability-related fields are -directly examined or modified, but the current version of the LSM -patch does allow a security module to completely replace the -assignment and testing of capabilities. These few locations would -need to be changed if the capability-related fields were moved into -the security field. The following is a list of known locations that -still perform such direct examination or modification of -capability-related fields: -<itemizedlist> -<listitem><para><filename>fs/open.c</filename>:<function>sys_access</function></para></listitem> -<listitem><para><filename>fs/lockd/host.c</filename>:<function>nlm_bind_host</function></para></listitem> -<listitem><para><filename>fs/nfsd/auth.c</filename>:<function>nfsd_setuser</function></para></listitem> -<listitem><para><filename>fs/proc/array.c</filename>:<function>task_cap</function></para></listitem> -</itemizedlist> -</para> - -</sect1> - -</article> diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl deleted file mode 100644 index b442921bca54..000000000000 --- a/Documentation/DocBook/mtdnand.tmpl +++ /dev/null @@ -1,1291 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="MTD-NAND-Guide"> - <bookinfo> - <title>MTD NAND Driver Programming Interface</title> - - <authorgroup> - <author> - <firstname>Thomas</firstname> - <surname>Gleixner</surname> - <affiliation> - <address> - <email>tglx@linutronix.de</email> - </address> - </affiliation> - </author> - </authorgroup> - - <copyright> - <year>2004</year> - <holder>Thomas Gleixner</holder> - </copyright> - - <legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License version 2 as published by the Free Software Foundation. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - </para> - - <para> - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - </para> - - <para> - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - </bookinfo> - -<toc></toc> - - <chapter id="intro"> - <title>Introduction</title> - <para> - The generic NAND driver supports almost all NAND and AG-AND based - chips and connects them to the Memory Technology Devices (MTD) - subsystem of the Linux Kernel. - </para> - <para> - This documentation is provided for developers who want to implement - board drivers or filesystem drivers suitable for NAND devices. - </para> - </chapter> - - <chapter id="bugs"> - <title>Known Bugs And Assumptions</title> - <para> - None. - </para> - </chapter> - - <chapter id="dochints"> - <title>Documentation hints</title> - <para> - The function and structure docs are autogenerated. Each function and - struct member has a short description which is marked with an [XXX] identifier. - The following chapters explain the meaning of those identifiers. - </para> - <sect1 id="Function_identifiers_XXX"> - <title>Function identifiers [XXX]</title> - <para> - The functions are marked with [XXX] identifiers in the short - comment. The identifiers explain the usage and scope of the - functions. Following identifiers are used: - </para> - <itemizedlist> - <listitem><para> - [MTD Interface]</para><para> - These functions provide the interface to the MTD kernel API. - They are not replaceable and provide functionality - which is complete hardware independent. - </para></listitem> - <listitem><para> - [NAND Interface]</para><para> - These functions are exported and provide the interface to the NAND kernel API. - </para></listitem> - <listitem><para> - [GENERIC]</para><para> - Generic functions are not replaceable and provide functionality - which is complete hardware independent. - </para></listitem> - <listitem><para> - [DEFAULT]</para><para> - Default functions provide hardware related functionality which is suitable - for most of the implementations. These functions can be replaced by the - board driver if necessary. Those functions are called via pointers in the - NAND chip description structure. The board driver can set the functions which - should be replaced by board dependent functions before calling nand_scan(). - If the function pointer is NULL on entry to nand_scan() then the pointer - is set to the default function which is suitable for the detected chip type. - </para></listitem> - </itemizedlist> - </sect1> - <sect1 id="Struct_member_identifiers_XXX"> - <title>Struct member identifiers [XXX]</title> - <para> - The struct members are marked with [XXX] identifiers in the - comment. The identifiers explain the usage and scope of the - members. Following identifiers are used: - </para> - <itemizedlist> - <listitem><para> - [INTERN]</para><para> - These members are for NAND driver internal use only and must not be - modified. Most of these values are calculated from the chip geometry - information which is evaluated during nand_scan(). - </para></listitem> - <listitem><para> - [REPLACEABLE]</para><para> - Replaceable members hold hardware related functions which can be - provided by the board driver. The board driver can set the functions which - should be replaced by board dependent functions before calling nand_scan(). - If the function pointer is NULL on entry to nand_scan() then the pointer - is set to the default function which is suitable for the detected chip type. - </para></listitem> - <listitem><para> - [BOARDSPECIFIC]</para><para> - Board specific members hold hardware related information which must - be provided by the board driver. The board driver must set the function - pointers and datafields before calling nand_scan(). - </para></listitem> - <listitem><para> - [OPTIONAL]</para><para> - Optional members can hold information relevant for the board driver. The - generic NAND driver code does not use this information. - </para></listitem> - </itemizedlist> - </sect1> - </chapter> - - <chapter id="basicboarddriver"> - <title>Basic board driver</title> - <para> - For most boards it will be sufficient to provide just the - basic functions and fill out some really board dependent - members in the nand chip description structure. - </para> - <sect1 id="Basic_defines"> - <title>Basic defines</title> - <para> - At least you have to provide a nand_chip structure - and a storage for the ioremap'ed chip address. - You can allocate the nand_chip structure using - kmalloc or you can allocate it statically. - The NAND chip structure embeds an mtd structure - which will be registered to the MTD subsystem. - You can extract a pointer to the mtd structure - from a nand_chip pointer using the nand_to_mtd() - helper. - </para> - <para> - Kmalloc based example - </para> - <programlisting> -static struct mtd_info *board_mtd; -static void __iomem *baseaddr; - </programlisting> - <para> - Static example - </para> - <programlisting> -static struct nand_chip board_chip; -static void __iomem *baseaddr; - </programlisting> - </sect1> - <sect1 id="Partition_defines"> - <title>Partition defines</title> - <para> - If you want to divide your device into partitions, then - define a partitioning scheme suitable to your board. - </para> - <programlisting> -#define NUM_PARTITIONS 2 -static struct mtd_partition partition_info[] = { - { .name = "Flash partition 1", - .offset = 0, - .size = 8 * 1024 * 1024 }, - { .name = "Flash partition 2", - .offset = MTDPART_OFS_NEXT, - .size = MTDPART_SIZ_FULL }, -}; - </programlisting> - </sect1> - <sect1 id="Hardware_control_functions"> - <title>Hardware control function</title> - <para> - The hardware control function provides access to the - control pins of the NAND chip(s). - The access can be done by GPIO pins or by address lines. - If you use address lines, make sure that the timing - requirements are met. - </para> - <para> - <emphasis>GPIO based example</emphasis> - </para> - <programlisting> -static void board_hwcontrol(struct mtd_info *mtd, int cmd) -{ - switch(cmd){ - case NAND_CTL_SETCLE: /* Set CLE pin high */ break; - case NAND_CTL_CLRCLE: /* Set CLE pin low */ break; - case NAND_CTL_SETALE: /* Set ALE pin high */ break; - case NAND_CTL_CLRALE: /* Set ALE pin low */ break; - case NAND_CTL_SETNCE: /* Set nCE pin low */ break; - case NAND_CTL_CLRNCE: /* Set nCE pin high */ break; - } -} - </programlisting> - <para> - <emphasis>Address lines based example.</emphasis> It's assumed that the - nCE pin is driven by a chip select decoder. - </para> - <programlisting> -static void board_hwcontrol(struct mtd_info *mtd, int cmd) -{ - struct nand_chip *this = mtd_to_nand(mtd); - switch(cmd){ - case NAND_CTL_SETCLE: this->IO_ADDR_W |= CLE_ADRR_BIT; break; - case NAND_CTL_CLRCLE: this->IO_ADDR_W &= ~CLE_ADRR_BIT; break; - case NAND_CTL_SETALE: this->IO_ADDR_W |= ALE_ADRR_BIT; break; - case NAND_CTL_CLRALE: this->IO_ADDR_W &= ~ALE_ADRR_BIT; break; - } -} - </programlisting> - </sect1> - <sect1 id="Device_ready_function"> - <title>Device ready function</title> - <para> - If the hardware interface has the ready busy pin of the NAND chip connected to a - GPIO or other accessible I/O pin, this function is used to read back the state of the - pin. The function has no arguments and should return 0, if the device is busy (R/B pin - is low) and 1, if the device is ready (R/B pin is high). - If the hardware interface does not give access to the ready busy pin, then - the function must not be defined and the function pointer this->dev_ready is set to NULL. - </para> - </sect1> - <sect1 id="Init_function"> - <title>Init function</title> - <para> - The init function allocates memory and sets up all the board - specific parameters and function pointers. When everything - is set up nand_scan() is called. This function tries to - detect and identify then chip. If a chip is found all the - internal data fields are initialized accordingly. - The structure(s) have to be zeroed out first and then filled with the necessary - information about the device. - </para> - <programlisting> -static int __init board_init (void) -{ - struct nand_chip *this; - int err = 0; - - /* Allocate memory for MTD device structure and private data */ - this = kzalloc(sizeof(struct nand_chip), GFP_KERNEL); - if (!this) { - printk ("Unable to allocate NAND MTD device structure.\n"); - err = -ENOMEM; - goto out; - } - - board_mtd = nand_to_mtd(this); - - /* map physical address */ - baseaddr = ioremap(CHIP_PHYSICAL_ADDRESS, 1024); - if (!baseaddr) { - printk("Ioremap to access NAND chip failed\n"); - err = -EIO; - goto out_mtd; - } - - /* Set address of NAND IO lines */ - this->IO_ADDR_R = baseaddr; - this->IO_ADDR_W = baseaddr; - /* Reference hardware control function */ - this->hwcontrol = board_hwcontrol; - /* Set command delay time, see datasheet for correct value */ - this->chip_delay = CHIP_DEPENDEND_COMMAND_DELAY; - /* Assign the device ready function, if available */ - this->dev_ready = board_dev_ready; - this->eccmode = NAND_ECC_SOFT; - - /* Scan to find existence of the device */ - if (nand_scan (board_mtd, 1)) { - err = -ENXIO; - goto out_ior; - } - - add_mtd_partitions(board_mtd, partition_info, NUM_PARTITIONS); - goto out; - -out_ior: - iounmap(baseaddr); -out_mtd: - kfree (this); -out: - return err; -} -module_init(board_init); - </programlisting> - </sect1> - <sect1 id="Exit_function"> - <title>Exit function</title> - <para> - The exit function is only necessary if the driver is - compiled as a module. It releases all resources which - are held by the chip driver and unregisters the partitions - in the MTD layer. - </para> - <programlisting> -#ifdef MODULE -static void __exit board_cleanup (void) -{ - /* Release resources, unregister device */ - nand_release (board_mtd); - - /* unmap physical address */ - iounmap(baseaddr); - - /* Free the MTD device structure */ - kfree (mtd_to_nand(board_mtd)); -} -module_exit(board_cleanup); -#endif - </programlisting> - </sect1> - </chapter> - - <chapter id="boarddriversadvanced"> - <title>Advanced board driver functions</title> - <para> - This chapter describes the advanced functionality of the NAND - driver. For a list of functions which can be overridden by the board - driver see the documentation of the nand_chip structure. - </para> - <sect1 id="Multiple_chip_control"> - <title>Multiple chip control</title> - <para> - The nand driver can control chip arrays. Therefore the - board driver must provide an own select_chip function. This - function must (de)select the requested chip. - The function pointer in the nand_chip structure must - be set before calling nand_scan(). The maxchip parameter - of nand_scan() defines the maximum number of chips to - scan for. Make sure that the select_chip function can - handle the requested number of chips. - </para> - <para> - The nand driver concatenates the chips to one virtual - chip and provides this virtual chip to the MTD layer. - </para> - <para> - <emphasis>Note: The driver can only handle linear chip arrays - of equally sized chips. There is no support for - parallel arrays which extend the buswidth.</emphasis> - </para> - <para> - <emphasis>GPIO based example</emphasis> - </para> - <programlisting> -static void board_select_chip (struct mtd_info *mtd, int chip) -{ - /* Deselect all chips, set all nCE pins high */ - GPIO(BOARD_NAND_NCE) |= 0xff; - if (chip >= 0) - GPIO(BOARD_NAND_NCE) &= ~ (1 << chip); -} - </programlisting> - <para> - <emphasis>Address lines based example.</emphasis> - Its assumed that the nCE pins are connected to an - address decoder. - </para> - <programlisting> -static void board_select_chip (struct mtd_info *mtd, int chip) -{ - struct nand_chip *this = mtd_to_nand(mtd); - - /* Deselect all chips */ - this->IO_ADDR_R &= ~BOARD_NAND_ADDR_MASK; - this->IO_ADDR_W &= ~BOARD_NAND_ADDR_MASK; - switch (chip) { - case 0: - this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIP0; - this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIP0; - break; - .... - case n: - this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIPn; - this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIPn; - break; - } -} - </programlisting> - </sect1> - <sect1 id="Hardware_ECC_support"> - <title>Hardware ECC support</title> - <sect2 id="Functions_and_constants"> - <title>Functions and constants</title> - <para> - The nand driver supports three different types of - hardware ECC. - <itemizedlist> - <listitem><para>NAND_ECC_HW3_256</para><para> - Hardware ECC generator providing 3 bytes ECC per - 256 byte. - </para> </listitem> - <listitem><para>NAND_ECC_HW3_512</para><para> - Hardware ECC generator providing 3 bytes ECC per - 512 byte. - </para> </listitem> - <listitem><para>NAND_ECC_HW6_512</para><para> - Hardware ECC generator providing 6 bytes ECC per - 512 byte. - </para> </listitem> - <listitem><para>NAND_ECC_HW8_512</para><para> - Hardware ECC generator providing 6 bytes ECC per - 512 byte. - </para> </listitem> - </itemizedlist> - If your hardware generator has a different functionality - add it at the appropriate place in nand_base.c - </para> - <para> - The board driver must provide following functions: - <itemizedlist> - <listitem><para>enable_hwecc</para><para> - This function is called before reading / writing to - the chip. Reset or initialize the hardware generator - in this function. The function is called with an - argument which let you distinguish between read - and write operations. - </para> </listitem> - <listitem><para>calculate_ecc</para><para> - This function is called after read / write from / to - the chip. Transfer the ECC from the hardware to - the buffer. If the option NAND_HWECC_SYNDROME is set - then the function is only called on write. See below. - </para> </listitem> - <listitem><para>correct_data</para><para> - In case of an ECC error this function is called for - error detection and correction. Return 1 respectively 2 - in case the error can be corrected. If the error is - not correctable return -1. If your hardware generator - matches the default algorithm of the nand_ecc software - generator then use the correction function provided - by nand_ecc instead of implementing duplicated code. - </para> </listitem> - </itemizedlist> - </para> - </sect2> - <sect2 id="Hardware_ECC_with_syndrome_calculation"> - <title>Hardware ECC with syndrome calculation</title> - <para> - Many hardware ECC implementations provide Reed-Solomon - codes and calculate an error syndrome on read. The syndrome - must be converted to a standard Reed-Solomon syndrome - before calling the error correction code in the generic - Reed-Solomon library. - </para> - <para> - The ECC bytes must be placed immediately after the data - bytes in order to make the syndrome generator work. This - is contrary to the usual layout used by software ECC. The - separation of data and out of band area is not longer - possible. The nand driver code handles this layout and - the remaining free bytes in the oob area are managed by - the autoplacement code. Provide a matching oob-layout - in this case. See rts_from4.c and diskonchip.c for - implementation reference. In those cases we must also - use bad block tables on FLASH, because the ECC layout is - interfering with the bad block marker positions. - See bad block table support for details. - </para> - </sect2> - </sect1> - <sect1 id="Bad_Block_table_support"> - <title>Bad block table support</title> - <para> - Most NAND chips mark the bad blocks at a defined - position in the spare area. Those blocks must - not be erased under any circumstances as the bad - block information would be lost. - It is possible to check the bad block mark each - time when the blocks are accessed by reading the - spare area of the first page in the block. This - is time consuming so a bad block table is used. - </para> - <para> - The nand driver supports various types of bad block - tables. - <itemizedlist> - <listitem><para>Per device</para><para> - The bad block table contains all bad block information - of the device which can consist of multiple chips. - </para> </listitem> - <listitem><para>Per chip</para><para> - A bad block table is used per chip and contains the - bad block information for this particular chip. - </para> </listitem> - <listitem><para>Fixed offset</para><para> - The bad block table is located at a fixed offset - in the chip (device). This applies to various - DiskOnChip devices. - </para> </listitem> - <listitem><para>Automatic placed</para><para> - The bad block table is automatically placed and - detected either at the end or at the beginning - of a chip (device) - </para> </listitem> - <listitem><para>Mirrored tables</para><para> - The bad block table is mirrored on the chip (device) to - allow updates of the bad block table without data loss. - </para> </listitem> - </itemizedlist> - </para> - <para> - nand_scan() calls the function nand_default_bbt(). - nand_default_bbt() selects appropriate default - bad block table descriptors depending on the chip information - which was retrieved by nand_scan(). - </para> - <para> - The standard policy is scanning the device for bad - blocks and build a ram based bad block table which - allows faster access than always checking the - bad block information on the flash chip itself. - </para> - <sect2 id="Flash_based_tables"> - <title>Flash based tables</title> - <para> - It may be desired or necessary to keep a bad block table in FLASH. - For AG-AND chips this is mandatory, as they have no factory marked - bad blocks. They have factory marked good blocks. The marker pattern - is erased when the block is erased to be reused. So in case of - powerloss before writing the pattern back to the chip this block - would be lost and added to the bad blocks. Therefore we scan the - chip(s) when we detect them the first time for good blocks and - store this information in a bad block table before erasing any - of the blocks. - </para> - <para> - The blocks in which the tables are stored are protected against - accidental access by marking them bad in the memory bad block - table. The bad block table management functions are allowed - to circumvent this protection. - </para> - <para> - The simplest way to activate the FLASH based bad block table support - is to set the option NAND_BBT_USE_FLASH in the bbt_option field of - the nand chip structure before calling nand_scan(). For AG-AND - chips is this done by default. - This activates the default FLASH based bad block table functionality - of the NAND driver. The default bad block table options are - <itemizedlist> - <listitem><para>Store bad block table per chip</para></listitem> - <listitem><para>Use 2 bits per block</para></listitem> - <listitem><para>Automatic placement at the end of the chip</para></listitem> - <listitem><para>Use mirrored tables with version numbers</para></listitem> - <listitem><para>Reserve 4 blocks at the end of the chip</para></listitem> - </itemizedlist> - </para> - </sect2> - <sect2 id="User_defined_tables"> - <title>User defined tables</title> - <para> - User defined tables are created by filling out a - nand_bbt_descr structure and storing the pointer in the - nand_chip structure member bbt_td before calling nand_scan(). - If a mirror table is necessary a second structure must be - created and a pointer to this structure must be stored - in bbt_md inside the nand_chip structure. If the bbt_md - member is set to NULL then only the main table is used - and no scan for the mirrored table is performed. - </para> - <para> - The most important field in the nand_bbt_descr structure - is the options field. The options define most of the - table properties. Use the predefined constants from - nand.h to define the options. - <itemizedlist> - <listitem><para>Number of bits per block</para> - <para>The supported number of bits is 1, 2, 4, 8.</para></listitem> - <listitem><para>Table per chip</para> - <para>Setting the constant NAND_BBT_PERCHIP selects that - a bad block table is managed for each chip in a chip array. - If this option is not set then a per device bad block table - is used.</para></listitem> - <listitem><para>Table location is absolute</para> - <para>Use the option constant NAND_BBT_ABSPAGE and - define the absolute page number where the bad block - table starts in the field pages. If you have selected bad block - tables per chip and you have a multi chip array then the start page - must be given for each chip in the chip array. Note: there is no scan - for a table ident pattern performed, so the fields - pattern, veroffs, offs, len can be left uninitialized</para></listitem> - <listitem><para>Table location is automatically detected</para> - <para>The table can either be located in the first or the last good - blocks of the chip (device). Set NAND_BBT_LASTBLOCK to place - the bad block table at the end of the chip (device). The - bad block tables are marked and identified by a pattern which - is stored in the spare area of the first page in the block which - holds the bad block table. Store a pointer to the pattern - in the pattern field. Further the length of the pattern has to be - stored in len and the offset in the spare area must be given - in the offs member of the nand_bbt_descr structure. For mirrored - bad block tables different patterns are mandatory.</para></listitem> - <listitem><para>Table creation</para> - <para>Set the option NAND_BBT_CREATE to enable the table creation - if no table can be found during the scan. Usually this is done only - once if a new chip is found. </para></listitem> - <listitem><para>Table write support</para> - <para>Set the option NAND_BBT_WRITE to enable the table write support. - This allows the update of the bad block table(s) in case a block has - to be marked bad due to wear. The MTD interface function block_markbad - is calling the update function of the bad block table. If the write - support is enabled then the table is updated on FLASH.</para> - <para> - Note: Write support should only be enabled for mirrored tables with - version control. - </para></listitem> - <listitem><para>Table version control</para> - <para>Set the option NAND_BBT_VERSION to enable the table version control. - It's highly recommended to enable this for mirrored tables with write - support. It makes sure that the risk of losing the bad block - table information is reduced to the loss of the information about the - one worn out block which should be marked bad. The version is stored in - 4 consecutive bytes in the spare area of the device. The position of - the version number is defined by the member veroffs in the bad block table - descriptor.</para></listitem> - <listitem><para>Save block contents on write</para> - <para> - In case that the block which holds the bad block table does contain - other useful information, set the option NAND_BBT_SAVECONTENT. When - the bad block table is written then the whole block is read the bad - block table is updated and the block is erased and everything is - written back. If this option is not set only the bad block table - is written and everything else in the block is ignored and erased. - </para></listitem> - <listitem><para>Number of reserved blocks</para> - <para> - For automatic placement some blocks must be reserved for - bad block table storage. The number of reserved blocks is defined - in the maxblocks member of the bad block table description structure. - Reserving 4 blocks for mirrored tables should be a reasonable number. - This also limits the number of blocks which are scanned for the bad - block table ident pattern. - </para></listitem> - </itemizedlist> - </para> - </sect2> - </sect1> - <sect1 id="Spare_area_placement"> - <title>Spare area (auto)placement</title> - <para> - The nand driver implements different possibilities for - placement of filesystem data in the spare area, - <itemizedlist> - <listitem><para>Placement defined by fs driver</para></listitem> - <listitem><para>Automatic placement</para></listitem> - </itemizedlist> - The default placement function is automatic placement. The - nand driver has built in default placement schemes for the - various chiptypes. If due to hardware ECC functionality the - default placement does not fit then the board driver can - provide a own placement scheme. - </para> - <para> - File system drivers can provide a own placement scheme which - is used instead of the default placement scheme. - </para> - <para> - Placement schemes are defined by a nand_oobinfo structure - <programlisting> -struct nand_oobinfo { - int useecc; - int eccbytes; - int eccpos[24]; - int oobfree[8][2]; -}; - </programlisting> - <itemizedlist> - <listitem><para>useecc</para><para> - The useecc member controls the ecc and placement function. The header - file include/mtd/mtd-abi.h contains constants to select ecc and - placement. MTD_NANDECC_OFF switches off the ecc complete. This is - not recommended and available for testing and diagnosis only. - MTD_NANDECC_PLACE selects caller defined placement, MTD_NANDECC_AUTOPLACE - selects automatic placement. - </para></listitem> - <listitem><para>eccbytes</para><para> - The eccbytes member defines the number of ecc bytes per page. - </para></listitem> - <listitem><para>eccpos</para><para> - The eccpos array holds the byte offsets in the spare area where - the ecc codes are placed. - </para></listitem> - <listitem><para>oobfree</para><para> - The oobfree array defines the areas in the spare area which can be - used for automatic placement. The information is given in the format - {offset, size}. offset defines the start of the usable area, size the - length in bytes. More than one area can be defined. The list is terminated - by an {0, 0} entry. - </para></listitem> - </itemizedlist> - </para> - <sect2 id="Placement_defined_by_fs_driver"> - <title>Placement defined by fs driver</title> - <para> - The calling function provides a pointer to a nand_oobinfo - structure which defines the ecc placement. For writes the - caller must provide a spare area buffer along with the - data buffer. The spare area buffer size is (number of pages) * - (size of spare area). For reads the buffer size is - (number of pages) * ((size of spare area) + (number of ecc - steps per page) * sizeof (int)). The driver stores the - result of the ecc check for each tuple in the spare buffer. - The storage sequence is - </para> - <para> - <spare data page 0><ecc result 0>...<ecc result n> - </para> - <para> - ... - </para> - <para> - <spare data page n><ecc result 0>...<ecc result n> - </para> - <para> - This is a legacy mode used by YAFFS1. - </para> - <para> - If the spare area buffer is NULL then only the ECC placement is - done according to the given scheme in the nand_oobinfo structure. - </para> - </sect2> - <sect2 id="Automatic_placement"> - <title>Automatic placement</title> - <para> - Automatic placement uses the built in defaults to place the - ecc bytes in the spare area. If filesystem data have to be stored / - read into the spare area then the calling function must provide a - buffer. The buffer size per page is determined by the oobfree array in - the nand_oobinfo structure. - </para> - <para> - If the spare area buffer is NULL then only the ECC placement is - done according to the default builtin scheme. - </para> - </sect2> - </sect1> - <sect1 id="Spare_area_autoplacement_default"> - <title>Spare area autoplacement default schemes</title> - <sect2 id="pagesize_256"> - <title>256 byte pagesize</title> -<informaltable><tgroup cols="3"><tbody> -<row> -<entry>Offset</entry> -<entry>Content</entry> -<entry>Comment</entry> -</row> -<row> -<entry>0x00</entry> -<entry>ECC byte 0</entry> -<entry>Error correction code byte 0</entry> -</row> -<row> -<entry>0x01</entry> -<entry>ECC byte 1</entry> -<entry>Error correction code byte 1</entry> -</row> -<row> -<entry>0x02</entry> -<entry>ECC byte 2</entry> -<entry>Error correction code byte 2</entry> -</row> -<row> -<entry>0x03</entry> -<entry>Autoplace 0</entry> -<entry></entry> -</row> -<row> -<entry>0x04</entry> -<entry>Autoplace 1</entry> -<entry></entry> -</row> -<row> -<entry>0x05</entry> -<entry>Bad block marker</entry> -<entry>If any bit in this byte is zero, then this block is bad. -This applies only to the first page in a block. In the remaining -pages this byte is reserved</entry> -</row> -<row> -<entry>0x06</entry> -<entry>Autoplace 2</entry> -<entry></entry> -</row> -<row> -<entry>0x07</entry> -<entry>Autoplace 3</entry> -<entry></entry> -</row> -</tbody></tgroup></informaltable> - </sect2> - <sect2 id="pagesize_512"> - <title>512 byte pagesize</title> -<informaltable><tgroup cols="3"><tbody> -<row> -<entry>Offset</entry> -<entry>Content</entry> -<entry>Comment</entry> -</row> -<row> -<entry>0x00</entry> -<entry>ECC byte 0</entry> -<entry>Error correction code byte 0 of the lower 256 Byte data in -this page</entry> -</row> -<row> -<entry>0x01</entry> -<entry>ECC byte 1</entry> -<entry>Error correction code byte 1 of the lower 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x02</entry> -<entry>ECC byte 2</entry> -<entry>Error correction code byte 2 of the lower 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x03</entry> -<entry>ECC byte 3</entry> -<entry>Error correction code byte 0 of the upper 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x04</entry> -<entry>reserved</entry> -<entry>reserved</entry> -</row> -<row> -<entry>0x05</entry> -<entry>Bad block marker</entry> -<entry>If any bit in this byte is zero, then this block is bad. -This applies only to the first page in a block. In the remaining -pages this byte is reserved</entry> -</row> -<row> -<entry>0x06</entry> -<entry>ECC byte 4</entry> -<entry>Error correction code byte 1 of the upper 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x07</entry> -<entry>ECC byte 5</entry> -<entry>Error correction code byte 2 of the upper 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x08 - 0x0F</entry> -<entry>Autoplace 0 - 7</entry> -<entry></entry> -</row> -</tbody></tgroup></informaltable> - </sect2> - <sect2 id="pagesize_2048"> - <title>2048 byte pagesize</title> -<informaltable><tgroup cols="3"><tbody> -<row> -<entry>Offset</entry> -<entry>Content</entry> -<entry>Comment</entry> -</row> -<row> -<entry>0x00</entry> -<entry>Bad block marker</entry> -<entry>If any bit in this byte is zero, then this block is bad. -This applies only to the first page in a block. In the remaining -pages this byte is reserved</entry> -</row> -<row> -<entry>0x01</entry> -<entry>Reserved</entry> -<entry>Reserved</entry> -</row> -<row> -<entry>0x02-0x27</entry> -<entry>Autoplace 0 - 37</entry> -<entry></entry> -</row> -<row> -<entry>0x28</entry> -<entry>ECC byte 0</entry> -<entry>Error correction code byte 0 of the first 256 Byte data in -this page</entry> -</row> -<row> -<entry>0x29</entry> -<entry>ECC byte 1</entry> -<entry>Error correction code byte 1 of the first 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x2A</entry> -<entry>ECC byte 2</entry> -<entry>Error correction code byte 2 of the first 256 Bytes data in -this page</entry> -</row> -<row> -<entry>0x2B</entry> -<entry>ECC byte 3</entry> -<entry>Error correction code byte 0 of the second 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x2C</entry> -<entry>ECC byte 4</entry> -<entry>Error correction code byte 1 of the second 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x2D</entry> -<entry>ECC byte 5</entry> -<entry>Error correction code byte 2 of the second 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x2E</entry> -<entry>ECC byte 6</entry> -<entry>Error correction code byte 0 of the third 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x2F</entry> -<entry>ECC byte 7</entry> -<entry>Error correction code byte 1 of the third 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x30</entry> -<entry>ECC byte 8</entry> -<entry>Error correction code byte 2 of the third 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x31</entry> -<entry>ECC byte 9</entry> -<entry>Error correction code byte 0 of the fourth 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x32</entry> -<entry>ECC byte 10</entry> -<entry>Error correction code byte 1 of the fourth 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x33</entry> -<entry>ECC byte 11</entry> -<entry>Error correction code byte 2 of the fourth 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x34</entry> -<entry>ECC byte 12</entry> -<entry>Error correction code byte 0 of the fifth 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x35</entry> -<entry>ECC byte 13</entry> -<entry>Error correction code byte 1 of the fifth 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x36</entry> -<entry>ECC byte 14</entry> -<entry>Error correction code byte 2 of the fifth 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x37</entry> -<entry>ECC byte 15</entry> -<entry>Error correction code byte 0 of the sixt 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x38</entry> -<entry>ECC byte 16</entry> -<entry>Error correction code byte 1 of the sixt 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x39</entry> -<entry>ECC byte 17</entry> -<entry>Error correction code byte 2 of the sixt 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x3A</entry> -<entry>ECC byte 18</entry> -<entry>Error correction code byte 0 of the seventh 256 Bytes of -data in this page</entry> -</row> -<row> -<entry>0x3B</entry> -<entry>ECC byte 19</entry> -<entry>Error correction code byte 1 of the seventh 256 Bytes of -data in this page</entry> -</row> -<row> -<entry>0x3C</entry> -<entry>ECC byte 20</entry> -<entry>Error correction code byte 2 of the seventh 256 Bytes of -data in this page</entry> -</row> -<row> -<entry>0x3D</entry> -<entry>ECC byte 21</entry> -<entry>Error correction code byte 0 of the eighth 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x3E</entry> -<entry>ECC byte 22</entry> -<entry>Error correction code byte 1 of the eighth 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x3F</entry> -<entry>ECC byte 23</entry> -<entry>Error correction code byte 2 of the eighth 256 Bytes of data -in this page</entry> -</row> -</tbody></tgroup></informaltable> - </sect2> - </sect1> - </chapter> - - <chapter id="filesystems"> - <title>Filesystem support</title> - <para> - The NAND driver provides all necessary functions for a - filesystem via the MTD interface. - </para> - <para> - Filesystems must be aware of the NAND peculiarities and - restrictions. One major restrictions of NAND Flash is, that you cannot - write as often as you want to a page. The consecutive writes to a page, - before erasing it again, are restricted to 1-3 writes, depending on the - manufacturers specifications. This applies similar to the spare area. - </para> - <para> - Therefore NAND aware filesystems must either write in page size chunks - or hold a writebuffer to collect smaller writes until they sum up to - pagesize. Available NAND aware filesystems: JFFS2, YAFFS. - </para> - <para> - The spare area usage to store filesystem data is controlled by - the spare area placement functionality which is described in one - of the earlier chapters. - </para> - </chapter> - <chapter id="tools"> - <title>Tools</title> - <para> - The MTD project provides a couple of helpful tools to handle NAND Flash. - <itemizedlist> - <listitem><para>flasherase, flasheraseall: Erase and format FLASH partitions</para></listitem> - <listitem><para>nandwrite: write filesystem images to NAND FLASH</para></listitem> - <listitem><para>nanddump: dump the contents of a NAND FLASH partitions</para></listitem> - </itemizedlist> - </para> - <para> - These tools are aware of the NAND restrictions. Please use those tools - instead of complaining about errors which are caused by non NAND aware - access methods. - </para> - </chapter> - - <chapter id="defines"> - <title>Constants</title> - <para> - This chapter describes the constants which might be relevant for a driver developer. - </para> - <sect1 id="Chip_option_constants"> - <title>Chip option constants</title> - <sect2 id="Constants_for_chip_id_table"> - <title>Constants for chip id table</title> - <para> - These constants are defined in nand.h. They are ored together to describe - the chip functionality. - <programlisting> -/* Buswitdh is 16 bit */ -#define NAND_BUSWIDTH_16 0x00000002 -/* Device supports partial programming without padding */ -#define NAND_NO_PADDING 0x00000004 -/* Chip has cache program function */ -#define NAND_CACHEPRG 0x00000008 -/* Chip has copy back function */ -#define NAND_COPYBACK 0x00000010 -/* AND Chip which has 4 banks and a confusing page / block - * assignment. See Renesas datasheet for further information */ -#define NAND_IS_AND 0x00000020 -/* Chip has a array of 4 pages which can be read without - * additional ready /busy waits */ -#define NAND_4PAGE_ARRAY 0x00000040 - </programlisting> - </para> - </sect2> - <sect2 id="Constants_for_runtime_options"> - <title>Constants for runtime options</title> - <para> - These constants are defined in nand.h. They are ored together to describe - the functionality. - <programlisting> -/* The hw ecc generator provides a syndrome instead a ecc value on read - * This can only work if we have the ecc bytes directly behind the - * data bytes. Applies for DOC and AG-AND Renesas HW Reed Solomon generators */ -#define NAND_HWECC_SYNDROME 0x00020000 - </programlisting> - </para> - </sect2> - </sect1> - - <sect1 id="EEC_selection_constants"> - <title>ECC selection constants</title> - <para> - Use these constants to select the ECC algorithm. - <programlisting> -/* No ECC. Usage is not recommended ! */ -#define NAND_ECC_NONE 0 -/* Software ECC 3 byte ECC per 256 Byte data */ -#define NAND_ECC_SOFT 1 -/* Hardware ECC 3 byte ECC per 256 Byte data */ -#define NAND_ECC_HW3_256 2 -/* Hardware ECC 3 byte ECC per 512 Byte data */ -#define NAND_ECC_HW3_512 3 -/* Hardware ECC 6 byte ECC per 512 Byte data */ -#define NAND_ECC_HW6_512 4 -/* Hardware ECC 6 byte ECC per 512 Byte data */ -#define NAND_ECC_HW8_512 6 - </programlisting> - </para> - </sect1> - - <sect1 id="Hardware_control_related_constants"> - <title>Hardware control related constants</title> - <para> - These constants describe the requested hardware access function when - the boardspecific hardware control function is called - <programlisting> -/* Select the chip by setting nCE to low */ -#define NAND_CTL_SETNCE 1 -/* Deselect the chip by setting nCE to high */ -#define NAND_CTL_CLRNCE 2 -/* Select the command latch by setting CLE to high */ -#define NAND_CTL_SETCLE 3 -/* Deselect the command latch by setting CLE to low */ -#define NAND_CTL_CLRCLE 4 -/* Select the address latch by setting ALE to high */ -#define NAND_CTL_SETALE 5 -/* Deselect the address latch by setting ALE to low */ -#define NAND_CTL_CLRALE 6 -/* Set write protection by setting WP to high. Not used! */ -#define NAND_CTL_SETWP 7 -/* Clear write protection by setting WP to low. Not used! */ -#define NAND_CTL_CLRWP 8 - </programlisting> - </para> - </sect1> - - <sect1 id="Bad_block_table_constants"> - <title>Bad block table related constants</title> - <para> - These constants describe the options used for bad block - table descriptors. - <programlisting> -/* Options for the bad block table descriptors */ - -/* The number of bits used per block in the bbt on the device */ -#define NAND_BBT_NRBITS_MSK 0x0000000F -#define NAND_BBT_1BIT 0x00000001 -#define NAND_BBT_2BIT 0x00000002 -#define NAND_BBT_4BIT 0x00000004 -#define NAND_BBT_8BIT 0x00000008 -/* The bad block table is in the last good block of the device */ -#define NAND_BBT_LASTBLOCK 0x00000010 -/* The bbt is at the given page, else we must scan for the bbt */ -#define NAND_BBT_ABSPAGE 0x00000020 -/* bbt is stored per chip on multichip devices */ -#define NAND_BBT_PERCHIP 0x00000080 -/* bbt has a version counter at offset veroffs */ -#define NAND_BBT_VERSION 0x00000100 -/* Create a bbt if none axists */ -#define NAND_BBT_CREATE 0x00000200 -/* Write bbt if necessary */ -#define NAND_BBT_WRITE 0x00001000 -/* Read and write back block contents when writing bbt */ -#define NAND_BBT_SAVECONTENT 0x00002000 - </programlisting> - </para> - </sect1> - - </chapter> - - <chapter id="structs"> - <title>Structures</title> - <para> - This chapter contains the autogenerated documentation of the structures which are - used in the NAND driver and might be relevant for a driver developer. Each - struct member has a short description which is marked with an [XXX] identifier. - See the chapter "Documentation hints" for an explanation. - </para> -!Iinclude/linux/mtd/nand.h - </chapter> - - <chapter id="pubfunctions"> - <title>Public Functions Provided</title> - <para> - This chapter contains the autogenerated documentation of the NAND kernel API functions - which are exported. Each function has a short description which is marked with an [XXX] identifier. - See the chapter "Documentation hints" for an explanation. - </para> -!Edrivers/mtd/nand/nand_base.c -!Edrivers/mtd/nand/nand_bbt.c -!Edrivers/mtd/nand/nand_ecc.c - </chapter> - - <chapter id="intfunctions"> - <title>Internal Functions Provided</title> - <para> - This chapter contains the autogenerated documentation of the NAND driver internal functions. - Each function has a short description which is marked with an [XXX] identifier. - See the chapter "Documentation hints" for an explanation. - The functions marked with [DEFAULT] might be relevant for a board driver developer. - </para> -!Idrivers/mtd/nand/nand_base.c -!Idrivers/mtd/nand/nand_bbt.c -<!-- No internal functions for kernel-doc: -X!Idrivers/mtd/nand/nand_ecc.c ---> - </chapter> - - <chapter id="credits"> - <title>Credits</title> - <para> - The following people have contributed to the NAND driver: - <orderedlist> - <listitem><para>Steven J. Hill<email>sjhill@realitydiluted.com</email></para></listitem> - <listitem><para>David Woodhouse<email>dwmw2@infradead.org</email></para></listitem> - <listitem><para>Thomas Gleixner<email>tglx@linutronix.de</email></para></listitem> - </orderedlist> - A lot of users have provided bugfixes, improvements and helping hands for testing. - Thanks a lot. - </para> - <para> - The following people have contributed to this document: - <orderedlist> - <listitem><para>Thomas Gleixner<email>tglx@linutronix.de</email></para></listitem> - </orderedlist> - </para> - </chapter> -</book> diff --git a/Documentation/DocBook/networking.tmpl b/Documentation/DocBook/networking.tmpl deleted file mode 100644 index 29df25016c7c..000000000000 --- a/Documentation/DocBook/networking.tmpl +++ /dev/null @@ -1,111 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="LinuxNetworking"> - <bookinfo> - <title>Linux Networking and Network Devices APIs</title> - - <legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later - version. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - </para> - - <para> - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - </para> - - <para> - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - </bookinfo> - -<toc></toc> - - <chapter id="netcore"> - <title>Linux Networking</title> - <sect1><title>Networking Base Types</title> -!Iinclude/linux/net.h - </sect1> - <sect1><title>Socket Buffer Functions</title> -!Iinclude/linux/skbuff.h -!Iinclude/net/sock.h -!Enet/socket.c -!Enet/core/skbuff.c -!Enet/core/sock.c -!Enet/core/datagram.c -!Enet/core/stream.c - </sect1> - <sect1><title>Socket Filter</title> -!Enet/core/filter.c - </sect1> - <sect1><title>Generic Network Statistics</title> -!Iinclude/uapi/linux/gen_stats.h -!Enet/core/gen_stats.c -!Enet/core/gen_estimator.c - </sect1> - <sect1><title>SUN RPC subsystem</title> -<!-- The !D functionality is not perfect, garbage has to be protected by comments -!Dnet/sunrpc/sunrpc_syms.c ---> -!Enet/sunrpc/xdr.c -!Enet/sunrpc/svc_xprt.c -!Enet/sunrpc/xprt.c -!Enet/sunrpc/sched.c -!Enet/sunrpc/socklib.c -!Enet/sunrpc/stats.c -!Enet/sunrpc/rpc_pipe.c -!Enet/sunrpc/rpcb_clnt.c -!Enet/sunrpc/clnt.c - </sect1> - <sect1><title>WiMAX</title> -!Enet/wimax/op-msg.c -!Enet/wimax/op-reset.c -!Enet/wimax/op-rfkill.c -!Enet/wimax/stack.c -!Iinclude/net/wimax.h -!Iinclude/uapi/linux/wimax.h - </sect1> - </chapter> - - <chapter id="netdev"> - <title>Network device support</title> - <sect1><title>Driver Support</title> -!Enet/core/dev.c -!Enet/ethernet/eth.c -!Enet/sched/sch_generic.c -!Iinclude/linux/etherdevice.h -!Iinclude/linux/netdevice.h - </sect1> - <sect1><title>PHY Support</title> -!Edrivers/net/phy/phy.c -!Idrivers/net/phy/phy.c -!Edrivers/net/phy/phy_device.c -!Idrivers/net/phy/phy_device.c -!Edrivers/net/phy/mdio_bus.c -!Idrivers/net/phy/mdio_bus.c - </sect1> -<!-- FIXME: Removed for now since no structured comments in source - <sect1><title>Wireless</title> -X!Enet/core/wireless.c - </sect1> ---> - </chapter> - -</book> diff --git a/Documentation/DocBook/rapidio.tmpl b/Documentation/DocBook/rapidio.tmpl deleted file mode 100644 index ac3cca3399a1..000000000000 --- a/Documentation/DocBook/rapidio.tmpl +++ /dev/null @@ -1,155 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" [ - <!ENTITY rapidio SYSTEM "rapidio.xml"> - ]> - -<book id="RapidIO-Guide"> - <bookinfo> - <title>RapidIO Subsystem Guide</title> - - <authorgroup> - <author> - <firstname>Matt</firstname> - <surname>Porter</surname> - <affiliation> - <address> - <email>mporter@kernel.crashing.org</email> - <email>mporter@mvista.com</email> - </address> - </affiliation> - </author> - </authorgroup> - - <copyright> - <year>2005</year> - <holder>MontaVista Software, Inc.</holder> - </copyright> - - <legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License version 2 as published by the Free Software Foundation. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - </para> - - <para> - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - </para> - - <para> - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - </bookinfo> - -<toc></toc> - - <chapter id="intro"> - <title>Introduction</title> - <para> - RapidIO is a high speed switched fabric interconnect with - features aimed at the embedded market. RapidIO provides - support for memory-mapped I/O as well as message-based - transactions over the switched fabric network. RapidIO has - a standardized discovery mechanism not unlike the PCI bus - standard that allows simple detection of devices in a - network. - </para> - <para> - This documentation is provided for developers intending - to support RapidIO on new architectures, write new drivers, - or to understand the subsystem internals. - </para> - </chapter> - - <chapter id="bugs"> - <title>Known Bugs and Limitations</title> - - <sect1 id="known_bugs"> - <title>Bugs</title> - <para>None. ;)</para> - </sect1> - <sect1 id="Limitations"> - <title>Limitations</title> - <para> - <orderedlist> - <listitem><para>Access/management of RapidIO memory regions is not supported</para></listitem> - <listitem><para>Multiple host enumeration is not supported</para></listitem> - </orderedlist> - </para> - </sect1> - </chapter> - - <chapter id="drivers"> - <title>RapidIO driver interface</title> - <para> - Drivers are provided a set of calls in order - to interface with the subsystem to gather info - on devices, request/map memory region resources, - and manage mailboxes/doorbells. - </para> - <sect1 id="Functions"> - <title>Functions</title> -!Iinclude/linux/rio_drv.h -!Edrivers/rapidio/rio-driver.c -!Edrivers/rapidio/rio.c - </sect1> - </chapter> - - <chapter id="internals"> - <title>Internals</title> - - <para> - This chapter contains the autogenerated documentation of the RapidIO - subsystem. - </para> - - <sect1 id="Structures"><title>Structures</title> -!Iinclude/linux/rio.h - </sect1> - <sect1 id="Enumeration_and_Discovery"><title>Enumeration and Discovery</title> -!Idrivers/rapidio/rio-scan.c - </sect1> - <sect1 id="Driver_functionality"><title>Driver functionality</title> -!Idrivers/rapidio/rio.c -!Idrivers/rapidio/rio-access.c - </sect1> - <sect1 id="Device_model_support"><title>Device model support</title> -!Idrivers/rapidio/rio-driver.c - </sect1> - <sect1 id="PPC32_support"><title>PPC32 support</title> -!Iarch/powerpc/sysdev/fsl_rio.c - </sect1> - </chapter> - - <chapter id="credits"> - <title>Credits</title> - <para> - The following people have contributed to the RapidIO - subsystem directly or indirectly: - <orderedlist> - <listitem><para>Matt Porter<email>mporter@kernel.crashing.org</email></para></listitem> - <listitem><para>Randy Vinson<email>rvinson@mvista.com</email></para></listitem> - <listitem><para>Dan Malek<email>dan@embeddedalley.com</email></para></listitem> - </orderedlist> - </para> - <para> - The following people have contributed to this document: - <orderedlist> - <listitem><para>Matt Porter<email>mporter@kernel.crashing.org</email></para></listitem> - </orderedlist> - </para> - </chapter> -</book> diff --git a/Documentation/DocBook/s390-drivers.tmpl b/Documentation/DocBook/s390-drivers.tmpl deleted file mode 100644 index 95bfc12e5439..000000000000 --- a/Documentation/DocBook/s390-drivers.tmpl +++ /dev/null @@ -1,161 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="s390drivers"> - <bookinfo> - <title>Writing s390 channel device drivers</title> - - <authorgroup> - <author> - <firstname>Cornelia</firstname> - <surname>Huck</surname> - <affiliation> - <address> - <email>cornelia.huck@de.ibm.com</email> - </address> - </affiliation> - </author> - </authorgroup> - - <copyright> - <year>2007</year> - <holder>IBM Corp.</holder> - </copyright> - - <legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later - version. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - </para> - - <para> - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - </para> - - <para> - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - </bookinfo> - -<toc></toc> - - <chapter id="intro"> - <title>Introduction</title> - <para> - This document describes the interfaces available for device drivers that - drive s390 based channel attached I/O devices. This includes interfaces for - interaction with the hardware and interfaces for interacting with the - common driver core. Those interfaces are provided by the s390 common I/O - layer. - </para> - <para> - The document assumes a familarity with the technical terms associated - with the s390 channel I/O architecture. For a description of this - architecture, please refer to the "z/Architecture: Principles of - Operation", IBM publication no. SA22-7832. - </para> - <para> - While most I/O devices on a s390 system are typically driven through the - channel I/O mechanism described here, there are various other methods - (like the diag interface). These are out of the scope of this document. - </para> - <para> - Some additional information can also be found in the kernel source - under Documentation/s390/driver-model.txt. - </para> - </chapter> - <chapter id="ccw"> - <title>The ccw bus</title> - <para> - The ccw bus typically contains the majority of devices available to - a s390 system. Named after the channel command word (ccw), the basic - command structure used to address its devices, the ccw bus contains - so-called channel attached devices. They are addressed via I/O - subchannels, visible on the css bus. A device driver for - channel-attached devices, however, will never interact with the - subchannel directly, but only via the I/O device on the ccw bus, - the ccw device. - </para> - <sect1 id="channelIO"> - <title>I/O functions for channel-attached devices</title> - <para> - Some hardware structures have been translated into C structures for use - by the common I/O layer and device drivers. For more information on - the hardware structures represented here, please consult the Principles - of Operation. - </para> -!Iarch/s390/include/asm/cio.h - </sect1> - <sect1 id="ccwdev"> - <title>ccw devices</title> - <para> - Devices that want to initiate channel I/O need to attach to the ccw bus. - Interaction with the driver core is done via the common I/O layer, which - provides the abstractions of ccw devices and ccw device drivers. - </para> - <para> - The functions that initiate or terminate channel I/O all act upon a - ccw device structure. Device drivers must not bypass those functions - or strange side effects may happen. - </para> -!Iarch/s390/include/asm/ccwdev.h -!Edrivers/s390/cio/device.c -!Edrivers/s390/cio/device_ops.c - </sect1> - <sect1 id="cmf"> - <title>The channel-measurement facility</title> - <para> - The channel-measurement facility provides a means to collect - measurement data which is made available by the channel subsystem - for each channel attached device. - </para> -!Iarch/s390/include/asm/cmb.h -!Edrivers/s390/cio/cmf.c - </sect1> - </chapter> - - <chapter id="ccwgroup"> - <title>The ccwgroup bus</title> - <para> - The ccwgroup bus only contains artificial devices, created by the user. - Many networking devices (e.g. qeth) are in fact composed of several - ccw devices (like read, write and data channel for qeth). The - ccwgroup bus provides a mechanism to create a meta-device which - contains those ccw devices as slave devices and can be associated - with the netdevice. - </para> - <sect1 id="ccwgroupdevices"> - <title>ccw group devices</title> -!Iarch/s390/include/asm/ccwgroup.h -!Edrivers/s390/cio/ccwgroup.c - </sect1> - </chapter> - - <chapter id="genericinterfaces"> - <title>Generic interfaces</title> - <para> - Some interfaces are available to other drivers that do not necessarily - have anything to do with the busses described above, but still are - indirectly using basic infrastructure in the common I/O layer. - One example is the support for adapter interrupts. - </para> -!Edrivers/s390/cio/airq.c - </chapter> - -</book> diff --git a/Documentation/DocBook/scsi.tmpl b/Documentation/DocBook/scsi.tmpl deleted file mode 100644 index 4b9b9b286cea..000000000000 --- a/Documentation/DocBook/scsi.tmpl +++ /dev/null @@ -1,409 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="scsimid"> - <bookinfo> - <title>SCSI Interfaces Guide</title> - - <authorgroup> - <author> - <firstname>James</firstname> - <surname>Bottomley</surname> - <affiliation> - <address> - <email>James.Bottomley@hansenpartnership.com</email> - </address> - </affiliation> - </author> - - <author> - <firstname>Rob</firstname> - <surname>Landley</surname> - <affiliation> - <address> - <email>rob@landley.net</email> - </address> - </affiliation> - </author> - - </authorgroup> - - <copyright> - <year>2007</year> - <holder>Linux Foundation</holder> - </copyright> - - <legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License version 2. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - </bookinfo> - - <toc></toc> - - <chapter id="intro"> - <title>Introduction</title> - <sect1 id="protocol_vs_bus"> - <title>Protocol vs bus</title> - <para> - Once upon a time, the Small Computer Systems Interface defined both - a parallel I/O bus and a data protocol to connect a wide variety of - peripherals (disk drives, tape drives, modems, printers, scanners, - optical drives, test equipment, and medical devices) to a host - computer. - </para> - <para> - Although the old parallel (fast/wide/ultra) SCSI bus has largely - fallen out of use, the SCSI command set is more widely used than ever - to communicate with devices over a number of different busses. - </para> - <para> - The <ulink url='http://www.t10.org/scsi-3.htm'>SCSI protocol</ulink> - is a big-endian peer-to-peer packet based protocol. SCSI commands - are 6, 10, 12, or 16 bytes long, often followed by an associated data - payload. - </para> - <para> - SCSI commands can be transported over just about any kind of bus, and - are the default protocol for storage devices attached to USB, SATA, - SAS, Fibre Channel, FireWire, and ATAPI devices. SCSI packets are - also commonly exchanged over Infiniband, - <ulink url='http://i2o.shadowconnect.com/faq.php'>I20</ulink>, TCP/IP - (<ulink url='https://en.wikipedia.org/wiki/ISCSI'>iSCSI</ulink>), even - <ulink url='http://cyberelk.net/tim/parport/parscsi.html'>Parallel - ports</ulink>. - </para> - </sect1> - <sect1 id="subsystem_design"> - <title>Design of the Linux SCSI subsystem</title> - <para> - The SCSI subsystem uses a three layer design, with upper, mid, and low - layers. Every operation involving the SCSI subsystem (such as reading - a sector from a disk) uses one driver at each of the 3 levels: one - upper layer driver, one lower layer driver, and the SCSI midlayer. - </para> - <para> - The SCSI upper layer provides the interface between userspace and the - kernel, in the form of block and char device nodes for I/O and - ioctl(). The SCSI lower layer contains drivers for specific hardware - devices. - </para> - <para> - In between is the SCSI mid-layer, analogous to a network routing - layer such as the IPv4 stack. The SCSI mid-layer routes a packet - based data protocol between the upper layer's /dev nodes and the - corresponding devices in the lower layer. It manages command queues, - provides error handling and power management functions, and responds - to ioctl() requests. - </para> - </sect1> - </chapter> - - <chapter id="upper_layer"> - <title>SCSI upper layer</title> - <para> - The upper layer supports the user-kernel interface by providing - device nodes. - </para> - <sect1 id="sd"> - <title>sd (SCSI Disk)</title> - <para>sd (sd_mod.o)</para> -<!-- !Idrivers/scsi/sd.c --> - </sect1> - <sect1 id="sr"> - <title>sr (SCSI CD-ROM)</title> - <para>sr (sr_mod.o)</para> - </sect1> - <sect1 id="st"> - <title>st (SCSI Tape)</title> - <para>st (st.o)</para> - </sect1> - <sect1 id="sg"> - <title>sg (SCSI Generic)</title> - <para>sg (sg.o)</para> - </sect1> - <sect1 id="ch"> - <title>ch (SCSI Media Changer)</title> - <para>ch (ch.c)</para> - </sect1> - </chapter> - - <chapter id="mid_layer"> - <title>SCSI mid layer</title> - - <sect1 id="midlayer_implementation"> - <title>SCSI midlayer implementation</title> - <sect2 id="scsi_device.h"> - <title>include/scsi/scsi_device.h</title> - <para> - </para> -!Iinclude/scsi/scsi_device.h - </sect2> - - <sect2 id="scsi.c"> - <title>drivers/scsi/scsi.c</title> - <para>Main file for the SCSI midlayer.</para> -!Edrivers/scsi/scsi.c - </sect2> - <sect2 id="scsicam.c"> - <title>drivers/scsi/scsicam.c</title> - <para> - <ulink url='http://www.t10.org/ftp/t10/drafts/cam/cam-r12b.pdf'>SCSI - Common Access Method</ulink> support functions, for use with - HDIO_GETGEO, etc. - </para> -!Edrivers/scsi/scsicam.c - </sect2> - <sect2 id="scsi_error.c"> - <title>drivers/scsi/scsi_error.c</title> - <para>Common SCSI error/timeout handling routines.</para> -!Edrivers/scsi/scsi_error.c - </sect2> - <sect2 id="scsi_devinfo.c"> - <title>drivers/scsi/scsi_devinfo.c</title> - <para> - Manage scsi_dev_info_list, which tracks blacklisted and whitelisted - devices. - </para> -!Idrivers/scsi/scsi_devinfo.c - </sect2> - <sect2 id="scsi_ioctl.c"> - <title>drivers/scsi/scsi_ioctl.c</title> - <para> - Handle ioctl() calls for SCSI devices. - </para> -!Edrivers/scsi/scsi_ioctl.c - </sect2> - <sect2 id="scsi_lib.c"> - <title>drivers/scsi/scsi_lib.c</title> - <para> - SCSI queuing library. - </para> -!Edrivers/scsi/scsi_lib.c - </sect2> - <sect2 id="scsi_lib_dma.c"> - <title>drivers/scsi/scsi_lib_dma.c</title> - <para> - SCSI library functions depending on DMA - (map and unmap scatter-gather lists). - </para> -!Edrivers/scsi/scsi_lib_dma.c - </sect2> - <sect2 id="scsi_module.c"> - <title>drivers/scsi/scsi_module.c</title> - <para> - The file drivers/scsi/scsi_module.c contains legacy support for - old-style host templates. It should never be used by any new driver. - </para> - </sect2> - <sect2 id="scsi_proc.c"> - <title>drivers/scsi/scsi_proc.c</title> - <para> - The functions in this file provide an interface between - the PROC file system and the SCSI device drivers - It is mainly used for debugging, statistics and to pass - information directly to the lowlevel driver. - - I.E. plumbing to manage /proc/scsi/* - </para> -!Idrivers/scsi/scsi_proc.c - </sect2> - <sect2 id="scsi_netlink.c"> - <title>drivers/scsi/scsi_netlink.c</title> - <para> - Infrastructure to provide async events from transports to userspace - via netlink, using a single NETLINK_SCSITRANSPORT protocol for all - transports. - - See <ulink url='http://marc.info/?l=linux-scsi&m=115507374832500&w=2'>the - original patch submission</ulink> for more details. - </para> -!Idrivers/scsi/scsi_netlink.c - </sect2> - <sect2 id="scsi_scan.c"> - <title>drivers/scsi/scsi_scan.c</title> - <para> - Scan a host to determine which (if any) devices are attached. - - The general scanning/probing algorithm is as follows, exceptions are - made to it depending on device specific flags, compilation options, - and global variable (boot or module load time) settings. - - A specific LUN is scanned via an INQUIRY command; if the LUN has a - device attached, a scsi_device is allocated and setup for it. - - For every id of every channel on the given host, start by scanning - LUN 0. Skip hosts that don't respond at all to a scan of LUN 0. - Otherwise, if LUN 0 has a device attached, allocate and setup a - scsi_device for it. If target is SCSI-3 or up, issue a REPORT LUN, - and scan all of the LUNs returned by the REPORT LUN; else, - sequentially scan LUNs up until some maximum is reached, or a LUN is - seen that cannot have a device attached to it. - </para> -!Idrivers/scsi/scsi_scan.c - </sect2> - <sect2 id="scsi_sysctl.c"> - <title>drivers/scsi/scsi_sysctl.c</title> - <para> - Set up the sysctl entry: "/dev/scsi/logging_level" - (DEV_SCSI_LOGGING_LEVEL) which sets/returns scsi_logging_level. - </para> - </sect2> - <sect2 id="scsi_sysfs.c"> - <title>drivers/scsi/scsi_sysfs.c</title> - <para> - SCSI sysfs interface routines. - </para> -!Edrivers/scsi/scsi_sysfs.c - </sect2> - <sect2 id="hosts.c"> - <title>drivers/scsi/hosts.c</title> - <para> - mid to lowlevel SCSI driver interface - </para> -!Edrivers/scsi/hosts.c - </sect2> - <sect2 id="constants.c"> - <title>drivers/scsi/constants.c</title> - <para> - mid to lowlevel SCSI driver interface - </para> -!Edrivers/scsi/constants.c - </sect2> - </sect1> - - <sect1 id="Transport_classes"> - <title>Transport classes</title> - <para> - Transport classes are service libraries for drivers in the SCSI - lower layer, which expose transport attributes in sysfs. - </para> - <sect2 id="Fibre_Channel_transport"> - <title>Fibre Channel transport</title> - <para> - The file drivers/scsi/scsi_transport_fc.c defines transport attributes - for Fibre Channel. - </para> -!Edrivers/scsi/scsi_transport_fc.c - </sect2> - <sect2 id="iSCSI_transport"> - <title>iSCSI transport class</title> - <para> - The file drivers/scsi/scsi_transport_iscsi.c defines transport - attributes for the iSCSI class, which sends SCSI packets over TCP/IP - connections. - </para> -!Edrivers/scsi/scsi_transport_iscsi.c - </sect2> - <sect2 id="SAS_transport"> - <title>Serial Attached SCSI (SAS) transport class</title> - <para> - The file drivers/scsi/scsi_transport_sas.c defines transport - attributes for Serial Attached SCSI, a variant of SATA aimed at - large high-end systems. - </para> - <para> - The SAS transport class contains common code to deal with SAS HBAs, - an aproximated representation of SAS topologies in the driver model, - and various sysfs attributes to expose these topologies and management - interfaces to userspace. - </para> - <para> - In addition to the basic SCSI core objects this transport class - introduces two additional intermediate objects: The SAS PHY - as represented by struct sas_phy defines an "outgoing" PHY on - a SAS HBA or Expander, and the SAS remote PHY represented by - struct sas_rphy defines an "incoming" PHY on a SAS Expander or - end device. Note that this is purely a software concept, the - underlying hardware for a PHY and a remote PHY is the exactly - the same. - </para> - <para> - There is no concept of a SAS port in this code, users can see - what PHYs form a wide port based on the port_identifier attribute, - which is the same for all PHYs in a port. - </para> -!Edrivers/scsi/scsi_transport_sas.c - </sect2> - <sect2 id="SATA_transport"> - <title>SATA transport class</title> - <para> - The SATA transport is handled by libata, which has its own book of - documentation in this directory. - </para> - </sect2> - <sect2 id="SPI_transport"> - <title>Parallel SCSI (SPI) transport class</title> - <para> - The file drivers/scsi/scsi_transport_spi.c defines transport - attributes for traditional (fast/wide/ultra) SCSI busses. - </para> -!Edrivers/scsi/scsi_transport_spi.c - </sect2> - <sect2 id="SRP_transport"> - <title>SCSI RDMA (SRP) transport class</title> - <para> - The file drivers/scsi/scsi_transport_srp.c defines transport - attributes for SCSI over Remote Direct Memory Access. - </para> -!Edrivers/scsi/scsi_transport_srp.c - </sect2> - </sect1> - - </chapter> - - <chapter id="lower_layer"> - <title>SCSI lower layer</title> - <sect1 id="hba_drivers"> - <title>Host Bus Adapter transport types</title> - <para> - Many modern device controllers use the SCSI command set as a protocol to - communicate with their devices through many different types of physical - connections. - </para> - <para> - In SCSI language a bus capable of carrying SCSI commands is - called a "transport", and a controller connecting to such a bus is - called a "host bus adapter" (HBA). - </para> - <sect2 id="scsi_debug.c"> - <title>Debug transport</title> - <para> - The file drivers/scsi/scsi_debug.c simulates a host adapter with a - variable number of disks (or disk like devices) attached, sharing a - common amount of RAM. Does a lot of checking to make sure that we are - not getting blocks mixed up, and panics the kernel if anything out of - the ordinary is seen. - </para> - <para> - To be more realistic, the simulated devices have the transport - attributes of SAS disks. - </para> - <para> - For documentation see - <ulink url='http://sg.danny.cz/sg/sdebug26.html'>http://sg.danny.cz/sg/sdebug26.html</ulink> - </para> -<!-- !Edrivers/scsi/scsi_debug.c --> - </sect2> - <sect2 id="todo"> - <title>todo</title> - <para>Parallel (fast/wide/ultra) SCSI, USB, SATA, - SAS, Fibre Channel, FireWire, ATAPI devices, Infiniband, - I20, iSCSI, Parallel ports, netlink... - </para> - </sect2> - </sect1> - </chapter> -</book> diff --git a/Documentation/DocBook/sh.tmpl b/Documentation/DocBook/sh.tmpl deleted file mode 100644 index 4a38f604fa66..000000000000 --- a/Documentation/DocBook/sh.tmpl +++ /dev/null @@ -1,105 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="sh-drivers"> - <bookinfo> - <title>SuperH Interfaces Guide</title> - - <authorgroup> - <author> - <firstname>Paul</firstname> - <surname>Mundt</surname> - <affiliation> - <address> - <email>lethal@linux-sh.org</email> - </address> - </affiliation> - </author> - </authorgroup> - - <copyright> - <year>2008-2010</year> - <holder>Paul Mundt</holder> - </copyright> - <copyright> - <year>2008-2010</year> - <holder>Renesas Technology Corp.</holder> - </copyright> - <copyright> - <year>2010</year> - <holder>Renesas Electronics Corp.</holder> - </copyright> - - <legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License version 2 as published by the Free Software Foundation. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - </para> - - <para> - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - </para> - - <para> - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - </bookinfo> - -<toc></toc> - - <chapter id="mm"> - <title>Memory Management</title> - <sect1 id="sh4"> - <title>SH-4</title> - <sect2 id="sq"> - <title>Store Queue API</title> -!Earch/sh/kernel/cpu/sh4/sq.c - </sect2> - </sect1> - <sect1 id="sh5"> - <title>SH-5</title> - <sect2 id="tlb"> - <title>TLB Interfaces</title> -!Iarch/sh/mm/tlb-sh5.c -!Iarch/sh/include/asm/tlb_64.h - </sect2> - </sect1> - </chapter> - <chapter id="mach"> - <title>Machine Specific Interfaces</title> - <sect1 id="dreamcast"> - <title>mach-dreamcast</title> -!Iarch/sh/boards/mach-dreamcast/rtc.c - </sect1> - <sect1 id="x3proto"> - <title>mach-x3proto</title> -!Earch/sh/boards/mach-x3proto/ilsel.c - </sect1> - </chapter> - <chapter id="busses"> - <title>Busses</title> - <sect1 id="superhyway"> - <title>SuperHyway</title> -!Edrivers/sh/superhyway/superhyway.c - </sect1> - - <sect1 id="maple"> - <title>Maple</title> -!Edrivers/sh/maple/maple.c - </sect1> - </chapter> -</book> diff --git a/Documentation/DocBook/stylesheet.xsl b/Documentation/DocBook/stylesheet.xsl deleted file mode 100644 index 3bf4ecf3d760..000000000000 --- a/Documentation/DocBook/stylesheet.xsl +++ /dev/null @@ -1,11 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<stylesheet xmlns="http://www.w3.org/1999/XSL/Transform" version="1.0"> -<param name="chunk.quietly">1</param> -<param name="funcsynopsis.style">ansi</param> -<param name="funcsynopsis.tabular.threshold">80</param> -<param name="callout.graphics">0</param> -<!-- <param name="paper.type">A4</param> --> -<param name="generate.consistent.ids">1</param> -<param name="generate.section.toc.level">2</param> -<param name="use.id.as.filename">1</param> -</stylesheet> diff --git a/Documentation/DocBook/w1.tmpl b/Documentation/DocBook/w1.tmpl deleted file mode 100644 index b0228d4c81bb..000000000000 --- a/Documentation/DocBook/w1.tmpl +++ /dev/null @@ -1,101 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="w1id"> - <bookinfo> - <title>W1: Dallas' 1-wire bus</title> - - <authorgroup> - <author> - <firstname>David</firstname> - <surname>Fries</surname> - <affiliation> - <address> - <email>David@Fries.net</email> - </address> - </affiliation> - </author> - - </authorgroup> - - <copyright> - <year>2013</year> - <!-- - <holder></holder> - --> - </copyright> - - <legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License version 2. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - </bookinfo> - - <toc></toc> - - <chapter id="w1_internal"> - <title>W1 API internal to the kernel</title> - - <sect1 id="w1_internal_api"> - <title>W1 API internal to the kernel</title> - <sect2 id="w1.h"> - <title>drivers/w1/w1.h</title> - <para>W1 core functions.</para> -!Idrivers/w1/w1.h - </sect2> - - <sect2 id="w1.c"> - <title>drivers/w1/w1.c</title> - <para>W1 core functions.</para> -!Idrivers/w1/w1.c - </sect2> - - <sect2 id="w1_family.h"> - <title>drivers/w1/w1_family.h</title> - <para>Allows registering device family operations.</para> -!Idrivers/w1/w1_family.h - </sect2> - - <sect2 id="w1_family.c"> - <title>drivers/w1/w1_family.c</title> - <para>Allows registering device family operations.</para> -!Edrivers/w1/w1_family.c - </sect2> - - <sect2 id="w1_int.c"> - <title>drivers/w1/w1_int.c</title> - <para>W1 internal initialization for master devices.</para> -!Edrivers/w1/w1_int.c - </sect2> - - <sect2 id="w1_netlink.h"> - <title>drivers/w1/w1_netlink.h</title> - <para>W1 external netlink API structures and commands.</para> -!Idrivers/w1/w1_netlink.h - </sect2> - - <sect2 id="w1_io.c"> - <title>drivers/w1/w1_io.c</title> - <para>W1 input/output.</para> -!Edrivers/w1/w1_io.c -!Idrivers/w1/w1_io.c - </sect2> - - </sect1> - - - </chapter> - -</book> diff --git a/Documentation/DocBook/z8530book.tmpl b/Documentation/DocBook/z8530book.tmpl deleted file mode 100644 index 6f3883be877e..000000000000 --- a/Documentation/DocBook/z8530book.tmpl +++ /dev/null @@ -1,371 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="Z85230Guide"> - <bookinfo> - <title>Z8530 Programming Guide</title> - - <authorgroup> - <author> - <firstname>Alan</firstname> - <surname>Cox</surname> - <affiliation> - <address> - <email>alan@lxorguk.ukuu.org.uk</email> - </address> - </affiliation> - </author> - </authorgroup> - - <copyright> - <year>2000</year> - <holder>Alan Cox</holder> - </copyright> - - <legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later - version. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - </para> - - <para> - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - </para> - - <para> - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - </bookinfo> - -<toc></toc> - - <chapter id="intro"> - <title>Introduction</title> - <para> - The Z85x30 family synchronous/asynchronous controller chips are - used on a large number of cheap network interface cards. The - kernel provides a core interface layer that is designed to make - it easy to provide WAN services using this chip. - </para> - <para> - The current driver only support synchronous operation. Merging the - asynchronous driver support into this code to allow any Z85x30 - device to be used as both a tty interface and as a synchronous - controller is a project for Linux post the 2.4 release - </para> - </chapter> - - <chapter id="Driver_Modes"> - <title>Driver Modes</title> - <para> - The Z85230 driver layer can drive Z8530, Z85C30 and Z85230 devices - in three different modes. Each mode can be applied to an individual - channel on the chip (each chip has two channels). - </para> - <para> - The PIO synchronous mode supports the most common Z8530 wiring. Here - the chip is interface to the I/O and interrupt facilities of the - host machine but not to the DMA subsystem. When running PIO the - Z8530 has extremely tight timing requirements. Doing high speeds, - even with a Z85230 will be tricky. Typically you should expect to - achieve at best 9600 baud with a Z8C530 and 64Kbits with a Z85230. - </para> - <para> - The DMA mode supports the chip when it is configured to use dual DMA - channels on an ISA bus. The better cards tend to support this mode - of operation for a single channel. With DMA running the Z85230 tops - out when it starts to hit ISA DMA constraints at about 512Kbits. It - is worth noting here that many PC machines hang or crash when the - chip is driven fast enough to hold the ISA bus solid. - </para> - <para> - Transmit DMA mode uses a single DMA channel. The DMA channel is used - for transmission as the transmit FIFO is smaller than the receive - FIFO. it gives better performance than pure PIO mode but is nowhere - near as ideal as pure DMA mode. - </para> - </chapter> - - <chapter id="Using_the_Z85230_driver"> - <title>Using the Z85230 driver</title> - <para> - The Z85230 driver provides the back end interface to your board. To - configure a Z8530 interface you need to detect the board and to - identify its ports and interrupt resources. It is also your problem - to verify the resources are available. - </para> - <para> - Having identified the chip you need to fill in a struct z8530_dev, - which describes each chip. This object must exist until you finally - shutdown the board. Firstly zero the active field. This ensures - nothing goes off without you intending it. The irq field should - be set to the interrupt number of the chip. (Each chip has a single - interrupt source rather than each channel). You are responsible - for allocating the interrupt line. The interrupt handler should be - set to <function>z8530_interrupt</function>. The device id should - be set to the z8530_dev structure pointer. Whether the interrupt can - be shared or not is board dependent, and up to you to initialise. - </para> - <para> - The structure holds two channel structures. - Initialise chanA.ctrlio and chanA.dataio with the address of the - control and data ports. You can or this with Z8530_PORT_SLEEP to - indicate your interface needs the 5uS delay for chip settling done - in software. The PORT_SLEEP option is architecture specific. Other - flags may become available on future platforms, eg for MMIO. - Initialise the chanA.irqs to &z8530_nop to start the chip up - as disabled and discarding interrupt events. This ensures that - stray interrupts will be mopped up and not hang the bus. Set - chanA.dev to point to the device structure itself. The - private and name field you may use as you wish. The private field - is unused by the Z85230 layer. The name is used for error reporting - and it may thus make sense to make it match the network name. - </para> - <para> - Repeat the same operation with the B channel if your chip has - both channels wired to something useful. This isn't always the - case. If it is not wired then the I/O values do not matter, but - you must initialise chanB.dev. - </para> - <para> - If your board has DMA facilities then initialise the txdma and - rxdma fields for the relevant channels. You must also allocate the - ISA DMA channels and do any necessary board level initialisation - to configure them. The low level driver will do the Z8530 and - DMA controller programming but not board specific magic. - </para> - <para> - Having initialised the device you can then call - <function>z8530_init</function>. This will probe the chip and - reset it into a known state. An identification sequence is then - run to identify the chip type. If the checks fail to pass the - function returns a non zero error code. Typically this indicates - that the port given is not valid. After this call the - type field of the z8530_dev structure is initialised to either - Z8530, Z85C30 or Z85230 according to the chip found. - </para> - <para> - Once you have called z8530_init you can also make use of the utility - function <function>z8530_describe</function>. This provides a - consistent reporting format for the Z8530 devices, and allows all - the drivers to provide consistent reporting. - </para> - </chapter> - - <chapter id="Attaching_Network_Interfaces"> - <title>Attaching Network Interfaces</title> - <para> - If you wish to use the network interface facilities of the driver, - then you need to attach a network device to each channel that is - present and in use. In addition to use the generic HDLC - you need to follow some additional plumbing rules. They may seem - complex but a look at the example hostess_sv11 driver should - reassure you. - </para> - <para> - The network device used for each channel should be pointed to by - the netdevice field of each channel. The hdlc-> priv field of the - network device points to your private data - you will need to be - able to find your private data from this. - </para> - <para> - The way most drivers approach this particular problem is to - create a structure holding the Z8530 device definition and - put that into the private field of the network device. The - network device fields of the channels then point back to the - network devices. - </para> - <para> - If you wish to use the generic HDLC then you need to register - the HDLC device. - </para> - <para> - Before you register your network device you will also need to - provide suitable handlers for most of the network device callbacks. - See the network device documentation for more details on this. - </para> - </chapter> - - <chapter id="Configuring_And_Activating_The_Port"> - <title>Configuring And Activating The Port</title> - <para> - The Z85230 driver provides helper functions and tables to load the - port registers on the Z8530 chips. When programming the register - settings for a channel be aware that the documentation recommends - initialisation orders. Strange things happen when these are not - followed. - </para> - <para> - <function>z8530_channel_load</function> takes an array of - pairs of initialisation values in an array of u8 type. The first - value is the Z8530 register number. Add 16 to indicate the alternate - register bank on the later chips. The array is terminated by a 255. - </para> - <para> - The driver provides a pair of public tables. The - z8530_hdlc_kilostream table is for the UK 'Kilostream' service and - also happens to cover most other end host configurations. The - z8530_hdlc_kilostream_85230 table is the same configuration using - the enhancements of the 85230 chip. The configuration loaded is - standard NRZ encoded synchronous data with HDLC bitstuffing. All - of the timing is taken from the other end of the link. - </para> - <para> - When writing your own tables be aware that the driver internally - tracks register values. It may need to reload values. You should - therefore be sure to set registers 1-7, 9-11, 14 and 15 in all - configurations. Where the register settings depend on DMA selection - the driver will update the bits itself when you open or close. - Loading a new table with the interface open is not recommended. - </para> - <para> - There are three standard configurations supported by the core - code. In PIO mode the interface is programmed up to use - interrupt driven PIO. This places high demands on the host processor - to avoid latency. The driver is written to take account of latency - issues but it cannot avoid latencies caused by other drivers, - notably IDE in PIO mode. Because the drivers allocate buffers you - must also prevent MTU changes while the port is open. - </para> - <para> - Once the port is open it will call the rx_function of each channel - whenever a completed packet arrived. This is invoked from - interrupt context and passes you the channel and a network - buffer (struct sk_buff) holding the data. The data includes - the CRC bytes so most users will want to trim the last two - bytes before processing the data. This function is very timing - critical. When you wish to simply discard data the support - code provides the function <function>z8530_null_rx</function> - to discard the data. - </para> - <para> - To active PIO mode sending and receiving the <function> - z8530_sync_open</function> is called. This expects to be passed - the network device and the channel. Typically this is called from - your network device open callback. On a failure a non zero error - status is returned. The <function>z8530_sync_close</function> - function shuts down a PIO channel. This must be done before the - channel is opened again and before the driver shuts down - and unloads. - </para> - <para> - The ideal mode of operation is dual channel DMA mode. Here the - kernel driver will configure the board for DMA in both directions. - The driver also handles ISA DMA issues such as controller - programming and the memory range limit for you. This mode is - activated by calling the <function>z8530_sync_dma_open</function> - function. On failure a non zero error value is returned. - Once this mode is activated it can be shut down by calling the - <function>z8530_sync_dma_close</function>. You must call the close - function matching the open mode you used. - </para> - <para> - The final supported mode uses a single DMA channel to drive the - transmit side. As the Z85C30 has a larger FIFO on the receive - channel this tends to increase the maximum speed a little. - This is activated by calling the <function>z8530_sync_txdma_open - </function>. This returns a non zero error code on failure. The - <function>z8530_sync_txdma_close</function> function closes down - the Z8530 interface from this mode. - </para> - </chapter> - - <chapter id="Network_Layer_Functions"> - <title>Network Layer Functions</title> - <para> - The Z8530 layer provides functions to queue packets for - transmission. The driver internally buffers the frame currently - being transmitted and one further frame (in order to keep back - to back transmission running). Any further buffering is up to - the caller. - </para> - <para> - The function <function>z8530_queue_xmit</function> takes a network - buffer in sk_buff format and queues it for transmission. The - caller must provide the entire packet with the exception of the - bitstuffing and CRC. This is normally done by the caller via - the generic HDLC interface layer. It returns 0 if the buffer has been - queued and non zero values for queue full. If the function accepts - the buffer it becomes property of the Z8530 layer and the caller - should not free it. - </para> - <para> - The function <function>z8530_get_stats</function> returns a pointer - to an internally maintained per interface statistics block. This - provides most of the interface code needed to implement the network - layer get_stats callback. - </para> - </chapter> - - <chapter id="Porting_The_Z8530_Driver"> - <title>Porting The Z8530 Driver</title> - <para> - The Z8530 driver is written to be portable. In DMA mode it makes - assumptions about the use of ISA DMA. These are probably warranted - in most cases as the Z85230 in particular was designed to glue to PC - type machines. The PIO mode makes no real assumptions. - </para> - <para> - Should you need to retarget the Z8530 driver to another architecture - the only code that should need changing are the port I/O functions. - At the moment these assume PC I/O port accesses. This may not be - appropriate for all platforms. Replacing - <function>z8530_read_port</function> and <function>z8530_write_port - </function> is intended to be all that is required to port this - driver layer. - </para> - </chapter> - - <chapter id="bugs"> - <title>Known Bugs And Assumptions</title> - <para> - <variablelist> - <varlistentry><term>Interrupt Locking</term> - <listitem> - <para> - The locking in the driver is done via the global cli/sti lock. This - makes for relatively poor SMP performance. Switching this to use a - per device spin lock would probably materially improve performance. - </para> - </listitem></varlistentry> - - <varlistentry><term>Occasional Failures</term> - <listitem> - <para> - We have reports of occasional failures when run for very long - periods of time and the driver starts to receive junk frames. At - the moment the cause of this is not clear. - </para> - </listitem></varlistentry> - </variablelist> - - </para> - </chapter> - - <chapter id="pubfunctions"> - <title>Public Functions Provided</title> -!Edrivers/net/wan/z85230.c - </chapter> - - <chapter id="intfunctions"> - <title>Internal Functions</title> -!Idrivers/net/wan/z85230.c - </chapter> - -</book> |