diff options
author | Jonathan Corbet <corbet@lwn.net> | 2017-05-18 19:03:08 +0200 |
---|---|---|
committer | Jonathan Corbet <corbet@lwn.net> | 2017-05-18 19:03:08 +0200 |
commit | 6312811be26f4a97fb36f53ffffafa5086833a28 (patch) | |
tree | 66082d254d384ed1a5ac869f9b5c96aa78bab95b /Documentation | |
parent | Documentation, kbuild: fix typo "minimun" -> "minimum" (diff) | |
parent | kernel-doc: describe the ``literal`` syntax (diff) | |
download | linux-6312811be26f4a97fb36f53ffffafa5086833a28.tar.xz linux-6312811be26f4a97fb36f53ffffafa5086833a28.zip |
Merge remote-tracking branch 'mauro-exp/docbook3' into death-to-docbook
Mauro says:
This patch series convert the remaining DocBooks to ReST.
The first version was originally
send as 3 patch series:
[PATCH 00/36] Convert DocBook documents to ReST
[PATCH 0/5] Convert more books to ReST
[PATCH 00/13] Get rid of DocBook
The lsm book was added as if it were a text file under
Documentation. The plan is to merge it with another file
under Documentation/security, after both this series and
a security Documentation patch series gets merged.
It also adjusts some Sphinx-pedantic errors/warnings on
some kernel-doc markups.
I also added some patches here to add PDF output for all
existing ReST books.
Diffstat (limited to 'Documentation')
64 files changed, 7321 insertions, 10360 deletions
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index ed3e5e949fce..f35473f8c630 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX @@ -24,8 +24,6 @@ DMA-ISA-LPC.txt - How to do DMA with ISA (and LPC) devices. DMA-attributes.txt - listing of the various possible attributes a DMA region can have -DocBook/ - - directory with DocBook templates etc. for kernel documentation. EDID/ - directory with info on customizing EDID for broken gfx/displays. IPMI.txt @@ -40,8 +38,6 @@ Intel-IOMMU.txt - basic info on the Intel IOMMU virtualization support. Makefile - It's not of interest for those who aren't touching the build system. -Makefile.sphinx - - It's not of interest for those who aren't touching the build system. PCI/ - info related to PCI drivers. RCU/ @@ -264,6 +260,8 @@ logo.gif - full colour GIF image of Linux logo (penguin - Tux). logo.txt - info on creator of above logo & site to get additional images from. +lsm.txt + - Linux Security Modules: General Security Hooks for Linux lzo.txt - kernel LZO decompressor input formats m68k/ diff --git a/Documentation/DocBook/.gitignore b/Documentation/DocBook/.gitignore deleted file mode 100644 index e05da3f7aa21..000000000000 --- a/Documentation/DocBook/.gitignore +++ /dev/null @@ -1,17 +0,0 @@ -*.xml -*.ps -*.pdf -*.html -*.9.gz -*.9 -*.aux -*.dvi -*.log -*.out -*.png -*.gif -*.svg -*.proc -*.db -media-indices.tmpl -media-entities.tmpl diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile deleted file mode 100644 index 85916f13d330..000000000000 --- a/Documentation/DocBook/Makefile +++ /dev/null @@ -1,282 +0,0 @@ -### -# This makefile is used to generate the kernel documentation, -# primarily based on in-line comments in various source files. -# See Documentation/kernel-doc-nano-HOWTO.txt for instruction in how -# to document the SRC - and how to read it. -# To add a new book the only step required is to add the book to the -# list of DOCBOOKS. - -DOCBOOKS := z8530book.xml \ - kernel-hacking.xml kernel-locking.xml \ - networking.xml \ - filesystems.xml lsm.xml kgdb.xml \ - libata.xml mtdnand.xml librs.xml rapidio.xml \ - s390-drivers.xml scsi.xml \ - sh.xml w1.xml - -ifeq ($(DOCBOOKS),) - -# Skip DocBook build if the user explicitly requested no DOCBOOKS. -.DEFAULT: - @echo " SKIP DocBook $@ target (DOCBOOKS=\"\" specified)." -else -ifneq ($(SPHINXDIRS),) - -# Skip DocBook build if the user explicitly requested a sphinx dir -.DEFAULT: - @echo " SKIP DocBook $@ target (SPHINXDIRS specified)." -else - - -### -# The build process is as follows (targets): -# (xmldocs) [by docproc] -# file.tmpl --> file.xml +--> file.ps (psdocs) [by db2ps or xmlto] -# +--> file.pdf (pdfdocs) [by db2pdf or xmlto] -# +--> DIR=file (htmldocs) [by xmlto] -# +--> man/ (mandocs) [by xmlto] - - -# for PDF and PS output you can choose between xmlto and docbook-utils tools -PDF_METHOD = $(prefer-db2x) -PS_METHOD = $(prefer-db2x) - - -targets += $(DOCBOOKS) -BOOKS := $(addprefix $(obj)/,$(DOCBOOKS)) -xmldocs: $(BOOKS) -sgmldocs: xmldocs - -PS := $(patsubst %.xml, %.ps, $(BOOKS)) -psdocs: $(PS) - -PDF := $(patsubst %.xml, %.pdf, $(BOOKS)) -pdfdocs: $(PDF) - -HTML := $(sort $(patsubst %.xml, %.html, $(BOOKS))) -htmldocs: $(HTML) - $(call cmd,build_main_index) - -MAN := $(patsubst %.xml, %.9, $(BOOKS)) -mandocs: $(MAN) - find $(obj)/man -name '*.9' | xargs gzip -nf - -# Default location for installed man pages -export INSTALL_MAN_PATH = $(objtree)/usr - -installmandocs: mandocs - mkdir -p $(INSTALL_MAN_PATH)/man/man9/ - find $(obj)/man -name '*.9.gz' -printf '%h %f\n' | \ - sort -k 2 -k 1 | uniq -f 1 | sed -e 's: :/:' | \ - xargs install -m 644 -t $(INSTALL_MAN_PATH)/man/man9/ - -# no-op for the DocBook toolchain -epubdocs: -latexdocs: -linkcheckdocs: - -### -#External programs used -KERNELDOCXMLREF = $(srctree)/scripts/kernel-doc-xml-ref -KERNELDOC = $(srctree)/scripts/kernel-doc -DOCPROC = $(objtree)/scripts/docproc -CHECK_LC_CTYPE = $(objtree)/scripts/check-lc_ctype - -# Use a fixed encoding - UTF-8 if the C library has support built-in -# or ASCII if not -LC_CTYPE := $(call try-run, LC_CTYPE=C.UTF-8 $(CHECK_LC_CTYPE),C.UTF-8,C) -export LC_CTYPE - -XMLTOFLAGS = -m $(srctree)/$(src)/stylesheet.xsl -XMLTOFLAGS += --skip-validation - -### -# DOCPROC is used for two purposes: -# 1) To generate a dependency list for a .tmpl file -# 2) To preprocess a .tmpl file and call kernel-doc with -# appropriate parameters. -# The following rules are used to generate the .xml documentation -# required to generate the final targets. (ps, pdf, html). -quiet_cmd_docproc = DOCPROC $@ - cmd_docproc = SRCTREE=$(srctree)/ $(DOCPROC) doc $< >$@ -define rule_docproc - set -e; \ - $(if $($(quiet)cmd_$(1)),echo ' $($(quiet)cmd_$(1))';) \ - $(cmd_$(1)); \ - ( \ - echo 'cmd_$@ := $(cmd_$(1))'; \ - echo $@: `SRCTREE=$(srctree) $(DOCPROC) depend $<`; \ - ) > $(dir $@).$(notdir $@).cmd -endef - -%.xml: %.tmpl $(KERNELDOC) $(DOCPROC) $(KERNELDOCXMLREF) FORCE - $(call if_changed_rule,docproc) - -# Tell kbuild to always build the programs -always := $(hostprogs-y) - -notfoundtemplate = echo "*** You have to install docbook-utils or xmlto ***"; \ - exit 1 -db2xtemplate = db2TYPE -o $(dir $@) $< -xmltotemplate = xmlto TYPE $(XMLTOFLAGS) -o $(dir $@) $< - -# determine which methods are available -ifeq ($(shell which db2ps >/dev/null 2>&1 && echo found),found) - use-db2x = db2x - prefer-db2x = db2x -else - use-db2x = notfound - prefer-db2x = $(use-xmlto) -endif -ifeq ($(shell which xmlto >/dev/null 2>&1 && echo found),found) - use-xmlto = xmlto - prefer-xmlto = xmlto -else - use-xmlto = notfound - prefer-xmlto = $(use-db2x) -endif - -# the commands, generated from the chosen template -quiet_cmd_db2ps = PS $@ - cmd_db2ps = $(subst TYPE,ps, $($(PS_METHOD)template)) -%.ps : %.xml - $(call cmd,db2ps) - -quiet_cmd_db2pdf = PDF $@ - cmd_db2pdf = $(subst TYPE,pdf, $($(PDF_METHOD)template)) -%.pdf : %.xml - $(call cmd,db2pdf) - - -index = index.html -main_idx = $(obj)/$(index) -quiet_cmd_build_main_index = HTML $(main_idx) - cmd_build_main_index = rm -rf $(main_idx); \ - echo '<h1>Linux Kernel HTML Documentation</h1>' >> $(main_idx) && \ - echo '<h2>Kernel Version: $(KERNELVERSION)</h2>' >> $(main_idx) && \ - cat $(HTML) >> $(main_idx) - -quiet_cmd_db2html = HTML $@ - cmd_db2html = xmlto html $(XMLTOFLAGS) -o $(patsubst %.html,%,$@) $< && \ - echo '<a HREF="$(patsubst %.html,%,$(notdir $@))/index.html"> \ - $(patsubst %.html,%,$(notdir $@))</a><p>' > $@ - -### -# Rules to create an aux XML and .db, and use them to re-process the DocBook XML -# to fill internal hyperlinks - gen_aux_xml = : - quiet_gen_aux_xml = echo ' XMLREF $@' -silent_gen_aux_xml = : -%.aux.xml: %.xml - @$($(quiet)gen_aux_xml) - @rm -rf $@ - @(cat $< | egrep "^<refentry id" | egrep -o "\".*\"" | cut -f 2 -d \" > $<.db) - @$(KERNELDOCXMLREF) -db $<.db $< > $@ -.PRECIOUS: %.aux.xml - -%.html: %.aux.xml - @(which xmlto > /dev/null 2>&1) || \ - (echo "*** You need to install xmlto ***"; \ - exit 1) - @rm -rf $@ $(patsubst %.html,%,$@) - $(call cmd,db2html) - @if [ ! -z "$(PNG-$(basename $(notdir $@)))" ]; then \ - cp $(PNG-$(basename $(notdir $@))) $(patsubst %.html,%,$@); fi - -quiet_cmd_db2man = MAN $@ - cmd_db2man = if grep -q refentry $<; then xmlto man $(XMLTOFLAGS) -o $(obj)/man/$(*F) $< ; fi -%.9 : %.xml - @(which xmlto > /dev/null 2>&1) || \ - (echo "*** You need to install xmlto ***"; \ - exit 1) - $(Q)mkdir -p $(obj)/man/$(*F) - $(call cmd,db2man) - @touch $@ - -### -# Rules to generate postscripts and PNG images from .fig format files -quiet_cmd_fig2eps = FIG2EPS $@ - cmd_fig2eps = fig2dev -Leps $< $@ - -%.eps: %.fig - @(which fig2dev > /dev/null 2>&1) || \ - (echo "*** You need to install transfig ***"; \ - exit 1) - $(call cmd,fig2eps) - -quiet_cmd_fig2png = FIG2PNG $@ - cmd_fig2png = fig2dev -Lpng $< $@ - -%.png: %.fig - @(which fig2dev > /dev/null 2>&1) || \ - (echo "*** You need to install transfig ***"; \ - exit 1) - $(call cmd,fig2png) - -### -# Rule to convert a .c file to inline XML documentation - gen_xml = : - quiet_gen_xml = echo ' GEN $@' -silent_gen_xml = : -%.xml: %.c - @$($(quiet)gen_xml) - @( \ - echo "<programlisting>"; \ - expand --tabs=8 < $< | \ - sed -e "s/&/\\&/g" \ - -e "s/</\\</g" \ - -e "s/>/\\>/g"; \ - echo "</programlisting>") > $@ - -endif # DOCBOOKS="" -endif # SPHINDIR=... - -### -# Help targets as used by the top-level makefile -dochelp: - @echo ' Linux kernel internal documentation in different formats (DocBook):' - @echo ' htmldocs - HTML' - @echo ' pdfdocs - PDF' - @echo ' psdocs - Postscript' - @echo ' xmldocs - XML DocBook' - @echo ' mandocs - man pages' - @echo ' installmandocs - install man pages generated by mandocs to INSTALL_MAN_PATH'; \ - echo ' (default: $(INSTALL_MAN_PATH))'; \ - echo '' - @echo ' cleandocs - clean all generated DocBook files' - @echo - @echo ' make DOCBOOKS="s1.xml s2.xml" [target] Generate only docs s1.xml s2.xml' - @echo ' valid values for DOCBOOKS are: $(DOCBOOKS)' - @echo - @echo " make DOCBOOKS=\"\" [target] Don't generate docs from Docbook" - @echo ' This is useful to generate only the ReST docs (Sphinx)' - - -### -# Temporary files left by various tools -clean-files := $(DOCBOOKS) \ - $(patsubst %.xml, %.dvi, $(DOCBOOKS)) \ - $(patsubst %.xml, %.aux, $(DOCBOOKS)) \ - $(patsubst %.xml, %.tex, $(DOCBOOKS)) \ - $(patsubst %.xml, %.log, $(DOCBOOKS)) \ - $(patsubst %.xml, %.out, $(DOCBOOKS)) \ - $(patsubst %.xml, %.ps, $(DOCBOOKS)) \ - $(patsubst %.xml, %.pdf, $(DOCBOOKS)) \ - $(patsubst %.xml, %.html, $(DOCBOOKS)) \ - $(patsubst %.xml, %.9, $(DOCBOOKS)) \ - $(patsubst %.xml, %.aux.xml, $(DOCBOOKS)) \ - $(patsubst %.xml, %.xml.db, $(DOCBOOKS)) \ - $(patsubst %.xml, %.xml, $(DOCBOOKS)) \ - $(patsubst %.xml, .%.xml.cmd, $(DOCBOOKS)) \ - $(index) - -clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS)) man - -cleandocs: - $(Q)rm -f $(call objectify, $(clean-files)) - $(Q)rm -rf $(call objectify, $(clean-dirs)) - -# Declare the contents of the .PHONY variable as phony. We keep that -# information in a variable so we can use it in if_changed and friends. - -.PHONY: $(PHONY) diff --git a/Documentation/DocBook/filesystems.tmpl b/Documentation/DocBook/filesystems.tmpl deleted file mode 100644 index 6006b6358c86..000000000000 --- a/Documentation/DocBook/filesystems.tmpl +++ /dev/null @@ -1,381 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="Linux-filesystems-API"> - <bookinfo> - <title>Linux Filesystems API</title> - - <legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later - version. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - </para> - - <para> - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - </para> - - <para> - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - </bookinfo> - -<toc></toc> - - <chapter id="vfs"> - <title>The Linux VFS</title> - <sect1 id="the_filesystem_types"><title>The Filesystem types</title> -!Iinclude/linux/fs.h - </sect1> - <sect1 id="the_directory_cache"><title>The Directory Cache</title> -!Efs/dcache.c -!Iinclude/linux/dcache.h - </sect1> - <sect1 id="inode_handling"><title>Inode Handling</title> -!Efs/inode.c -!Efs/bad_inode.c - </sect1> - <sect1 id="registration_and_superblocks"><title>Registration and Superblocks</title> -!Efs/super.c - </sect1> - <sect1 id="file_locks"><title>File Locks</title> -!Efs/locks.c -!Ifs/locks.c - </sect1> - <sect1 id="other_functions"><title>Other Functions</title> -!Efs/mpage.c -!Efs/namei.c -!Efs/buffer.c -!Eblock/bio.c -!Efs/seq_file.c -!Efs/filesystems.c -!Efs/fs-writeback.c -!Efs/block_dev.c - </sect1> - </chapter> - - <chapter id="proc"> - <title>The proc filesystem</title> - - <sect1 id="sysctl_interface"><title>sysctl interface</title> -!Ekernel/sysctl.c - </sect1> - - <sect1 id="proc_filesystem_interface"><title>proc filesystem interface</title> -!Ifs/proc/base.c - </sect1> - </chapter> - - <chapter id="fs_events"> - <title>Events based on file descriptors</title> -!Efs/eventfd.c - </chapter> - - <chapter id="sysfs"> - <title>The Filesystem for Exporting Kernel Objects</title> -!Efs/sysfs/file.c -!Efs/sysfs/symlink.c - </chapter> - - <chapter id="debugfs"> - <title>The debugfs filesystem</title> - - <sect1 id="debugfs_interface"><title>debugfs interface</title> -!Efs/debugfs/inode.c -!Efs/debugfs/file.c - </sect1> - </chapter> - - <chapter id="LinuxJDBAPI"> - <chapterinfo> - <title>The Linux Journalling API</title> - - <authorgroup> - <author> - <firstname>Roger</firstname> - <surname>Gammans</surname> - <affiliation> - <address> - <email>rgammans@computer-surgery.co.uk</email> - </address> - </affiliation> - </author> - </authorgroup> - - <authorgroup> - <author> - <firstname>Stephen</firstname> - <surname>Tweedie</surname> - <affiliation> - <address> - <email>sct@redhat.com</email> - </address> - </affiliation> - </author> - </authorgroup> - - <copyright> - <year>2002</year> - <holder>Roger Gammans</holder> - </copyright> - </chapterinfo> - - <title>The Linux Journalling API</title> - - <sect1 id="journaling_overview"> - <title>Overview</title> - <sect2 id="journaling_details"> - <title>Details</title> -<para> -The journalling layer is easy to use. You need to -first of all create a journal_t data structure. There are -two calls to do this dependent on how you decide to allocate the physical -media on which the journal resides. The jbd2_journal_init_inode() call -is for journals stored in filesystem inodes, or the jbd2_journal_init_dev() -call can be used for journal stored on a raw device (in a continuous range -of blocks). A journal_t is a typedef for a struct pointer, so when -you are finally finished make sure you call jbd2_journal_destroy() on it -to free up any used kernel memory. -</para> - -<para> -Once you have got your journal_t object you need to 'mount' or load the journal -file. The journalling layer expects the space for the journal was already -allocated and initialized properly by the userspace tools. When loading the -journal you must call jbd2_journal_load() to process journal contents. If the -client file system detects the journal contents does not need to be processed -(or even need not have valid contents), it may call jbd2_journal_wipe() to -clear the journal contents before calling jbd2_journal_load(). -</para> - -<para> -Note that jbd2_journal_wipe(..,0) calls jbd2_journal_skip_recovery() for you if -it detects any outstanding transactions in the journal and similarly -jbd2_journal_load() will call jbd2_journal_recover() if necessary. I would -advise reading ext4_load_journal() in fs/ext4/super.c for examples on this -stage. -</para> - -<para> -Now you can go ahead and start modifying the underlying -filesystem. Almost. -</para> - -<para> - -You still need to actually journal your filesystem changes, this -is done by wrapping them into transactions. Additionally you -also need to wrap the modification of each of the buffers -with calls to the journal layer, so it knows what the modifications -you are actually making are. To do this use jbd2_journal_start() which -returns a transaction handle. -</para> - -<para> -jbd2_journal_start() -and its counterpart jbd2_journal_stop(), which indicates the end of a -transaction are nestable calls, so you can reenter a transaction if necessary, -but remember you must call jbd2_journal_stop() the same number of times as -jbd2_journal_start() before the transaction is completed (or more accurately -leaves the update phase). Ext4/VFS makes use of this feature to simplify -handling of inode dirtying, quota support, etc. -</para> - -<para> -Inside each transaction you need to wrap the modifications to the -individual buffers (blocks). Before you start to modify a buffer you -need to call jbd2_journal_get_{create,write,undo}_access() as appropriate, -this allows the journalling layer to copy the unmodified data if it -needs to. After all the buffer may be part of a previously uncommitted -transaction. -At this point you are at last ready to modify a buffer, and once -you are have done so you need to call jbd2_journal_dirty_{meta,}data(). -Or if you've asked for access to a buffer you now know is now longer -required to be pushed back on the device you can call jbd2_journal_forget() -in much the same way as you might have used bforget() in the past. -</para> - -<para> -A jbd2_journal_flush() may be called at any time to commit and checkpoint -all your transactions. -</para> - -<para> -Then at umount time , in your put_super() you can then call jbd2_journal_destroy() -to clean up your in-core journal object. -</para> - -<para> -Unfortunately there a couple of ways the journal layer can cause a deadlock. -The first thing to note is that each task can only have -a single outstanding transaction at any one time, remember nothing -commits until the outermost jbd2_journal_stop(). This means -you must complete the transaction at the end of each file/inode/address -etc. operation you perform, so that the journalling system isn't re-entered -on another journal. Since transactions can't be nested/batched -across differing journals, and another filesystem other than -yours (say ext4) may be modified in a later syscall. -</para> - -<para> -The second case to bear in mind is that jbd2_journal_start() can -block if there isn't enough space in the journal for your transaction -(based on the passed nblocks param) - when it blocks it merely(!) needs to -wait for transactions to complete and be committed from other tasks, -so essentially we are waiting for jbd2_journal_stop(). So to avoid -deadlocks you must treat jbd2_journal_start/stop() as if they -were semaphores and include them in your semaphore ordering rules to prevent -deadlocks. Note that jbd2_journal_extend() has similar blocking behaviour to -jbd2_journal_start() so you can deadlock here just as easily as on -jbd2_journal_start(). -</para> - -<para> -Try to reserve the right number of blocks the first time. ;-). This will -be the maximum number of blocks you are going to touch in this transaction. -I advise having a look at at least ext4_jbd.h to see the basis on which -ext4 uses to make these decisions. -</para> - -<para> -Another wriggle to watch out for is your on-disk block allocation strategy. -Why? Because, if you do a delete, you need to ensure you haven't reused any -of the freed blocks until the transaction freeing these blocks commits. If you -reused these blocks and crash happens, there is no way to restore the contents -of the reallocated blocks at the end of the last fully committed transaction. - -One simple way of doing this is to mark blocks as free in internal in-memory -block allocation structures only after the transaction freeing them commits. -Ext4 uses journal commit callback for this purpose. -</para> - -<para> -With journal commit callbacks you can ask the journalling layer to call a -callback function when the transaction is finally committed to disk, so that -you can do some of your own management. You ask the journalling layer for -calling the callback by simply setting journal->j_commit_callback function -pointer and that function is called after each transaction commit. You can also -use transaction->t_private_list for attaching entries to a transaction that -need processing when the transaction commits. -</para> - -<para> -JBD2 also provides a way to block all transaction updates via -jbd2_journal_{un,}lock_updates(). Ext4 uses this when it wants a window with a -clean and stable fs for a moment. E.g. -</para> - -<programlisting> - - jbd2_journal_lock_updates() //stop new stuff happening.. - jbd2_journal_flush() // checkpoint everything. - ..do stuff on stable fs - jbd2_journal_unlock_updates() // carry on with filesystem use. -</programlisting> - -<para> -The opportunities for abuse and DOS attacks with this should be obvious, -if you allow unprivileged userspace to trigger codepaths containing these -calls. -</para> - - </sect2> - - <sect2 id="jbd_summary"> - <title>Summary</title> -<para> -Using the journal is a matter of wrapping the different context changes, -being each mount, each modification (transaction) and each changed buffer -to tell the journalling layer about them. -</para> - - </sect2> - - </sect1> - - <sect1 id="data_types"> - <title>Data Types</title> - <para> - The journalling layer uses typedefs to 'hide' the concrete definitions - of the structures used. As a client of the JBD2 layer you can - just rely on the using the pointer as a magic cookie of some sort. - - Obviously the hiding is not enforced as this is 'C'. - </para> - <sect2 id="structures"><title>Structures</title> -!Iinclude/linux/jbd2.h - </sect2> - </sect1> - - <sect1 id="functions"> - <title>Functions</title> - <para> - The functions here are split into two groups those that - affect a journal as a whole, and those which are used to - manage transactions - </para> - <sect2 id="journal_level"><title>Journal Level</title> -!Efs/jbd2/journal.c -!Ifs/jbd2/recovery.c - </sect2> - <sect2 id="transaction_level"><title>Transasction Level</title> -!Efs/jbd2/transaction.c - </sect2> - </sect1> - <sect1 id="see_also"> - <title>See also</title> - <para> - <citation> - <ulink url="http://kernel.org/pub/linux/kernel/people/sct/ext3/journal-design.ps.gz"> - Journaling the Linux ext2fs Filesystem, LinuxExpo 98, Stephen Tweedie - </ulink> - </citation> - </para> - <para> - <citation> - <ulink url="http://olstrans.sourceforge.net/release/OLS2000-ext3/OLS2000-ext3.html"> - Ext3 Journalling FileSystem, OLS 2000, Dr. Stephen Tweedie - </ulink> - </citation> - </para> - </sect1> - - </chapter> - - <chapter id="splice"> - <title>splice API</title> - <para> - splice is a method for moving blocks of data around inside the - kernel, without continually transferring them between the kernel - and user space. - </para> -!Ffs/splice.c - </chapter> - - <chapter id="pipes"> - <title>pipes API</title> - <para> - Pipe interfaces are all for in-kernel (builtin image) use. - They are not exported for use by modules. - </para> -!Iinclude/linux/pipe_fs_i.h -!Ffs/pipe.c - </chapter> - -</book> diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl deleted file mode 100644 index da5c087462b1..000000000000 --- a/Documentation/DocBook/kernel-hacking.tmpl +++ /dev/null @@ -1,1312 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="lk-hacking-guide"> - <bookinfo> - <title>Unreliable Guide To Hacking The Linux Kernel</title> - - <authorgroup> - <author> - <firstname>Rusty</firstname> - <surname>Russell</surname> - <affiliation> - <address> - <email>rusty@rustcorp.com.au</email> - </address> - </affiliation> - </author> - </authorgroup> - - <copyright> - <year>2005</year> - <holder>Rusty Russell</holder> - </copyright> - - <legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later - version. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - </para> - - <para> - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - </para> - - <para> - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - - <releaseinfo> - This is the first release of this document as part of the kernel tarball. - </releaseinfo> - - </bookinfo> - - <toc></toc> - - <chapter id="introduction"> - <title>Introduction</title> - <para> - Welcome, gentle reader, to Rusty's Remarkably Unreliable Guide to Linux - Kernel Hacking. This document describes the common routines and - general requirements for kernel code: its goal is to serve as a - primer for Linux kernel development for experienced C - programmers. I avoid implementation details: that's what the - code is for, and I ignore whole tracts of useful routines. - </para> - <para> - Before you read this, please understand that I never wanted to - write this document, being grossly under-qualified, but I always - wanted to read it, and this was the only way. I hope it will - grow into a compendium of best practice, common starting points - and random information. - </para> - </chapter> - - <chapter id="basic-players"> - <title>The Players</title> - - <para> - At any time each of the CPUs in a system can be: - </para> - - <itemizedlist> - <listitem> - <para> - not associated with any process, serving a hardware interrupt; - </para> - </listitem> - - <listitem> - <para> - not associated with any process, serving a softirq or tasklet; - </para> - </listitem> - - <listitem> - <para> - running in kernel space, associated with a process (user context); - </para> - </listitem> - - <listitem> - <para> - running a process in user space. - </para> - </listitem> - </itemizedlist> - - <para> - There is an ordering between these. The bottom two can preempt - each other, but above that is a strict hierarchy: each can only be - preempted by the ones above it. For example, while a softirq is - running on a CPU, no other softirq will preempt it, but a hardware - interrupt can. However, any other CPUs in the system execute - independently. - </para> - - <para> - We'll see a number of ways that the user context can block - interrupts, to become truly non-preemptable. - </para> - - <sect1 id="basics-usercontext"> - <title>User Context</title> - - <para> - User context is when you are coming in from a system call or other - trap: like userspace, you can be preempted by more important tasks - and by interrupts. You can sleep, by calling - <function>schedule()</function>. - </para> - - <note> - <para> - You are always in user context on module load and unload, - and on operations on the block device layer. - </para> - </note> - - <para> - In user context, the <varname>current</varname> pointer (indicating - the task we are currently executing) is valid, and - <function>in_interrupt()</function> - (<filename>include/linux/interrupt.h</filename>) is <returnvalue>false - </returnvalue>. - </para> - - <caution> - <para> - Beware that if you have preemption or softirqs disabled - (see below), <function>in_interrupt()</function> will return a - false positive. - </para> - </caution> - </sect1> - - <sect1 id="basics-hardirqs"> - <title>Hardware Interrupts (Hard IRQs)</title> - - <para> - Timer ticks, <hardware>network cards</hardware> and - <hardware>keyboard</hardware> are examples of real - hardware which produce interrupts at any time. The kernel runs - interrupt handlers, which services the hardware. The kernel - guarantees that this handler is never re-entered: if the same - interrupt arrives, it is queued (or dropped). Because it - disables interrupts, this handler has to be fast: frequently it - simply acknowledges the interrupt, marks a 'software interrupt' - for execution and exits. - </para> - - <para> - You can tell you are in a hardware interrupt, because - <function>in_irq()</function> returns <returnvalue>true</returnvalue>. - </para> - <caution> - <para> - Beware that this will return a false positive if interrupts are disabled - (see below). - </para> - </caution> - </sect1> - - <sect1 id="basics-softirqs"> - <title>Software Interrupt Context: Softirqs and Tasklets</title> - - <para> - Whenever a system call is about to return to userspace, or a - hardware interrupt handler exits, any 'software interrupts' - which are marked pending (usually by hardware interrupts) are - run (<filename>kernel/softirq.c</filename>). - </para> - - <para> - Much of the real interrupt handling work is done here. Early in - the transition to <acronym>SMP</acronym>, there were only 'bottom - halves' (BHs), which didn't take advantage of multiple CPUs. Shortly - after we switched from wind-up computers made of match-sticks and snot, - we abandoned this limitation and switched to 'softirqs'. - </para> - - <para> - <filename class="headerfile">include/linux/interrupt.h</filename> lists the - different softirqs. A very important softirq is the - timer softirq (<filename - class="headerfile">include/linux/timer.h</filename>): you can - register to have it call functions for you in a given length of - time. - </para> - - <para> - Softirqs are often a pain to deal with, since the same softirq - will run simultaneously on more than one CPU. For this reason, - tasklets (<filename - class="headerfile">include/linux/interrupt.h</filename>) are more - often used: they are dynamically-registrable (meaning you can have - as many as you want), and they also guarantee that any tasklet - will only run on one CPU at any time, although different tasklets - can run simultaneously. - </para> - <caution> - <para> - The name 'tasklet' is misleading: they have nothing to do with 'tasks', - and probably more to do with some bad vodka Alexey Kuznetsov had at the - time. - </para> - </caution> - - <para> - You can tell you are in a softirq (or tasklet) - using the <function>in_softirq()</function> macro - (<filename class="headerfile">include/linux/interrupt.h</filename>). - </para> - <caution> - <para> - Beware that this will return a false positive if a bh lock (see below) - is held. - </para> - </caution> - </sect1> - </chapter> - - <chapter id="basic-rules"> - <title>Some Basic Rules</title> - - <variablelist> - <varlistentry> - <term>No memory protection</term> - <listitem> - <para> - If you corrupt memory, whether in user context or - interrupt context, the whole machine will crash. Are you - sure you can't do what you want in userspace? - </para> - </listitem> - </varlistentry> - - <varlistentry> - <term>No floating point or <acronym>MMX</acronym></term> - <listitem> - <para> - The <acronym>FPU</acronym> context is not saved; even in user - context the <acronym>FPU</acronym> state probably won't - correspond with the current process: you would mess with some - user process' <acronym>FPU</acronym> state. If you really want - to do this, you would have to explicitly save/restore the full - <acronym>FPU</acronym> state (and avoid context switches). It - is generally a bad idea; use fixed point arithmetic first. - </para> - </listitem> - </varlistentry> - - <varlistentry> - <term>A rigid stack limit</term> - <listitem> - <para> - Depending on configuration options the kernel stack is about 3K to 6K for most 32-bit architectures: it's - about 14K on most 64-bit archs, and often shared with interrupts - so you can't use it all. Avoid deep recursion and huge local - arrays on the stack (allocate them dynamically instead). - </para> - </listitem> - </varlistentry> - - <varlistentry> - <term>The Linux kernel is portable</term> - <listitem> - <para> - Let's keep it that way. Your code should be 64-bit clean, - and endian-independent. You should also minimize CPU - specific stuff, e.g. inline assembly should be cleanly - encapsulated and minimized to ease porting. Generally it - should be restricted to the architecture-dependent part of - the kernel tree. - </para> - </listitem> - </varlistentry> - </variablelist> - </chapter> - - <chapter id="ioctls"> - <title>ioctls: Not writing a new system call</title> - - <para> - A system call generally looks like this - </para> - - <programlisting> -asmlinkage long sys_mycall(int arg) -{ - return 0; -} - </programlisting> - - <para> - First, in most cases you don't want to create a new system call. - You create a character device and implement an appropriate ioctl - for it. This is much more flexible than system calls, doesn't have - to be entered in every architecture's - <filename class="headerfile">include/asm/unistd.h</filename> and - <filename>arch/kernel/entry.S</filename> file, and is much more - likely to be accepted by Linus. - </para> - - <para> - If all your routine does is read or write some parameter, consider - implementing a <function>sysfs</function> interface instead. - </para> - - <para> - Inside the ioctl you're in user context to a process. When a - error occurs you return a negated errno (see - <filename class="headerfile">include/linux/errno.h</filename>), - otherwise you return <returnvalue>0</returnvalue>. - </para> - - <para> - After you slept you should check if a signal occurred: the - Unix/Linux way of handling signals is to temporarily exit the - system call with the <constant>-ERESTARTSYS</constant> error. The - system call entry code will switch back to user context, process - the signal handler and then your system call will be restarted - (unless the user disabled that). So you should be prepared to - process the restart, e.g. if you're in the middle of manipulating - some data structure. - </para> - - <programlisting> -if (signal_pending(current)) - return -ERESTARTSYS; - </programlisting> - - <para> - If you're doing longer computations: first think userspace. If you - <emphasis>really</emphasis> want to do it in kernel you should - regularly check if you need to give up the CPU (remember there is - cooperative multitasking per CPU). Idiom: - </para> - - <programlisting> -cond_resched(); /* Will sleep */ - </programlisting> - - <para> - A short note on interface design: the UNIX system call motto is - "Provide mechanism not policy". - </para> - </chapter> - - <chapter id="deadlock-recipes"> - <title>Recipes for Deadlock</title> - - <para> - You cannot call any routines which may sleep, unless: - </para> - <itemizedlist> - <listitem> - <para> - You are in user context. - </para> - </listitem> - - <listitem> - <para> - You do not own any spinlocks. - </para> - </listitem> - - <listitem> - <para> - You have interrupts enabled (actually, Andi Kleen says - that the scheduling code will enable them for you, but - that's probably not what you wanted). - </para> - </listitem> - </itemizedlist> - - <para> - Note that some functions may sleep implicitly: common ones are - the user space access functions (*_user) and memory allocation - functions without <symbol>GFP_ATOMIC</symbol>. - </para> - - <para> - You should always compile your kernel - <symbol>CONFIG_DEBUG_ATOMIC_SLEEP</symbol> on, and it will warn - you if you break these rules. If you <emphasis>do</emphasis> break - the rules, you will eventually lock up your box. - </para> - - <para> - Really. - </para> - </chapter> - - <chapter id="common-routines"> - <title>Common Routines</title> - - <sect1 id="routines-printk"> - <title> - <function>printk()</function> - <filename class="headerfile">include/linux/kernel.h</filename> - </title> - - <para> - <function>printk()</function> feeds kernel messages to the - console, dmesg, and the syslog daemon. It is useful for debugging - and reporting errors, and can be used inside interrupt context, - but use with caution: a machine which has its console flooded with - printk messages is unusable. It uses a format string mostly - compatible with ANSI C printf, and C string concatenation to give - it a first "priority" argument: - </para> - - <programlisting> -printk(KERN_INFO "i = %u\n", i); - </programlisting> - - <para> - See <filename class="headerfile">include/linux/kernel.h</filename>; - for other KERN_ values; these are interpreted by syslog as the - level. Special case: for printing an IP address use - </para> - - <programlisting> -__be32 ipaddress; -printk(KERN_INFO "my ip: %pI4\n", &ipaddress); - </programlisting> - - <para> - <function>printk()</function> internally uses a 1K buffer and does - not catch overruns. Make sure that will be enough. - </para> - - <note> - <para> - You will know when you are a real kernel hacker - when you start typoing printf as printk in your user programs :) - </para> - </note> - - <!--- From the Lions book reader department --> - - <note> - <para> - Another sidenote: the original Unix Version 6 sources had a - comment on top of its printf function: "Printf should not be - used for chit-chat". You should follow that advice. - </para> - </note> - </sect1> - - <sect1 id="routines-copy"> - <title> - <function>copy_[to/from]_user()</function> - / - <function>get_user()</function> - / - <function>put_user()</function> - <filename class="headerfile">include/linux/uaccess.h</filename> - </title> - - <para> - <emphasis>[SLEEPS]</emphasis> - </para> - - <para> - <function>put_user()</function> and <function>get_user()</function> - are used to get and put single values (such as an int, char, or - long) from and to userspace. A pointer into userspace should - never be simply dereferenced: data should be copied using these - routines. Both return <constant>-EFAULT</constant> or 0. - </para> - <para> - <function>copy_to_user()</function> and - <function>copy_from_user()</function> are more general: they copy - an arbitrary amount of data to and from userspace. - <caution> - <para> - Unlike <function>put_user()</function> and - <function>get_user()</function>, they return the amount of - uncopied data (ie. <returnvalue>0</returnvalue> still means - success). - </para> - </caution> - [Yes, this moronic interface makes me cringe. The flamewar comes up every year or so. --RR.] - </para> - <para> - The functions may sleep implicitly. This should never be called - outside user context (it makes no sense), with interrupts - disabled, or a spinlock held. - </para> - </sect1> - - <sect1 id="routines-kmalloc"> - <title><function>kmalloc()</function>/<function>kfree()</function> - <filename class="headerfile">include/linux/slab.h</filename></title> - - <para> - <emphasis>[MAY SLEEP: SEE BELOW]</emphasis> - </para> - - <para> - These routines are used to dynamically request pointer-aligned - chunks of memory, like malloc and free do in userspace, but - <function>kmalloc()</function> takes an extra flag word. - Important values: - </para> - - <variablelist> - <varlistentry> - <term> - <constant> - GFP_KERNEL - </constant> - </term> - <listitem> - <para> - May sleep and swap to free memory. Only allowed in user - context, but is the most reliable way to allocate memory. - </para> - </listitem> - </varlistentry> - - <varlistentry> - <term> - <constant> - GFP_ATOMIC - </constant> - </term> - <listitem> - <para> - Don't sleep. Less reliable than <constant>GFP_KERNEL</constant>, - but may be called from interrupt context. You should - <emphasis>really</emphasis> have a good out-of-memory - error-handling strategy. - </para> - </listitem> - </varlistentry> - - <varlistentry> - <term> - <constant> - GFP_DMA - </constant> - </term> - <listitem> - <para> - Allocate ISA DMA lower than 16MB. If you don't know what that - is you don't need it. Very unreliable. - </para> - </listitem> - </varlistentry> - </variablelist> - - <para> - If you see a <errorname>sleeping function called from invalid - context</errorname> warning message, then maybe you called a - sleeping allocation function from interrupt context without - <constant>GFP_ATOMIC</constant>. You should really fix that. - Run, don't walk. - </para> - - <para> - If you are allocating at least <constant>PAGE_SIZE</constant> - (<filename class="headerfile">include/asm/page.h</filename>) bytes, - consider using <function>__get_free_pages()</function> - - (<filename class="headerfile">include/linux/mm.h</filename>). It - takes an order argument (0 for page sized, 1 for double page, 2 - for four pages etc.) and the same memory priority flag word as - above. - </para> - - <para> - If you are allocating more than a page worth of bytes you can use - <function>vmalloc()</function>. It'll allocate virtual memory in - the kernel map. This block is not contiguous in physical memory, - but the <acronym>MMU</acronym> makes it look like it is for you - (so it'll only look contiguous to the CPUs, not to external device - drivers). If you really need large physically contiguous memory - for some weird device, you have a problem: it is poorly supported - in Linux because after some time memory fragmentation in a running - kernel makes it hard. The best way is to allocate the block early - in the boot process via the <function>alloc_bootmem()</function> - routine. - </para> - - <para> - Before inventing your own cache of often-used objects consider - using a slab cache in - <filename class="headerfile">include/linux/slab.h</filename> - </para> - </sect1> - - <sect1 id="routines-current"> - <title><function>current</function> - <filename class="headerfile">include/asm/current.h</filename></title> - - <para> - This global variable (really a macro) contains a pointer to - the current task structure, so is only valid in user context. - For example, when a process makes a system call, this will - point to the task structure of the calling process. It is - <emphasis>not NULL</emphasis> in interrupt context. - </para> - </sect1> - - <sect1 id="routines-udelay"> - <title><function>mdelay()</function>/<function>udelay()</function> - <filename class="headerfile">include/asm/delay.h</filename> - <filename class="headerfile">include/linux/delay.h</filename> - </title> - - <para> - The <function>udelay()</function> and <function>ndelay()</function> functions can be used for small pauses. - Do not use large values with them as you risk - overflow - the helper function <function>mdelay()</function> is useful - here, or consider <function>msleep()</function>. - </para> - </sect1> - - <sect1 id="routines-endian"> - <title><function>cpu_to_be32()</function>/<function>be32_to_cpu()</function>/<function>cpu_to_le32()</function>/<function>le32_to_cpu()</function> - <filename class="headerfile">include/asm/byteorder.h</filename> - </title> - - <para> - The <function>cpu_to_be32()</function> family (where the "32" can - be replaced by 64 or 16, and the "be" can be replaced by "le") are - the general way to do endian conversions in the kernel: they - return the converted value. All variations supply the reverse as - well: <function>be32_to_cpu()</function>, etc. - </para> - - <para> - There are two major variations of these functions: the pointer - variation, such as <function>cpu_to_be32p()</function>, which take - a pointer to the given type, and return the converted value. The - other variation is the "in-situ" family, such as - <function>cpu_to_be32s()</function>, which convert value referred - to by the pointer, and return void. - </para> - </sect1> - - <sect1 id="routines-local-irqs"> - <title><function>local_irq_save()</function>/<function>local_irq_restore()</function> - <filename class="headerfile">include/linux/irqflags.h</filename> - </title> - - <para> - These routines disable hard interrupts on the local CPU, and - restore them. They are reentrant; saving the previous state in - their one <varname>unsigned long flags</varname> argument. If you - know that interrupts are enabled, you can simply use - <function>local_irq_disable()</function> and - <function>local_irq_enable()</function>. - </para> - </sect1> - - <sect1 id="routines-softirqs"> - <title><function>local_bh_disable()</function>/<function>local_bh_enable()</function> - <filename class="headerfile">include/linux/interrupt.h</filename></title> - - <para> - These routines disable soft interrupts on the local CPU, and - restore them. They are reentrant; if soft interrupts were - disabled before, they will still be disabled after this pair - of functions has been called. They prevent softirqs and tasklets - from running on the current CPU. - </para> - </sect1> - - <sect1 id="routines-processorids"> - <title><function>smp_processor_id</function>() - <filename class="headerfile">include/asm/smp.h</filename></title> - - <para> - <function>get_cpu()</function> disables preemption (so you won't - suddenly get moved to another CPU) and returns the current - processor number, between 0 and <symbol>NR_CPUS</symbol>. Note - that the CPU numbers are not necessarily continuous. You return - it again with <function>put_cpu()</function> when you are done. - </para> - <para> - If you know you cannot be preempted by another task (ie. you are - in interrupt context, or have preemption disabled) you can use - smp_processor_id(). - </para> - </sect1> - - <sect1 id="routines-init"> - <title><type>__init</type>/<type>__exit</type>/<type>__initdata</type> - <filename class="headerfile">include/linux/init.h</filename></title> - - <para> - After boot, the kernel frees up a special section; functions - marked with <type>__init</type> and data structures marked with - <type>__initdata</type> are dropped after boot is complete: similarly - modules discard this memory after initialization. <type>__exit</type> - is used to declare a function which is only required on exit: the - function will be dropped if this file is not compiled as a module. - See the header file for use. Note that it makes no sense for a function - marked with <type>__init</type> to be exported to modules with - <function>EXPORT_SYMBOL()</function> - this will break. - </para> - - </sect1> - - <sect1 id="routines-init-again"> - <title><function>__initcall()</function>/<function>module_init()</function> - <filename class="headerfile">include/linux/init.h</filename></title> - <para> - Many parts of the kernel are well served as a module - (dynamically-loadable parts of the kernel). Using the - <function>module_init()</function> and - <function>module_exit()</function> macros it is easy to write code - without #ifdefs which can operate both as a module or built into - the kernel. - </para> - - <para> - The <function>module_init()</function> macro defines which - function is to be called at module insertion time (if the file is - compiled as a module), or at boot time: if the file is not - compiled as a module the <function>module_init()</function> macro - becomes equivalent to <function>__initcall()</function>, which - through linker magic ensures that the function is called on boot. - </para> - - <para> - The function can return a negative error number to cause - module loading to fail (unfortunately, this has no effect if - the module is compiled into the kernel). This function is - called in user context with interrupts enabled, so it can sleep. - </para> - </sect1> - - <sect1 id="routines-moduleexit"> - <title> <function>module_exit()</function> - <filename class="headerfile">include/linux/init.h</filename> </title> - - <para> - This macro defines the function to be called at module removal - time (or never, in the case of the file compiled into the - kernel). It will only be called if the module usage count has - reached zero. This function can also sleep, but cannot fail: - everything must be cleaned up by the time it returns. - </para> - - <para> - Note that this macro is optional: if it is not present, your - module will not be removable (except for 'rmmod -f'). - </para> - </sect1> - - <sect1 id="routines-module-use-counters"> - <title> <function>try_module_get()</function>/<function>module_put()</function> - <filename class="headerfile">include/linux/module.h</filename></title> - - <para> - These manipulate the module usage count, to protect against - removal (a module also can't be removed if another module uses one - of its exported symbols: see below). Before calling into module - code, you should call <function>try_module_get()</function> on - that module: if it fails, then the module is being removed and you - should act as if it wasn't there. Otherwise, you can safely enter - the module, and call <function>module_put()</function> when you're - finished. - </para> - - <para> - Most registerable structures have an - <structfield>owner</structfield> field, such as in the - <structname>file_operations</structname> structure. Set this field - to the macro <symbol>THIS_MODULE</symbol>. - </para> - </sect1> - - <!-- add info on new-style module refcounting here --> - </chapter> - - <chapter id="queues"> - <title>Wait Queues - <filename class="headerfile">include/linux/wait.h</filename> - </title> - <para> - <emphasis>[SLEEPS]</emphasis> - </para> - - <para> - A wait queue is used to wait for someone to wake you up when a - certain condition is true. They must be used carefully to ensure - there is no race condition. You declare a - <type>wait_queue_head_t</type>, and then processes which want to - wait for that condition declare a <type>wait_queue_t</type> - referring to themselves, and place that in the queue. - </para> - - <sect1 id="queue-declaring"> - <title>Declaring</title> - - <para> - You declare a <type>wait_queue_head_t</type> using the - <function>DECLARE_WAIT_QUEUE_HEAD()</function> macro, or using the - <function>init_waitqueue_head()</function> routine in your - initialization code. - </para> - </sect1> - - <sect1 id="queue-waitqueue"> - <title>Queuing</title> - - <para> - Placing yourself in the waitqueue is fairly complex, because you - must put yourself in the queue before checking the condition. - There is a macro to do this: - <function>wait_event_interruptible()</function> - - <filename class="headerfile">include/linux/wait.h</filename> The - first argument is the wait queue head, and the second is an - expression which is evaluated; the macro returns - <returnvalue>0</returnvalue> when this expression is true, or - <returnvalue>-ERESTARTSYS</returnvalue> if a signal is received. - The <function>wait_event()</function> version ignores signals. - </para> - - </sect1> - - <sect1 id="queue-waking"> - <title>Waking Up Queued Tasks</title> - - <para> - Call <function>wake_up()</function> - - <filename class="headerfile">include/linux/wait.h</filename>;, - which will wake up every process in the queue. The exception is - if one has <constant>TASK_EXCLUSIVE</constant> set, in which case - the remainder of the queue will not be woken. There are other variants - of this basic function available in the same header. - </para> - </sect1> - </chapter> - - <chapter id="atomic-ops"> - <title>Atomic Operations</title> - - <para> - Certain operations are guaranteed atomic on all platforms. The - first class of operations work on <type>atomic_t</type> - - <filename class="headerfile">include/asm/atomic.h</filename>; this - contains a signed integer (at least 32 bits long), and you must use - these functions to manipulate or read atomic_t variables. - <function>atomic_read()</function> and - <function>atomic_set()</function> get and set the counter, - <function>atomic_add()</function>, - <function>atomic_sub()</function>, - <function>atomic_inc()</function>, - <function>atomic_dec()</function>, and - <function>atomic_dec_and_test()</function> (returns - <returnvalue>true</returnvalue> if it was decremented to zero). - </para> - - <para> - Yes. It returns <returnvalue>true</returnvalue> (i.e. != 0) if the - atomic variable is zero. - </para> - - <para> - Note that these functions are slower than normal arithmetic, and - so should not be used unnecessarily. - </para> - - <para> - The second class of atomic operations is atomic bit operations on an - <type>unsigned long</type>, defined in - - <filename class="headerfile">include/linux/bitops.h</filename>. These - operations generally take a pointer to the bit pattern, and a bit - number: 0 is the least significant bit. - <function>set_bit()</function>, <function>clear_bit()</function> - and <function>change_bit()</function> set, clear, and flip the - given bit. <function>test_and_set_bit()</function>, - <function>test_and_clear_bit()</function> and - <function>test_and_change_bit()</function> do the same thing, - except return true if the bit was previously set; these are - particularly useful for atomically setting flags. - </para> - - <para> - It is possible to call these operations with bit indices greater - than BITS_PER_LONG. The resulting behavior is strange on big-endian - platforms though so it is a good idea not to do this. - </para> - </chapter> - - <chapter id="symbols"> - <title>Symbols</title> - - <para> - Within the kernel proper, the normal linking rules apply - (ie. unless a symbol is declared to be file scope with the - <type>static</type> keyword, it can be used anywhere in the - kernel). However, for modules, a special exported symbol table is - kept which limits the entry points to the kernel proper. Modules - can also export symbols. - </para> - - <sect1 id="sym-exportsymbols"> - <title><function>EXPORT_SYMBOL()</function> - <filename class="headerfile">include/linux/export.h</filename></title> - - <para> - This is the classic method of exporting a symbol: dynamically - loaded modules will be able to use the symbol as normal. - </para> - </sect1> - - <sect1 id="sym-exportsymbols-gpl"> - <title><function>EXPORT_SYMBOL_GPL()</function> - <filename class="headerfile">include/linux/export.h</filename></title> - - <para> - Similar to <function>EXPORT_SYMBOL()</function> except that the - symbols exported by <function>EXPORT_SYMBOL_GPL()</function> can - only be seen by modules with a - <function>MODULE_LICENSE()</function> that specifies a GPL - compatible license. It implies that the function is considered - an internal implementation issue, and not really an interface. - Some maintainers and developers may however - require EXPORT_SYMBOL_GPL() when adding any new APIs or functionality. - </para> - </sect1> - </chapter> - - <chapter id="conventions"> - <title>Routines and Conventions</title> - - <sect1 id="conventions-doublelinkedlist"> - <title>Double-linked lists - <filename class="headerfile">include/linux/list.h</filename></title> - - <para> - There used to be three sets of linked-list routines in the kernel - headers, but this one is the winner. If you don't have some - particular pressing need for a single list, it's a good choice. - </para> - - <para> - In particular, <function>list_for_each_entry</function> is useful. - </para> - </sect1> - - <sect1 id="convention-returns"> - <title>Return Conventions</title> - - <para> - For code called in user context, it's very common to defy C - convention, and return <returnvalue>0</returnvalue> for success, - and a negative error number - (eg. <returnvalue>-EFAULT</returnvalue>) for failure. This can be - unintuitive at first, but it's fairly widespread in the kernel. - </para> - - <para> - Using <function>ERR_PTR()</function> - - <filename class="headerfile">include/linux/err.h</filename>; to - encode a negative error number into a pointer, and - <function>IS_ERR()</function> and <function>PTR_ERR()</function> - to get it back out again: avoids a separate pointer parameter for - the error number. Icky, but in a good way. - </para> - </sect1> - - <sect1 id="conventions-borkedcompile"> - <title>Breaking Compilation</title> - - <para> - Linus and the other developers sometimes change function or - structure names in development kernels; this is not done just to - keep everyone on their toes: it reflects a fundamental change - (eg. can no longer be called with interrupts on, or does extra - checks, or doesn't do checks which were caught before). Usually - this is accompanied by a fairly complete note to the linux-kernel - mailing list; search the archive. Simply doing a global replace - on the file usually makes things <emphasis>worse</emphasis>. - </para> - </sect1> - - <sect1 id="conventions-initialising"> - <title>Initializing structure members</title> - - <para> - The preferred method of initializing structures is to use - designated initialisers, as defined by ISO C99, eg: - </para> - <programlisting> -static struct block_device_operations opt_fops = { - .open = opt_open, - .release = opt_release, - .ioctl = opt_ioctl, - .check_media_change = opt_media_change, -}; - </programlisting> - <para> - This makes it easy to grep for, and makes it clear which - structure fields are set. You should do this because it looks - cool. - </para> - </sect1> - - <sect1 id="conventions-gnu-extns"> - <title>GNU Extensions</title> - - <para> - GNU Extensions are explicitly allowed in the Linux kernel. - Note that some of the more complex ones are not very well - supported, due to lack of general use, but the following are - considered standard (see the GCC info page section "C - Extensions" for more details - Yes, really the info page, the - man page is only a short summary of the stuff in info). - </para> - <itemizedlist> - <listitem> - <para> - Inline functions - </para> - </listitem> - <listitem> - <para> - Statement expressions (ie. the ({ and }) constructs). - </para> - </listitem> - <listitem> - <para> - Declaring attributes of a function / variable / type - (__attribute__) - </para> - </listitem> - <listitem> - <para> - typeof - </para> - </listitem> - <listitem> - <para> - Zero length arrays - </para> - </listitem> - <listitem> - <para> - Macro varargs - </para> - </listitem> - <listitem> - <para> - Arithmetic on void pointers - </para> - </listitem> - <listitem> - <para> - Non-Constant initializers - </para> - </listitem> - <listitem> - <para> - Assembler Instructions (not outside arch/ and include/asm/) - </para> - </listitem> - <listitem> - <para> - Function names as strings (__func__). - </para> - </listitem> - <listitem> - <para> - __builtin_constant_p() - </para> - </listitem> - </itemizedlist> - - <para> - Be wary when using long long in the kernel, the code gcc generates for - it is horrible and worse: division and multiplication does not work - on i386 because the GCC runtime functions for it are missing from - the kernel environment. - </para> - - <!-- FIXME: add a note about ANSI aliasing cleanness --> - </sect1> - - <sect1 id="conventions-cplusplus"> - <title>C++</title> - - <para> - Using C++ in the kernel is usually a bad idea, because the - kernel does not provide the necessary runtime environment - and the include files are not tested for it. It is still - possible, but not recommended. If you really want to do - this, forget about exceptions at least. - </para> - </sect1> - - <sect1 id="conventions-ifdef"> - <title>#if</title> - - <para> - It is generally considered cleaner to use macros in header files - (or at the top of .c files) to abstract away functions rather than - using `#if' pre-processor statements throughout the source code. - </para> - </sect1> - </chapter> - - <chapter id="submitting"> - <title>Putting Your Stuff in the Kernel</title> - - <para> - In order to get your stuff into shape for official inclusion, or - even to make a neat patch, there's administrative work to be - done: - </para> - <itemizedlist> - <listitem> - <para> - Figure out whose pond you've been pissing in. Look at the top of - the source files, inside the <filename>MAINTAINERS</filename> - file, and last of all in the <filename>CREDITS</filename> file. - You should coordinate with this person to make sure you're not - duplicating effort, or trying something that's already been - rejected. - </para> - - <para> - Make sure you put your name and EMail address at the top of - any files you create or mangle significantly. This is the - first place people will look when they find a bug, or when - <emphasis>they</emphasis> want to make a change. - </para> - </listitem> - - <listitem> - <para> - Usually you want a configuration option for your kernel hack. - Edit <filename>Kconfig</filename> in the appropriate directory. - The Config language is simple to use by cut and paste, and there's - complete documentation in - <filename>Documentation/kbuild/kconfig-language.txt</filename>. - </para> - - <para> - In your description of the option, make sure you address both the - expert user and the user who knows nothing about your feature. Mention - incompatibilities and issues here. <emphasis> Definitely - </emphasis> end your description with <quote> if in doubt, say N - </quote> (or, occasionally, `Y'); this is for people who have no - idea what you are talking about. - </para> - </listitem> - - <listitem> - <para> - Edit the <filename>Makefile</filename>: the CONFIG variables are - exported here so you can usually just add a "obj-$(CONFIG_xxx) += - xxx.o" line. The syntax is documented in - <filename>Documentation/kbuild/makefiles.txt</filename>. - </para> - </listitem> - - <listitem> - <para> - Put yourself in <filename>CREDITS</filename> if you've done - something noteworthy, usually beyond a single file (your name - should be at the top of the source files anyway). - <filename>MAINTAINERS</filename> means you want to be consulted - when changes are made to a subsystem, and hear about bugs; it - implies a more-than-passing commitment to some part of the code. - </para> - </listitem> - - <listitem> - <para> - Finally, don't forget to read <filename>Documentation/process/submitting-patches.rst</filename> - and possibly <filename>Documentation/process/submitting-drivers.rst</filename>. - </para> - </listitem> - </itemizedlist> - </chapter> - - <chapter id="cantrips"> - <title>Kernel Cantrips</title> - - <para> - Some favorites from browsing the source. Feel free to add to this - list. - </para> - - <para> - <filename>arch/x86/include/asm/delay.h:</filename> - </para> - <programlisting> -#define ndelay(n) (__builtin_constant_p(n) ? \ - ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \ - __ndelay(n)) - </programlisting> - - <para> - <filename>include/linux/fs.h</filename>: - </para> - <programlisting> -/* - * Kernel pointers have redundant information, so we can use a - * scheme where we can return either an error code or a dentry - * pointer with the same return value. - * - * This should be a per-architecture thing, to allow different - * error and pointer decisions. - */ - #define ERR_PTR(err) ((void *)((long)(err))) - #define PTR_ERR(ptr) ((long)(ptr)) - #define IS_ERR(ptr) ((unsigned long)(ptr) > (unsigned long)(-1000)) -</programlisting> - - <para> - <filename>arch/x86/include/asm/uaccess_32.h:</filename> - </para> - - <programlisting> -#define copy_to_user(to,from,n) \ - (__builtin_constant_p(n) ? \ - __constant_copy_to_user((to),(from),(n)) : \ - __generic_copy_to_user((to),(from),(n))) - </programlisting> - - <para> - <filename>arch/sparc/kernel/head.S:</filename> - </para> - - <programlisting> -/* - * Sun people can't spell worth damn. "compatability" indeed. - * At least we *know* we can't spell, and use a spell-checker. - */ - -/* Uh, actually Linus it is I who cannot spell. Too much murky - * Sparc assembly will do this to ya. - */ -C_LABEL(cputypvar): - .asciz "compatibility" - -/* Tested on SS-5, SS-10. Probably someone at Sun applied a spell-checker. */ - .align 4 -C_LABEL(cputypvar_sun4m): - .asciz "compatible" - </programlisting> - - <para> - <filename>arch/sparc/lib/checksum.S:</filename> - </para> - - <programlisting> - /* Sun, you just can't beat me, you just can't. Stop trying, - * give up. I'm serious, I am going to kick the living shit - * out of you, game over, lights out. - */ - </programlisting> - </chapter> - - <chapter id="credits"> - <title>Thanks</title> - - <para> - Thanks to Andi Kleen for the idea, answering my questions, fixing - my mistakes, filling content, etc. Philipp Rumpf for more spelling - and clarity fixes, and some excellent non-obvious points. Werner - Almesberger for giving me a great summary of - <function>disable_irq()</function>, and Jes Sorensen and Andrea - Arcangeli added caveats. Michael Elizabeth Chastain for checking - and adding to the Configure section. <!-- Rusty insisted on this - bit; I didn't do it! --> Telsa Gwynne for teaching me DocBook. - </para> - </chapter> -</book> - diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl deleted file mode 100644 index 7c9cc4846cb6..000000000000 --- a/Documentation/DocBook/kernel-locking.tmpl +++ /dev/null @@ -1,2151 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="LKLockingGuide"> - <bookinfo> - <title>Unreliable Guide To Locking</title> - - <authorgroup> - <author> - <firstname>Rusty</firstname> - <surname>Russell</surname> - <affiliation> - <address> - <email>rusty@rustcorp.com.au</email> - </address> - </affiliation> - </author> - </authorgroup> - - <copyright> - <year>2003</year> - <holder>Rusty Russell</holder> - </copyright> - - <legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later - version. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - </para> - - <para> - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - </para> - - <para> - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - </bookinfo> - - <toc></toc> - <chapter id="intro"> - <title>Introduction</title> - <para> - Welcome, to Rusty's Remarkably Unreliable Guide to Kernel - Locking issues. This document describes the locking systems in - the Linux Kernel in 2.6. - </para> - <para> - With the wide availability of HyperThreading, and <firstterm - linkend="gloss-preemption">preemption </firstterm> in the Linux - Kernel, everyone hacking on the kernel needs to know the - fundamentals of concurrency and locking for - <firstterm linkend="gloss-smp"><acronym>SMP</acronym></firstterm>. - </para> - </chapter> - - <chapter id="races"> - <title>The Problem With Concurrency</title> - <para> - (Skip this if you know what a Race Condition is). - </para> - <para> - In a normal program, you can increment a counter like so: - </para> - <programlisting> - very_important_count++; - </programlisting> - - <para> - This is what they would expect to happen: - </para> - - <table> - <title>Expected Results</title> - - <tgroup cols="2" align="left"> - - <thead> - <row> - <entry>Instance 1</entry> - <entry>Instance 2</entry> - </row> - </thead> - - <tbody> - <row> - <entry>read very_important_count (5)</entry> - <entry></entry> - </row> - <row> - <entry>add 1 (6)</entry> - <entry></entry> - </row> - <row> - <entry>write very_important_count (6)</entry> - <entry></entry> - </row> - <row> - <entry></entry> - <entry>read very_important_count (6)</entry> - </row> - <row> - <entry></entry> - <entry>add 1 (7)</entry> - </row> - <row> - <entry></entry> - <entry>write very_important_count (7)</entry> - </row> - </tbody> - - </tgroup> - </table> - - <para> - This is what might happen: - </para> - - <table> - <title>Possible Results</title> - - <tgroup cols="2" align="left"> - <thead> - <row> - <entry>Instance 1</entry> - <entry>Instance 2</entry> - </row> - </thead> - - <tbody> - <row> - <entry>read very_important_count (5)</entry> - <entry></entry> - </row> - <row> - <entry></entry> - <entry>read very_important_count (5)</entry> - </row> - <row> - <entry>add 1 (6)</entry> - <entry></entry> - </row> - <row> - <entry></entry> - <entry>add 1 (6)</entry> - </row> - <row> - <entry>write very_important_count (6)</entry> - <entry></entry> - </row> - <row> - <entry></entry> - <entry>write very_important_count (6)</entry> - </row> - </tbody> - </tgroup> - </table> - - <sect1 id="race-condition"> - <title>Race Conditions and Critical Regions</title> - <para> - This overlap, where the result depends on the - relative timing of multiple tasks, is called a <firstterm>race condition</firstterm>. - The piece of code containing the concurrency issue is called a - <firstterm>critical region</firstterm>. And especially since Linux starting running - on SMP machines, they became one of the major issues in kernel - design and implementation. - </para> - <para> - Preemption can have the same effect, even if there is only one - CPU: by preempting one task during the critical region, we have - exactly the same race condition. In this case the thread which - preempts might run the critical region itself. - </para> - <para> - The solution is to recognize when these simultaneous accesses - occur, and use locks to make sure that only one instance can - enter the critical region at any time. There are many - friendly primitives in the Linux kernel to help you do this. - And then there are the unfriendly primitives, but I'll pretend - they don't exist. - </para> - </sect1> - </chapter> - - <chapter id="locks"> - <title>Locking in the Linux Kernel</title> - - <para> - If I could give you one piece of advice: never sleep with anyone - crazier than yourself. But if I had to give you advice on - locking: <emphasis>keep it simple</emphasis>. - </para> - - <para> - Be reluctant to introduce new locks. - </para> - - <para> - Strangely enough, this last one is the exact reverse of my advice when - you <emphasis>have</emphasis> slept with someone crazier than yourself. - And you should think about getting a big dog. - </para> - - <sect1 id="lock-intro"> - <title>Two Main Types of Kernel Locks: Spinlocks and Mutexes</title> - - <para> - There are two main types of kernel locks. The fundamental type - is the spinlock - (<filename class="headerfile">include/asm/spinlock.h</filename>), - which is a very simple single-holder lock: if you can't get the - spinlock, you keep trying (spinning) until you can. Spinlocks are - very small and fast, and can be used anywhere. - </para> - <para> - The second type is a mutex - (<filename class="headerfile">include/linux/mutex.h</filename>): it - is like a spinlock, but you may block holding a mutex. - If you can't lock a mutex, your task will suspend itself, and be woken - up when the mutex is released. This means the CPU can do something - else while you are waiting. There are many cases when you simply - can't sleep (see <xref linkend="sleeping-things"/>), and so have to - use a spinlock instead. - </para> - <para> - Neither type of lock is recursive: see - <xref linkend="deadlock"/>. - </para> - </sect1> - - <sect1 id="uniprocessor"> - <title>Locks and Uniprocessor Kernels</title> - - <para> - For kernels compiled without <symbol>CONFIG_SMP</symbol>, and - without <symbol>CONFIG_PREEMPT</symbol> spinlocks do not exist at - all. This is an excellent design decision: when no-one else can - run at the same time, there is no reason to have a lock. - </para> - - <para> - If the kernel is compiled without <symbol>CONFIG_SMP</symbol>, - but <symbol>CONFIG_PREEMPT</symbol> is set, then spinlocks - simply disable preemption, which is sufficient to prevent any - races. For most purposes, we can think of preemption as - equivalent to SMP, and not worry about it separately. - </para> - - <para> - You should always test your locking code with <symbol>CONFIG_SMP</symbol> - and <symbol>CONFIG_PREEMPT</symbol> enabled, even if you don't have an SMP test box, because it - will still catch some kinds of locking bugs. - </para> - - <para> - Mutexes still exist, because they are required for - synchronization between <firstterm linkend="gloss-usercontext">user - contexts</firstterm>, as we will see below. - </para> - </sect1> - - <sect1 id="usercontextlocking"> - <title>Locking Only In User Context</title> - - <para> - If you have a data structure which is only ever accessed from - user context, then you can use a simple mutex - (<filename>include/linux/mutex.h</filename>) to protect it. This - is the most trivial case: you initialize the mutex. Then you can - call <function>mutex_lock_interruptible()</function> to grab the mutex, - and <function>mutex_unlock()</function> to release it. There is also a - <function>mutex_lock()</function>, which should be avoided, because it - will not return if a signal is received. - </para> - - <para> - Example: <filename>net/netfilter/nf_sockopt.c</filename> allows - registration of new <function>setsockopt()</function> and - <function>getsockopt()</function> calls, with - <function>nf_register_sockopt()</function>. Registration and - de-registration are only done on module load and unload (and boot - time, where there is no concurrency), and the list of registrations - is only consulted for an unknown <function>setsockopt()</function> - or <function>getsockopt()</function> system call. The - <varname>nf_sockopt_mutex</varname> is perfect to protect this, - especially since the setsockopt and getsockopt calls may well - sleep. - </para> - </sect1> - - <sect1 id="lock-user-bh"> - <title>Locking Between User Context and Softirqs</title> - - <para> - If a <firstterm linkend="gloss-softirq">softirq</firstterm> shares - data with user context, you have two problems. Firstly, the current - user context can be interrupted by a softirq, and secondly, the - critical region could be entered from another CPU. This is where - <function>spin_lock_bh()</function> - (<filename class="headerfile">include/linux/spinlock.h</filename>) is - used. It disables softirqs on that CPU, then grabs the lock. - <function>spin_unlock_bh()</function> does the reverse. (The - '_bh' suffix is a historical reference to "Bottom Halves", the - old name for software interrupts. It should really be - called spin_lock_softirq()' in a perfect world). - </para> - - <para> - Note that you can also use <function>spin_lock_irq()</function> - or <function>spin_lock_irqsave()</function> here, which stop - hardware interrupts as well: see <xref linkend="hardirq-context"/>. - </para> - - <para> - This works perfectly for <firstterm linkend="gloss-up"><acronym>UP - </acronym></firstterm> as well: the spin lock vanishes, and this macro - simply becomes <function>local_bh_disable()</function> - (<filename class="headerfile">include/linux/interrupt.h</filename>), which - protects you from the softirq being run. - </para> - </sect1> - - <sect1 id="lock-user-tasklet"> - <title>Locking Between User Context and Tasklets</title> - - <para> - This is exactly the same as above, because <firstterm - linkend="gloss-tasklet">tasklets</firstterm> are actually run - from a softirq. - </para> - </sect1> - - <sect1 id="lock-user-timers"> - <title>Locking Between User Context and Timers</title> - - <para> - This, too, is exactly the same as above, because <firstterm - linkend="gloss-timers">timers</firstterm> are actually run from - a softirq. From a locking point of view, tasklets and timers - are identical. - </para> - </sect1> - - <sect1 id="lock-tasklets"> - <title>Locking Between Tasklets/Timers</title> - - <para> - Sometimes a tasklet or timer might want to share data with - another tasklet or timer. - </para> - - <sect2 id="lock-tasklets-same"> - <title>The Same Tasklet/Timer</title> - <para> - Since a tasklet is never run on two CPUs at once, you don't - need to worry about your tasklet being reentrant (running - twice at once), even on SMP. - </para> - </sect2> - - <sect2 id="lock-tasklets-different"> - <title>Different Tasklets/Timers</title> - <para> - If another tasklet/timer wants - to share data with your tasklet or timer , you will both need to use - <function>spin_lock()</function> and - <function>spin_unlock()</function> calls. - <function>spin_lock_bh()</function> is - unnecessary here, as you are already in a tasklet, and - none will be run on the same CPU. - </para> - </sect2> - </sect1> - - <sect1 id="lock-softirqs"> - <title>Locking Between Softirqs</title> - - <para> - Often a softirq might - want to share data with itself or a tasklet/timer. - </para> - - <sect2 id="lock-softirqs-same"> - <title>The Same Softirq</title> - - <para> - The same softirq can run on the other CPUs: you can use a - per-CPU array (see <xref linkend="per-cpu"/>) for better - performance. If you're going so far as to use a softirq, - you probably care about scalable performance enough - to justify the extra complexity. - </para> - - <para> - You'll need to use <function>spin_lock()</function> and - <function>spin_unlock()</function> for shared data. - </para> - </sect2> - - <sect2 id="lock-softirqs-different"> - <title>Different Softirqs</title> - - <para> - You'll need to use <function>spin_lock()</function> and - <function>spin_unlock()</function> for shared data, whether it - be a timer, tasklet, different softirq or the same or another - softirq: any of them could be running on a different CPU. - </para> - </sect2> - </sect1> - </chapter> - - <chapter id="hardirq-context"> - <title>Hard IRQ Context</title> - - <para> - Hardware interrupts usually communicate with a - tasklet or softirq. Frequently this involves putting work in a - queue, which the softirq will take out. - </para> - - <sect1 id="hardirq-softirq"> - <title>Locking Between Hard IRQ and Softirqs/Tasklets</title> - - <para> - If a hardware irq handler shares data with a softirq, you have - two concerns. Firstly, the softirq processing can be - interrupted by a hardware interrupt, and secondly, the - critical region could be entered by a hardware interrupt on - another CPU. This is where <function>spin_lock_irq()</function> is - used. It is defined to disable interrupts on that cpu, then grab - the lock. <function>spin_unlock_irq()</function> does the reverse. - </para> - - <para> - The irq handler does not to use - <function>spin_lock_irq()</function>, because the softirq cannot - run while the irq handler is running: it can use - <function>spin_lock()</function>, which is slightly faster. The - only exception would be if a different hardware irq handler uses - the same lock: <function>spin_lock_irq()</function> will stop - that from interrupting us. - </para> - - <para> - This works perfectly for UP as well: the spin lock vanishes, - and this macro simply becomes <function>local_irq_disable()</function> - (<filename class="headerfile">include/asm/smp.h</filename>), which - protects you from the softirq/tasklet/BH being run. - </para> - - <para> - <function>spin_lock_irqsave()</function> - (<filename>include/linux/spinlock.h</filename>) is a variant - which saves whether interrupts were on or off in a flags word, - which is passed to <function>spin_unlock_irqrestore()</function>. This - means that the same code can be used inside an hard irq handler (where - interrupts are already off) and in softirqs (where the irq - disabling is required). - </para> - - <para> - Note that softirqs (and hence tasklets and timers) are run on - return from hardware interrupts, so - <function>spin_lock_irq()</function> also stops these. In that - sense, <function>spin_lock_irqsave()</function> is the most - general and powerful locking function. - </para> - - </sect1> - <sect1 id="hardirq-hardirq"> - <title>Locking Between Two Hard IRQ Handlers</title> - <para> - It is rare to have to share data between two IRQ handlers, but - if you do, <function>spin_lock_irqsave()</function> should be - used: it is architecture-specific whether all interrupts are - disabled inside irq handlers themselves. - </para> - </sect1> - - </chapter> - - <chapter id="cheatsheet"> - <title>Cheat Sheet For Locking</title> - <para> - Pete Zaitcev gives the following summary: - </para> - <itemizedlist> - <listitem> - <para> - If you are in a process context (any syscall) and want to - lock other process out, use a mutex. You can take a mutex - and sleep (<function>copy_from_user*(</function> or - <function>kmalloc(x,GFP_KERNEL)</function>). - </para> - </listitem> - <listitem> - <para> - Otherwise (== data can be touched in an interrupt), use - <function>spin_lock_irqsave()</function> and - <function>spin_unlock_irqrestore()</function>. - </para> - </listitem> - <listitem> - <para> - Avoid holding spinlock for more than 5 lines of code and - across any function call (except accessors like - <function>readb</function>). - </para> - </listitem> - </itemizedlist> - - <sect1 id="minimum-lock-reqirements"> - <title>Table of Minimum Requirements</title> - - <para> The following table lists the <emphasis>minimum</emphasis> - locking requirements between various contexts. In some cases, - the same context can only be running on one CPU at a time, so - no locking is required for that context (eg. a particular - thread can only run on one CPU at a time, but if it needs - shares data with another thread, locking is required). - </para> - <para> - Remember the advice above: you can always use - <function>spin_lock_irqsave()</function>, which is a superset - of all other spinlock primitives. - </para> - - <table> -<title>Table of Locking Requirements</title> -<tgroup cols="11"> -<tbody> - -<row> -<entry></entry> -<entry>IRQ Handler A</entry> -<entry>IRQ Handler B</entry> -<entry>Softirq A</entry> -<entry>Softirq B</entry> -<entry>Tasklet A</entry> -<entry>Tasklet B</entry> -<entry>Timer A</entry> -<entry>Timer B</entry> -<entry>User Context A</entry> -<entry>User Context B</entry> -</row> - -<row> -<entry>IRQ Handler A</entry> -<entry>None</entry> -</row> - -<row> -<entry>IRQ Handler B</entry> -<entry>SLIS</entry> -<entry>None</entry> -</row> - -<row> -<entry>Softirq A</entry> -<entry>SLI</entry> -<entry>SLI</entry> -<entry>SL</entry> -</row> - -<row> -<entry>Softirq B</entry> -<entry>SLI</entry> -<entry>SLI</entry> -<entry>SL</entry> -<entry>SL</entry> -</row> - -<row> -<entry>Tasklet A</entry> -<entry>SLI</entry> -<entry>SLI</entry> -<entry>SL</entry> -<entry>SL</entry> -<entry>None</entry> -</row> - -<row> -<entry>Tasklet B</entry> -<entry>SLI</entry> -<entry>SLI</entry> -<entry>SL</entry> -<entry>SL</entry> -<entry>SL</entry> -<entry>None</entry> -</row> - -<row> -<entry>Timer A</entry> -<entry>SLI</entry> -<entry>SLI</entry> -<entry>SL</entry> -<entry>SL</entry> -<entry>SL</entry> -<entry>SL</entry> -<entry>None</entry> -</row> - -<row> -<entry>Timer B</entry> -<entry>SLI</entry> -<entry>SLI</entry> -<entry>SL</entry> -<entry>SL</entry> -<entry>SL</entry> -<entry>SL</entry> -<entry>SL</entry> -<entry>None</entry> -</row> - -<row> -<entry>User Context A</entry> -<entry>SLI</entry> -<entry>SLI</entry> -<entry>SLBH</entry> -<entry>SLBH</entry> -<entry>SLBH</entry> -<entry>SLBH</entry> -<entry>SLBH</entry> -<entry>SLBH</entry> -<entry>None</entry> -</row> - -<row> -<entry>User Context B</entry> -<entry>SLI</entry> -<entry>SLI</entry> -<entry>SLBH</entry> -<entry>SLBH</entry> -<entry>SLBH</entry> -<entry>SLBH</entry> -<entry>SLBH</entry> -<entry>SLBH</entry> -<entry>MLI</entry> -<entry>None</entry> -</row> - -</tbody> -</tgroup> -</table> - - <table> -<title>Legend for Locking Requirements Table</title> -<tgroup cols="2"> -<tbody> - -<row> -<entry>SLIS</entry> -<entry>spin_lock_irqsave</entry> -</row> -<row> -<entry>SLI</entry> -<entry>spin_lock_irq</entry> -</row> -<row> -<entry>SL</entry> -<entry>spin_lock</entry> -</row> -<row> -<entry>SLBH</entry> -<entry>spin_lock_bh</entry> -</row> -<row> -<entry>MLI</entry> -<entry>mutex_lock_interruptible</entry> -</row> - -</tbody> -</tgroup> -</table> - -</sect1> -</chapter> - -<chapter id="trylock-functions"> - <title>The trylock Functions</title> - <para> - There are functions that try to acquire a lock only once and immediately - return a value telling about success or failure to acquire the lock. - They can be used if you need no access to the data protected with the lock - when some other thread is holding the lock. You should acquire the lock - later if you then need access to the data protected with the lock. - </para> - - <para> - <function>spin_trylock()</function> does not spin but returns non-zero if - it acquires the spinlock on the first try or 0 if not. This function can - be used in all contexts like <function>spin_lock</function>: you must have - disabled the contexts that might interrupt you and acquire the spin lock. - </para> - - <para> - <function>mutex_trylock()</function> does not suspend your task - but returns non-zero if it could lock the mutex on the first try - or 0 if not. This function cannot be safely used in hardware or software - interrupt contexts despite not sleeping. - </para> -</chapter> - - <chapter id="Examples"> - <title>Common Examples</title> - <para> -Let's step through a simple example: a cache of number to name -mappings. The cache keeps a count of how often each of the objects is -used, and when it gets full, throws out the least used one. - - </para> - - <sect1 id="examples-usercontext"> - <title>All In User Context</title> - <para> -For our first example, we assume that all operations are in user -context (ie. from system calls), so we can sleep. This means we can -use a mutex to protect the cache and all the objects within -it. Here's the code: - </para> - - <programlisting> -#include <linux/list.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/mutex.h> -#include <asm/errno.h> - -struct object -{ - struct list_head list; - int id; - char name[32]; - int popularity; -}; - -/* Protects the cache, cache_num, and the objects within it */ -static DEFINE_MUTEX(cache_lock); -static LIST_HEAD(cache); -static unsigned int cache_num = 0; -#define MAX_CACHE_SIZE 10 - -/* Must be holding cache_lock */ -static struct object *__cache_find(int id) -{ - struct object *i; - - list_for_each_entry(i, &cache, list) - if (i->id == id) { - i->popularity++; - return i; - } - return NULL; -} - -/* Must be holding cache_lock */ -static void __cache_delete(struct object *obj) -{ - BUG_ON(!obj); - list_del(&obj->list); - kfree(obj); - cache_num--; -} - -/* Must be holding cache_lock */ -static void __cache_add(struct object *obj) -{ - list_add(&obj->list, &cache); - if (++cache_num > MAX_CACHE_SIZE) { - struct object *i, *outcast = NULL; - list_for_each_entry(i, &cache, list) { - if (!outcast || i->popularity < outcast->popularity) - outcast = i; - } - __cache_delete(outcast); - } -} - -int cache_add(int id, const char *name) -{ - struct object *obj; - - if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL) - return -ENOMEM; - - strlcpy(obj->name, name, sizeof(obj->name)); - obj->id = id; - obj->popularity = 0; - - mutex_lock(&cache_lock); - __cache_add(obj); - mutex_unlock(&cache_lock); - return 0; -} - -void cache_delete(int id) -{ - mutex_lock(&cache_lock); - __cache_delete(__cache_find(id)); - mutex_unlock(&cache_lock); -} - -int cache_find(int id, char *name) -{ - struct object *obj; - int ret = -ENOENT; - - mutex_lock(&cache_lock); - obj = __cache_find(id); - if (obj) { - ret = 0; - strcpy(name, obj->name); - } - mutex_unlock(&cache_lock); - return ret; -} -</programlisting> - - <para> -Note that we always make sure we have the cache_lock when we add, -delete, or look up the cache: both the cache infrastructure itself and -the contents of the objects are protected by the lock. In this case -it's easy, since we copy the data for the user, and never let them -access the objects directly. - </para> - <para> -There is a slight (and common) optimization here: in -<function>cache_add</function> we set up the fields of the object -before grabbing the lock. This is safe, as no-one else can access it -until we put it in cache. - </para> - </sect1> - - <sect1 id="examples-interrupt"> - <title>Accessing From Interrupt Context</title> - <para> -Now consider the case where <function>cache_find</function> can be -called from interrupt context: either a hardware interrupt or a -softirq. An example would be a timer which deletes object from the -cache. - </para> - <para> -The change is shown below, in standard patch format: the -<symbol>-</symbol> are lines which are taken away, and the -<symbol>+</symbol> are lines which are added. - </para> -<programlisting> ---- cache.c.usercontext 2003-12-09 13:58:54.000000000 +1100 -+++ cache.c.interrupt 2003-12-09 14:07:49.000000000 +1100 -@@ -12,7 +12,7 @@ - int popularity; - }; - --static DEFINE_MUTEX(cache_lock); -+static DEFINE_SPINLOCK(cache_lock); - static LIST_HEAD(cache); - static unsigned int cache_num = 0; - #define MAX_CACHE_SIZE 10 -@@ -55,6 +55,7 @@ - int cache_add(int id, const char *name) - { - struct object *obj; -+ unsigned long flags; - - if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL) - return -ENOMEM; -@@ -63,30 +64,33 @@ - obj->id = id; - obj->popularity = 0; - -- mutex_lock(&cache_lock); -+ spin_lock_irqsave(&cache_lock, flags); - __cache_add(obj); -- mutex_unlock(&cache_lock); -+ spin_unlock_irqrestore(&cache_lock, flags); - return 0; - } - - void cache_delete(int id) - { -- mutex_lock(&cache_lock); -+ unsigned long flags; -+ -+ spin_lock_irqsave(&cache_lock, flags); - __cache_delete(__cache_find(id)); -- mutex_unlock(&cache_lock); -+ spin_unlock_irqrestore(&cache_lock, flags); - } - - int cache_find(int id, char *name) - { - struct object *obj; - int ret = -ENOENT; -+ unsigned long flags; - -- mutex_lock(&cache_lock); -+ spin_lock_irqsave(&cache_lock, flags); - obj = __cache_find(id); - if (obj) { - ret = 0; - strcpy(name, obj->name); - } -- mutex_unlock(&cache_lock); -+ spin_unlock_irqrestore(&cache_lock, flags); - return ret; - } -</programlisting> - - <para> -Note that the <function>spin_lock_irqsave</function> will turn off -interrupts if they are on, otherwise does nothing (if we are already -in an interrupt handler), hence these functions are safe to call from -any context. - </para> - <para> -Unfortunately, <function>cache_add</function> calls -<function>kmalloc</function> with the <symbol>GFP_KERNEL</symbol> -flag, which is only legal in user context. I have assumed that -<function>cache_add</function> is still only called in user context, -otherwise this should become a parameter to -<function>cache_add</function>. - </para> - </sect1> - <sect1 id="examples-refcnt"> - <title>Exposing Objects Outside This File</title> - <para> -If our objects contained more information, it might not be sufficient -to copy the information in and out: other parts of the code might want -to keep pointers to these objects, for example, rather than looking up -the id every time. This produces two problems. - </para> - <para> -The first problem is that we use the <symbol>cache_lock</symbol> to -protect objects: we'd need to make this non-static so the rest of the -code can use it. This makes locking trickier, as it is no longer all -in one place. - </para> - <para> -The second problem is the lifetime problem: if another structure keeps -a pointer to an object, it presumably expects that pointer to remain -valid. Unfortunately, this is only guaranteed while you hold the -lock, otherwise someone might call <function>cache_delete</function> -and even worse, add another object, re-using the same address. - </para> - <para> -As there is only one lock, you can't hold it forever: no-one else would -get any work done. - </para> - <para> -The solution to this problem is to use a reference count: everyone who -has a pointer to the object increases it when they first get the -object, and drops the reference count when they're finished with it. -Whoever drops it to zero knows it is unused, and can actually delete it. - </para> - <para> -Here is the code: - </para> - -<programlisting> ---- cache.c.interrupt 2003-12-09 14:25:43.000000000 +1100 -+++ cache.c.refcnt 2003-12-09 14:33:05.000000000 +1100 -@@ -7,6 +7,7 @@ - struct object - { - struct list_head list; -+ unsigned int refcnt; - int id; - char name[32]; - int popularity; -@@ -17,6 +18,35 @@ - static unsigned int cache_num = 0; - #define MAX_CACHE_SIZE 10 - -+static void __object_put(struct object *obj) -+{ -+ if (--obj->refcnt == 0) -+ kfree(obj); -+} -+ -+static void __object_get(struct object *obj) -+{ -+ obj->refcnt++; -+} -+ -+void object_put(struct object *obj) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&cache_lock, flags); -+ __object_put(obj); -+ spin_unlock_irqrestore(&cache_lock, flags); -+} -+ -+void object_get(struct object *obj) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&cache_lock, flags); -+ __object_get(obj); -+ spin_unlock_irqrestore(&cache_lock, flags); -+} -+ - /* Must be holding cache_lock */ - static struct object *__cache_find(int id) - { -@@ -35,6 +65,7 @@ - { - BUG_ON(!obj); - list_del(&obj->list); -+ __object_put(obj); - cache_num--; - } - -@@ -63,6 +94,7 @@ - strlcpy(obj->name, name, sizeof(obj->name)); - obj->id = id; - obj->popularity = 0; -+ obj->refcnt = 1; /* The cache holds a reference */ - - spin_lock_irqsave(&cache_lock, flags); - __cache_add(obj); -@@ -79,18 +111,15 @@ - spin_unlock_irqrestore(&cache_lock, flags); - } - --int cache_find(int id, char *name) -+struct object *cache_find(int id) - { - struct object *obj; -- int ret = -ENOENT; - unsigned long flags; - - spin_lock_irqsave(&cache_lock, flags); - obj = __cache_find(id); -- if (obj) { -- ret = 0; -- strcpy(name, obj->name); -- } -+ if (obj) -+ __object_get(obj); - spin_unlock_irqrestore(&cache_lock, flags); -- return ret; -+ return obj; - } -</programlisting> - -<para> -We encapsulate the reference counting in the standard 'get' and 'put' -functions. Now we can return the object itself from -<function>cache_find</function> which has the advantage that the user -can now sleep holding the object (eg. to -<function>copy_to_user</function> to name to userspace). -</para> -<para> -The other point to note is that I said a reference should be held for -every pointer to the object: thus the reference count is 1 when first -inserted into the cache. In some versions the framework does not hold -a reference count, but they are more complicated. -</para> - - <sect2 id="examples-refcnt-atomic"> - <title>Using Atomic Operations For The Reference Count</title> -<para> -In practice, <type>atomic_t</type> would usually be used for -<structfield>refcnt</structfield>. There are a number of atomic -operations defined in - -<filename class="headerfile">include/asm/atomic.h</filename>: these are -guaranteed to be seen atomically from all CPUs in the system, so no -lock is required. In this case, it is simpler than using spinlocks, -although for anything non-trivial using spinlocks is clearer. The -<function>atomic_inc</function> and -<function>atomic_dec_and_test</function> are used instead of the -standard increment and decrement operators, and the lock is no longer -used to protect the reference count itself. -</para> - -<programlisting> ---- cache.c.refcnt 2003-12-09 15:00:35.000000000 +1100 -+++ cache.c.refcnt-atomic 2003-12-11 15:49:42.000000000 +1100 -@@ -7,7 +7,7 @@ - struct object - { - struct list_head list; -- unsigned int refcnt; -+ atomic_t refcnt; - int id; - char name[32]; - int popularity; -@@ -18,33 +18,15 @@ - static unsigned int cache_num = 0; - #define MAX_CACHE_SIZE 10 - --static void __object_put(struct object *obj) --{ -- if (--obj->refcnt == 0) -- kfree(obj); --} -- --static void __object_get(struct object *obj) --{ -- obj->refcnt++; --} -- - void object_put(struct object *obj) - { -- unsigned long flags; -- -- spin_lock_irqsave(&cache_lock, flags); -- __object_put(obj); -- spin_unlock_irqrestore(&cache_lock, flags); -+ if (atomic_dec_and_test(&obj->refcnt)) -+ kfree(obj); - } - - void object_get(struct object *obj) - { -- unsigned long flags; -- -- spin_lock_irqsave(&cache_lock, flags); -- __object_get(obj); -- spin_unlock_irqrestore(&cache_lock, flags); -+ atomic_inc(&obj->refcnt); - } - - /* Must be holding cache_lock */ -@@ -65,7 +47,7 @@ - { - BUG_ON(!obj); - list_del(&obj->list); -- __object_put(obj); -+ object_put(obj); - cache_num--; - } - -@@ -94,7 +76,7 @@ - strlcpy(obj->name, name, sizeof(obj->name)); - obj->id = id; - obj->popularity = 0; -- obj->refcnt = 1; /* The cache holds a reference */ -+ atomic_set(&obj->refcnt, 1); /* The cache holds a reference */ - - spin_lock_irqsave(&cache_lock, flags); - __cache_add(obj); -@@ -119,7 +101,7 @@ - spin_lock_irqsave(&cache_lock, flags); - obj = __cache_find(id); - if (obj) -- __object_get(obj); -+ object_get(obj); - spin_unlock_irqrestore(&cache_lock, flags); - return obj; - } -</programlisting> -</sect2> -</sect1> - - <sect1 id="examples-lock-per-obj"> - <title>Protecting The Objects Themselves</title> - <para> -In these examples, we assumed that the objects (except the reference -counts) never changed once they are created. If we wanted to allow -the name to change, there are three possibilities: - </para> - <itemizedlist> - <listitem> - <para> -You can make <symbol>cache_lock</symbol> non-static, and tell people -to grab that lock before changing the name in any object. - </para> - </listitem> - <listitem> - <para> -You can provide a <function>cache_obj_rename</function> which grabs -this lock and changes the name for the caller, and tell everyone to -use that function. - </para> - </listitem> - <listitem> - <para> -You can make the <symbol>cache_lock</symbol> protect only the cache -itself, and use another lock to protect the name. - </para> - </listitem> - </itemizedlist> - - <para> -Theoretically, you can make the locks as fine-grained as one lock for -every field, for every object. In practice, the most common variants -are: -</para> - <itemizedlist> - <listitem> - <para> -One lock which protects the infrastructure (the <symbol>cache</symbol> -list in this example) and all the objects. This is what we have done -so far. - </para> - </listitem> - <listitem> - <para> -One lock which protects the infrastructure (including the list -pointers inside the objects), and one lock inside the object which -protects the rest of that object. - </para> - </listitem> - <listitem> - <para> -Multiple locks to protect the infrastructure (eg. one lock per hash -chain), possibly with a separate per-object lock. - </para> - </listitem> - </itemizedlist> - -<para> -Here is the "lock-per-object" implementation: -</para> -<programlisting> ---- cache.c.refcnt-atomic 2003-12-11 15:50:54.000000000 +1100 -+++ cache.c.perobjectlock 2003-12-11 17:15:03.000000000 +1100 -@@ -6,11 +6,17 @@ - - struct object - { -+ /* These two protected by cache_lock. */ - struct list_head list; -+ int popularity; -+ - atomic_t refcnt; -+ -+ /* Doesn't change once created. */ - int id; -+ -+ spinlock_t lock; /* Protects the name */ - char name[32]; -- int popularity; - }; - - static DEFINE_SPINLOCK(cache_lock); -@@ -77,6 +84,7 @@ - obj->id = id; - obj->popularity = 0; - atomic_set(&obj->refcnt, 1); /* The cache holds a reference */ -+ spin_lock_init(&obj->lock); - - spin_lock_irqsave(&cache_lock, flags); - __cache_add(obj); -</programlisting> - -<para> -Note that I decide that the <structfield>popularity</structfield> -count should be protected by the <symbol>cache_lock</symbol> rather -than the per-object lock: this is because it (like the -<structname>struct list_head</structname> inside the object) is -logically part of the infrastructure. This way, I don't need to grab -the lock of every object in <function>__cache_add</function> when -seeking the least popular. -</para> - -<para> -I also decided that the <structfield>id</structfield> member is -unchangeable, so I don't need to grab each object lock in -<function>__cache_find()</function> to examine the -<structfield>id</structfield>: the object lock is only used by a -caller who wants to read or write the <structfield>name</structfield> -field. -</para> - -<para> -Note also that I added a comment describing what data was protected by -which locks. This is extremely important, as it describes the runtime -behavior of the code, and can be hard to gain from just reading. And -as Alan Cox says, <quote>Lock data, not code</quote>. -</para> -</sect1> -</chapter> - - <chapter id="common-problems"> - <title>Common Problems</title> - <sect1 id="deadlock"> - <title>Deadlock: Simple and Advanced</title> - - <para> - There is a coding bug where a piece of code tries to grab a - spinlock twice: it will spin forever, waiting for the lock to - be released (spinlocks, rwlocks and mutexes are not - recursive in Linux). This is trivial to diagnose: not a - stay-up-five-nights-talk-to-fluffy-code-bunnies kind of - problem. - </para> - - <para> - For a slightly more complex case, imagine you have a region - shared by a softirq and user context. If you use a - <function>spin_lock()</function> call to protect it, it is - possible that the user context will be interrupted by the softirq - while it holds the lock, and the softirq will then spin - forever trying to get the same lock. - </para> - - <para> - Both of these are called deadlock, and as shown above, it can - occur even with a single CPU (although not on UP compiles, - since spinlocks vanish on kernel compiles with - <symbol>CONFIG_SMP</symbol>=n. You'll still get data corruption - in the second example). - </para> - - <para> - This complete lockup is easy to diagnose: on SMP boxes the - watchdog timer or compiling with <symbol>DEBUG_SPINLOCK</symbol> set - (<filename>include/linux/spinlock.h</filename>) will show this up - immediately when it happens. - </para> - - <para> - A more complex problem is the so-called 'deadly embrace', - involving two or more locks. Say you have a hash table: each - entry in the table is a spinlock, and a chain of hashed - objects. Inside a softirq handler, you sometimes want to - alter an object from one place in the hash to another: you - grab the spinlock of the old hash chain and the spinlock of - the new hash chain, and delete the object from the old one, - and insert it in the new one. - </para> - - <para> - There are two problems here. First, if your code ever - tries to move the object to the same chain, it will deadlock - with itself as it tries to lock it twice. Secondly, if the - same softirq on another CPU is trying to move another object - in the reverse direction, the following could happen: - </para> - - <table> - <title>Consequences</title> - - <tgroup cols="2" align="left"> - - <thead> - <row> - <entry>CPU 1</entry> - <entry>CPU 2</entry> - </row> - </thead> - - <tbody> - <row> - <entry>Grab lock A -> OK</entry> - <entry>Grab lock B -> OK</entry> - </row> - <row> - <entry>Grab lock B -> spin</entry> - <entry>Grab lock A -> spin</entry> - </row> - </tbody> - </tgroup> - </table> - - <para> - The two CPUs will spin forever, waiting for the other to give up - their lock. It will look, smell, and feel like a crash. - </para> - </sect1> - - <sect1 id="techs-deadlock-prevent"> - <title>Preventing Deadlock</title> - - <para> - Textbooks will tell you that if you always lock in the same - order, you will never get this kind of deadlock. Practice - will tell you that this approach doesn't scale: when I - create a new lock, I don't understand enough of the kernel - to figure out where in the 5000 lock hierarchy it will fit. - </para> - - <para> - The best locks are encapsulated: they never get exposed in - headers, and are never held around calls to non-trivial - functions outside the same file. You can read through this - code and see that it will never deadlock, because it never - tries to grab another lock while it has that one. People - using your code don't even need to know you are using a - lock. - </para> - - <para> - A classic problem here is when you provide callbacks or - hooks: if you call these with the lock held, you risk simple - deadlock, or a deadly embrace (who knows what the callback - will do?). Remember, the other programmers are out to get - you, so don't do this. - </para> - - <sect2 id="techs-deadlock-overprevent"> - <title>Overzealous Prevention Of Deadlocks</title> - - <para> - Deadlocks are problematic, but not as bad as data - corruption. Code which grabs a read lock, searches a list, - fails to find what it wants, drops the read lock, grabs a - write lock and inserts the object has a race condition. - </para> - - <para> - If you don't see why, please stay the fuck away from my code. - </para> - </sect2> - </sect1> - - <sect1 id="racing-timers"> - <title>Racing Timers: A Kernel Pastime</title> - - <para> - Timers can produce their own special problems with races. - Consider a collection of objects (list, hash, etc) where each - object has a timer which is due to destroy it. - </para> - - <para> - If you want to destroy the entire collection (say on module - removal), you might do the following: - </para> - - <programlisting> - /* THIS CODE BAD BAD BAD BAD: IF IT WAS ANY WORSE IT WOULD USE - HUNGARIAN NOTATION */ - spin_lock_bh(&list_lock); - - while (list) { - struct foo *next = list->next; - del_timer(&list->timer); - kfree(list); - list = next; - } - - spin_unlock_bh(&list_lock); - </programlisting> - - <para> - Sooner or later, this will crash on SMP, because a timer can - have just gone off before the <function>spin_lock_bh()</function>, - and it will only get the lock after we - <function>spin_unlock_bh()</function>, and then try to free - the element (which has already been freed!). - </para> - - <para> - This can be avoided by checking the result of - <function>del_timer()</function>: if it returns - <returnvalue>1</returnvalue>, the timer has been deleted. - If <returnvalue>0</returnvalue>, it means (in this - case) that it is currently running, so we can do: - </para> - - <programlisting> - retry: - spin_lock_bh(&list_lock); - - while (list) { - struct foo *next = list->next; - if (!del_timer(&list->timer)) { - /* Give timer a chance to delete this */ - spin_unlock_bh(&list_lock); - goto retry; - } - kfree(list); - list = next; - } - - spin_unlock_bh(&list_lock); - </programlisting> - - <para> - Another common problem is deleting timers which restart - themselves (by calling <function>add_timer()</function> at the end - of their timer function). Because this is a fairly common case - which is prone to races, you should use <function>del_timer_sync()</function> - (<filename class="headerfile">include/linux/timer.h</filename>) - to handle this case. It returns the number of times the timer - had to be deleted before we finally stopped it from adding itself back - in. - </para> - </sect1> - - </chapter> - - <chapter id="Efficiency"> - <title>Locking Speed</title> - - <para> -There are three main things to worry about when considering speed of -some code which does locking. First is concurrency: how many things -are going to be waiting while someone else is holding a lock. Second -is the time taken to actually acquire and release an uncontended lock. -Third is using fewer, or smarter locks. I'm assuming that the lock is -used fairly often: otherwise, you wouldn't be concerned about -efficiency. -</para> - <para> -Concurrency depends on how long the lock is usually held: you should -hold the lock for as long as needed, but no longer. In the cache -example, we always create the object without the lock held, and then -grab the lock only when we are ready to insert it in the list. -</para> - <para> -Acquisition times depend on how much damage the lock operations do to -the pipeline (pipeline stalls) and how likely it is that this CPU was -the last one to grab the lock (ie. is the lock cache-hot for this -CPU): on a machine with more CPUs, this likelihood drops fast. -Consider a 700MHz Intel Pentium III: an instruction takes about 0.7ns, -an atomic increment takes about 58ns, a lock which is cache-hot on -this CPU takes 160ns, and a cacheline transfer from another CPU takes -an additional 170 to 360ns. (These figures from Paul McKenney's -<ulink url="http://www.linuxjournal.com/article.php?sid=6993"> Linux -Journal RCU article</ulink>). -</para> - <para> -These two aims conflict: holding a lock for a short time might be done -by splitting locks into parts (such as in our final per-object-lock -example), but this increases the number of lock acquisitions, and the -results are often slower than having a single lock. This is another -reason to advocate locking simplicity. -</para> - <para> -The third concern is addressed below: there are some methods to reduce -the amount of locking which needs to be done. -</para> - - <sect1 id="efficiency-rwlocks"> - <title>Read/Write Lock Variants</title> - - <para> - Both spinlocks and mutexes have read/write variants: - <type>rwlock_t</type> and <structname>struct rw_semaphore</structname>. - These divide users into two classes: the readers and the writers. If - you are only reading the data, you can get a read lock, but to write to - the data you need the write lock. Many people can hold a read lock, - but a writer must be sole holder. - </para> - - <para> - If your code divides neatly along reader/writer lines (as our - cache code does), and the lock is held by readers for - significant lengths of time, using these locks can help. They - are slightly slower than the normal locks though, so in practice - <type>rwlock_t</type> is not usually worthwhile. - </para> - </sect1> - - <sect1 id="efficiency-read-copy-update"> - <title>Avoiding Locks: Read Copy Update</title> - - <para> - There is a special method of read/write locking called Read Copy - Update. Using RCU, the readers can avoid taking a lock - altogether: as we expect our cache to be read more often than - updated (otherwise the cache is a waste of time), it is a - candidate for this optimization. - </para> - - <para> - How do we get rid of read locks? Getting rid of read locks - means that writers may be changing the list underneath the - readers. That is actually quite simple: we can read a linked - list while an element is being added if the writer adds the - element very carefully. For example, adding - <symbol>new</symbol> to a single linked list called - <symbol>list</symbol>: - </para> - - <programlisting> - new->next = list->next; - wmb(); - list->next = new; - </programlisting> - - <para> - The <function>wmb()</function> is a write memory barrier. It - ensures that the first operation (setting the new element's - <symbol>next</symbol> pointer) is complete and will be seen by - all CPUs, before the second operation is (putting the new - element into the list). This is important, since modern - compilers and modern CPUs can both reorder instructions unless - told otherwise: we want a reader to either not see the new - element at all, or see the new element with the - <symbol>next</symbol> pointer correctly pointing at the rest of - the list. - </para> - <para> - Fortunately, there is a function to do this for standard - <structname>struct list_head</structname> lists: - <function>list_add_rcu()</function> - (<filename>include/linux/list.h</filename>). - </para> - <para> - Removing an element from the list is even simpler: we replace - the pointer to the old element with a pointer to its successor, - and readers will either see it, or skip over it. - </para> - <programlisting> - list->next = old->next; - </programlisting> - <para> - There is <function>list_del_rcu()</function> - (<filename>include/linux/list.h</filename>) which does this (the - normal version poisons the old object, which we don't want). - </para> - <para> - The reader must also be careful: some CPUs can look through the - <symbol>next</symbol> pointer to start reading the contents of - the next element early, but don't realize that the pre-fetched - contents is wrong when the <symbol>next</symbol> pointer changes - underneath them. Once again, there is a - <function>list_for_each_entry_rcu()</function> - (<filename>include/linux/list.h</filename>) to help you. Of - course, writers can just use - <function>list_for_each_entry()</function>, since there cannot - be two simultaneous writers. - </para> - <para> - Our final dilemma is this: when can we actually destroy the - removed element? Remember, a reader might be stepping through - this element in the list right now: if we free this element and - the <symbol>next</symbol> pointer changes, the reader will jump - off into garbage and crash. We need to wait until we know that - all the readers who were traversing the list when we deleted the - element are finished. We use <function>call_rcu()</function> to - register a callback which will actually destroy the object once - all pre-existing readers are finished. Alternatively, - <function>synchronize_rcu()</function> may be used to block until - all pre-existing are finished. - </para> - <para> - But how does Read Copy Update know when the readers are - finished? The method is this: firstly, the readers always - traverse the list inside - <function>rcu_read_lock()</function>/<function>rcu_read_unlock()</function> - pairs: these simply disable preemption so the reader won't go to - sleep while reading the list. - </para> - <para> - RCU then waits until every other CPU has slept at least once: - since readers cannot sleep, we know that any readers which were - traversing the list during the deletion are finished, and the - callback is triggered. The real Read Copy Update code is a - little more optimized than this, but this is the fundamental - idea. - </para> - -<programlisting> ---- cache.c.perobjectlock 2003-12-11 17:15:03.000000000 +1100 -+++ cache.c.rcupdate 2003-12-11 17:55:14.000000000 +1100 -@@ -1,15 +1,18 @@ - #include <linux/list.h> - #include <linux/slab.h> - #include <linux/string.h> -+#include <linux/rcupdate.h> - #include <linux/mutex.h> - #include <asm/errno.h> - - struct object - { -- /* These two protected by cache_lock. */ -+ /* This is protected by RCU */ - struct list_head list; - int popularity; - -+ struct rcu_head rcu; -+ - atomic_t refcnt; - - /* Doesn't change once created. */ -@@ -40,7 +43,7 @@ - { - struct object *i; - -- list_for_each_entry(i, &cache, list) { -+ list_for_each_entry_rcu(i, &cache, list) { - if (i->id == id) { - i->popularity++; - return i; -@@ -49,19 +52,25 @@ - return NULL; - } - -+/* Final discard done once we know no readers are looking. */ -+static void cache_delete_rcu(void *arg) -+{ -+ object_put(arg); -+} -+ - /* Must be holding cache_lock */ - static void __cache_delete(struct object *obj) - { - BUG_ON(!obj); -- list_del(&obj->list); -- object_put(obj); -+ list_del_rcu(&obj->list); - cache_num--; -+ call_rcu(&obj->rcu, cache_delete_rcu); - } - - /* Must be holding cache_lock */ - static void __cache_add(struct object *obj) - { -- list_add(&obj->list, &cache); -+ list_add_rcu(&obj->list, &cache); - if (++cache_num > MAX_CACHE_SIZE) { - struct object *i, *outcast = NULL; - list_for_each_entry(i, &cache, list) { -@@ -104,12 +114,11 @@ - struct object *cache_find(int id) - { - struct object *obj; -- unsigned long flags; - -- spin_lock_irqsave(&cache_lock, flags); -+ rcu_read_lock(); - obj = __cache_find(id); - if (obj) - object_get(obj); -- spin_unlock_irqrestore(&cache_lock, flags); -+ rcu_read_unlock(); - return obj; - } -</programlisting> - -<para> -Note that the reader will alter the -<structfield>popularity</structfield> member in -<function>__cache_find()</function>, and now it doesn't hold a lock. -One solution would be to make it an <type>atomic_t</type>, but for -this usage, we don't really care about races: an approximate result is -good enough, so I didn't change it. -</para> - -<para> -The result is that <function>cache_find()</function> requires no -synchronization with any other functions, so is almost as fast on SMP -as it would be on UP. -</para> - -<para> -There is a further optimization possible here: remember our original -cache code, where there were no reference counts and the caller simply -held the lock whenever using the object? This is still possible: if -you hold the lock, no one can delete the object, so you don't need to -get and put the reference count. -</para> - -<para> -Now, because the 'read lock' in RCU is simply disabling preemption, a -caller which always has preemption disabled between calling -<function>cache_find()</function> and -<function>object_put()</function> does not need to actually get and -put the reference count: we could expose -<function>__cache_find()</function> by making it non-static, and -such callers could simply call that. -</para> -<para> -The benefit here is that the reference count is not written to: the -object is not altered in any way, which is much faster on SMP -machines due to caching. -</para> - </sect1> - - <sect1 id="per-cpu"> - <title>Per-CPU Data</title> - - <para> - Another technique for avoiding locking which is used fairly - widely is to duplicate information for each CPU. For example, - if you wanted to keep a count of a common condition, you could - use a spin lock and a single counter. Nice and simple. - </para> - - <para> - If that was too slow (it's usually not, but if you've got a - really big machine to test on and can show that it is), you - could instead use a counter for each CPU, then none of them need - an exclusive lock. See <function>DEFINE_PER_CPU()</function>, - <function>get_cpu_var()</function> and - <function>put_cpu_var()</function> - (<filename class="headerfile">include/linux/percpu.h</filename>). - </para> - - <para> - Of particular use for simple per-cpu counters is the - <type>local_t</type> type, and the - <function>cpu_local_inc()</function> and related functions, - which are more efficient than simple code on some architectures - (<filename class="headerfile">include/asm/local.h</filename>). - </para> - - <para> - Note that there is no simple, reliable way of getting an exact - value of such a counter, without introducing more locks. This - is not a problem for some uses. - </para> - </sect1> - - <sect1 id="mostly-hardirq"> - <title>Data Which Mostly Used By An IRQ Handler</title> - - <para> - If data is always accessed from within the same IRQ handler, you - don't need a lock at all: the kernel already guarantees that the - irq handler will not run simultaneously on multiple CPUs. - </para> - <para> - Manfred Spraul points out that you can still do this, even if - the data is very occasionally accessed in user context or - softirqs/tasklets. The irq handler doesn't use a lock, and - all other accesses are done as so: - </para> - -<programlisting> - spin_lock(&lock); - disable_irq(irq); - ... - enable_irq(irq); - spin_unlock(&lock); -</programlisting> - <para> - The <function>disable_irq()</function> prevents the irq handler - from running (and waits for it to finish if it's currently - running on other CPUs). The spinlock prevents any other - accesses happening at the same time. Naturally, this is slower - than just a <function>spin_lock_irq()</function> call, so it - only makes sense if this type of access happens extremely - rarely. - </para> - </sect1> - </chapter> - - <chapter id="sleeping-things"> - <title>What Functions Are Safe To Call From Interrupts?</title> - - <para> - Many functions in the kernel sleep (ie. call schedule()) - directly or indirectly: you can never call them while holding a - spinlock, or with preemption disabled. This also means you need - to be in user context: calling them from an interrupt is illegal. - </para> - - <sect1 id="sleeping"> - <title>Some Functions Which Sleep</title> - - <para> - The most common ones are listed below, but you usually have to - read the code to find out if other calls are safe. If everyone - else who calls it can sleep, you probably need to be able to - sleep, too. In particular, registration and deregistration - functions usually expect to be called from user context, and can - sleep. - </para> - - <itemizedlist> - <listitem> - <para> - Accesses to - <firstterm linkend="gloss-userspace">userspace</firstterm>: - </para> - <itemizedlist> - <listitem> - <para> - <function>copy_from_user()</function> - </para> - </listitem> - <listitem> - <para> - <function>copy_to_user()</function> - </para> - </listitem> - <listitem> - <para> - <function>get_user()</function> - </para> - </listitem> - <listitem> - <para> - <function>put_user()</function> - </para> - </listitem> - </itemizedlist> - </listitem> - - <listitem> - <para> - <function>kmalloc(GFP_KERNEL)</function> - </para> - </listitem> - - <listitem> - <para> - <function>mutex_lock_interruptible()</function> and - <function>mutex_lock()</function> - </para> - <para> - There is a <function>mutex_trylock()</function> which does not - sleep. Still, it must not be used inside interrupt context since - its implementation is not safe for that. - <function>mutex_unlock()</function> will also never sleep. - It cannot be used in interrupt context either since a mutex - must be released by the same task that acquired it. - </para> - </listitem> - </itemizedlist> - </sect1> - - <sect1 id="dont-sleep"> - <title>Some Functions Which Don't Sleep</title> - - <para> - Some functions are safe to call from any context, or holding - almost any lock. - </para> - - <itemizedlist> - <listitem> - <para> - <function>printk()</function> - </para> - </listitem> - <listitem> - <para> - <function>kfree()</function> - </para> - </listitem> - <listitem> - <para> - <function>add_timer()</function> and <function>del_timer()</function> - </para> - </listitem> - </itemizedlist> - </sect1> - </chapter> - - <chapter id="apiref-mutex"> - <title>Mutex API reference</title> -!Iinclude/linux/mutex.h -!Ekernel/locking/mutex.c - </chapter> - - <chapter id="apiref-futex"> - <title>Futex API reference</title> -!Ikernel/futex.c - </chapter> - - <chapter id="references"> - <title>Further reading</title> - - <itemizedlist> - <listitem> - <para> - <filename>Documentation/locking/spinlocks.txt</filename>: - Linus Torvalds' spinlocking tutorial in the kernel sources. - </para> - </listitem> - - <listitem> - <para> - Unix Systems for Modern Architectures: Symmetric - Multiprocessing and Caching for Kernel Programmers: - </para> - - <para> - Curt Schimmel's very good introduction to kernel level - locking (not written for Linux, but nearly everything - applies). The book is expensive, but really worth every - penny to understand SMP locking. [ISBN: 0201633388] - </para> - </listitem> - </itemizedlist> - </chapter> - - <chapter id="thanks"> - <title>Thanks</title> - - <para> - Thanks to Telsa Gwynne for DocBooking, neatening and adding - style. - </para> - - <para> - Thanks to Martin Pool, Philipp Rumpf, Stephen Rothwell, Paul - Mackerras, Ruedi Aschwanden, Alan Cox, Manfred Spraul, Tim - Waugh, Pete Zaitcev, James Morris, Robert Love, Paul McKenney, - John Ashby for proofreading, correcting, flaming, commenting. - </para> - - <para> - Thanks to the cabal for having no influence on this document. - </para> - </chapter> - - <glossary id="glossary"> - <title>Glossary</title> - - <glossentry id="gloss-preemption"> - <glossterm>preemption</glossterm> - <glossdef> - <para> - Prior to 2.5, or when <symbol>CONFIG_PREEMPT</symbol> is - unset, processes in user context inside the kernel would not - preempt each other (ie. you had that CPU until you gave it up, - except for interrupts). With the addition of - <symbol>CONFIG_PREEMPT</symbol> in 2.5.4, this changed: when - in user context, higher priority tasks can "cut in": spinlocks - were changed to disable preemption, even on UP. - </para> - </glossdef> - </glossentry> - - <glossentry id="gloss-bh"> - <glossterm>bh</glossterm> - <glossdef> - <para> - Bottom Half: for historical reasons, functions with - '_bh' in them often now refer to any software interrupt, e.g. - <function>spin_lock_bh()</function> blocks any software interrupt - on the current CPU. Bottom halves are deprecated, and will - eventually be replaced by tasklets. Only one bottom half will be - running at any time. - </para> - </glossdef> - </glossentry> - - <glossentry id="gloss-hwinterrupt"> - <glossterm>Hardware Interrupt / Hardware IRQ</glossterm> - <glossdef> - <para> - Hardware interrupt request. <function>in_irq()</function> returns - <returnvalue>true</returnvalue> in a hardware interrupt handler. - </para> - </glossdef> - </glossentry> - - <glossentry id="gloss-interruptcontext"> - <glossterm>Interrupt Context</glossterm> - <glossdef> - <para> - Not user context: processing a hardware irq or software irq. - Indicated by the <function>in_interrupt()</function> macro - returning <returnvalue>true</returnvalue>. - </para> - </glossdef> - </glossentry> - - <glossentry id="gloss-smp"> - <glossterm><acronym>SMP</acronym></glossterm> - <glossdef> - <para> - Symmetric Multi-Processor: kernels compiled for multiple-CPU - machines. (CONFIG_SMP=y). - </para> - </glossdef> - </glossentry> - - <glossentry id="gloss-softirq"> - <glossterm>Software Interrupt / softirq</glossterm> - <glossdef> - <para> - Software interrupt handler. <function>in_irq()</function> returns - <returnvalue>false</returnvalue>; <function>in_softirq()</function> - returns <returnvalue>true</returnvalue>. Tasklets and softirqs - both fall into the category of 'software interrupts'. - </para> - <para> - Strictly speaking a softirq is one of up to 32 enumerated software - interrupts which can run on multiple CPUs at once. - Sometimes used to refer to tasklets as - well (ie. all software interrupts). - </para> - </glossdef> - </glossentry> - - <glossentry id="gloss-tasklet"> - <glossterm>tasklet</glossterm> - <glossdef> - <para> - A dynamically-registrable software interrupt, - which is guaranteed to only run on one CPU at a time. - </para> - </glossdef> - </glossentry> - - <glossentry id="gloss-timers"> - <glossterm>timer</glossterm> - <glossdef> - <para> - A dynamically-registrable software interrupt, which is run at - (or close to) a given time. When running, it is just like a - tasklet (in fact, they are called from the TIMER_SOFTIRQ). - </para> - </glossdef> - </glossentry> - - <glossentry id="gloss-up"> - <glossterm><acronym>UP</acronym></glossterm> - <glossdef> - <para> - Uni-Processor: Non-SMP. (CONFIG_SMP=n). - </para> - </glossdef> - </glossentry> - - <glossentry id="gloss-usercontext"> - <glossterm>User Context</glossterm> - <glossdef> - <para> - The kernel executing on behalf of a particular process (ie. a - system call or trap) or kernel thread. You can tell which - process with the <symbol>current</symbol> macro.) Not to - be confused with userspace. Can be interrupted by software or - hardware interrupts. - </para> - </glossdef> - </glossentry> - - <glossentry id="gloss-userspace"> - <glossterm>Userspace</glossterm> - <glossdef> - <para> - A process executing its own code outside the kernel. - </para> - </glossdef> - </glossentry> - - </glossary> -</book> - diff --git a/Documentation/DocBook/kgdb.tmpl b/Documentation/DocBook/kgdb.tmpl deleted file mode 100644 index 856ac20bf367..000000000000 --- a/Documentation/DocBook/kgdb.tmpl +++ /dev/null @@ -1,918 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="kgdbOnLinux"> - <bookinfo> - <title>Using kgdb, kdb and the kernel debugger internals</title> - - <authorgroup> - <author> - <firstname>Jason</firstname> - <surname>Wessel</surname> - <affiliation> - <address> - <email>jason.wessel@windriver.com</email> - </address> - </affiliation> - </author> - </authorgroup> - <copyright> - <year>2008,2010</year> - <holder>Wind River Systems, Inc.</holder> - </copyright> - <copyright> - <year>2004-2005</year> - <holder>MontaVista Software, Inc.</holder> - </copyright> - <copyright> - <year>2004</year> - <holder>Amit S. Kale</holder> - </copyright> - - <legalnotice> - <para> - This file is licensed under the terms of the GNU General Public License - version 2. This program is licensed "as is" without any warranty of any - kind, whether express or implied. - </para> - - </legalnotice> - </bookinfo> - -<toc></toc> - <chapter id="Introduction"> - <title>Introduction</title> - <para> - The kernel has two different debugger front ends (kdb and kgdb) - which interface to the debug core. It is possible to use either - of the debugger front ends and dynamically transition between them - if you configure the kernel properly at compile and runtime. - </para> - <para> - Kdb is simplistic shell-style interface which you can use on a - system console with a keyboard or serial console. You can use it - to inspect memory, registers, process lists, dmesg, and even set - breakpoints to stop in a certain location. Kdb is not a source - level debugger, although you can set breakpoints and execute some - basic kernel run control. Kdb is mainly aimed at doing some - analysis to aid in development or diagnosing kernel problems. You - can access some symbols by name in kernel built-ins or in kernel - modules if the code was built - with <symbol>CONFIG_KALLSYMS</symbol>. - </para> - <para> - Kgdb is intended to be used as a source level debugger for the - Linux kernel. It is used along with gdb to debug a Linux kernel. - The expectation is that gdb can be used to "break in" to the - kernel to inspect memory, variables and look through call stack - information similar to the way an application developer would use - gdb to debug an application. It is possible to place breakpoints - in kernel code and perform some limited execution stepping. - </para> - <para> - Two machines are required for using kgdb. One of these machines is - a development machine and the other is the target machine. The - kernel to be debugged runs on the target machine. The development - machine runs an instance of gdb against the vmlinux file which - contains the symbols (not a boot image such as bzImage, zImage, - uImage...). In gdb the developer specifies the connection - parameters and connects to kgdb. The type of connection a - developer makes with gdb depends on the availability of kgdb I/O - modules compiled as built-ins or loadable kernel modules in the test - machine's kernel. - </para> - </chapter> - <chapter id="CompilingAKernel"> - <title>Compiling a kernel</title> - <para> - <itemizedlist> - <listitem><para>In order to enable compilation of kdb, you must first enable kgdb.</para></listitem> - <listitem><para>The kgdb test compile options are described in the kgdb test suite chapter.</para></listitem> - </itemizedlist> - </para> - <sect1 id="CompileKGDB"> - <title>Kernel config options for kgdb</title> - <para> - To enable <symbol>CONFIG_KGDB</symbol> you should look under - "Kernel hacking" / "Kernel debugging" and select "KGDB: kernel debugger". - </para> - <para> - While it is not a hard requirement that you have symbols in your - vmlinux file, gdb tends not to be very useful without the symbolic - data, so you will want to turn - on <symbol>CONFIG_DEBUG_INFO</symbol> which is called "Compile the - kernel with debug info" in the config menu. - </para> - <para> - It is advised, but not required, that you turn on the - <symbol>CONFIG_FRAME_POINTER</symbol> kernel option which is called "Compile the - kernel with frame pointers" in the config menu. This option - inserts code to into the compiled executable which saves the frame - information in registers or on the stack at different points which - allows a debugger such as gdb to more accurately construct - stack back traces while debugging the kernel. - </para> - <para> - If the architecture that you are using supports the kernel option - CONFIG_STRICT_KERNEL_RWX, you should consider turning it off. This - option will prevent the use of software breakpoints because it - marks certain regions of the kernel's memory space as read-only. - If kgdb supports it for the architecture you are using, you can - use hardware breakpoints if you desire to run with the - CONFIG_STRICT_KERNEL_RWX option turned on, else you need to turn off - this option. - </para> - <para> - Next you should choose one of more I/O drivers to interconnect - debugging host and debugged target. Early boot debugging requires - a KGDB I/O driver that supports early debugging and the driver - must be built into the kernel directly. Kgdb I/O driver - configuration takes place via kernel or module parameters which - you can learn more about in the in the section that describes the - parameter "kgdboc". - </para> - <para>Here is an example set of .config symbols to enable or - disable for kgdb: - <itemizedlist> - <listitem><para># CONFIG_STRICT_KERNEL_RWX is not set</para></listitem> - <listitem><para>CONFIG_FRAME_POINTER=y</para></listitem> - <listitem><para>CONFIG_KGDB=y</para></listitem> - <listitem><para>CONFIG_KGDB_SERIAL_CONSOLE=y</para></listitem> - </itemizedlist> - </para> - </sect1> - <sect1 id="CompileKDB"> - <title>Kernel config options for kdb</title> - <para>Kdb is quite a bit more complex than the simple gdbstub - sitting on top of the kernel's debug core. Kdb must implement a - shell, and also adds some helper functions in other parts of the - kernel, responsible for printing out interesting data such as what - you would see if you ran "lsmod", or "ps". In order to build kdb - into the kernel you follow the same steps as you would for kgdb. - </para> - <para>The main config option for kdb - is <symbol>CONFIG_KGDB_KDB</symbol> which is called "KGDB_KDB: - include kdb frontend for kgdb" in the config menu. In theory you - would have already also selected an I/O driver such as the - CONFIG_KGDB_SERIAL_CONSOLE interface if you plan on using kdb on a - serial port, when you were configuring kgdb. - </para> - <para>If you want to use a PS/2-style keyboard with kdb, you would - select CONFIG_KDB_KEYBOARD which is called "KGDB_KDB: keyboard as - input device" in the config menu. The CONFIG_KDB_KEYBOARD option - is not used for anything in the gdb interface to kgdb. The - CONFIG_KDB_KEYBOARD option only works with kdb. - </para> - <para>Here is an example set of .config symbols to enable/disable kdb: - <itemizedlist> - <listitem><para># CONFIG_STRICT_KERNEL_RWX is not set</para></listitem> - <listitem><para>CONFIG_FRAME_POINTER=y</para></listitem> - <listitem><para>CONFIG_KGDB=y</para></listitem> - <listitem><para>CONFIG_KGDB_SERIAL_CONSOLE=y</para></listitem> - <listitem><para>CONFIG_KGDB_KDB=y</para></listitem> - <listitem><para>CONFIG_KDB_KEYBOARD=y</para></listitem> - </itemizedlist> - </para> - </sect1> - </chapter> - <chapter id="kgdbKernelArgs"> - <title>Kernel Debugger Boot Arguments</title> - <para>This section describes the various runtime kernel - parameters that affect the configuration of the kernel debugger. - The following chapter covers using kdb and kgdb as well as - providing some examples of the configuration parameters.</para> - <sect1 id="kgdboc"> - <title>Kernel parameter: kgdboc</title> - <para>The kgdboc driver was originally an abbreviation meant to - stand for "kgdb over console". Today it is the primary mechanism - to configure how to communicate from gdb to kgdb as well as the - devices you want to use to interact with the kdb shell. - </para> - <para>For kgdb/gdb, kgdboc is designed to work with a single serial - port. It is intended to cover the circumstance where you want to - use a serial console as your primary console as well as using it to - perform kernel debugging. It is also possible to use kgdb on a - serial port which is not designated as a system console. Kgdboc - may be configured as a kernel built-in or a kernel loadable module. - You can only make use of <constant>kgdbwait</constant> and early - debugging if you build kgdboc into the kernel as a built-in. - </para> - <para>Optionally you can elect to activate kms (Kernel Mode - Setting) integration. When you use kms with kgdboc and you have a - video driver that has atomic mode setting hooks, it is possible to - enter the debugger on the graphics console. When the kernel - execution is resumed, the previous graphics mode will be restored. - This integration can serve as a useful tool to aid in diagnosing - crashes or doing analysis of memory with kdb while allowing the - full graphics console applications to run. - </para> - <sect2 id="kgdbocArgs"> - <title>kgdboc arguments</title> - <para>Usage: <constant>kgdboc=[kms][[,]kbd][[,]serial_device][,baud]</constant></para> - <para>The order listed above must be observed if you use any of the - optional configurations together. - </para> - <para>Abbreviations: - <itemizedlist> - <listitem><para>kms = Kernel Mode Setting</para></listitem> - <listitem><para>kbd = Keyboard</para></listitem> - </itemizedlist> - </para> - <para>You can configure kgdboc to use the keyboard, and/or a serial - device depending on if you are using kdb and/or kgdb, in one of the - following scenarios. The order listed above must be observed if - you use any of the optional configurations together. Using kms + - only gdb is generally not a useful combination.</para> - <sect3 id="kgdbocArgs1"> - <title>Using loadable module or built-in</title> - <para> - <orderedlist> - <listitem><para>As a kernel built-in:</para> - <para>Use the kernel boot argument: <constant>kgdboc=<tty-device>,[baud]</constant></para></listitem> - <listitem> - <para>As a kernel loadable module:</para> - <para>Use the command: <constant>modprobe kgdboc kgdboc=<tty-device>,[baud]</constant></para> - <para>Here are two examples of how you might format the kgdboc - string. The first is for an x86 target using the first serial port. - The second example is for the ARM Versatile AB using the second - serial port. - <orderedlist> - <listitem><para><constant>kgdboc=ttyS0,115200</constant></para></listitem> - <listitem><para><constant>kgdboc=ttyAMA1,115200</constant></para></listitem> - </orderedlist> - </para> - </listitem> - </orderedlist></para> - </sect3> - <sect3 id="kgdbocArgs2"> - <title>Configure kgdboc at runtime with sysfs</title> - <para>At run time you can enable or disable kgdboc by echoing a - parameters into the sysfs. Here are two examples:</para> - <orderedlist> - <listitem><para>Enable kgdboc on ttyS0</para> - <para><constant>echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem> - <listitem><para>Disable kgdboc</para> - <para><constant>echo "" > /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem> - </orderedlist> - <para>NOTE: You do not need to specify the baud if you are - configuring the console on tty which is already configured or - open.</para> - </sect3> - <sect3 id="kgdbocArgs3"> - <title>More examples</title> - <para>You can configure kgdboc to use the keyboard, and/or a serial device - depending on if you are using kdb and/or kgdb, in one of the - following scenarios. - <orderedlist> - <listitem><para>kdb and kgdb over only a serial port</para> - <para><constant>kgdboc=<serial_device>[,baud]</constant></para> - <para>Example: <constant>kgdboc=ttyS0,115200</constant></para> - </listitem> - <listitem><para>kdb and kgdb with keyboard and a serial port</para> - <para><constant>kgdboc=kbd,<serial_device>[,baud]</constant></para> - <para>Example: <constant>kgdboc=kbd,ttyS0,115200</constant></para> - </listitem> - <listitem><para>kdb with a keyboard</para> - <para><constant>kgdboc=kbd</constant></para> - </listitem> - <listitem><para>kdb with kernel mode setting</para> - <para><constant>kgdboc=kms,kbd</constant></para> - </listitem> - <listitem><para>kdb with kernel mode setting and kgdb over a serial port</para> - <para><constant>kgdboc=kms,kbd,ttyS0,115200</constant></para> - </listitem> - </orderedlist> - </para> - <para>NOTE: Kgdboc does not support interrupting the target via the - gdb remote protocol. You must manually send a sysrq-g unless you - have a proxy that splits console output to a terminal program. - A console proxy has a separate TCP port for the debugger and a separate - TCP port for the "human" console. The proxy can take care of sending - the sysrq-g for you. - </para> - <para>When using kgdboc with no debugger proxy, you can end up - connecting the debugger at one of two entry points. If an - exception occurs after you have loaded kgdboc, a message should - print on the console stating it is waiting for the debugger. In - this case you disconnect your terminal program and then connect the - debugger in its place. If you want to interrupt the target system - and forcibly enter a debug session you have to issue a Sysrq - sequence and then type the letter <constant>g</constant>. Then - you disconnect the terminal session and connect gdb. Your options - if you don't like this are to hack gdb to send the sysrq-g for you - as well as on the initial connect, or to use a debugger proxy that - allows an unmodified gdb to do the debugging. - </para> - </sect3> - </sect2> - </sect1> - <sect1 id="kgdbwait"> - <title>Kernel parameter: kgdbwait</title> - <para> - The Kernel command line option <constant>kgdbwait</constant> makes - kgdb wait for a debugger connection during booting of a kernel. You - can only use this option if you compiled a kgdb I/O driver into the - kernel and you specified the I/O driver configuration as a kernel - command line option. The kgdbwait parameter should always follow the - configuration parameter for the kgdb I/O driver in the kernel - command line else the I/O driver will not be configured prior to - asking the kernel to use it to wait. - </para> - <para> - The kernel will stop and wait as early as the I/O driver and - architecture allows when you use this option. If you build the - kgdb I/O driver as a loadable kernel module kgdbwait will not do - anything. - </para> - </sect1> - <sect1 id="kgdbcon"> - <title>Kernel parameter: kgdbcon</title> - <para> The kgdbcon feature allows you to see printk() messages - inside gdb while gdb is connected to the kernel. Kdb does not make - use of the kgdbcon feature. - </para> - <para>Kgdb supports using the gdb serial protocol to send console - messages to the debugger when the debugger is connected and running. - There are two ways to activate this feature. - <orderedlist> - <listitem><para>Activate with the kernel command line option:</para> - <para><constant>kgdbcon</constant></para> - </listitem> - <listitem><para>Use sysfs before configuring an I/O driver</para> - <para> - <constant>echo 1 > /sys/module/kgdb/parameters/kgdb_use_con</constant> - </para> - <para> - NOTE: If you do this after you configure the kgdb I/O driver, the - setting will not take effect until the next point the I/O is - reconfigured. - </para> - </listitem> - </orderedlist> - </para> - <para>IMPORTANT NOTE: You cannot use kgdboc + kgdbcon on a tty that is an - active system console. An example of incorrect usage is <constant>console=ttyS0,115200 kgdboc=ttyS0 kgdbcon</constant> - </para> - <para>It is possible to use this option with kgdboc on a tty that is not a system console. - </para> - </sect1> - <sect1 id="kgdbreboot"> - <title>Run time parameter: kgdbreboot</title> - <para> The kgdbreboot feature allows you to change how the debugger - deals with the reboot notification. You have 3 choices for the - behavior. The default behavior is always set to 0.</para> - <orderedlist> - <listitem><para>echo -1 > /sys/module/debug_core/parameters/kgdbreboot</para> - <para>Ignore the reboot notification entirely.</para> - </listitem> - <listitem><para>echo 0 > /sys/module/debug_core/parameters/kgdbreboot</para> - <para>Send the detach message to any attached debugger client.</para> - </listitem> - <listitem><para>echo 1 > /sys/module/debug_core/parameters/kgdbreboot</para> - <para>Enter the debugger on reboot notify.</para> - </listitem> - </orderedlist> - </sect1> - </chapter> - <chapter id="usingKDB"> - <title>Using kdb</title> - <para> - </para> - <sect1 id="quickKDBserial"> - <title>Quick start for kdb on a serial port</title> - <para>This is a quick example of how to use kdb.</para> - <para><orderedlist> - <listitem><para>Configure kgdboc at boot using kernel parameters: - <itemizedlist> - <listitem><para><constant>console=ttyS0,115200 kgdboc=ttyS0,115200</constant></para></listitem> - </itemizedlist></para> - <para>OR</para> - <para>Configure kgdboc after the kernel has booted; assuming you are using a serial port console: - <itemizedlist> - <listitem><para><constant>echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem> - </itemizedlist> - </para> - </listitem> - <listitem><para>Enter the kernel debugger manually or by waiting for an oops or fault. There are several ways you can enter the kernel debugger manually; all involve using the sysrq-g, which means you must have enabled CONFIG_MAGIC_SYSRQ=y in your kernel config.</para> - <itemizedlist> - <listitem><para>When logged in as root or with a super user session you can run:</para> - <para><constant>echo g > /proc/sysrq-trigger</constant></para></listitem> - <listitem><para>Example using minicom 2.2</para> - <para>Press: <constant>Control-a</constant></para> - <para>Press: <constant>f</constant></para> - <para>Press: <constant>g</constant></para> - </listitem> - <listitem><para>When you have telneted to a terminal server that supports sending a remote break</para> - <para>Press: <constant>Control-]</constant></para> - <para>Type in:<constant>send break</constant></para> - <para>Press: <constant>Enter</constant></para> - <para>Press: <constant>g</constant></para> - </listitem> - </itemizedlist> - </listitem> - <listitem><para>From the kdb prompt you can run the "help" command to see a complete list of the commands that are available.</para> - <para>Some useful commands in kdb include: - <itemizedlist> - <listitem><para>lsmod -- Shows where kernel modules are loaded</para></listitem> - <listitem><para>ps -- Displays only the active processes</para></listitem> - <listitem><para>ps A -- Shows all the processes</para></listitem> - <listitem><para>summary -- Shows kernel version info and memory usage</para></listitem> - <listitem><para>bt -- Get a backtrace of the current process using dump_stack()</para></listitem> - <listitem><para>dmesg -- View the kernel syslog buffer</para></listitem> - <listitem><para>go -- Continue the system</para></listitem> - </itemizedlist> - </para> - </listitem> - <listitem> - <para>When you are done using kdb you need to consider rebooting the - system or using the "go" command to resuming normal kernel - execution. If you have paused the kernel for a lengthy period of - time, applications that rely on timely networking or anything to do - with real wall clock time could be adversely affected, so you - should take this into consideration when using the kernel - debugger.</para> - </listitem> - </orderedlist></para> - </sect1> - <sect1 id="quickKDBkeyboard"> - <title>Quick start for kdb using a keyboard connected console</title> - <para>This is a quick example of how to use kdb with a keyboard.</para> - <para><orderedlist> - <listitem><para>Configure kgdboc at boot using kernel parameters: - <itemizedlist> - <listitem><para><constant>kgdboc=kbd</constant></para></listitem> - </itemizedlist></para> - <para>OR</para> - <para>Configure kgdboc after the kernel has booted: - <itemizedlist> - <listitem><para><constant>echo kbd > /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem> - </itemizedlist> - </para> - </listitem> - <listitem><para>Enter the kernel debugger manually or by waiting for an oops or fault. There are several ways you can enter the kernel debugger manually; all involve using the sysrq-g, which means you must have enabled CONFIG_MAGIC_SYSRQ=y in your kernel config.</para> - <itemizedlist> - <listitem><para>When logged in as root or with a super user session you can run:</para> - <para><constant>echo g > /proc/sysrq-trigger</constant></para></listitem> - <listitem><para>Example using a laptop keyboard</para> - <para>Press and hold down: <constant>Alt</constant></para> - <para>Press and hold down: <constant>Fn</constant></para> - <para>Press and release the key with the label: <constant>SysRq</constant></para> - <para>Release: <constant>Fn</constant></para> - <para>Press and release: <constant>g</constant></para> - <para>Release: <constant>Alt</constant></para> - </listitem> - <listitem><para>Example using a PS/2 101-key keyboard</para> - <para>Press and hold down: <constant>Alt</constant></para> - <para>Press and release the key with the label: <constant>SysRq</constant></para> - <para>Press and release: <constant>g</constant></para> - <para>Release: <constant>Alt</constant></para> - </listitem> - </itemizedlist> - </listitem> - <listitem> - <para>Now type in a kdb command such as "help", "dmesg", "bt" or "go" to continue kernel execution.</para> - </listitem> - </orderedlist></para> - </sect1> - </chapter> - <chapter id="EnableKGDB"> - <title>Using kgdb / gdb</title> - <para>In order to use kgdb you must activate it by passing - configuration information to one of the kgdb I/O drivers. If you - do not pass any configuration information kgdb will not do anything - at all. Kgdb will only actively hook up to the kernel trap hooks - if a kgdb I/O driver is loaded and configured. If you unconfigure - a kgdb I/O driver, kgdb will unregister all the kernel hook points. - </para> - <para> All kgdb I/O drivers can be reconfigured at run time, if - <symbol>CONFIG_SYSFS</symbol> and <symbol>CONFIG_MODULES</symbol> - are enabled, by echo'ing a new config string to - <constant>/sys/module/<driver>/parameter/<option></constant>. - The driver can be unconfigured by passing an empty string. You cannot - change the configuration while the debugger is attached. Make sure - to detach the debugger with the <constant>detach</constant> command - prior to trying to unconfigure a kgdb I/O driver. - </para> - <sect1 id="ConnectingGDB"> - <title>Connecting with gdb to a serial port</title> - <orderedlist> - <listitem><para>Configure kgdboc</para> - <para>Configure kgdboc at boot using kernel parameters: - <itemizedlist> - <listitem><para><constant>kgdboc=ttyS0,115200</constant></para></listitem> - </itemizedlist></para> - <para>OR</para> - <para>Configure kgdboc after the kernel has booted: - <itemizedlist> - <listitem><para><constant>echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem> - </itemizedlist></para> - </listitem> - <listitem> - <para>Stop kernel execution (break into the debugger)</para> - <para>In order to connect to gdb via kgdboc, the kernel must - first be stopped. There are several ways to stop the kernel which - include using kgdbwait as a boot argument, via a sysrq-g, or running - the kernel until it takes an exception where it waits for the - debugger to attach. - <itemizedlist> - <listitem><para>When logged in as root or with a super user session you can run:</para> - <para><constant>echo g > /proc/sysrq-trigger</constant></para></listitem> - <listitem><para>Example using minicom 2.2</para> - <para>Press: <constant>Control-a</constant></para> - <para>Press: <constant>f</constant></para> - <para>Press: <constant>g</constant></para> - </listitem> - <listitem><para>When you have telneted to a terminal server that supports sending a remote break</para> - <para>Press: <constant>Control-]</constant></para> - <para>Type in:<constant>send break</constant></para> - <para>Press: <constant>Enter</constant></para> - <para>Press: <constant>g</constant></para> - </listitem> - </itemizedlist> - </para> - </listitem> - <listitem> - <para>Connect from gdb</para> - <para> - Example (using a directly connected port): - </para> - <programlisting> - % gdb ./vmlinux - (gdb) set remotebaud 115200 - (gdb) target remote /dev/ttyS0 - </programlisting> - <para> - Example (kgdb to a terminal server on TCP port 2012): - </para> - <programlisting> - % gdb ./vmlinux - (gdb) target remote 192.168.2.2:2012 - </programlisting> - <para> - Once connected, you can debug a kernel the way you would debug an - application program. - </para> - <para> - If you are having problems connecting or something is going - seriously wrong while debugging, it will most often be the case - that you want to enable gdb to be verbose about its target - communications. You do this prior to issuing the <constant>target - remote</constant> command by typing in: <constant>set debug remote 1</constant> - </para> - </listitem> - </orderedlist> - <para>Remember if you continue in gdb, and need to "break in" again, - you need to issue an other sysrq-g. It is easy to create a simple - entry point by putting a breakpoint at <constant>sys_sync</constant> - and then you can run "sync" from a shell or script to break into the - debugger.</para> - </sect1> - </chapter> - <chapter id="switchKdbKgdb"> - <title>kgdb and kdb interoperability</title> - <para>It is possible to transition between kdb and kgdb dynamically. - The debug core will remember which you used the last time and - automatically start in the same mode.</para> - <sect1> - <title>Switching between kdb and kgdb</title> - <sect2> - <title>Switching from kgdb to kdb</title> - <para> - There are two ways to switch from kgdb to kdb: you can use gdb to - issue a maintenance packet, or you can blindly type the command $3#33. - Whenever the kernel debugger stops in kgdb mode it will print the - message <constant>KGDB or $3#33 for KDB</constant>. It is important - to note that you have to type the sequence correctly in one pass. - You cannot type a backspace or delete because kgdb will interpret - that as part of the debug stream. - <orderedlist> - <listitem><para>Change from kgdb to kdb by blindly typing:</para> - <para><constant>$3#33</constant></para></listitem> - <listitem><para>Change from kgdb to kdb with gdb</para> - <para><constant>maintenance packet 3</constant></para> - <para>NOTE: Now you must kill gdb. Typically you press control-z and - issue the command: kill -9 %</para></listitem> - </orderedlist> - </para> - </sect2> - <sect2> - <title>Change from kdb to kgdb</title> - <para>There are two ways you can change from kdb to kgdb. You can - manually enter kgdb mode by issuing the kgdb command from the kdb - shell prompt, or you can connect gdb while the kdb shell prompt is - active. The kdb shell looks for the typical first commands that gdb - would issue with the gdb remote protocol and if it sees one of those - commands it automatically changes into kgdb mode.</para> - <orderedlist> - <listitem><para>From kdb issue the command:</para> - <para><constant>kgdb</constant></para> - <para>Now disconnect your terminal program and connect gdb in its place</para></listitem> - <listitem><para>At the kdb prompt, disconnect the terminal program and connect gdb in its place.</para></listitem> - </orderedlist> - </sect2> - </sect1> - <sect1> - <title>Running kdb commands from gdb</title> - <para>It is possible to run a limited set of kdb commands from gdb, - using the gdb monitor command. You don't want to execute any of the - run control or breakpoint operations, because it can disrupt the - state of the kernel debugger. You should be using gdb for - breakpoints and run control operations if you have gdb connected. - The more useful commands to run are things like lsmod, dmesg, ps or - possibly some of the memory information commands. To see all the kdb - commands you can run <constant>monitor help</constant>.</para> - <para>Example: - <informalexample><programlisting> -(gdb) monitor ps -1 idle process (state I) and -27 sleeping system daemon (state M) processes suppressed, -use 'ps A' to see all. -Task Addr Pid Parent [*] cpu State Thread Command - -0xc78291d0 1 0 0 0 S 0xc7829404 init -0xc7954150 942 1 0 0 S 0xc7954384 dropbear -0xc78789c0 944 1 0 0 S 0xc7878bf4 sh -(gdb) - </programlisting></informalexample> - </para> - </sect1> - </chapter> - <chapter id="KGDBTestSuite"> - <title>kgdb Test Suite</title> - <para> - When kgdb is enabled in the kernel config you can also elect to - enable the config parameter KGDB_TESTS. Turning this on will - enable a special kgdb I/O module which is designed to test the - kgdb internal functions. - </para> - <para> - The kgdb tests are mainly intended for developers to test the kgdb - internals as well as a tool for developing a new kgdb architecture - specific implementation. These tests are not really for end users - of the Linux kernel. The primary source of documentation would be - to look in the drivers/misc/kgdbts.c file. - </para> - <para> - The kgdb test suite can also be configured at compile time to run - the core set of tests by setting the kernel config parameter - KGDB_TESTS_ON_BOOT. This particular option is aimed at automated - regression testing and does not require modifying the kernel boot - config arguments. If this is turned on, the kgdb test suite can - be disabled by specifying "kgdbts=" as a kernel boot argument. - </para> - </chapter> - <chapter id="CommonBackEndReq"> - <title>Kernel Debugger Internals</title> - <sect1 id="kgdbArchitecture"> - <title>Architecture Specifics</title> - <para> - The kernel debugger is organized into a number of components: - <orderedlist> - <listitem><para>The debug core</para> - <para> - The debug core is found in kernel/debugger/debug_core.c. It contains: - <itemizedlist> - <listitem><para>A generic OS exception handler which includes - sync'ing the processors into a stopped state on an multi-CPU - system.</para></listitem> - <listitem><para>The API to talk to the kgdb I/O drivers</para></listitem> - <listitem><para>The API to make calls to the arch-specific kgdb implementation</para></listitem> - <listitem><para>The logic to perform safe memory reads and writes to memory while using the debugger</para></listitem> - <listitem><para>A full implementation for software breakpoints unless overridden by the arch</para></listitem> - <listitem><para>The API to invoke either the kdb or kgdb frontend to the debug core.</para></listitem> - <listitem><para>The structures and callback API for atomic kernel mode setting.</para> - <para>NOTE: kgdboc is where the kms callbacks are invoked.</para></listitem> - </itemizedlist> - </para> - </listitem> - <listitem><para>kgdb arch-specific implementation</para> - <para> - This implementation is generally found in arch/*/kernel/kgdb.c. - As an example, arch/x86/kernel/kgdb.c contains the specifics to - implement HW breakpoint as well as the initialization to - dynamically register and unregister for the trap handlers on - this architecture. The arch-specific portion implements: - <itemizedlist> - <listitem><para>contains an arch-specific trap catcher which - invokes kgdb_handle_exception() to start kgdb about doing its - work</para></listitem> - <listitem><para>translation to and from gdb specific packet format to pt_regs</para></listitem> - <listitem><para>Registration and unregistration of architecture specific trap hooks</para></listitem> - <listitem><para>Any special exception handling and cleanup</para></listitem> - <listitem><para>NMI exception handling and cleanup</para></listitem> - <listitem><para>(optional) HW breakpoints</para></listitem> - </itemizedlist> - </para> - </listitem> - <listitem><para>gdbstub frontend (aka kgdb)</para> - <para>The gdbstub is located in kernel/debug/gdbstub.c. It contains:</para> - <itemizedlist> - <listitem><para>All the logic to implement the gdb serial protocol</para></listitem> - </itemizedlist> - </listitem> - <listitem><para>kdb frontend</para> - <para>The kdb debugger shell is broken down into a number of - components. The kdb core is located in kernel/debug/kdb. There - are a number of helper functions in some of the other kernel - components to make it possible for kdb to examine and report - information about the kernel without taking locks that could - cause a kernel deadlock. The kdb core contains implements the following functionality.</para> - <itemizedlist> - <listitem><para>A simple shell</para></listitem> - <listitem><para>The kdb core command set</para></listitem> - <listitem><para>A registration API to register additional kdb shell commands.</para> - <itemizedlist> - <listitem><para>A good example of a self-contained kdb module - is the "ftdump" command for dumping the ftrace buffer. See: - kernel/trace/trace_kdb.c</para></listitem> - <listitem><para>For an example of how to dynamically register - a new kdb command you can build the kdb_hello.ko kernel module - from samples/kdb/kdb_hello.c. To build this example you can - set CONFIG_SAMPLES=y and CONFIG_SAMPLE_KDB=m in your kernel - config. Later run "modprobe kdb_hello" and the next time you - enter the kdb shell, you can run the "hello" - command.</para></listitem> - </itemizedlist></listitem> - <listitem><para>The implementation for kdb_printf() which - emits messages directly to I/O drivers, bypassing the kernel - log.</para></listitem> - <listitem><para>SW / HW breakpoint management for the kdb shell</para></listitem> - </itemizedlist> - </listitem> - <listitem><para>kgdb I/O driver</para> - <para> - Each kgdb I/O driver has to provide an implementation for the following: - <itemizedlist> - <listitem><para>configuration via built-in or module</para></listitem> - <listitem><para>dynamic configuration and kgdb hook registration calls</para></listitem> - <listitem><para>read and write character interface</para></listitem> - <listitem><para>A cleanup handler for unconfiguring from the kgdb core</para></listitem> - <listitem><para>(optional) Early debug methodology</para></listitem> - </itemizedlist> - Any given kgdb I/O driver has to operate very closely with the - hardware and must do it in such a way that does not enable - interrupts or change other parts of the system context without - completely restoring them. The kgdb core will repeatedly "poll" - a kgdb I/O driver for characters when it needs input. The I/O - driver is expected to return immediately if there is no data - available. Doing so allows for the future possibility to touch - watchdog hardware in such a way as to have a target system not - reset when these are enabled. - </para> - </listitem> - </orderedlist> - </para> - <para> - If you are intent on adding kgdb architecture specific support - for a new architecture, the architecture should define - <constant>HAVE_ARCH_KGDB</constant> in the architecture specific - Kconfig file. This will enable kgdb for the architecture, and - at that point you must create an architecture specific kgdb - implementation. - </para> - <para> - There are a few flags which must be set on every architecture in - their <asm/kgdb.h> file. These are: - <itemizedlist> - <listitem> - <para> - NUMREGBYTES: The size in bytes of all of the registers, so - that we can ensure they will all fit into a packet. - </para> - </listitem> - <listitem> - <para> - BUFMAX: The size in bytes of the buffer GDB will read into. - This must be larger than NUMREGBYTES. - </para> - </listitem> - <listitem> - <para> - CACHE_FLUSH_IS_SAFE: Set to 1 if it is always safe to call - flush_cache_range or flush_icache_range. On some architectures, - these functions may not be safe to call on SMP since we keep other - CPUs in a holding pattern. - </para> - </listitem> - </itemizedlist> - </para> - <para> - There are also the following functions for the common backend, - found in kernel/kgdb.c, that must be supplied by the - architecture-specific backend unless marked as (optional), in - which case a default function maybe used if the architecture - does not need to provide a specific implementation. - </para> -!Iinclude/linux/kgdb.h - </sect1> - <sect1 id="kgdbocDesign"> - <title>kgdboc internals</title> - <sect2> - <title>kgdboc and uarts</title> - <para> - The kgdboc driver is actually a very thin driver that relies on the - underlying low level to the hardware driver having "polling hooks" - to which the tty driver is attached. In the initial - implementation of kgdboc the serial_core was changed to expose a - low level UART hook for doing polled mode reading and writing of a - single character while in an atomic context. When kgdb makes an I/O - request to the debugger, kgdboc invokes a callback in the serial - core which in turn uses the callback in the UART driver.</para> - <para> - When using kgdboc with a UART, the UART driver must implement two callbacks in the <constant>struct uart_ops</constant>. Example from drivers/8250.c:<programlisting> -#ifdef CONFIG_CONSOLE_POLL - .poll_get_char = serial8250_get_poll_char, - .poll_put_char = serial8250_put_poll_char, -#endif - </programlisting> - Any implementation specifics around creating a polling driver use the - <constant>#ifdef CONFIG_CONSOLE_POLL</constant>, as shown above. - Keep in mind that polling hooks have to be implemented in such a way - that they can be called from an atomic context and have to restore - the state of the UART chip on return such that the system can return - to normal when the debugger detaches. You need to be very careful - with any kind of lock you consider, because failing here is most likely - going to mean pressing the reset button. - </para> - </sect2> - <sect2 id="kgdbocKbd"> - <title>kgdboc and keyboards</title> - <para>The kgdboc driver contains logic to configure communications - with an attached keyboard. The keyboard infrastructure is only - compiled into the kernel when CONFIG_KDB_KEYBOARD=y is set in the - kernel configuration.</para> - <para>The core polled keyboard driver driver for PS/2 type keyboards - is in drivers/char/kdb_keyboard.c. This driver is hooked into the - debug core when kgdboc populates the callback in the array - called <constant>kdb_poll_funcs[]</constant>. The - kdb_get_kbd_char() is the top-level function which polls hardware - for single character input. - </para> - </sect2> - <sect2 id="kgdbocKms"> - <title>kgdboc and kms</title> - <para>The kgdboc driver contains logic to request the graphics - display to switch to a text context when you are using - "kgdboc=kms,kbd", provided that you have a video driver which has a - frame buffer console and atomic kernel mode setting support.</para> - <para> - Every time the kernel - debugger is entered it calls kgdboc_pre_exp_handler() which in turn - calls con_debug_enter() in the virtual console layer. On resuming kernel - execution, the kernel debugger calls kgdboc_post_exp_handler() which - in turn calls con_debug_leave().</para> - <para>Any video driver that wants to be compatible with the kernel - debugger and the atomic kms callbacks must implement the - mode_set_base_atomic, fb_debug_enter and fb_debug_leave operations. - For the fb_debug_enter and fb_debug_leave the option exists to use - the generic drm fb helper functions or implement something custom for - the hardware. The following example shows the initialization of the - .mode_set_base_atomic operation in - drivers/gpu/drm/i915/intel_display.c: - <informalexample> - <programlisting> -static const struct drm_crtc_helper_funcs intel_helper_funcs = { -[...] - .mode_set_base_atomic = intel_pipe_set_base_atomic, -[...] -}; - </programlisting> - </informalexample> - </para> - <para>Here is an example of how the i915 driver initializes the fb_debug_enter and fb_debug_leave functions to use the generic drm helpers in - drivers/gpu/drm/i915/intel_fb.c: - <informalexample> - <programlisting> -static struct fb_ops intelfb_ops = { -[...] - .fb_debug_enter = drm_fb_helper_debug_enter, - .fb_debug_leave = drm_fb_helper_debug_leave, -[...] -}; - </programlisting> - </informalexample> - </para> - </sect2> - </sect1> - </chapter> - <chapter id="credits"> - <title>Credits</title> - <para> - The following people have contributed to this document: - <orderedlist> - <listitem><para>Amit Kale<email>amitkale@linsyssoft.com</email></para></listitem> - <listitem><para>Tom Rini<email>trini@kernel.crashing.org</email></para></listitem> - </orderedlist> - In March 2008 this document was completely rewritten by: - <itemizedlist> - <listitem><para>Jason Wessel<email>jason.wessel@windriver.com</email></para></listitem> - </itemizedlist> - In Jan 2010 this document was updated to include kdb. - <itemizedlist> - <listitem><para>Jason Wessel<email>jason.wessel@windriver.com</email></para></listitem> - </itemizedlist> - </para> - </chapter> -</book> - diff --git a/Documentation/DocBook/libata.tmpl b/Documentation/DocBook/libata.tmpl deleted file mode 100644 index 0320910b866d..000000000000 --- a/Documentation/DocBook/libata.tmpl +++ /dev/null @@ -1,1625 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="libataDevGuide"> - <bookinfo> - <title>libATA Developer's Guide</title> - - <authorgroup> - <author> - <firstname>Jeff</firstname> - <surname>Garzik</surname> - </author> - </authorgroup> - - <copyright> - <year>2003-2006</year> - <holder>Jeff Garzik</holder> - </copyright> - - <legalnotice> - <para> - The contents of this file are subject to the Open - Software License version 1.1 that can be found at - <ulink url="http://fedoraproject.org/wiki/Licensing:OSL1.1">http://fedoraproject.org/wiki/Licensing:OSL1.1</ulink> - and is included herein by reference. - </para> - - <para> - Alternatively, the contents of this file may be used under the terms - of the GNU General Public License version 2 (the "GPL") as distributed - in the kernel source COPYING file, in which case the provisions of - the GPL are applicable instead of the above. If you wish to allow - the use of your version of this file only under the terms of the - GPL and not to allow others to use your version of this file under - the OSL, indicate your decision by deleting the provisions above and - replace them with the notice and other provisions required by the GPL. - If you do not delete the provisions above, a recipient may use your - version of this file under either the OSL or the GPL. - </para> - - </legalnotice> - </bookinfo> - -<toc></toc> - - <chapter id="libataIntroduction"> - <title>Introduction</title> - <para> - libATA is a library used inside the Linux kernel to support ATA host - controllers and devices. libATA provides an ATA driver API, class - transports for ATA and ATAPI devices, and SCSI<->ATA translation - for ATA devices according to the T10 SAT specification. - </para> - <para> - This Guide documents the libATA driver API, library functions, library - internals, and a couple sample ATA low-level drivers. - </para> - </chapter> - - <chapter id="libataDriverApi"> - <title>libata Driver API</title> - <para> - struct ata_port_operations is defined for every low-level libata - hardware driver, and it controls how the low-level driver - interfaces with the ATA and SCSI layers. - </para> - <para> - FIS-based drivers will hook into the system with ->qc_prep() and - ->qc_issue() high-level hooks. Hardware which behaves in a manner - similar to PCI IDE hardware may utilize several generic helpers, - defining at a bare minimum the bus I/O addresses of the ATA shadow - register blocks. - </para> - <sect1> - <title>struct ata_port_operations</title> - - <sect2><title>Disable ATA port</title> - <programlisting> -void (*port_disable) (struct ata_port *); - </programlisting> - - <para> - Called from ata_bus_probe() error path, as well as when - unregistering from the SCSI module (rmmod, hot unplug). - This function should do whatever needs to be done to take the - port out of use. In most cases, ata_port_disable() can be used - as this hook. - </para> - <para> - Called from ata_bus_probe() on a failed probe. - Called from ata_scsi_release(). - </para> - - </sect2> - - <sect2><title>Post-IDENTIFY device configuration</title> - <programlisting> -void (*dev_config) (struct ata_port *, struct ata_device *); - </programlisting> - - <para> - Called after IDENTIFY [PACKET] DEVICE is issued to each device - found. Typically used to apply device-specific fixups prior to - issue of SET FEATURES - XFER MODE, and prior to operation. - </para> - <para> - This entry may be specified as NULL in ata_port_operations. - </para> - - </sect2> - - <sect2><title>Set PIO/DMA mode</title> - <programlisting> -void (*set_piomode) (struct ata_port *, struct ata_device *); -void (*set_dmamode) (struct ata_port *, struct ata_device *); -void (*post_set_mode) (struct ata_port *); -unsigned int (*mode_filter) (struct ata_port *, struct ata_device *, unsigned int); - </programlisting> - - <para> - Hooks called prior to the issue of SET FEATURES - XFER MODE - command. The optional ->mode_filter() hook is called when libata - has built a mask of the possible modes. This is passed to the - ->mode_filter() function which should return a mask of valid modes - after filtering those unsuitable due to hardware limits. It is not - valid to use this interface to add modes. - </para> - <para> - dev->pio_mode and dev->dma_mode are guaranteed to be valid when - ->set_piomode() and when ->set_dmamode() is called. The timings for - any other drive sharing the cable will also be valid at this point. - That is the library records the decisions for the modes of each - drive on a channel before it attempts to set any of them. - </para> - <para> - ->post_set_mode() is - called unconditionally, after the SET FEATURES - XFER MODE - command completes successfully. - </para> - - <para> - ->set_piomode() is always called (if present), but - ->set_dma_mode() is only called if DMA is possible. - </para> - - </sect2> - - <sect2><title>Taskfile read/write</title> - <programlisting> -void (*sff_tf_load) (struct ata_port *ap, struct ata_taskfile *tf); -void (*sff_tf_read) (struct ata_port *ap, struct ata_taskfile *tf); - </programlisting> - - <para> - ->tf_load() is called to load the given taskfile into hardware - registers / DMA buffers. ->tf_read() is called to read the - hardware registers / DMA buffers, to obtain the current set of - taskfile register values. - Most drivers for taskfile-based hardware (PIO or MMIO) use - ata_sff_tf_load() and ata_sff_tf_read() for these hooks. - </para> - - </sect2> - - <sect2><title>PIO data read/write</title> - <programlisting> -void (*sff_data_xfer) (struct ata_device *, unsigned char *, unsigned int, int); - </programlisting> - - <para> -All bmdma-style drivers must implement this hook. This is the low-level -operation that actually copies the data bytes during a PIO data -transfer. -Typically the driver will choose one of ata_sff_data_xfer_noirq(), -ata_sff_data_xfer(), or ata_sff_data_xfer32(). - </para> - - </sect2> - - <sect2><title>ATA command execute</title> - <programlisting> -void (*sff_exec_command)(struct ata_port *ap, struct ata_taskfile *tf); - </programlisting> - - <para> - causes an ATA command, previously loaded with - ->tf_load(), to be initiated in hardware. - Most drivers for taskfile-based hardware use ata_sff_exec_command() - for this hook. - </para> - - </sect2> - - <sect2><title>Per-cmd ATAPI DMA capabilities filter</title> - <programlisting> -int (*check_atapi_dma) (struct ata_queued_cmd *qc); - </programlisting> - - <para> -Allow low-level driver to filter ATA PACKET commands, returning a status -indicating whether or not it is OK to use DMA for the supplied PACKET -command. - </para> - <para> - This hook may be specified as NULL, in which case libata will - assume that atapi dma can be supported. - </para> - - </sect2> - - <sect2><title>Read specific ATA shadow registers</title> - <programlisting> -u8 (*sff_check_status)(struct ata_port *ap); -u8 (*sff_check_altstatus)(struct ata_port *ap); - </programlisting> - - <para> - Reads the Status/AltStatus ATA shadow register from - hardware. On some hardware, reading the Status register has - the side effect of clearing the interrupt condition. - Most drivers for taskfile-based hardware use - ata_sff_check_status() for this hook. - </para> - - </sect2> - - <sect2><title>Write specific ATA shadow register</title> - <programlisting> -void (*sff_set_devctl)(struct ata_port *ap, u8 ctl); - </programlisting> - - <para> - Write the device control ATA shadow register to the hardware. - Most drivers don't need to define this. - </para> - - </sect2> - - <sect2><title>Select ATA device on bus</title> - <programlisting> -void (*sff_dev_select)(struct ata_port *ap, unsigned int device); - </programlisting> - - <para> - Issues the low-level hardware command(s) that causes one of N - hardware devices to be considered 'selected' (active and - available for use) on the ATA bus. This generally has no - meaning on FIS-based devices. - </para> - <para> - Most drivers for taskfile-based hardware use - ata_sff_dev_select() for this hook. - </para> - - </sect2> - - <sect2><title>Private tuning method</title> - <programlisting> -void (*set_mode) (struct ata_port *ap); - </programlisting> - - <para> - By default libata performs drive and controller tuning in - accordance with the ATA timing rules and also applies blacklists - and cable limits. Some controllers need special handling and have - custom tuning rules, typically raid controllers that use ATA - commands but do not actually do drive timing. - </para> - - <warning> - <para> - This hook should not be used to replace the standard controller - tuning logic when a controller has quirks. Replacing the default - tuning logic in that case would bypass handling for drive and - bridge quirks that may be important to data reliability. If a - controller needs to filter the mode selection it should use the - mode_filter hook instead. - </para> - </warning> - - </sect2> - - <sect2><title>Control PCI IDE BMDMA engine</title> - <programlisting> -void (*bmdma_setup) (struct ata_queued_cmd *qc); -void (*bmdma_start) (struct ata_queued_cmd *qc); -void (*bmdma_stop) (struct ata_port *ap); -u8 (*bmdma_status) (struct ata_port *ap); - </programlisting> - - <para> -When setting up an IDE BMDMA transaction, these hooks arm -(->bmdma_setup), fire (->bmdma_start), and halt (->bmdma_stop) -the hardware's DMA engine. ->bmdma_status is used to read the standard -PCI IDE DMA Status register. - </para> - - <para> -These hooks are typically either no-ops, or simply not implemented, in -FIS-based drivers. - </para> - <para> -Most legacy IDE drivers use ata_bmdma_setup() for the bmdma_setup() -hook. ata_bmdma_setup() will write the pointer to the PRD table to -the IDE PRD Table Address register, enable DMA in the DMA Command -register, and call exec_command() to begin the transfer. - </para> - <para> -Most legacy IDE drivers use ata_bmdma_start() for the bmdma_start() -hook. ata_bmdma_start() will write the ATA_DMA_START flag to the DMA -Command register. - </para> - <para> -Many legacy IDE drivers use ata_bmdma_stop() for the bmdma_stop() -hook. ata_bmdma_stop() clears the ATA_DMA_START flag in the DMA -command register. - </para> - <para> -Many legacy IDE drivers use ata_bmdma_status() as the bmdma_status() hook. - </para> - - </sect2> - - <sect2><title>High-level taskfile hooks</title> - <programlisting> -void (*qc_prep) (struct ata_queued_cmd *qc); -int (*qc_issue) (struct ata_queued_cmd *qc); - </programlisting> - - <para> - Higher-level hooks, these two hooks can potentially supercede - several of the above taskfile/DMA engine hooks. ->qc_prep is - called after the buffers have been DMA-mapped, and is typically - used to populate the hardware's DMA scatter-gather table. - Most drivers use the standard ata_qc_prep() helper function, but - more advanced drivers roll their own. - </para> - <para> - ->qc_issue is used to make a command active, once the hardware - and S/G tables have been prepared. IDE BMDMA drivers use the - helper function ata_qc_issue_prot() for taskfile protocol-based - dispatch. More advanced drivers implement their own ->qc_issue. - </para> - <para> - ata_qc_issue_prot() calls ->tf_load(), ->bmdma_setup(), and - ->bmdma_start() as necessary to initiate a transfer. - </para> - - </sect2> - - <sect2><title>Exception and probe handling (EH)</title> - <programlisting> -void (*eng_timeout) (struct ata_port *ap); -void (*phy_reset) (struct ata_port *ap); - </programlisting> - - <para> -Deprecated. Use ->error_handler() instead. - </para> - - <programlisting> -void (*freeze) (struct ata_port *ap); -void (*thaw) (struct ata_port *ap); - </programlisting> - - <para> -ata_port_freeze() is called when HSM violations or some other -condition disrupts normal operation of the port. A frozen port -is not allowed to perform any operation until the port is -thawed, which usually follows a successful reset. - </para> - - <para> -The optional ->freeze() callback can be used for freezing the port -hardware-wise (e.g. mask interrupt and stop DMA engine). If a -port cannot be frozen hardware-wise, the interrupt handler -must ack and clear interrupts unconditionally while the port -is frozen. - </para> - <para> -The optional ->thaw() callback is called to perform the opposite of ->freeze(): -prepare the port for normal operation once again. Unmask interrupts, -start DMA engine, etc. - </para> - - <programlisting> -void (*error_handler) (struct ata_port *ap); - </programlisting> - - <para> -->error_handler() is a driver's hook into probe, hotplug, and recovery -and other exceptional conditions. The primary responsibility of an -implementation is to call ata_do_eh() or ata_bmdma_drive_eh() with a set -of EH hooks as arguments: - </para> - - <para> -'prereset' hook (may be NULL) is called during an EH reset, before any other actions -are taken. - </para> - - <para> -'postreset' hook (may be NULL) is called after the EH reset is performed. Based on -existing conditions, severity of the problem, and hardware capabilities, - </para> - - <para> -Either 'softreset' (may be NULL) or 'hardreset' (may be NULL) will be -called to perform the low-level EH reset. - </para> - - <programlisting> -void (*post_internal_cmd) (struct ata_queued_cmd *qc); - </programlisting> - - <para> -Perform any hardware-specific actions necessary to finish processing -after executing a probe-time or EH-time command via ata_exec_internal(). - </para> - - </sect2> - - <sect2><title>Hardware interrupt handling</title> - <programlisting> -irqreturn_t (*irq_handler)(int, void *, struct pt_regs *); -void (*irq_clear) (struct ata_port *); - </programlisting> - - <para> - ->irq_handler is the interrupt handling routine registered with - the system, by libata. ->irq_clear is called during probe just - before the interrupt handler is registered, to be sure hardware - is quiet. - </para> - <para> - The second argument, dev_instance, should be cast to a pointer - to struct ata_host_set. - </para> - <para> - Most legacy IDE drivers use ata_sff_interrupt() for the - irq_handler hook, which scans all ports in the host_set, - determines which queued command was active (if any), and calls - ata_sff_host_intr(ap,qc). - </para> - <para> - Most legacy IDE drivers use ata_sff_irq_clear() for the - irq_clear() hook, which simply clears the interrupt and error - flags in the DMA status register. - </para> - - </sect2> - - <sect2><title>SATA phy read/write</title> - <programlisting> -int (*scr_read) (struct ata_port *ap, unsigned int sc_reg, - u32 *val); -int (*scr_write) (struct ata_port *ap, unsigned int sc_reg, - u32 val); - </programlisting> - - <para> - Read and write standard SATA phy registers. Currently only used - if ->phy_reset hook called the sata_phy_reset() helper function. - sc_reg is one of SCR_STATUS, SCR_CONTROL, SCR_ERROR, or SCR_ACTIVE. - </para> - - </sect2> - - <sect2><title>Init and shutdown</title> - <programlisting> -int (*port_start) (struct ata_port *ap); -void (*port_stop) (struct ata_port *ap); -void (*host_stop) (struct ata_host_set *host_set); - </programlisting> - - <para> - ->port_start() is called just after the data structures for each - port are initialized. Typically this is used to alloc per-port - DMA buffers / tables / rings, enable DMA engines, and similar - tasks. Some drivers also use this entry point as a chance to - allocate driver-private memory for ap->private_data. - </para> - <para> - Many drivers use ata_port_start() as this hook or call - it from their own port_start() hooks. ata_port_start() - allocates space for a legacy IDE PRD table and returns. - </para> - <para> - ->port_stop() is called after ->host_stop(). Its sole function - is to release DMA/memory resources, now that they are no longer - actively being used. Many drivers also free driver-private - data from port at this time. - </para> - <para> - ->host_stop() is called after all ->port_stop() calls -have completed. The hook must finalize hardware shutdown, release DMA -and other resources, etc. - This hook may be specified as NULL, in which case it is not called. - </para> - - </sect2> - - </sect1> - </chapter> - - <chapter id="libataEH"> - <title>Error handling</title> - - <para> - This chapter describes how errors are handled under libata. - Readers are advised to read SCSI EH - (Documentation/scsi/scsi_eh.txt) and ATA exceptions doc first. - </para> - - <sect1><title>Origins of commands</title> - <para> - In libata, a command is represented with struct ata_queued_cmd - or qc. qc's are preallocated during port initialization and - repetitively used for command executions. Currently only one - qc is allocated per port but yet-to-be-merged NCQ branch - allocates one for each tag and maps each qc to NCQ tag 1-to-1. - </para> - <para> - libata commands can originate from two sources - libata itself - and SCSI midlayer. libata internal commands are used for - initialization and error handling. All normal blk requests - and commands for SCSI emulation are passed as SCSI commands - through queuecommand callback of SCSI host template. - </para> - </sect1> - - <sect1><title>How commands are issued</title> - - <variablelist> - - <varlistentry><term>Internal commands</term> - <listitem> - <para> - First, qc is allocated and initialized using - ata_qc_new_init(). Although ata_qc_new_init() doesn't - implement any wait or retry mechanism when qc is not - available, internal commands are currently issued only during - initialization and error recovery, so no other command is - active and allocation is guaranteed to succeed. - </para> - <para> - Once allocated qc's taskfile is initialized for the command to - be executed. qc currently has two mechanisms to notify - completion. One is via qc->complete_fn() callback and the - other is completion qc->waiting. qc->complete_fn() callback - is the asynchronous path used by normal SCSI translated - commands and qc->waiting is the synchronous (issuer sleeps in - process context) path used by internal commands. - </para> - <para> - Once initialization is complete, host_set lock is acquired - and the qc is issued. - </para> - </listitem> - </varlistentry> - - <varlistentry><term>SCSI commands</term> - <listitem> - <para> - All libata drivers use ata_scsi_queuecmd() as - hostt->queuecommand callback. scmds can either be simulated - or translated. No qc is involved in processing a simulated - scmd. The result is computed right away and the scmd is - completed. - </para> - <para> - For a translated scmd, ata_qc_new_init() is invoked to - allocate a qc and the scmd is translated into the qc. SCSI - midlayer's completion notification function pointer is stored - into qc->scsidone. - </para> - <para> - qc->complete_fn() callback is used for completion - notification. ATA commands use ata_scsi_qc_complete() while - ATAPI commands use atapi_qc_complete(). Both functions end up - calling qc->scsidone to notify upper layer when the qc is - finished. After translation is completed, the qc is issued - with ata_qc_issue(). - </para> - <para> - Note that SCSI midlayer invokes hostt->queuecommand while - holding host_set lock, so all above occur while holding - host_set lock. - </para> - </listitem> - </varlistentry> - - </variablelist> - </sect1> - - <sect1><title>How commands are processed</title> - <para> - Depending on which protocol and which controller are used, - commands are processed differently. For the purpose of - discussion, a controller which uses taskfile interface and all - standard callbacks is assumed. - </para> - <para> - Currently 6 ATA command protocols are used. They can be - sorted into the following four categories according to how - they are processed. - </para> - - <variablelist> - <varlistentry><term>ATA NO DATA or DMA</term> - <listitem> - <para> - ATA_PROT_NODATA and ATA_PROT_DMA fall into this category. - These types of commands don't require any software - intervention once issued. Device will raise interrupt on - completion. - </para> - </listitem> - </varlistentry> - - <varlistentry><term>ATA PIO</term> - <listitem> - <para> - ATA_PROT_PIO is in this category. libata currently - implements PIO with polling. ATA_NIEN bit is set to turn - off interrupt and pio_task on ata_wq performs polling and - IO. - </para> - </listitem> - </varlistentry> - - <varlistentry><term>ATAPI NODATA or DMA</term> - <listitem> - <para> - ATA_PROT_ATAPI_NODATA and ATA_PROT_ATAPI_DMA are in this - category. packet_task is used to poll BSY bit after - issuing PACKET command. Once BSY is turned off by the - device, packet_task transfers CDB and hands off processing - to interrupt handler. - </para> - </listitem> - </varlistentry> - - <varlistentry><term>ATAPI PIO</term> - <listitem> - <para> - ATA_PROT_ATAPI is in this category. ATA_NIEN bit is set - and, as in ATAPI NODATA or DMA, packet_task submits cdb. - However, after submitting cdb, further processing (data - transfer) is handed off to pio_task. - </para> - </listitem> - </varlistentry> - </variablelist> - </sect1> - - <sect1><title>How commands are completed</title> - <para> - Once issued, all qc's are either completed with - ata_qc_complete() or time out. For commands which are handled - by interrupts, ata_host_intr() invokes ata_qc_complete(), and, - for PIO tasks, pio_task invokes ata_qc_complete(). In error - cases, packet_task may also complete commands. - </para> - <para> - ata_qc_complete() does the following. - </para> - - <orderedlist> - - <listitem> - <para> - DMA memory is unmapped. - </para> - </listitem> - - <listitem> - <para> - ATA_QCFLAG_ACTIVE is cleared from qc->flags. - </para> - </listitem> - - <listitem> - <para> - qc->complete_fn() callback is invoked. If the return value of - the callback is not zero. Completion is short circuited and - ata_qc_complete() returns. - </para> - </listitem> - - <listitem> - <para> - __ata_qc_complete() is called, which does - <orderedlist> - - <listitem> - <para> - qc->flags is cleared to zero. - </para> - </listitem> - - <listitem> - <para> - ap->active_tag and qc->tag are poisoned. - </para> - </listitem> - - <listitem> - <para> - qc->waiting is cleared & completed (in that order). - </para> - </listitem> - - <listitem> - <para> - qc is deallocated by clearing appropriate bit in ap->qactive. - </para> - </listitem> - - </orderedlist> - </para> - </listitem> - - </orderedlist> - - <para> - So, it basically notifies upper layer and deallocates qc. One - exception is short-circuit path in #3 which is used by - atapi_qc_complete(). - </para> - <para> - For all non-ATAPI commands, whether it fails or not, almost - the same code path is taken and very little error handling - takes place. A qc is completed with success status if it - succeeded, with failed status otherwise. - </para> - <para> - However, failed ATAPI commands require more handling as - REQUEST SENSE is needed to acquire sense data. If an ATAPI - command fails, ata_qc_complete() is invoked with error status, - which in turn invokes atapi_qc_complete() via - qc->complete_fn() callback. - </para> - <para> - This makes atapi_qc_complete() set scmd->result to - SAM_STAT_CHECK_CONDITION, complete the scmd and return 1. As - the sense data is empty but scmd->result is CHECK CONDITION, - SCSI midlayer will invoke EH for the scmd, and returning 1 - makes ata_qc_complete() to return without deallocating the qc. - This leads us to ata_scsi_error() with partially completed qc. - </para> - - </sect1> - - <sect1><title>ata_scsi_error()</title> - <para> - ata_scsi_error() is the current transportt->eh_strategy_handler() - for libata. As discussed above, this will be entered in two - cases - timeout and ATAPI error completion. This function - calls low level libata driver's eng_timeout() callback, the - standard callback for which is ata_eng_timeout(). It checks - if a qc is active and calls ata_qc_timeout() on the qc if so. - Actual error handling occurs in ata_qc_timeout(). - </para> - <para> - If EH is invoked for timeout, ata_qc_timeout() stops BMDMA and - completes the qc. Note that as we're currently in EH, we - cannot call scsi_done. As described in SCSI EH doc, a - recovered scmd should be either retried with - scsi_queue_insert() or finished with scsi_finish_command(). - Here, we override qc->scsidone with scsi_finish_command() and - calls ata_qc_complete(). - </para> - <para> - If EH is invoked due to a failed ATAPI qc, the qc here is - completed but not deallocated. The purpose of this - half-completion is to use the qc as place holder to make EH - code reach this place. This is a bit hackish, but it works. - </para> - <para> - Once control reaches here, the qc is deallocated by invoking - __ata_qc_complete() explicitly. Then, internal qc for REQUEST - SENSE is issued. Once sense data is acquired, scmd is - finished by directly invoking scsi_finish_command() on the - scmd. Note that as we already have completed and deallocated - the qc which was associated with the scmd, we don't need - to/cannot call ata_qc_complete() again. - </para> - - </sect1> - - <sect1><title>Problems with the current EH</title> - - <itemizedlist> - - <listitem> - <para> - Error representation is too crude. Currently any and all - error conditions are represented with ATA STATUS and ERROR - registers. Errors which aren't ATA device errors are treated - as ATA device errors by setting ATA_ERR bit. Better error - descriptor which can properly represent ATA and other - errors/exceptions is needed. - </para> - </listitem> - - <listitem> - <para> - When handling timeouts, no action is taken to make device - forget about the timed out command and ready for new commands. - </para> - </listitem> - - <listitem> - <para> - EH handling via ata_scsi_error() is not properly protected - from usual command processing. On EH entrance, the device is - not in quiescent state. Timed out commands may succeed or - fail any time. pio_task and atapi_task may still be running. - </para> - </listitem> - - <listitem> - <para> - Too weak error recovery. Devices / controllers causing HSM - mismatch errors and other errors quite often require reset to - return to known state. Also, advanced error handling is - necessary to support features like NCQ and hotplug. - </para> - </listitem> - - <listitem> - <para> - ATA errors are directly handled in the interrupt handler and - PIO errors in pio_task. This is problematic for advanced - error handling for the following reasons. - </para> - <para> - First, advanced error handling often requires context and - internal qc execution. - </para> - <para> - Second, even a simple failure (say, CRC error) needs - information gathering and could trigger complex error handling - (say, resetting & reconfiguring). Having multiple code - paths to gather information, enter EH and trigger actions - makes life painful. - </para> - <para> - Third, scattered EH code makes implementing low level drivers - difficult. Low level drivers override libata callbacks. If - EH is scattered over several places, each affected callbacks - should perform its part of error handling. This can be error - prone and painful. - </para> - </listitem> - - </itemizedlist> - </sect1> - </chapter> - - <chapter id="libataExt"> - <title>libata Library</title> -!Edrivers/ata/libata-core.c - </chapter> - - <chapter id="libataInt"> - <title>libata Core Internals</title> -!Idrivers/ata/libata-core.c - </chapter> - - <chapter id="libataScsiInt"> - <title>libata SCSI translation/emulation</title> -!Edrivers/ata/libata-scsi.c -!Idrivers/ata/libata-scsi.c - </chapter> - - <chapter id="ataExceptions"> - <title>ATA errors and exceptions</title> - - <para> - This chapter tries to identify what error/exception conditions exist - for ATA/ATAPI devices and describe how they should be handled in - implementation-neutral way. - </para> - - <para> - The term 'error' is used to describe conditions where either an - explicit error condition is reported from device or a command has - timed out. - </para> - - <para> - The term 'exception' is either used to describe exceptional - conditions which are not errors (say, power or hotplug events), or - to describe both errors and non-error exceptional conditions. Where - explicit distinction between error and exception is necessary, the - term 'non-error exception' is used. - </para> - - <sect1 id="excat"> - <title>Exception categories</title> - <para> - Exceptions are described primarily with respect to legacy - taskfile + bus master IDE interface. If a controller provides - other better mechanism for error reporting, mapping those into - categories described below shouldn't be difficult. - </para> - - <para> - In the following sections, two recovery actions - reset and - reconfiguring transport - are mentioned. These are described - further in <xref linkend="exrec"/>. - </para> - - <sect2 id="excatHSMviolation"> - <title>HSM violation</title> - <para> - This error is indicated when STATUS value doesn't match HSM - requirement during issuing or execution any ATA/ATAPI command. - </para> - - <itemizedlist> - <title>Examples</title> - - <listitem> - <para> - ATA_STATUS doesn't contain !BSY && DRDY && !DRQ while trying - to issue a command. - </para> - </listitem> - - <listitem> - <para> - !BSY && !DRQ during PIO data transfer. - </para> - </listitem> - - <listitem> - <para> - DRQ on command completion. - </para> - </listitem> - - <listitem> - <para> - !BSY && ERR after CDB transfer starts but before the - last byte of CDB is transferred. ATA/ATAPI standard states - that "The device shall not terminate the PACKET command - with an error before the last byte of the command packet has - been written" in the error outputs description of PACKET - command and the state diagram doesn't include such - transitions. - </para> - </listitem> - - </itemizedlist> - - <para> - In these cases, HSM is violated and not much information - regarding the error can be acquired from STATUS or ERROR - register. IOW, this error can be anything - driver bug, - faulty device, controller and/or cable. - </para> - - <para> - As HSM is violated, reset is necessary to restore known state. - Reconfiguring transport for lower speed might be helpful too - as transmission errors sometimes cause this kind of errors. - </para> - </sect2> - - <sect2 id="excatDevErr"> - <title>ATA/ATAPI device error (non-NCQ / non-CHECK CONDITION)</title> - - <para> - These are errors detected and reported by ATA/ATAPI devices - indicating device problems. For this type of errors, STATUS - and ERROR register values are valid and describe error - condition. Note that some of ATA bus errors are detected by - ATA/ATAPI devices and reported using the same mechanism as - device errors. Those cases are described later in this - section. - </para> - - <para> - For ATA commands, this type of errors are indicated by !BSY - && ERR during command execution and on completion. - </para> - - <para>For ATAPI commands,</para> - - <itemizedlist> - - <listitem> - <para> - !BSY && ERR && ABRT right after issuing PACKET - indicates that PACKET command is not supported and falls in - this category. - </para> - </listitem> - - <listitem> - <para> - !BSY && ERR(==CHK) && !ABRT after the last - byte of CDB is transferred indicates CHECK CONDITION and - doesn't fall in this category. - </para> - </listitem> - - <listitem> - <para> - !BSY && ERR(==CHK) && ABRT after the last byte - of CDB is transferred *probably* indicates CHECK CONDITION and - doesn't fall in this category. - </para> - </listitem> - - </itemizedlist> - - <para> - Of errors detected as above, the following are not ATA/ATAPI - device errors but ATA bus errors and should be handled - according to <xref linkend="excatATAbusErr"/>. - </para> - - <variablelist> - - <varlistentry> - <term>CRC error during data transfer</term> - <listitem> - <para> - This is indicated by ICRC bit in the ERROR register and - means that corruption occurred during data transfer. Up to - ATA/ATAPI-7, the standard specifies that this bit is only - applicable to UDMA transfers but ATA/ATAPI-8 draft revision - 1f says that the bit may be applicable to multiword DMA and - PIO. - </para> - </listitem> - </varlistentry> - - <varlistentry> - <term>ABRT error during data transfer or on completion</term> - <listitem> - <para> - Up to ATA/ATAPI-7, the standard specifies that ABRT could be - set on ICRC errors and on cases where a device is not able - to complete a command. Combined with the fact that MWDMA - and PIO transfer errors aren't allowed to use ICRC bit up to - ATA/ATAPI-7, it seems to imply that ABRT bit alone could - indicate transfer errors. - </para> - <para> - However, ATA/ATAPI-8 draft revision 1f removes the part - that ICRC errors can turn on ABRT. So, this is kind of - gray area. Some heuristics are needed here. - </para> - </listitem> - </varlistentry> - - </variablelist> - - <para> - ATA/ATAPI device errors can be further categorized as follows. - </para> - - <variablelist> - - <varlistentry> - <term>Media errors</term> - <listitem> - <para> - This is indicated by UNC bit in the ERROR register. ATA - devices reports UNC error only after certain number of - retries cannot recover the data, so there's nothing much - else to do other than notifying upper layer. - </para> - <para> - READ and WRITE commands report CHS or LBA of the first - failed sector but ATA/ATAPI standard specifies that the - amount of transferred data on error completion is - indeterminate, so we cannot assume that sectors preceding - the failed sector have been transferred and thus cannot - complete those sectors successfully as SCSI does. - </para> - </listitem> - </varlistentry> - - <varlistentry> - <term>Media changed / media change requested error</term> - <listitem> - <para> - <<TODO: fill here>> - </para> - </listitem> - </varlistentry> - - <varlistentry><term>Address error</term> - <listitem> - <para> - This is indicated by IDNF bit in the ERROR register. - Report to upper layer. - </para> - </listitem> - </varlistentry> - - <varlistentry><term>Other errors</term> - <listitem> - <para> - This can be invalid command or parameter indicated by ABRT - ERROR bit or some other error condition. Note that ABRT - bit can indicate a lot of things including ICRC and Address - errors. Heuristics needed. - </para> - </listitem> - </varlistentry> - - </variablelist> - - <para> - Depending on commands, not all STATUS/ERROR bits are - applicable. These non-applicable bits are marked with - "na" in the output descriptions but up to ATA/ATAPI-7 - no definition of "na" can be found. However, - ATA/ATAPI-8 draft revision 1f describes "N/A" as - follows. - </para> - - <blockquote> - <variablelist> - <varlistentry><term>3.2.3.3a N/A</term> - <listitem> - <para> - A keyword the indicates a field has no defined value in - this standard and should not be checked by the host or - device. N/A fields should be cleared to zero. - </para> - </listitem> - </varlistentry> - </variablelist> - </blockquote> - - <para> - So, it seems reasonable to assume that "na" bits are - cleared to zero by devices and thus need no explicit masking. - </para> - - </sect2> - - <sect2 id="excatATAPIcc"> - <title>ATAPI device CHECK CONDITION</title> - - <para> - ATAPI device CHECK CONDITION error is indicated by set CHK bit - (ERR bit) in the STATUS register after the last byte of CDB is - transferred for a PACKET command. For this kind of errors, - sense data should be acquired to gather information regarding - the errors. REQUEST SENSE packet command should be used to - acquire sense data. - </para> - - <para> - Once sense data is acquired, this type of errors can be - handled similarly to other SCSI errors. Note that sense data - may indicate ATA bus error (e.g. Sense Key 04h HARDWARE ERROR - && ASC/ASCQ 47h/00h SCSI PARITY ERROR). In such - cases, the error should be considered as an ATA bus error and - handled according to <xref linkend="excatATAbusErr"/>. - </para> - - </sect2> - - <sect2 id="excatNCQerr"> - <title>ATA device error (NCQ)</title> - - <para> - NCQ command error is indicated by cleared BSY and set ERR bit - during NCQ command phase (one or more NCQ commands - outstanding). Although STATUS and ERROR registers will - contain valid values describing the error, READ LOG EXT is - required to clear the error condition, determine which command - has failed and acquire more information. - </para> - - <para> - READ LOG EXT Log Page 10h reports which tag has failed and - taskfile register values describing the error. With this - information the failed command can be handled as a normal ATA - command error as in <xref linkend="excatDevErr"/> and all - other in-flight commands must be retried. Note that this - retry should not be counted - it's likely that commands - retried this way would have completed normally if it were not - for the failed command. - </para> - - <para> - Note that ATA bus errors can be reported as ATA device NCQ - errors. This should be handled as described in <xref - linkend="excatATAbusErr"/>. - </para> - - <para> - If READ LOG EXT Log Page 10h fails or reports NQ, we're - thoroughly screwed. This condition should be treated - according to <xref linkend="excatHSMviolation"/>. - </para> - - </sect2> - - <sect2 id="excatATAbusErr"> - <title>ATA bus error</title> - - <para> - ATA bus error means that data corruption occurred during - transmission over ATA bus (SATA or PATA). This type of errors - can be indicated by - </para> - - <itemizedlist> - - <listitem> - <para> - ICRC or ABRT error as described in <xref linkend="excatDevErr"/>. - </para> - </listitem> - - <listitem> - <para> - Controller-specific error completion with error information - indicating transmission error. - </para> - </listitem> - - <listitem> - <para> - On some controllers, command timeout. In this case, there may - be a mechanism to determine that the timeout is due to - transmission error. - </para> - </listitem> - - <listitem> - <para> - Unknown/random errors, timeouts and all sorts of weirdities. - </para> - </listitem> - - </itemizedlist> - - <para> - As described above, transmission errors can cause wide variety - of symptoms ranging from device ICRC error to random device - lockup, and, for many cases, there is no way to tell if an - error condition is due to transmission error or not; - therefore, it's necessary to employ some kind of heuristic - when dealing with errors and timeouts. For example, - encountering repetitive ABRT errors for known supported - command is likely to indicate ATA bus error. - </para> - - <para> - Once it's determined that ATA bus errors have possibly - occurred, lowering ATA bus transmission speed is one of - actions which may alleviate the problem. See <xref - linkend="exrecReconf"/> for more information. - </para> - - </sect2> - - <sect2 id="excatPCIbusErr"> - <title>PCI bus error</title> - - <para> - Data corruption or other failures during transmission over PCI - (or other system bus). For standard BMDMA, this is indicated - by Error bit in the BMDMA Status register. This type of - errors must be logged as it indicates something is very wrong - with the system. Resetting host controller is recommended. - </para> - - </sect2> - - <sect2 id="excatLateCompletion"> - <title>Late completion</title> - - <para> - This occurs when timeout occurs and the timeout handler finds - out that the timed out command has completed successfully or - with error. This is usually caused by lost interrupts. This - type of errors must be logged. Resetting host controller is - recommended. - </para> - - </sect2> - - <sect2 id="excatUnknown"> - <title>Unknown error (timeout)</title> - - <para> - This is when timeout occurs and the command is still - processing or the host and device are in unknown state. When - this occurs, HSM could be in any valid or invalid state. To - bring the device to known state and make it forget about the - timed out command, resetting is necessary. The timed out - command may be retried. - </para> - - <para> - Timeouts can also be caused by transmission errors. Refer to - <xref linkend="excatATAbusErr"/> for more details. - </para> - - </sect2> - - <sect2 id="excatHoplugPM"> - <title>Hotplug and power management exceptions</title> - - <para> - <<TODO: fill here>> - </para> - - </sect2> - - </sect1> - - <sect1 id="exrec"> - <title>EH recovery actions</title> - - <para> - This section discusses several important recovery actions. - </para> - - <sect2 id="exrecClr"> - <title>Clearing error condition</title> - - <para> - Many controllers require its error registers to be cleared by - error handler. Different controllers may have different - requirements. - </para> - - <para> - For SATA, it's strongly recommended to clear at least SError - register during error handling. - </para> - </sect2> - - <sect2 id="exrecRst"> - <title>Reset</title> - - <para> - During EH, resetting is necessary in the following cases. - </para> - - <itemizedlist> - - <listitem> - <para> - HSM is in unknown or invalid state - </para> - </listitem> - - <listitem> - <para> - HBA is in unknown or invalid state - </para> - </listitem> - - <listitem> - <para> - EH needs to make HBA/device forget about in-flight commands - </para> - </listitem> - - <listitem> - <para> - HBA/device behaves weirdly - </para> - </listitem> - - </itemizedlist> - - <para> - Resetting during EH might be a good idea regardless of error - condition to improve EH robustness. Whether to reset both or - either one of HBA and device depends on situation but the - following scheme is recommended. - </para> - - <itemizedlist> - - <listitem> - <para> - When it's known that HBA is in ready state but ATA/ATAPI - device is in unknown state, reset only device. - </para> - </listitem> - - <listitem> - <para> - If HBA is in unknown state, reset both HBA and device. - </para> - </listitem> - - </itemizedlist> - - <para> - HBA resetting is implementation specific. For a controller - complying to taskfile/BMDMA PCI IDE, stopping active DMA - transaction may be sufficient iff BMDMA state is the only HBA - context. But even mostly taskfile/BMDMA PCI IDE complying - controllers may have implementation specific requirements and - mechanism to reset themselves. This must be addressed by - specific drivers. - </para> - - <para> - OTOH, ATA/ATAPI standard describes in detail ways to reset - ATA/ATAPI devices. - </para> - - <variablelist> - - <varlistentry><term>PATA hardware reset</term> - <listitem> - <para> - This is hardware initiated device reset signalled with - asserted PATA RESET- signal. There is no standard way to - initiate hardware reset from software although some - hardware provides registers that allow driver to directly - tweak the RESET- signal. - </para> - </listitem> - </varlistentry> - - <varlistentry><term>Software reset</term> - <listitem> - <para> - This is achieved by turning CONTROL SRST bit on for at - least 5us. Both PATA and SATA support it but, in case of - SATA, this may require controller-specific support as the - second Register FIS to clear SRST should be transmitted - while BSY bit is still set. Note that on PATA, this resets - both master and slave devices on a channel. - </para> - </listitem> - </varlistentry> - - <varlistentry><term>EXECUTE DEVICE DIAGNOSTIC command</term> - <listitem> - <para> - Although ATA/ATAPI standard doesn't describe exactly, EDD - implies some level of resetting, possibly similar level - with software reset. Host-side EDD protocol can be handled - with normal command processing and most SATA controllers - should be able to handle EDD's just like other commands. - As in software reset, EDD affects both devices on a PATA - bus. - </para> - <para> - Although EDD does reset devices, this doesn't suit error - handling as EDD cannot be issued while BSY is set and it's - unclear how it will act when device is in unknown/weird - state. - </para> - </listitem> - </varlistentry> - - <varlistentry><term>ATAPI DEVICE RESET command</term> - <listitem> - <para> - This is very similar to software reset except that reset - can be restricted to the selected device without affecting - the other device sharing the cable. - </para> - </listitem> - </varlistentry> - - <varlistentry><term>SATA phy reset</term> - <listitem> - <para> - This is the preferred way of resetting a SATA device. In - effect, it's identical to PATA hardware reset. Note that - this can be done with the standard SCR Control register. - As such, it's usually easier to implement than software - reset. - </para> - </listitem> - </varlistentry> - - </variablelist> - - <para> - One more thing to consider when resetting devices is that - resetting clears certain configuration parameters and they - need to be set to their previous or newly adjusted values - after reset. - </para> - - <para> - Parameters affected are. - </para> - - <itemizedlist> - - <listitem> - <para> - CHS set up with INITIALIZE DEVICE PARAMETERS (seldom used) - </para> - </listitem> - - <listitem> - <para> - Parameters set with SET FEATURES including transfer mode setting - </para> - </listitem> - - <listitem> - <para> - Block count set with SET MULTIPLE MODE - </para> - </listitem> - - <listitem> - <para> - Other parameters (SET MAX, MEDIA LOCK...) - </para> - </listitem> - - </itemizedlist> - - <para> - ATA/ATAPI standard specifies that some parameters must be - maintained across hardware or software reset, but doesn't - strictly specify all of them. Always reconfiguring needed - parameters after reset is required for robustness. Note that - this also applies when resuming from deep sleep (power-off). - </para> - - <para> - Also, ATA/ATAPI standard requires that IDENTIFY DEVICE / - IDENTIFY PACKET DEVICE is issued after any configuration - parameter is updated or a hardware reset and the result used - for further operation. OS driver is required to implement - revalidation mechanism to support this. - </para> - - </sect2> - - <sect2 id="exrecReconf"> - <title>Reconfigure transport</title> - - <para> - For both PATA and SATA, a lot of corners are cut for cheap - connectors, cables or controllers and it's quite common to see - high transmission error rate. This can be mitigated by - lowering transmission speed. - </para> - - <para> - The following is a possible scheme Jeff Garzik suggested. - </para> - - <blockquote> - <para> - If more than $N (3?) transmission errors happen in 15 minutes, - </para> - <itemizedlist> - <listitem> - <para> - if SATA, decrease SATA PHY speed. if speed cannot be decreased, - </para> - </listitem> - <listitem> - <para> - decrease UDMA xfer speed. if at UDMA0, switch to PIO4, - </para> - </listitem> - <listitem> - <para> - decrease PIO xfer speed. if at PIO3, complain, but continue - </para> - </listitem> - </itemizedlist> - </blockquote> - - </sect2> - - </sect1> - - </chapter> - - <chapter id="PiixInt"> - <title>ata_piix Internals</title> -!Idrivers/ata/ata_piix.c - </chapter> - - <chapter id="SILInt"> - <title>sata_sil Internals</title> -!Idrivers/ata/sata_sil.c - </chapter> - - <chapter id="libataThanks"> - <title>Thanks</title> - <para> - The bulk of the ATA knowledge comes thanks to long conversations with - Andre Hedrick (www.linux-ide.org), and long hours pondering the ATA - and SCSI specifications. - </para> - <para> - Thanks to Alan Cox for pointing out similarities - between SATA and SCSI, and in general for motivation to hack on - libata. - </para> - <para> - libata's device detection - method, ata_pio_devchk, and in general all the early probing was - based on extensive study of Hale Landis's probe/reset code in his - ATADRVR driver (www.ata-atapi.com). - </para> - </chapter> - -</book> diff --git a/Documentation/DocBook/librs.tmpl b/Documentation/DocBook/librs.tmpl deleted file mode 100644 index 94f21361e0ed..000000000000 --- a/Documentation/DocBook/librs.tmpl +++ /dev/null @@ -1,289 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="Reed-Solomon-Library-Guide"> - <bookinfo> - <title>Reed-Solomon Library Programming Interface</title> - - <authorgroup> - <author> - <firstname>Thomas</firstname> - <surname>Gleixner</surname> - <affiliation> - <address> - <email>tglx@linutronix.de</email> - </address> - </affiliation> - </author> - </authorgroup> - - <copyright> - <year>2004</year> - <holder>Thomas Gleixner</holder> - </copyright> - - <legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License version 2 as published by the Free Software Foundation. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - </para> - - <para> - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - </para> - - <para> - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - </bookinfo> - -<toc></toc> - - <chapter id="intro"> - <title>Introduction</title> - <para> - The generic Reed-Solomon Library provides encoding, decoding - and error correction functions. - </para> - <para> - Reed-Solomon codes are used in communication and storage - applications to ensure data integrity. - </para> - <para> - This documentation is provided for developers who want to utilize - the functions provided by the library. - </para> - </chapter> - - <chapter id="bugs"> - <title>Known Bugs And Assumptions</title> - <para> - None. - </para> - </chapter> - - <chapter id="usage"> - <title>Usage</title> - <para> - This chapter provides examples of how to use the library. - </para> - <sect1> - <title>Initializing</title> - <para> - The init function init_rs returns a pointer to an - rs decoder structure, which holds the necessary - information for encoding, decoding and error correction - with the given polynomial. It either uses an existing - matching decoder or creates a new one. On creation all - the lookup tables for fast en/decoding are created. - The function may take a while, so make sure not to - call it in critical code paths. - </para> - <programlisting> -/* the Reed Solomon control structure */ -static struct rs_control *rs_decoder; - -/* Symbolsize is 10 (bits) - * Primitive polynomial is x^10+x^3+1 - * first consecutive root is 0 - * primitive element to generate roots = 1 - * generator polynomial degree (number of roots) = 6 - */ -rs_decoder = init_rs (10, 0x409, 0, 1, 6); - </programlisting> - </sect1> - <sect1> - <title>Encoding</title> - <para> - The encoder calculates the Reed-Solomon code over - the given data length and stores the result in - the parity buffer. Note that the parity buffer must - be initialized before calling the encoder. - </para> - <para> - The expanded data can be inverted on the fly by - providing a non-zero inversion mask. The expanded data is - XOR'ed with the mask. This is used e.g. for FLASH - ECC, where the all 0xFF is inverted to an all 0x00. - The Reed-Solomon code for all 0x00 is all 0x00. The - code is inverted before storing to FLASH so it is 0xFF - too. This prevents that reading from an erased FLASH - results in ECC errors. - </para> - <para> - The databytes are expanded to the given symbol size - on the fly. There is no support for encoding continuous - bitstreams with a symbol size != 8 at the moment. If - it is necessary it should be not a big deal to implement - such functionality. - </para> - <programlisting> -/* Parity buffer. Size = number of roots */ -uint16_t par[6]; -/* Initialize the parity buffer */ -memset(par, 0, sizeof(par)); -/* Encode 512 byte in data8. Store parity in buffer par */ -encode_rs8 (rs_decoder, data8, 512, par, 0); - </programlisting> - </sect1> - <sect1> - <title>Decoding</title> - <para> - The decoder calculates the syndrome over - the given data length and the received parity symbols - and corrects errors in the data. - </para> - <para> - If a syndrome is available from a hardware decoder - then the syndrome calculation is skipped. - </para> - <para> - The correction of the data buffer can be suppressed - by providing a correction pattern buffer and an error - location buffer to the decoder. The decoder stores the - calculated error location and the correction bitmask - in the given buffers. This is useful for hardware - decoders which use a weird bit ordering scheme. - </para> - <para> - The databytes are expanded to the given symbol size - on the fly. There is no support for decoding continuous - bitstreams with a symbolsize != 8 at the moment. If - it is necessary it should be not a big deal to implement - such functionality. - </para> - - <sect2> - <title> - Decoding with syndrome calculation, direct data correction - </title> - <programlisting> -/* Parity buffer. Size = number of roots */ -uint16_t par[6]; -uint8_t data[512]; -int numerr; -/* Receive data */ -..... -/* Receive parity */ -..... -/* Decode 512 byte in data8.*/ -numerr = decode_rs8 (rs_decoder, data8, par, 512, NULL, 0, NULL, 0, NULL); - </programlisting> - </sect2> - - <sect2> - <title> - Decoding with syndrome given by hardware decoder, direct data correction - </title> - <programlisting> -/* Parity buffer. Size = number of roots */ -uint16_t par[6], syn[6]; -uint8_t data[512]; -int numerr; -/* Receive data */ -..... -/* Receive parity */ -..... -/* Get syndrome from hardware decoder */ -..... -/* Decode 512 byte in data8.*/ -numerr = decode_rs8 (rs_decoder, data8, par, 512, syn, 0, NULL, 0, NULL); - </programlisting> - </sect2> - - <sect2> - <title> - Decoding with syndrome given by hardware decoder, no direct data correction. - </title> - <para> - Note: It's not necessary to give data and received parity to the decoder. - </para> - <programlisting> -/* Parity buffer. Size = number of roots */ -uint16_t par[6], syn[6], corr[8]; -uint8_t data[512]; -int numerr, errpos[8]; -/* Receive data */ -..... -/* Receive parity */ -..... -/* Get syndrome from hardware decoder */ -..... -/* Decode 512 byte in data8.*/ -numerr = decode_rs8 (rs_decoder, NULL, NULL, 512, syn, 0, errpos, 0, corr); -for (i = 0; i < numerr; i++) { - do_error_correction_in_your_buffer(errpos[i], corr[i]); -} - </programlisting> - </sect2> - </sect1> - <sect1> - <title>Cleanup</title> - <para> - The function free_rs frees the allocated resources, - if the caller is the last user of the decoder. - </para> - <programlisting> -/* Release resources */ -free_rs(rs_decoder); - </programlisting> - </sect1> - - </chapter> - - <chapter id="structs"> - <title>Structures</title> - <para> - This chapter contains the autogenerated documentation of the structures which are - used in the Reed-Solomon Library and are relevant for a developer. - </para> -!Iinclude/linux/rslib.h - </chapter> - - <chapter id="pubfunctions"> - <title>Public Functions Provided</title> - <para> - This chapter contains the autogenerated documentation of the Reed-Solomon functions - which are exported. - </para> -!Elib/reed_solomon/reed_solomon.c - </chapter> - - <chapter id="credits"> - <title>Credits</title> - <para> - The library code for encoding and decoding was written by Phil Karn. - </para> - <programlisting> - Copyright 2002, Phil Karn, KA9Q - May be used under the terms of the GNU General Public License (GPL) - </programlisting> - <para> - The wrapper functions and interfaces are written by Thomas Gleixner. - </para> - <para> - Many users have provided bugfixes, improvements and helping hands for testing. - Thanks a lot. - </para> - <para> - The following people have contributed to this document: - </para> - <para> - Thomas Gleixner<email>tglx@linutronix.de</email> - </para> - </chapter> -</book> diff --git a/Documentation/DocBook/lsm.tmpl b/Documentation/DocBook/lsm.tmpl deleted file mode 100644 index fe7664ce9667..000000000000 --- a/Documentation/DocBook/lsm.tmpl +++ /dev/null @@ -1,265 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<article class="whitepaper" id="LinuxSecurityModule" lang="en"> - <articleinfo> - <title>Linux Security Modules: General Security Hooks for Linux</title> - <authorgroup> - <author> - <firstname>Stephen</firstname> - <surname>Smalley</surname> - <affiliation> - <orgname>NAI Labs</orgname> - <address><email>ssmalley@nai.com</email></address> - </affiliation> - </author> - <author> - <firstname>Timothy</firstname> - <surname>Fraser</surname> - <affiliation> - <orgname>NAI Labs</orgname> - <address><email>tfraser@nai.com</email></address> - </affiliation> - </author> - <author> - <firstname>Chris</firstname> - <surname>Vance</surname> - <affiliation> - <orgname>NAI Labs</orgname> - <address><email>cvance@nai.com</email></address> - </affiliation> - </author> - </authorgroup> - </articleinfo> - -<sect1 id="Introduction"><title>Introduction</title> - -<para> -In March 2001, the National Security Agency (NSA) gave a presentation -about Security-Enhanced Linux (SELinux) at the 2.5 Linux Kernel -Summit. SELinux is an implementation of flexible and fine-grained -nondiscretionary access controls in the Linux kernel, originally -implemented as its own particular kernel patch. Several other -security projects (e.g. RSBAC, Medusa) have also developed flexible -access control architectures for the Linux kernel, and various -projects have developed particular access control models for Linux -(e.g. LIDS, DTE, SubDomain). Each project has developed and -maintained its own kernel patch to support its security needs. -</para> - -<para> -In response to the NSA presentation, Linus Torvalds made a set of -remarks that described a security framework he would be willing to -consider for inclusion in the mainstream Linux kernel. He described a -general framework that would provide a set of security hooks to -control operations on kernel objects and a set of opaque security -fields in kernel data structures for maintaining security attributes. -This framework could then be used by loadable kernel modules to -implement any desired model of security. Linus also suggested the -possibility of migrating the Linux capabilities code into such a -module. -</para> - -<para> -The Linux Security Modules (LSM) project was started by WireX to -develop such a framework. LSM is a joint development effort by -several security projects, including Immunix, SELinux, SGI and Janus, -and several individuals, including Greg Kroah-Hartman and James -Morris, to develop a Linux kernel patch that implements this -framework. The patch is currently tracking the 2.4 series and is -targeted for integration into the 2.5 development series. This -technical report provides an overview of the framework and the example -capabilities security module provided by the LSM kernel patch. -</para> - -</sect1> - -<sect1 id="framework"><title>LSM Framework</title> - -<para> -The LSM kernel patch provides a general kernel framework to support -security modules. In particular, the LSM framework is primarily -focused on supporting access control modules, although future -development is likely to address other security needs such as -auditing. By itself, the framework does not provide any additional -security; it merely provides the infrastructure to support security -modules. The LSM kernel patch also moves most of the capabilities -logic into an optional security module, with the system defaulting -to the traditional superuser logic. This capabilities module -is discussed further in <xref linkend="cap"/>. -</para> - -<para> -The LSM kernel patch adds security fields to kernel data structures -and inserts calls to hook functions at critical points in the kernel -code to manage the security fields and to perform access control. It -also adds functions for registering and unregistering security -modules, and adds a general <function>security</function> system call -to support new system calls for security-aware applications. -</para> - -<para> -The LSM security fields are simply <type>void*</type> pointers. For -process and program execution security information, security fields -were added to <structname>struct task_struct</structname> and -<structname>struct linux_binprm</structname>. For filesystem security -information, a security field was added to -<structname>struct super_block</structname>. For pipe, file, and socket -security information, security fields were added to -<structname>struct inode</structname> and -<structname>struct file</structname>. For packet and network device security -information, security fields were added to -<structname>struct sk_buff</structname> and -<structname>struct net_device</structname>. For System V IPC security -information, security fields were added to -<structname>struct kern_ipc_perm</structname> and -<structname>struct msg_msg</structname>; additionally, the definitions -for <structname>struct msg_msg</structname>, <structname>struct -msg_queue</structname>, and <structname>struct -shmid_kernel</structname> were moved to header files -(<filename>include/linux/msg.h</filename> and -<filename>include/linux/shm.h</filename> as appropriate) to allow -the security modules to use these definitions. -</para> - -<para> -Each LSM hook is a function pointer in a global table, -security_ops. This table is a -<structname>security_operations</structname> structure as defined by -<filename>include/linux/security.h</filename>. Detailed documentation -for each hook is included in this header file. At present, this -structure consists of a collection of substructures that group related -hooks based on the kernel object (e.g. task, inode, file, sk_buff, -etc) as well as some top-level hook function pointers for system -operations. This structure is likely to be flattened in the future -for performance. The placement of the hook calls in the kernel code -is described by the "called:" lines in the per-hook documentation in -the header file. The hook calls can also be easily found in the -kernel code by looking for the string "security_ops->". - -</para> - -<para> -Linus mentioned per-process security hooks in his original remarks as a -possible alternative to global security hooks. However, if LSM were -to start from the perspective of per-process hooks, then the base -framework would have to deal with how to handle operations that -involve multiple processes (e.g. kill), since each process might have -its own hook for controlling the operation. This would require a -general mechanism for composing hooks in the base framework. -Additionally, LSM would still need global hooks for operations that -have no process context (e.g. network input operations). -Consequently, LSM provides global security hooks, but a security -module is free to implement per-process hooks (where that makes sense) -by storing a security_ops table in each process' security field and -then invoking these per-process hooks from the global hooks. -The problem of composition is thus deferred to the module. -</para> - -<para> -The global security_ops table is initialized to a set of hook -functions provided by a dummy security module that provides -traditional superuser logic. A <function>register_security</function> -function (in <filename>security/security.c</filename>) is provided to -allow a security module to set security_ops to refer to its own hook -functions, and an <function>unregister_security</function> function is -provided to revert security_ops to the dummy module hooks. This -mechanism is used to set the primary security module, which is -responsible for making the final decision for each hook. -</para> - -<para> -LSM also provides a simple mechanism for stacking additional security -modules with the primary security module. It defines -<function>register_security</function> and -<function>unregister_security</function> hooks in the -<structname>security_operations</structname> structure and provides -<function>mod_reg_security</function> and -<function>mod_unreg_security</function> functions that invoke these -hooks after performing some sanity checking. A security module can -call these functions in order to stack with other modules. However, -the actual details of how this stacking is handled are deferred to the -module, which can implement these hooks in any way it wishes -(including always returning an error if it does not wish to support -stacking). In this manner, LSM again defers the problem of -composition to the module. -</para> - -<para> -Although the LSM hooks are organized into substructures based on -kernel object, all of the hooks can be viewed as falling into two -major categories: hooks that are used to manage the security fields -and hooks that are used to perform access control. Examples of the -first category of hooks include the -<function>alloc_security</function> and -<function>free_security</function> hooks defined for each kernel data -structure that has a security field. These hooks are used to allocate -and free security structures for kernel objects. The first category -of hooks also includes hooks that set information in the security -field after allocation, such as the <function>post_lookup</function> -hook in <structname>struct inode_security_ops</structname>. This hook -is used to set security information for inodes after successful lookup -operations. An example of the second category of hooks is the -<function>permission</function> hook in -<structname>struct inode_security_ops</structname>. This hook checks -permission when accessing an inode. -</para> - -</sect1> - -<sect1 id="cap"><title>LSM Capabilities Module</title> - -<para> -The LSM kernel patch moves most of the existing POSIX.1e capabilities -logic into an optional security module stored in the file -<filename>security/capability.c</filename>. This change allows -users who do not want to use capabilities to omit this code entirely -from their kernel, instead using the dummy module for traditional -superuser logic or any other module that they desire. This change -also allows the developers of the capabilities logic to maintain and -enhance their code more freely, without needing to integrate patches -back into the base kernel. -</para> - -<para> -In addition to moving the capabilities logic, the LSM kernel patch -could move the capability-related fields from the kernel data -structures into the new security fields managed by the security -modules. However, at present, the LSM kernel patch leaves the -capability fields in the kernel data structures. In his original -remarks, Linus suggested that this might be preferable so that other -security modules can be easily stacked with the capabilities module -without needing to chain multiple security structures on the security field. -It also avoids imposing extra overhead on the capabilities module -to manage the security fields. However, the LSM framework could -certainly support such a move if it is determined to be desirable, -with only a few additional changes described below. -</para> - -<para> -At present, the capabilities logic for computing process capabilities -on <function>execve</function> and <function>set*uid</function>, -checking capabilities for a particular process, saving and checking -capabilities for netlink messages, and handling the -<function>capget</function> and <function>capset</function> system -calls have been moved into the capabilities module. There are still a -few locations in the base kernel where capability-related fields are -directly examined or modified, but the current version of the LSM -patch does allow a security module to completely replace the -assignment and testing of capabilities. These few locations would -need to be changed if the capability-related fields were moved into -the security field. The following is a list of known locations that -still perform such direct examination or modification of -capability-related fields: -<itemizedlist> -<listitem><para><filename>fs/open.c</filename>:<function>sys_access</function></para></listitem> -<listitem><para><filename>fs/lockd/host.c</filename>:<function>nlm_bind_host</function></para></listitem> -<listitem><para><filename>fs/nfsd/auth.c</filename>:<function>nfsd_setuser</function></para></listitem> -<listitem><para><filename>fs/proc/array.c</filename>:<function>task_cap</function></para></listitem> -</itemizedlist> -</para> - -</sect1> - -</article> diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl deleted file mode 100644 index b442921bca54..000000000000 --- a/Documentation/DocBook/mtdnand.tmpl +++ /dev/null @@ -1,1291 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="MTD-NAND-Guide"> - <bookinfo> - <title>MTD NAND Driver Programming Interface</title> - - <authorgroup> - <author> - <firstname>Thomas</firstname> - <surname>Gleixner</surname> - <affiliation> - <address> - <email>tglx@linutronix.de</email> - </address> - </affiliation> - </author> - </authorgroup> - - <copyright> - <year>2004</year> - <holder>Thomas Gleixner</holder> - </copyright> - - <legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License version 2 as published by the Free Software Foundation. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - </para> - - <para> - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - </para> - - <para> - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - </bookinfo> - -<toc></toc> - - <chapter id="intro"> - <title>Introduction</title> - <para> - The generic NAND driver supports almost all NAND and AG-AND based - chips and connects them to the Memory Technology Devices (MTD) - subsystem of the Linux Kernel. - </para> - <para> - This documentation is provided for developers who want to implement - board drivers or filesystem drivers suitable for NAND devices. - </para> - </chapter> - - <chapter id="bugs"> - <title>Known Bugs And Assumptions</title> - <para> - None. - </para> - </chapter> - - <chapter id="dochints"> - <title>Documentation hints</title> - <para> - The function and structure docs are autogenerated. Each function and - struct member has a short description which is marked with an [XXX] identifier. - The following chapters explain the meaning of those identifiers. - </para> - <sect1 id="Function_identifiers_XXX"> - <title>Function identifiers [XXX]</title> - <para> - The functions are marked with [XXX] identifiers in the short - comment. The identifiers explain the usage and scope of the - functions. Following identifiers are used: - </para> - <itemizedlist> - <listitem><para> - [MTD Interface]</para><para> - These functions provide the interface to the MTD kernel API. - They are not replaceable and provide functionality - which is complete hardware independent. - </para></listitem> - <listitem><para> - [NAND Interface]</para><para> - These functions are exported and provide the interface to the NAND kernel API. - </para></listitem> - <listitem><para> - [GENERIC]</para><para> - Generic functions are not replaceable and provide functionality - which is complete hardware independent. - </para></listitem> - <listitem><para> - [DEFAULT]</para><para> - Default functions provide hardware related functionality which is suitable - for most of the implementations. These functions can be replaced by the - board driver if necessary. Those functions are called via pointers in the - NAND chip description structure. The board driver can set the functions which - should be replaced by board dependent functions before calling nand_scan(). - If the function pointer is NULL on entry to nand_scan() then the pointer - is set to the default function which is suitable for the detected chip type. - </para></listitem> - </itemizedlist> - </sect1> - <sect1 id="Struct_member_identifiers_XXX"> - <title>Struct member identifiers [XXX]</title> - <para> - The struct members are marked with [XXX] identifiers in the - comment. The identifiers explain the usage and scope of the - members. Following identifiers are used: - </para> - <itemizedlist> - <listitem><para> - [INTERN]</para><para> - These members are for NAND driver internal use only and must not be - modified. Most of these values are calculated from the chip geometry - information which is evaluated during nand_scan(). - </para></listitem> - <listitem><para> - [REPLACEABLE]</para><para> - Replaceable members hold hardware related functions which can be - provided by the board driver. The board driver can set the functions which - should be replaced by board dependent functions before calling nand_scan(). - If the function pointer is NULL on entry to nand_scan() then the pointer - is set to the default function which is suitable for the detected chip type. - </para></listitem> - <listitem><para> - [BOARDSPECIFIC]</para><para> - Board specific members hold hardware related information which must - be provided by the board driver. The board driver must set the function - pointers and datafields before calling nand_scan(). - </para></listitem> - <listitem><para> - [OPTIONAL]</para><para> - Optional members can hold information relevant for the board driver. The - generic NAND driver code does not use this information. - </para></listitem> - </itemizedlist> - </sect1> - </chapter> - - <chapter id="basicboarddriver"> - <title>Basic board driver</title> - <para> - For most boards it will be sufficient to provide just the - basic functions and fill out some really board dependent - members in the nand chip description structure. - </para> - <sect1 id="Basic_defines"> - <title>Basic defines</title> - <para> - At least you have to provide a nand_chip structure - and a storage for the ioremap'ed chip address. - You can allocate the nand_chip structure using - kmalloc or you can allocate it statically. - The NAND chip structure embeds an mtd structure - which will be registered to the MTD subsystem. - You can extract a pointer to the mtd structure - from a nand_chip pointer using the nand_to_mtd() - helper. - </para> - <para> - Kmalloc based example - </para> - <programlisting> -static struct mtd_info *board_mtd; -static void __iomem *baseaddr; - </programlisting> - <para> - Static example - </para> - <programlisting> -static struct nand_chip board_chip; -static void __iomem *baseaddr; - </programlisting> - </sect1> - <sect1 id="Partition_defines"> - <title>Partition defines</title> - <para> - If you want to divide your device into partitions, then - define a partitioning scheme suitable to your board. - </para> - <programlisting> -#define NUM_PARTITIONS 2 -static struct mtd_partition partition_info[] = { - { .name = "Flash partition 1", - .offset = 0, - .size = 8 * 1024 * 1024 }, - { .name = "Flash partition 2", - .offset = MTDPART_OFS_NEXT, - .size = MTDPART_SIZ_FULL }, -}; - </programlisting> - </sect1> - <sect1 id="Hardware_control_functions"> - <title>Hardware control function</title> - <para> - The hardware control function provides access to the - control pins of the NAND chip(s). - The access can be done by GPIO pins or by address lines. - If you use address lines, make sure that the timing - requirements are met. - </para> - <para> - <emphasis>GPIO based example</emphasis> - </para> - <programlisting> -static void board_hwcontrol(struct mtd_info *mtd, int cmd) -{ - switch(cmd){ - case NAND_CTL_SETCLE: /* Set CLE pin high */ break; - case NAND_CTL_CLRCLE: /* Set CLE pin low */ break; - case NAND_CTL_SETALE: /* Set ALE pin high */ break; - case NAND_CTL_CLRALE: /* Set ALE pin low */ break; - case NAND_CTL_SETNCE: /* Set nCE pin low */ break; - case NAND_CTL_CLRNCE: /* Set nCE pin high */ break; - } -} - </programlisting> - <para> - <emphasis>Address lines based example.</emphasis> It's assumed that the - nCE pin is driven by a chip select decoder. - </para> - <programlisting> -static void board_hwcontrol(struct mtd_info *mtd, int cmd) -{ - struct nand_chip *this = mtd_to_nand(mtd); - switch(cmd){ - case NAND_CTL_SETCLE: this->IO_ADDR_W |= CLE_ADRR_BIT; break; - case NAND_CTL_CLRCLE: this->IO_ADDR_W &= ~CLE_ADRR_BIT; break; - case NAND_CTL_SETALE: this->IO_ADDR_W |= ALE_ADRR_BIT; break; - case NAND_CTL_CLRALE: this->IO_ADDR_W &= ~ALE_ADRR_BIT; break; - } -} - </programlisting> - </sect1> - <sect1 id="Device_ready_function"> - <title>Device ready function</title> - <para> - If the hardware interface has the ready busy pin of the NAND chip connected to a - GPIO or other accessible I/O pin, this function is used to read back the state of the - pin. The function has no arguments and should return 0, if the device is busy (R/B pin - is low) and 1, if the device is ready (R/B pin is high). - If the hardware interface does not give access to the ready busy pin, then - the function must not be defined and the function pointer this->dev_ready is set to NULL. - </para> - </sect1> - <sect1 id="Init_function"> - <title>Init function</title> - <para> - The init function allocates memory and sets up all the board - specific parameters and function pointers. When everything - is set up nand_scan() is called. This function tries to - detect and identify then chip. If a chip is found all the - internal data fields are initialized accordingly. - The structure(s) have to be zeroed out first and then filled with the necessary - information about the device. - </para> - <programlisting> -static int __init board_init (void) -{ - struct nand_chip *this; - int err = 0; - - /* Allocate memory for MTD device structure and private data */ - this = kzalloc(sizeof(struct nand_chip), GFP_KERNEL); - if (!this) { - printk ("Unable to allocate NAND MTD device structure.\n"); - err = -ENOMEM; - goto out; - } - - board_mtd = nand_to_mtd(this); - - /* map physical address */ - baseaddr = ioremap(CHIP_PHYSICAL_ADDRESS, 1024); - if (!baseaddr) { - printk("Ioremap to access NAND chip failed\n"); - err = -EIO; - goto out_mtd; - } - - /* Set address of NAND IO lines */ - this->IO_ADDR_R = baseaddr; - this->IO_ADDR_W = baseaddr; - /* Reference hardware control function */ - this->hwcontrol = board_hwcontrol; - /* Set command delay time, see datasheet for correct value */ - this->chip_delay = CHIP_DEPENDEND_COMMAND_DELAY; - /* Assign the device ready function, if available */ - this->dev_ready = board_dev_ready; - this->eccmode = NAND_ECC_SOFT; - - /* Scan to find existence of the device */ - if (nand_scan (board_mtd, 1)) { - err = -ENXIO; - goto out_ior; - } - - add_mtd_partitions(board_mtd, partition_info, NUM_PARTITIONS); - goto out; - -out_ior: - iounmap(baseaddr); -out_mtd: - kfree (this); -out: - return err; -} -module_init(board_init); - </programlisting> - </sect1> - <sect1 id="Exit_function"> - <title>Exit function</title> - <para> - The exit function is only necessary if the driver is - compiled as a module. It releases all resources which - are held by the chip driver and unregisters the partitions - in the MTD layer. - </para> - <programlisting> -#ifdef MODULE -static void __exit board_cleanup (void) -{ - /* Release resources, unregister device */ - nand_release (board_mtd); - - /* unmap physical address */ - iounmap(baseaddr); - - /* Free the MTD device structure */ - kfree (mtd_to_nand(board_mtd)); -} -module_exit(board_cleanup); -#endif - </programlisting> - </sect1> - </chapter> - - <chapter id="boarddriversadvanced"> - <title>Advanced board driver functions</title> - <para> - This chapter describes the advanced functionality of the NAND - driver. For a list of functions which can be overridden by the board - driver see the documentation of the nand_chip structure. - </para> - <sect1 id="Multiple_chip_control"> - <title>Multiple chip control</title> - <para> - The nand driver can control chip arrays. Therefore the - board driver must provide an own select_chip function. This - function must (de)select the requested chip. - The function pointer in the nand_chip structure must - be set before calling nand_scan(). The maxchip parameter - of nand_scan() defines the maximum number of chips to - scan for. Make sure that the select_chip function can - handle the requested number of chips. - </para> - <para> - The nand driver concatenates the chips to one virtual - chip and provides this virtual chip to the MTD layer. - </para> - <para> - <emphasis>Note: The driver can only handle linear chip arrays - of equally sized chips. There is no support for - parallel arrays which extend the buswidth.</emphasis> - </para> - <para> - <emphasis>GPIO based example</emphasis> - </para> - <programlisting> -static void board_select_chip (struct mtd_info *mtd, int chip) -{ - /* Deselect all chips, set all nCE pins high */ - GPIO(BOARD_NAND_NCE) |= 0xff; - if (chip >= 0) - GPIO(BOARD_NAND_NCE) &= ~ (1 << chip); -} - </programlisting> - <para> - <emphasis>Address lines based example.</emphasis> - Its assumed that the nCE pins are connected to an - address decoder. - </para> - <programlisting> -static void board_select_chip (struct mtd_info *mtd, int chip) -{ - struct nand_chip *this = mtd_to_nand(mtd); - - /* Deselect all chips */ - this->IO_ADDR_R &= ~BOARD_NAND_ADDR_MASK; - this->IO_ADDR_W &= ~BOARD_NAND_ADDR_MASK; - switch (chip) { - case 0: - this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIP0; - this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIP0; - break; - .... - case n: - this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIPn; - this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIPn; - break; - } -} - </programlisting> - </sect1> - <sect1 id="Hardware_ECC_support"> - <title>Hardware ECC support</title> - <sect2 id="Functions_and_constants"> - <title>Functions and constants</title> - <para> - The nand driver supports three different types of - hardware ECC. - <itemizedlist> - <listitem><para>NAND_ECC_HW3_256</para><para> - Hardware ECC generator providing 3 bytes ECC per - 256 byte. - </para> </listitem> - <listitem><para>NAND_ECC_HW3_512</para><para> - Hardware ECC generator providing 3 bytes ECC per - 512 byte. - </para> </listitem> - <listitem><para>NAND_ECC_HW6_512</para><para> - Hardware ECC generator providing 6 bytes ECC per - 512 byte. - </para> </listitem> - <listitem><para>NAND_ECC_HW8_512</para><para> - Hardware ECC generator providing 6 bytes ECC per - 512 byte. - </para> </listitem> - </itemizedlist> - If your hardware generator has a different functionality - add it at the appropriate place in nand_base.c - </para> - <para> - The board driver must provide following functions: - <itemizedlist> - <listitem><para>enable_hwecc</para><para> - This function is called before reading / writing to - the chip. Reset or initialize the hardware generator - in this function. The function is called with an - argument which let you distinguish between read - and write operations. - </para> </listitem> - <listitem><para>calculate_ecc</para><para> - This function is called after read / write from / to - the chip. Transfer the ECC from the hardware to - the buffer. If the option NAND_HWECC_SYNDROME is set - then the function is only called on write. See below. - </para> </listitem> - <listitem><para>correct_data</para><para> - In case of an ECC error this function is called for - error detection and correction. Return 1 respectively 2 - in case the error can be corrected. If the error is - not correctable return -1. If your hardware generator - matches the default algorithm of the nand_ecc software - generator then use the correction function provided - by nand_ecc instead of implementing duplicated code. - </para> </listitem> - </itemizedlist> - </para> - </sect2> - <sect2 id="Hardware_ECC_with_syndrome_calculation"> - <title>Hardware ECC with syndrome calculation</title> - <para> - Many hardware ECC implementations provide Reed-Solomon - codes and calculate an error syndrome on read. The syndrome - must be converted to a standard Reed-Solomon syndrome - before calling the error correction code in the generic - Reed-Solomon library. - </para> - <para> - The ECC bytes must be placed immediately after the data - bytes in order to make the syndrome generator work. This - is contrary to the usual layout used by software ECC. The - separation of data and out of band area is not longer - possible. The nand driver code handles this layout and - the remaining free bytes in the oob area are managed by - the autoplacement code. Provide a matching oob-layout - in this case. See rts_from4.c and diskonchip.c for - implementation reference. In those cases we must also - use bad block tables on FLASH, because the ECC layout is - interfering with the bad block marker positions. - See bad block table support for details. - </para> - </sect2> - </sect1> - <sect1 id="Bad_Block_table_support"> - <title>Bad block table support</title> - <para> - Most NAND chips mark the bad blocks at a defined - position in the spare area. Those blocks must - not be erased under any circumstances as the bad - block information would be lost. - It is possible to check the bad block mark each - time when the blocks are accessed by reading the - spare area of the first page in the block. This - is time consuming so a bad block table is used. - </para> - <para> - The nand driver supports various types of bad block - tables. - <itemizedlist> - <listitem><para>Per device</para><para> - The bad block table contains all bad block information - of the device which can consist of multiple chips. - </para> </listitem> - <listitem><para>Per chip</para><para> - A bad block table is used per chip and contains the - bad block information for this particular chip. - </para> </listitem> - <listitem><para>Fixed offset</para><para> - The bad block table is located at a fixed offset - in the chip (device). This applies to various - DiskOnChip devices. - </para> </listitem> - <listitem><para>Automatic placed</para><para> - The bad block table is automatically placed and - detected either at the end or at the beginning - of a chip (device) - </para> </listitem> - <listitem><para>Mirrored tables</para><para> - The bad block table is mirrored on the chip (device) to - allow updates of the bad block table without data loss. - </para> </listitem> - </itemizedlist> - </para> - <para> - nand_scan() calls the function nand_default_bbt(). - nand_default_bbt() selects appropriate default - bad block table descriptors depending on the chip information - which was retrieved by nand_scan(). - </para> - <para> - The standard policy is scanning the device for bad - blocks and build a ram based bad block table which - allows faster access than always checking the - bad block information on the flash chip itself. - </para> - <sect2 id="Flash_based_tables"> - <title>Flash based tables</title> - <para> - It may be desired or necessary to keep a bad block table in FLASH. - For AG-AND chips this is mandatory, as they have no factory marked - bad blocks. They have factory marked good blocks. The marker pattern - is erased when the block is erased to be reused. So in case of - powerloss before writing the pattern back to the chip this block - would be lost and added to the bad blocks. Therefore we scan the - chip(s) when we detect them the first time for good blocks and - store this information in a bad block table before erasing any - of the blocks. - </para> - <para> - The blocks in which the tables are stored are protected against - accidental access by marking them bad in the memory bad block - table. The bad block table management functions are allowed - to circumvent this protection. - </para> - <para> - The simplest way to activate the FLASH based bad block table support - is to set the option NAND_BBT_USE_FLASH in the bbt_option field of - the nand chip structure before calling nand_scan(). For AG-AND - chips is this done by default. - This activates the default FLASH based bad block table functionality - of the NAND driver. The default bad block table options are - <itemizedlist> - <listitem><para>Store bad block table per chip</para></listitem> - <listitem><para>Use 2 bits per block</para></listitem> - <listitem><para>Automatic placement at the end of the chip</para></listitem> - <listitem><para>Use mirrored tables with version numbers</para></listitem> - <listitem><para>Reserve 4 blocks at the end of the chip</para></listitem> - </itemizedlist> - </para> - </sect2> - <sect2 id="User_defined_tables"> - <title>User defined tables</title> - <para> - User defined tables are created by filling out a - nand_bbt_descr structure and storing the pointer in the - nand_chip structure member bbt_td before calling nand_scan(). - If a mirror table is necessary a second structure must be - created and a pointer to this structure must be stored - in bbt_md inside the nand_chip structure. If the bbt_md - member is set to NULL then only the main table is used - and no scan for the mirrored table is performed. - </para> - <para> - The most important field in the nand_bbt_descr structure - is the options field. The options define most of the - table properties. Use the predefined constants from - nand.h to define the options. - <itemizedlist> - <listitem><para>Number of bits per block</para> - <para>The supported number of bits is 1, 2, 4, 8.</para></listitem> - <listitem><para>Table per chip</para> - <para>Setting the constant NAND_BBT_PERCHIP selects that - a bad block table is managed for each chip in a chip array. - If this option is not set then a per device bad block table - is used.</para></listitem> - <listitem><para>Table location is absolute</para> - <para>Use the option constant NAND_BBT_ABSPAGE and - define the absolute page number where the bad block - table starts in the field pages. If you have selected bad block - tables per chip and you have a multi chip array then the start page - must be given for each chip in the chip array. Note: there is no scan - for a table ident pattern performed, so the fields - pattern, veroffs, offs, len can be left uninitialized</para></listitem> - <listitem><para>Table location is automatically detected</para> - <para>The table can either be located in the first or the last good - blocks of the chip (device). Set NAND_BBT_LASTBLOCK to place - the bad block table at the end of the chip (device). The - bad block tables are marked and identified by a pattern which - is stored in the spare area of the first page in the block which - holds the bad block table. Store a pointer to the pattern - in the pattern field. Further the length of the pattern has to be - stored in len and the offset in the spare area must be given - in the offs member of the nand_bbt_descr structure. For mirrored - bad block tables different patterns are mandatory.</para></listitem> - <listitem><para>Table creation</para> - <para>Set the option NAND_BBT_CREATE to enable the table creation - if no table can be found during the scan. Usually this is done only - once if a new chip is found. </para></listitem> - <listitem><para>Table write support</para> - <para>Set the option NAND_BBT_WRITE to enable the table write support. - This allows the update of the bad block table(s) in case a block has - to be marked bad due to wear. The MTD interface function block_markbad - is calling the update function of the bad block table. If the write - support is enabled then the table is updated on FLASH.</para> - <para> - Note: Write support should only be enabled for mirrored tables with - version control. - </para></listitem> - <listitem><para>Table version control</para> - <para>Set the option NAND_BBT_VERSION to enable the table version control. - It's highly recommended to enable this for mirrored tables with write - support. It makes sure that the risk of losing the bad block - table information is reduced to the loss of the information about the - one worn out block which should be marked bad. The version is stored in - 4 consecutive bytes in the spare area of the device. The position of - the version number is defined by the member veroffs in the bad block table - descriptor.</para></listitem> - <listitem><para>Save block contents on write</para> - <para> - In case that the block which holds the bad block table does contain - other useful information, set the option NAND_BBT_SAVECONTENT. When - the bad block table is written then the whole block is read the bad - block table is updated and the block is erased and everything is - written back. If this option is not set only the bad block table - is written and everything else in the block is ignored and erased. - </para></listitem> - <listitem><para>Number of reserved blocks</para> - <para> - For automatic placement some blocks must be reserved for - bad block table storage. The number of reserved blocks is defined - in the maxblocks member of the bad block table description structure. - Reserving 4 blocks for mirrored tables should be a reasonable number. - This also limits the number of blocks which are scanned for the bad - block table ident pattern. - </para></listitem> - </itemizedlist> - </para> - </sect2> - </sect1> - <sect1 id="Spare_area_placement"> - <title>Spare area (auto)placement</title> - <para> - The nand driver implements different possibilities for - placement of filesystem data in the spare area, - <itemizedlist> - <listitem><para>Placement defined by fs driver</para></listitem> - <listitem><para>Automatic placement</para></listitem> - </itemizedlist> - The default placement function is automatic placement. The - nand driver has built in default placement schemes for the - various chiptypes. If due to hardware ECC functionality the - default placement does not fit then the board driver can - provide a own placement scheme. - </para> - <para> - File system drivers can provide a own placement scheme which - is used instead of the default placement scheme. - </para> - <para> - Placement schemes are defined by a nand_oobinfo structure - <programlisting> -struct nand_oobinfo { - int useecc; - int eccbytes; - int eccpos[24]; - int oobfree[8][2]; -}; - </programlisting> - <itemizedlist> - <listitem><para>useecc</para><para> - The useecc member controls the ecc and placement function. The header - file include/mtd/mtd-abi.h contains constants to select ecc and - placement. MTD_NANDECC_OFF switches off the ecc complete. This is - not recommended and available for testing and diagnosis only. - MTD_NANDECC_PLACE selects caller defined placement, MTD_NANDECC_AUTOPLACE - selects automatic placement. - </para></listitem> - <listitem><para>eccbytes</para><para> - The eccbytes member defines the number of ecc bytes per page. - </para></listitem> - <listitem><para>eccpos</para><para> - The eccpos array holds the byte offsets in the spare area where - the ecc codes are placed. - </para></listitem> - <listitem><para>oobfree</para><para> - The oobfree array defines the areas in the spare area which can be - used for automatic placement. The information is given in the format - {offset, size}. offset defines the start of the usable area, size the - length in bytes. More than one area can be defined. The list is terminated - by an {0, 0} entry. - </para></listitem> - </itemizedlist> - </para> - <sect2 id="Placement_defined_by_fs_driver"> - <title>Placement defined by fs driver</title> - <para> - The calling function provides a pointer to a nand_oobinfo - structure which defines the ecc placement. For writes the - caller must provide a spare area buffer along with the - data buffer. The spare area buffer size is (number of pages) * - (size of spare area). For reads the buffer size is - (number of pages) * ((size of spare area) + (number of ecc - steps per page) * sizeof (int)). The driver stores the - result of the ecc check for each tuple in the spare buffer. - The storage sequence is - </para> - <para> - <spare data page 0><ecc result 0>...<ecc result n> - </para> - <para> - ... - </para> - <para> - <spare data page n><ecc result 0>...<ecc result n> - </para> - <para> - This is a legacy mode used by YAFFS1. - </para> - <para> - If the spare area buffer is NULL then only the ECC placement is - done according to the given scheme in the nand_oobinfo structure. - </para> - </sect2> - <sect2 id="Automatic_placement"> - <title>Automatic placement</title> - <para> - Automatic placement uses the built in defaults to place the - ecc bytes in the spare area. If filesystem data have to be stored / - read into the spare area then the calling function must provide a - buffer. The buffer size per page is determined by the oobfree array in - the nand_oobinfo structure. - </para> - <para> - If the spare area buffer is NULL then only the ECC placement is - done according to the default builtin scheme. - </para> - </sect2> - </sect1> - <sect1 id="Spare_area_autoplacement_default"> - <title>Spare area autoplacement default schemes</title> - <sect2 id="pagesize_256"> - <title>256 byte pagesize</title> -<informaltable><tgroup cols="3"><tbody> -<row> -<entry>Offset</entry> -<entry>Content</entry> -<entry>Comment</entry> -</row> -<row> -<entry>0x00</entry> -<entry>ECC byte 0</entry> -<entry>Error correction code byte 0</entry> -</row> -<row> -<entry>0x01</entry> -<entry>ECC byte 1</entry> -<entry>Error correction code byte 1</entry> -</row> -<row> -<entry>0x02</entry> -<entry>ECC byte 2</entry> -<entry>Error correction code byte 2</entry> -</row> -<row> -<entry>0x03</entry> -<entry>Autoplace 0</entry> -<entry></entry> -</row> -<row> -<entry>0x04</entry> -<entry>Autoplace 1</entry> -<entry></entry> -</row> -<row> -<entry>0x05</entry> -<entry>Bad block marker</entry> -<entry>If any bit in this byte is zero, then this block is bad. -This applies only to the first page in a block. In the remaining -pages this byte is reserved</entry> -</row> -<row> -<entry>0x06</entry> -<entry>Autoplace 2</entry> -<entry></entry> -</row> -<row> -<entry>0x07</entry> -<entry>Autoplace 3</entry> -<entry></entry> -</row> -</tbody></tgroup></informaltable> - </sect2> - <sect2 id="pagesize_512"> - <title>512 byte pagesize</title> -<informaltable><tgroup cols="3"><tbody> -<row> -<entry>Offset</entry> -<entry>Content</entry> -<entry>Comment</entry> -</row> -<row> -<entry>0x00</entry> -<entry>ECC byte 0</entry> -<entry>Error correction code byte 0 of the lower 256 Byte data in -this page</entry> -</row> -<row> -<entry>0x01</entry> -<entry>ECC byte 1</entry> -<entry>Error correction code byte 1 of the lower 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x02</entry> -<entry>ECC byte 2</entry> -<entry>Error correction code byte 2 of the lower 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x03</entry> -<entry>ECC byte 3</entry> -<entry>Error correction code byte 0 of the upper 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x04</entry> -<entry>reserved</entry> -<entry>reserved</entry> -</row> -<row> -<entry>0x05</entry> -<entry>Bad block marker</entry> -<entry>If any bit in this byte is zero, then this block is bad. -This applies only to the first page in a block. In the remaining -pages this byte is reserved</entry> -</row> -<row> -<entry>0x06</entry> -<entry>ECC byte 4</entry> -<entry>Error correction code byte 1 of the upper 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x07</entry> -<entry>ECC byte 5</entry> -<entry>Error correction code byte 2 of the upper 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x08 - 0x0F</entry> -<entry>Autoplace 0 - 7</entry> -<entry></entry> -</row> -</tbody></tgroup></informaltable> - </sect2> - <sect2 id="pagesize_2048"> - <title>2048 byte pagesize</title> -<informaltable><tgroup cols="3"><tbody> -<row> -<entry>Offset</entry> -<entry>Content</entry> -<entry>Comment</entry> -</row> -<row> -<entry>0x00</entry> -<entry>Bad block marker</entry> -<entry>If any bit in this byte is zero, then this block is bad. -This applies only to the first page in a block. In the remaining -pages this byte is reserved</entry> -</row> -<row> -<entry>0x01</entry> -<entry>Reserved</entry> -<entry>Reserved</entry> -</row> -<row> -<entry>0x02-0x27</entry> -<entry>Autoplace 0 - 37</entry> -<entry></entry> -</row> -<row> -<entry>0x28</entry> -<entry>ECC byte 0</entry> -<entry>Error correction code byte 0 of the first 256 Byte data in -this page</entry> -</row> -<row> -<entry>0x29</entry> -<entry>ECC byte 1</entry> -<entry>Error correction code byte 1 of the first 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x2A</entry> -<entry>ECC byte 2</entry> -<entry>Error correction code byte 2 of the first 256 Bytes data in -this page</entry> -</row> -<row> -<entry>0x2B</entry> -<entry>ECC byte 3</entry> -<entry>Error correction code byte 0 of the second 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x2C</entry> -<entry>ECC byte 4</entry> -<entry>Error correction code byte 1 of the second 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x2D</entry> -<entry>ECC byte 5</entry> -<entry>Error correction code byte 2 of the second 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x2E</entry> -<entry>ECC byte 6</entry> -<entry>Error correction code byte 0 of the third 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x2F</entry> -<entry>ECC byte 7</entry> -<entry>Error correction code byte 1 of the third 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x30</entry> -<entry>ECC byte 8</entry> -<entry>Error correction code byte 2 of the third 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x31</entry> -<entry>ECC byte 9</entry> -<entry>Error correction code byte 0 of the fourth 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x32</entry> -<entry>ECC byte 10</entry> -<entry>Error correction code byte 1 of the fourth 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x33</entry> -<entry>ECC byte 11</entry> -<entry>Error correction code byte 2 of the fourth 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x34</entry> -<entry>ECC byte 12</entry> -<entry>Error correction code byte 0 of the fifth 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x35</entry> -<entry>ECC byte 13</entry> -<entry>Error correction code byte 1 of the fifth 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x36</entry> -<entry>ECC byte 14</entry> -<entry>Error correction code byte 2 of the fifth 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x37</entry> -<entry>ECC byte 15</entry> -<entry>Error correction code byte 0 of the sixt 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x38</entry> -<entry>ECC byte 16</entry> -<entry>Error correction code byte 1 of the sixt 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x39</entry> -<entry>ECC byte 17</entry> -<entry>Error correction code byte 2 of the sixt 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x3A</entry> -<entry>ECC byte 18</entry> -<entry>Error correction code byte 0 of the seventh 256 Bytes of -data in this page</entry> -</row> -<row> -<entry>0x3B</entry> -<entry>ECC byte 19</entry> -<entry>Error correction code byte 1 of the seventh 256 Bytes of -data in this page</entry> -</row> -<row> -<entry>0x3C</entry> -<entry>ECC byte 20</entry> -<entry>Error correction code byte 2 of the seventh 256 Bytes of -data in this page</entry> -</row> -<row> -<entry>0x3D</entry> -<entry>ECC byte 21</entry> -<entry>Error correction code byte 0 of the eighth 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x3E</entry> -<entry>ECC byte 22</entry> -<entry>Error correction code byte 1 of the eighth 256 Bytes of data -in this page</entry> -</row> -<row> -<entry>0x3F</entry> -<entry>ECC byte 23</entry> -<entry>Error correction code byte 2 of the eighth 256 Bytes of data -in this page</entry> -</row> -</tbody></tgroup></informaltable> - </sect2> - </sect1> - </chapter> - - <chapter id="filesystems"> - <title>Filesystem support</title> - <para> - The NAND driver provides all necessary functions for a - filesystem via the MTD interface. - </para> - <para> - Filesystems must be aware of the NAND peculiarities and - restrictions. One major restrictions of NAND Flash is, that you cannot - write as often as you want to a page. The consecutive writes to a page, - before erasing it again, are restricted to 1-3 writes, depending on the - manufacturers specifications. This applies similar to the spare area. - </para> - <para> - Therefore NAND aware filesystems must either write in page size chunks - or hold a writebuffer to collect smaller writes until they sum up to - pagesize. Available NAND aware filesystems: JFFS2, YAFFS. - </para> - <para> - The spare area usage to store filesystem data is controlled by - the spare area placement functionality which is described in one - of the earlier chapters. - </para> - </chapter> - <chapter id="tools"> - <title>Tools</title> - <para> - The MTD project provides a couple of helpful tools to handle NAND Flash. - <itemizedlist> - <listitem><para>flasherase, flasheraseall: Erase and format FLASH partitions</para></listitem> - <listitem><para>nandwrite: write filesystem images to NAND FLASH</para></listitem> - <listitem><para>nanddump: dump the contents of a NAND FLASH partitions</para></listitem> - </itemizedlist> - </para> - <para> - These tools are aware of the NAND restrictions. Please use those tools - instead of complaining about errors which are caused by non NAND aware - access methods. - </para> - </chapter> - - <chapter id="defines"> - <title>Constants</title> - <para> - This chapter describes the constants which might be relevant for a driver developer. - </para> - <sect1 id="Chip_option_constants"> - <title>Chip option constants</title> - <sect2 id="Constants_for_chip_id_table"> - <title>Constants for chip id table</title> - <para> - These constants are defined in nand.h. They are ored together to describe - the chip functionality. - <programlisting> -/* Buswitdh is 16 bit */ -#define NAND_BUSWIDTH_16 0x00000002 -/* Device supports partial programming without padding */ -#define NAND_NO_PADDING 0x00000004 -/* Chip has cache program function */ -#define NAND_CACHEPRG 0x00000008 -/* Chip has copy back function */ -#define NAND_COPYBACK 0x00000010 -/* AND Chip which has 4 banks and a confusing page / block - * assignment. See Renesas datasheet for further information */ -#define NAND_IS_AND 0x00000020 -/* Chip has a array of 4 pages which can be read without - * additional ready /busy waits */ -#define NAND_4PAGE_ARRAY 0x00000040 - </programlisting> - </para> - </sect2> - <sect2 id="Constants_for_runtime_options"> - <title>Constants for runtime options</title> - <para> - These constants are defined in nand.h. They are ored together to describe - the functionality. - <programlisting> -/* The hw ecc generator provides a syndrome instead a ecc value on read - * This can only work if we have the ecc bytes directly behind the - * data bytes. Applies for DOC and AG-AND Renesas HW Reed Solomon generators */ -#define NAND_HWECC_SYNDROME 0x00020000 - </programlisting> - </para> - </sect2> - </sect1> - - <sect1 id="EEC_selection_constants"> - <title>ECC selection constants</title> - <para> - Use these constants to select the ECC algorithm. - <programlisting> -/* No ECC. Usage is not recommended ! */ -#define NAND_ECC_NONE 0 -/* Software ECC 3 byte ECC per 256 Byte data */ -#define NAND_ECC_SOFT 1 -/* Hardware ECC 3 byte ECC per 256 Byte data */ -#define NAND_ECC_HW3_256 2 -/* Hardware ECC 3 byte ECC per 512 Byte data */ -#define NAND_ECC_HW3_512 3 -/* Hardware ECC 6 byte ECC per 512 Byte data */ -#define NAND_ECC_HW6_512 4 -/* Hardware ECC 6 byte ECC per 512 Byte data */ -#define NAND_ECC_HW8_512 6 - </programlisting> - </para> - </sect1> - - <sect1 id="Hardware_control_related_constants"> - <title>Hardware control related constants</title> - <para> - These constants describe the requested hardware access function when - the boardspecific hardware control function is called - <programlisting> -/* Select the chip by setting nCE to low */ -#define NAND_CTL_SETNCE 1 -/* Deselect the chip by setting nCE to high */ -#define NAND_CTL_CLRNCE 2 -/* Select the command latch by setting CLE to high */ -#define NAND_CTL_SETCLE 3 -/* Deselect the command latch by setting CLE to low */ -#define NAND_CTL_CLRCLE 4 -/* Select the address latch by setting ALE to high */ -#define NAND_CTL_SETALE 5 -/* Deselect the address latch by setting ALE to low */ -#define NAND_CTL_CLRALE 6 -/* Set write protection by setting WP to high. Not used! */ -#define NAND_CTL_SETWP 7 -/* Clear write protection by setting WP to low. Not used! */ -#define NAND_CTL_CLRWP 8 - </programlisting> - </para> - </sect1> - - <sect1 id="Bad_block_table_constants"> - <title>Bad block table related constants</title> - <para> - These constants describe the options used for bad block - table descriptors. - <programlisting> -/* Options for the bad block table descriptors */ - -/* The number of bits used per block in the bbt on the device */ -#define NAND_BBT_NRBITS_MSK 0x0000000F -#define NAND_BBT_1BIT 0x00000001 -#define NAND_BBT_2BIT 0x00000002 -#define NAND_BBT_4BIT 0x00000004 -#define NAND_BBT_8BIT 0x00000008 -/* The bad block table is in the last good block of the device */ -#define NAND_BBT_LASTBLOCK 0x00000010 -/* The bbt is at the given page, else we must scan for the bbt */ -#define NAND_BBT_ABSPAGE 0x00000020 -/* bbt is stored per chip on multichip devices */ -#define NAND_BBT_PERCHIP 0x00000080 -/* bbt has a version counter at offset veroffs */ -#define NAND_BBT_VERSION 0x00000100 -/* Create a bbt if none axists */ -#define NAND_BBT_CREATE 0x00000200 -/* Write bbt if necessary */ -#define NAND_BBT_WRITE 0x00001000 -/* Read and write back block contents when writing bbt */ -#define NAND_BBT_SAVECONTENT 0x00002000 - </programlisting> - </para> - </sect1> - - </chapter> - - <chapter id="structs"> - <title>Structures</title> - <para> - This chapter contains the autogenerated documentation of the structures which are - used in the NAND driver and might be relevant for a driver developer. Each - struct member has a short description which is marked with an [XXX] identifier. - See the chapter "Documentation hints" for an explanation. - </para> -!Iinclude/linux/mtd/nand.h - </chapter> - - <chapter id="pubfunctions"> - <title>Public Functions Provided</title> - <para> - This chapter contains the autogenerated documentation of the NAND kernel API functions - which are exported. Each function has a short description which is marked with an [XXX] identifier. - See the chapter "Documentation hints" for an explanation. - </para> -!Edrivers/mtd/nand/nand_base.c -!Edrivers/mtd/nand/nand_bbt.c -!Edrivers/mtd/nand/nand_ecc.c - </chapter> - - <chapter id="intfunctions"> - <title>Internal Functions Provided</title> - <para> - This chapter contains the autogenerated documentation of the NAND driver internal functions. - Each function has a short description which is marked with an [XXX] identifier. - See the chapter "Documentation hints" for an explanation. - The functions marked with [DEFAULT] might be relevant for a board driver developer. - </para> -!Idrivers/mtd/nand/nand_base.c -!Idrivers/mtd/nand/nand_bbt.c -<!-- No internal functions for kernel-doc: -X!Idrivers/mtd/nand/nand_ecc.c ---> - </chapter> - - <chapter id="credits"> - <title>Credits</title> - <para> - The following people have contributed to the NAND driver: - <orderedlist> - <listitem><para>Steven J. Hill<email>sjhill@realitydiluted.com</email></para></listitem> - <listitem><para>David Woodhouse<email>dwmw2@infradead.org</email></para></listitem> - <listitem><para>Thomas Gleixner<email>tglx@linutronix.de</email></para></listitem> - </orderedlist> - A lot of users have provided bugfixes, improvements and helping hands for testing. - Thanks a lot. - </para> - <para> - The following people have contributed to this document: - <orderedlist> - <listitem><para>Thomas Gleixner<email>tglx@linutronix.de</email></para></listitem> - </orderedlist> - </para> - </chapter> -</book> diff --git a/Documentation/DocBook/networking.tmpl b/Documentation/DocBook/networking.tmpl deleted file mode 100644 index 29df25016c7c..000000000000 --- a/Documentation/DocBook/networking.tmpl +++ /dev/null @@ -1,111 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="LinuxNetworking"> - <bookinfo> - <title>Linux Networking and Network Devices APIs</title> - - <legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later - version. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - </para> - - <para> - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - </para> - - <para> - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - </bookinfo> - -<toc></toc> - - <chapter id="netcore"> - <title>Linux Networking</title> - <sect1><title>Networking Base Types</title> -!Iinclude/linux/net.h - </sect1> - <sect1><title>Socket Buffer Functions</title> -!Iinclude/linux/skbuff.h -!Iinclude/net/sock.h -!Enet/socket.c -!Enet/core/skbuff.c -!Enet/core/sock.c -!Enet/core/datagram.c -!Enet/core/stream.c - </sect1> - <sect1><title>Socket Filter</title> -!Enet/core/filter.c - </sect1> - <sect1><title>Generic Network Statistics</title> -!Iinclude/uapi/linux/gen_stats.h -!Enet/core/gen_stats.c -!Enet/core/gen_estimator.c - </sect1> - <sect1><title>SUN RPC subsystem</title> -<!-- The !D functionality is not perfect, garbage has to be protected by comments -!Dnet/sunrpc/sunrpc_syms.c ---> -!Enet/sunrpc/xdr.c -!Enet/sunrpc/svc_xprt.c -!Enet/sunrpc/xprt.c -!Enet/sunrpc/sched.c -!Enet/sunrpc/socklib.c -!Enet/sunrpc/stats.c -!Enet/sunrpc/rpc_pipe.c -!Enet/sunrpc/rpcb_clnt.c -!Enet/sunrpc/clnt.c - </sect1> - <sect1><title>WiMAX</title> -!Enet/wimax/op-msg.c -!Enet/wimax/op-reset.c -!Enet/wimax/op-rfkill.c -!Enet/wimax/stack.c -!Iinclude/net/wimax.h -!Iinclude/uapi/linux/wimax.h - </sect1> - </chapter> - - <chapter id="netdev"> - <title>Network device support</title> - <sect1><title>Driver Support</title> -!Enet/core/dev.c -!Enet/ethernet/eth.c -!Enet/sched/sch_generic.c -!Iinclude/linux/etherdevice.h -!Iinclude/linux/netdevice.h - </sect1> - <sect1><title>PHY Support</title> -!Edrivers/net/phy/phy.c -!Idrivers/net/phy/phy.c -!Edrivers/net/phy/phy_device.c -!Idrivers/net/phy/phy_device.c -!Edrivers/net/phy/mdio_bus.c -!Idrivers/net/phy/mdio_bus.c - </sect1> -<!-- FIXME: Removed for now since no structured comments in source - <sect1><title>Wireless</title> -X!Enet/core/wireless.c - </sect1> ---> - </chapter> - -</book> diff --git a/Documentation/DocBook/rapidio.tmpl b/Documentation/DocBook/rapidio.tmpl deleted file mode 100644 index ac3cca3399a1..000000000000 --- a/Documentation/DocBook/rapidio.tmpl +++ /dev/null @@ -1,155 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" [ - <!ENTITY rapidio SYSTEM "rapidio.xml"> - ]> - -<book id="RapidIO-Guide"> - <bookinfo> - <title>RapidIO Subsystem Guide</title> - - <authorgroup> - <author> - <firstname>Matt</firstname> - <surname>Porter</surname> - <affiliation> - <address> - <email>mporter@kernel.crashing.org</email> - <email>mporter@mvista.com</email> - </address> - </affiliation> - </author> - </authorgroup> - - <copyright> - <year>2005</year> - <holder>MontaVista Software, Inc.</holder> - </copyright> - - <legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License version 2 as published by the Free Software Foundation. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - </para> - - <para> - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - </para> - - <para> - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - </bookinfo> - -<toc></toc> - - <chapter id="intro"> - <title>Introduction</title> - <para> - RapidIO is a high speed switched fabric interconnect with - features aimed at the embedded market. RapidIO provides - support for memory-mapped I/O as well as message-based - transactions over the switched fabric network. RapidIO has - a standardized discovery mechanism not unlike the PCI bus - standard that allows simple detection of devices in a - network. - </para> - <para> - This documentation is provided for developers intending - to support RapidIO on new architectures, write new drivers, - or to understand the subsystem internals. - </para> - </chapter> - - <chapter id="bugs"> - <title>Known Bugs and Limitations</title> - - <sect1 id="known_bugs"> - <title>Bugs</title> - <para>None. ;)</para> - </sect1> - <sect1 id="Limitations"> - <title>Limitations</title> - <para> - <orderedlist> - <listitem><para>Access/management of RapidIO memory regions is not supported</para></listitem> - <listitem><para>Multiple host enumeration is not supported</para></listitem> - </orderedlist> - </para> - </sect1> - </chapter> - - <chapter id="drivers"> - <title>RapidIO driver interface</title> - <para> - Drivers are provided a set of calls in order - to interface with the subsystem to gather info - on devices, request/map memory region resources, - and manage mailboxes/doorbells. - </para> - <sect1 id="Functions"> - <title>Functions</title> -!Iinclude/linux/rio_drv.h -!Edrivers/rapidio/rio-driver.c -!Edrivers/rapidio/rio.c - </sect1> - </chapter> - - <chapter id="internals"> - <title>Internals</title> - - <para> - This chapter contains the autogenerated documentation of the RapidIO - subsystem. - </para> - - <sect1 id="Structures"><title>Structures</title> -!Iinclude/linux/rio.h - </sect1> - <sect1 id="Enumeration_and_Discovery"><title>Enumeration and Discovery</title> -!Idrivers/rapidio/rio-scan.c - </sect1> - <sect1 id="Driver_functionality"><title>Driver functionality</title> -!Idrivers/rapidio/rio.c -!Idrivers/rapidio/rio-access.c - </sect1> - <sect1 id="Device_model_support"><title>Device model support</title> -!Idrivers/rapidio/rio-driver.c - </sect1> - <sect1 id="PPC32_support"><title>PPC32 support</title> -!Iarch/powerpc/sysdev/fsl_rio.c - </sect1> - </chapter> - - <chapter id="credits"> - <title>Credits</title> - <para> - The following people have contributed to the RapidIO - subsystem directly or indirectly: - <orderedlist> - <listitem><para>Matt Porter<email>mporter@kernel.crashing.org</email></para></listitem> - <listitem><para>Randy Vinson<email>rvinson@mvista.com</email></para></listitem> - <listitem><para>Dan Malek<email>dan@embeddedalley.com</email></para></listitem> - </orderedlist> - </para> - <para> - The following people have contributed to this document: - <orderedlist> - <listitem><para>Matt Porter<email>mporter@kernel.crashing.org</email></para></listitem> - </orderedlist> - </para> - </chapter> -</book> diff --git a/Documentation/DocBook/s390-drivers.tmpl b/Documentation/DocBook/s390-drivers.tmpl deleted file mode 100644 index 95bfc12e5439..000000000000 --- a/Documentation/DocBook/s390-drivers.tmpl +++ /dev/null @@ -1,161 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="s390drivers"> - <bookinfo> - <title>Writing s390 channel device drivers</title> - - <authorgroup> - <author> - <firstname>Cornelia</firstname> - <surname>Huck</surname> - <affiliation> - <address> - <email>cornelia.huck@de.ibm.com</email> - </address> - </affiliation> - </author> - </authorgroup> - - <copyright> - <year>2007</year> - <holder>IBM Corp.</holder> - </copyright> - - <legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later - version. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - </para> - - <para> - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - </para> - - <para> - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - </bookinfo> - -<toc></toc> - - <chapter id="intro"> - <title>Introduction</title> - <para> - This document describes the interfaces available for device drivers that - drive s390 based channel attached I/O devices. This includes interfaces for - interaction with the hardware and interfaces for interacting with the - common driver core. Those interfaces are provided by the s390 common I/O - layer. - </para> - <para> - The document assumes a familarity with the technical terms associated - with the s390 channel I/O architecture. For a description of this - architecture, please refer to the "z/Architecture: Principles of - Operation", IBM publication no. SA22-7832. - </para> - <para> - While most I/O devices on a s390 system are typically driven through the - channel I/O mechanism described here, there are various other methods - (like the diag interface). These are out of the scope of this document. - </para> - <para> - Some additional information can also be found in the kernel source - under Documentation/s390/driver-model.txt. - </para> - </chapter> - <chapter id="ccw"> - <title>The ccw bus</title> - <para> - The ccw bus typically contains the majority of devices available to - a s390 system. Named after the channel command word (ccw), the basic - command structure used to address its devices, the ccw bus contains - so-called channel attached devices. They are addressed via I/O - subchannels, visible on the css bus. A device driver for - channel-attached devices, however, will never interact with the - subchannel directly, but only via the I/O device on the ccw bus, - the ccw device. - </para> - <sect1 id="channelIO"> - <title>I/O functions for channel-attached devices</title> - <para> - Some hardware structures have been translated into C structures for use - by the common I/O layer and device drivers. For more information on - the hardware structures represented here, please consult the Principles - of Operation. - </para> -!Iarch/s390/include/asm/cio.h - </sect1> - <sect1 id="ccwdev"> - <title>ccw devices</title> - <para> - Devices that want to initiate channel I/O need to attach to the ccw bus. - Interaction with the driver core is done via the common I/O layer, which - provides the abstractions of ccw devices and ccw device drivers. - </para> - <para> - The functions that initiate or terminate channel I/O all act upon a - ccw device structure. Device drivers must not bypass those functions - or strange side effects may happen. - </para> -!Iarch/s390/include/asm/ccwdev.h -!Edrivers/s390/cio/device.c -!Edrivers/s390/cio/device_ops.c - </sect1> - <sect1 id="cmf"> - <title>The channel-measurement facility</title> - <para> - The channel-measurement facility provides a means to collect - measurement data which is made available by the channel subsystem - for each channel attached device. - </para> -!Iarch/s390/include/asm/cmb.h -!Edrivers/s390/cio/cmf.c - </sect1> - </chapter> - - <chapter id="ccwgroup"> - <title>The ccwgroup bus</title> - <para> - The ccwgroup bus only contains artificial devices, created by the user. - Many networking devices (e.g. qeth) are in fact composed of several - ccw devices (like read, write and data channel for qeth). The - ccwgroup bus provides a mechanism to create a meta-device which - contains those ccw devices as slave devices and can be associated - with the netdevice. - </para> - <sect1 id="ccwgroupdevices"> - <title>ccw group devices</title> -!Iarch/s390/include/asm/ccwgroup.h -!Edrivers/s390/cio/ccwgroup.c - </sect1> - </chapter> - - <chapter id="genericinterfaces"> - <title>Generic interfaces</title> - <para> - Some interfaces are available to other drivers that do not necessarily - have anything to do with the busses described above, but still are - indirectly using basic infrastructure in the common I/O layer. - One example is the support for adapter interrupts. - </para> -!Edrivers/s390/cio/airq.c - </chapter> - -</book> diff --git a/Documentation/DocBook/scsi.tmpl b/Documentation/DocBook/scsi.tmpl deleted file mode 100644 index 4b9b9b286cea..000000000000 --- a/Documentation/DocBook/scsi.tmpl +++ /dev/null @@ -1,409 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="scsimid"> - <bookinfo> - <title>SCSI Interfaces Guide</title> - - <authorgroup> - <author> - <firstname>James</firstname> - <surname>Bottomley</surname> - <affiliation> - <address> - <email>James.Bottomley@hansenpartnership.com</email> - </address> - </affiliation> - </author> - - <author> - <firstname>Rob</firstname> - <surname>Landley</surname> - <affiliation> - <address> - <email>rob@landley.net</email> - </address> - </affiliation> - </author> - - </authorgroup> - - <copyright> - <year>2007</year> - <holder>Linux Foundation</holder> - </copyright> - - <legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License version 2. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - </bookinfo> - - <toc></toc> - - <chapter id="intro"> - <title>Introduction</title> - <sect1 id="protocol_vs_bus"> - <title>Protocol vs bus</title> - <para> - Once upon a time, the Small Computer Systems Interface defined both - a parallel I/O bus and a data protocol to connect a wide variety of - peripherals (disk drives, tape drives, modems, printers, scanners, - optical drives, test equipment, and medical devices) to a host - computer. - </para> - <para> - Although the old parallel (fast/wide/ultra) SCSI bus has largely - fallen out of use, the SCSI command set is more widely used than ever - to communicate with devices over a number of different busses. - </para> - <para> - The <ulink url='http://www.t10.org/scsi-3.htm'>SCSI protocol</ulink> - is a big-endian peer-to-peer packet based protocol. SCSI commands - are 6, 10, 12, or 16 bytes long, often followed by an associated data - payload. - </para> - <para> - SCSI commands can be transported over just about any kind of bus, and - are the default protocol for storage devices attached to USB, SATA, - SAS, Fibre Channel, FireWire, and ATAPI devices. SCSI packets are - also commonly exchanged over Infiniband, - <ulink url='http://i2o.shadowconnect.com/faq.php'>I20</ulink>, TCP/IP - (<ulink url='https://en.wikipedia.org/wiki/ISCSI'>iSCSI</ulink>), even - <ulink url='http://cyberelk.net/tim/parport/parscsi.html'>Parallel - ports</ulink>. - </para> - </sect1> - <sect1 id="subsystem_design"> - <title>Design of the Linux SCSI subsystem</title> - <para> - The SCSI subsystem uses a three layer design, with upper, mid, and low - layers. Every operation involving the SCSI subsystem (such as reading - a sector from a disk) uses one driver at each of the 3 levels: one - upper layer driver, one lower layer driver, and the SCSI midlayer. - </para> - <para> - The SCSI upper layer provides the interface between userspace and the - kernel, in the form of block and char device nodes for I/O and - ioctl(). The SCSI lower layer contains drivers for specific hardware - devices. - </para> - <para> - In between is the SCSI mid-layer, analogous to a network routing - layer such as the IPv4 stack. The SCSI mid-layer routes a packet - based data protocol between the upper layer's /dev nodes and the - corresponding devices in the lower layer. It manages command queues, - provides error handling and power management functions, and responds - to ioctl() requests. - </para> - </sect1> - </chapter> - - <chapter id="upper_layer"> - <title>SCSI upper layer</title> - <para> - The upper layer supports the user-kernel interface by providing - device nodes. - </para> - <sect1 id="sd"> - <title>sd (SCSI Disk)</title> - <para>sd (sd_mod.o)</para> -<!-- !Idrivers/scsi/sd.c --> - </sect1> - <sect1 id="sr"> - <title>sr (SCSI CD-ROM)</title> - <para>sr (sr_mod.o)</para> - </sect1> - <sect1 id="st"> - <title>st (SCSI Tape)</title> - <para>st (st.o)</para> - </sect1> - <sect1 id="sg"> - <title>sg (SCSI Generic)</title> - <para>sg (sg.o)</para> - </sect1> - <sect1 id="ch"> - <title>ch (SCSI Media Changer)</title> - <para>ch (ch.c)</para> - </sect1> - </chapter> - - <chapter id="mid_layer"> - <title>SCSI mid layer</title> - - <sect1 id="midlayer_implementation"> - <title>SCSI midlayer implementation</title> - <sect2 id="scsi_device.h"> - <title>include/scsi/scsi_device.h</title> - <para> - </para> -!Iinclude/scsi/scsi_device.h - </sect2> - - <sect2 id="scsi.c"> - <title>drivers/scsi/scsi.c</title> - <para>Main file for the SCSI midlayer.</para> -!Edrivers/scsi/scsi.c - </sect2> - <sect2 id="scsicam.c"> - <title>drivers/scsi/scsicam.c</title> - <para> - <ulink url='http://www.t10.org/ftp/t10/drafts/cam/cam-r12b.pdf'>SCSI - Common Access Method</ulink> support functions, for use with - HDIO_GETGEO, etc. - </para> -!Edrivers/scsi/scsicam.c - </sect2> - <sect2 id="scsi_error.c"> - <title>drivers/scsi/scsi_error.c</title> - <para>Common SCSI error/timeout handling routines.</para> -!Edrivers/scsi/scsi_error.c - </sect2> - <sect2 id="scsi_devinfo.c"> - <title>drivers/scsi/scsi_devinfo.c</title> - <para> - Manage scsi_dev_info_list, which tracks blacklisted and whitelisted - devices. - </para> -!Idrivers/scsi/scsi_devinfo.c - </sect2> - <sect2 id="scsi_ioctl.c"> - <title>drivers/scsi/scsi_ioctl.c</title> - <para> - Handle ioctl() calls for SCSI devices. - </para> -!Edrivers/scsi/scsi_ioctl.c - </sect2> - <sect2 id="scsi_lib.c"> - <title>drivers/scsi/scsi_lib.c</title> - <para> - SCSI queuing library. - </para> -!Edrivers/scsi/scsi_lib.c - </sect2> - <sect2 id="scsi_lib_dma.c"> - <title>drivers/scsi/scsi_lib_dma.c</title> - <para> - SCSI library functions depending on DMA - (map and unmap scatter-gather lists). - </para> -!Edrivers/scsi/scsi_lib_dma.c - </sect2> - <sect2 id="scsi_module.c"> - <title>drivers/scsi/scsi_module.c</title> - <para> - The file drivers/scsi/scsi_module.c contains legacy support for - old-style host templates. It should never be used by any new driver. - </para> - </sect2> - <sect2 id="scsi_proc.c"> - <title>drivers/scsi/scsi_proc.c</title> - <para> - The functions in this file provide an interface between - the PROC file system and the SCSI device drivers - It is mainly used for debugging, statistics and to pass - information directly to the lowlevel driver. - - I.E. plumbing to manage /proc/scsi/* - </para> -!Idrivers/scsi/scsi_proc.c - </sect2> - <sect2 id="scsi_netlink.c"> - <title>drivers/scsi/scsi_netlink.c</title> - <para> - Infrastructure to provide async events from transports to userspace - via netlink, using a single NETLINK_SCSITRANSPORT protocol for all - transports. - - See <ulink url='http://marc.info/?l=linux-scsi&m=115507374832500&w=2'>the - original patch submission</ulink> for more details. - </para> -!Idrivers/scsi/scsi_netlink.c - </sect2> - <sect2 id="scsi_scan.c"> - <title>drivers/scsi/scsi_scan.c</title> - <para> - Scan a host to determine which (if any) devices are attached. - - The general scanning/probing algorithm is as follows, exceptions are - made to it depending on device specific flags, compilation options, - and global variable (boot or module load time) settings. - - A specific LUN is scanned via an INQUIRY command; if the LUN has a - device attached, a scsi_device is allocated and setup for it. - - For every id of every channel on the given host, start by scanning - LUN 0. Skip hosts that don't respond at all to a scan of LUN 0. - Otherwise, if LUN 0 has a device attached, allocate and setup a - scsi_device for it. If target is SCSI-3 or up, issue a REPORT LUN, - and scan all of the LUNs returned by the REPORT LUN; else, - sequentially scan LUNs up until some maximum is reached, or a LUN is - seen that cannot have a device attached to it. - </para> -!Idrivers/scsi/scsi_scan.c - </sect2> - <sect2 id="scsi_sysctl.c"> - <title>drivers/scsi/scsi_sysctl.c</title> - <para> - Set up the sysctl entry: "/dev/scsi/logging_level" - (DEV_SCSI_LOGGING_LEVEL) which sets/returns scsi_logging_level. - </para> - </sect2> - <sect2 id="scsi_sysfs.c"> - <title>drivers/scsi/scsi_sysfs.c</title> - <para> - SCSI sysfs interface routines. - </para> -!Edrivers/scsi/scsi_sysfs.c - </sect2> - <sect2 id="hosts.c"> - <title>drivers/scsi/hosts.c</title> - <para> - mid to lowlevel SCSI driver interface - </para> -!Edrivers/scsi/hosts.c - </sect2> - <sect2 id="constants.c"> - <title>drivers/scsi/constants.c</title> - <para> - mid to lowlevel SCSI driver interface - </para> -!Edrivers/scsi/constants.c - </sect2> - </sect1> - - <sect1 id="Transport_classes"> - <title>Transport classes</title> - <para> - Transport classes are service libraries for drivers in the SCSI - lower layer, which expose transport attributes in sysfs. - </para> - <sect2 id="Fibre_Channel_transport"> - <title>Fibre Channel transport</title> - <para> - The file drivers/scsi/scsi_transport_fc.c defines transport attributes - for Fibre Channel. - </para> -!Edrivers/scsi/scsi_transport_fc.c - </sect2> - <sect2 id="iSCSI_transport"> - <title>iSCSI transport class</title> - <para> - The file drivers/scsi/scsi_transport_iscsi.c defines transport - attributes for the iSCSI class, which sends SCSI packets over TCP/IP - connections. - </para> -!Edrivers/scsi/scsi_transport_iscsi.c - </sect2> - <sect2 id="SAS_transport"> - <title>Serial Attached SCSI (SAS) transport class</title> - <para> - The file drivers/scsi/scsi_transport_sas.c defines transport - attributes for Serial Attached SCSI, a variant of SATA aimed at - large high-end systems. - </para> - <para> - The SAS transport class contains common code to deal with SAS HBAs, - an aproximated representation of SAS topologies in the driver model, - and various sysfs attributes to expose these topologies and management - interfaces to userspace. - </para> - <para> - In addition to the basic SCSI core objects this transport class - introduces two additional intermediate objects: The SAS PHY - as represented by struct sas_phy defines an "outgoing" PHY on - a SAS HBA or Expander, and the SAS remote PHY represented by - struct sas_rphy defines an "incoming" PHY on a SAS Expander or - end device. Note that this is purely a software concept, the - underlying hardware for a PHY and a remote PHY is the exactly - the same. - </para> - <para> - There is no concept of a SAS port in this code, users can see - what PHYs form a wide port based on the port_identifier attribute, - which is the same for all PHYs in a port. - </para> -!Edrivers/scsi/scsi_transport_sas.c - </sect2> - <sect2 id="SATA_transport"> - <title>SATA transport class</title> - <para> - The SATA transport is handled by libata, which has its own book of - documentation in this directory. - </para> - </sect2> - <sect2 id="SPI_transport"> - <title>Parallel SCSI (SPI) transport class</title> - <para> - The file drivers/scsi/scsi_transport_spi.c defines transport - attributes for traditional (fast/wide/ultra) SCSI busses. - </para> -!Edrivers/scsi/scsi_transport_spi.c - </sect2> - <sect2 id="SRP_transport"> - <title>SCSI RDMA (SRP) transport class</title> - <para> - The file drivers/scsi/scsi_transport_srp.c defines transport - attributes for SCSI over Remote Direct Memory Access. - </para> -!Edrivers/scsi/scsi_transport_srp.c - </sect2> - </sect1> - - </chapter> - - <chapter id="lower_layer"> - <title>SCSI lower layer</title> - <sect1 id="hba_drivers"> - <title>Host Bus Adapter transport types</title> - <para> - Many modern device controllers use the SCSI command set as a protocol to - communicate with their devices through many different types of physical - connections. - </para> - <para> - In SCSI language a bus capable of carrying SCSI commands is - called a "transport", and a controller connecting to such a bus is - called a "host bus adapter" (HBA). - </para> - <sect2 id="scsi_debug.c"> - <title>Debug transport</title> - <para> - The file drivers/scsi/scsi_debug.c simulates a host adapter with a - variable number of disks (or disk like devices) attached, sharing a - common amount of RAM. Does a lot of checking to make sure that we are - not getting blocks mixed up, and panics the kernel if anything out of - the ordinary is seen. - </para> - <para> - To be more realistic, the simulated devices have the transport - attributes of SAS disks. - </para> - <para> - For documentation see - <ulink url='http://sg.danny.cz/sg/sdebug26.html'>http://sg.danny.cz/sg/sdebug26.html</ulink> - </para> -<!-- !Edrivers/scsi/scsi_debug.c --> - </sect2> - <sect2 id="todo"> - <title>todo</title> - <para>Parallel (fast/wide/ultra) SCSI, USB, SATA, - SAS, Fibre Channel, FireWire, ATAPI devices, Infiniband, - I20, iSCSI, Parallel ports, netlink... - </para> - </sect2> - </sect1> - </chapter> -</book> diff --git a/Documentation/DocBook/sh.tmpl b/Documentation/DocBook/sh.tmpl deleted file mode 100644 index 4a38f604fa66..000000000000 --- a/Documentation/DocBook/sh.tmpl +++ /dev/null @@ -1,105 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="sh-drivers"> - <bookinfo> - <title>SuperH Interfaces Guide</title> - - <authorgroup> - <author> - <firstname>Paul</firstname> - <surname>Mundt</surname> - <affiliation> - <address> - <email>lethal@linux-sh.org</email> - </address> - </affiliation> - </author> - </authorgroup> - - <copyright> - <year>2008-2010</year> - <holder>Paul Mundt</holder> - </copyright> - <copyright> - <year>2008-2010</year> - <holder>Renesas Technology Corp.</holder> - </copyright> - <copyright> - <year>2010</year> - <holder>Renesas Electronics Corp.</holder> - </copyright> - - <legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License version 2 as published by the Free Software Foundation. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - </para> - - <para> - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - </para> - - <para> - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - </bookinfo> - -<toc></toc> - - <chapter id="mm"> - <title>Memory Management</title> - <sect1 id="sh4"> - <title>SH-4</title> - <sect2 id="sq"> - <title>Store Queue API</title> -!Earch/sh/kernel/cpu/sh4/sq.c - </sect2> - </sect1> - <sect1 id="sh5"> - <title>SH-5</title> - <sect2 id="tlb"> - <title>TLB Interfaces</title> -!Iarch/sh/mm/tlb-sh5.c -!Iarch/sh/include/asm/tlb_64.h - </sect2> - </sect1> - </chapter> - <chapter id="mach"> - <title>Machine Specific Interfaces</title> - <sect1 id="dreamcast"> - <title>mach-dreamcast</title> -!Iarch/sh/boards/mach-dreamcast/rtc.c - </sect1> - <sect1 id="x3proto"> - <title>mach-x3proto</title> -!Earch/sh/boards/mach-x3proto/ilsel.c - </sect1> - </chapter> - <chapter id="busses"> - <title>Busses</title> - <sect1 id="superhyway"> - <title>SuperHyway</title> -!Edrivers/sh/superhyway/superhyway.c - </sect1> - - <sect1 id="maple"> - <title>Maple</title> -!Edrivers/sh/maple/maple.c - </sect1> - </chapter> -</book> diff --git a/Documentation/DocBook/stylesheet.xsl b/Documentation/DocBook/stylesheet.xsl deleted file mode 100644 index 3bf4ecf3d760..000000000000 --- a/Documentation/DocBook/stylesheet.xsl +++ /dev/null @@ -1,11 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<stylesheet xmlns="http://www.w3.org/1999/XSL/Transform" version="1.0"> -<param name="chunk.quietly">1</param> -<param name="funcsynopsis.style">ansi</param> -<param name="funcsynopsis.tabular.threshold">80</param> -<param name="callout.graphics">0</param> -<!-- <param name="paper.type">A4</param> --> -<param name="generate.consistent.ids">1</param> -<param name="generate.section.toc.level">2</param> -<param name="use.id.as.filename">1</param> -</stylesheet> diff --git a/Documentation/DocBook/w1.tmpl b/Documentation/DocBook/w1.tmpl deleted file mode 100644 index b0228d4c81bb..000000000000 --- a/Documentation/DocBook/w1.tmpl +++ /dev/null @@ -1,101 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="w1id"> - <bookinfo> - <title>W1: Dallas' 1-wire bus</title> - - <authorgroup> - <author> - <firstname>David</firstname> - <surname>Fries</surname> - <affiliation> - <address> - <email>David@Fries.net</email> - </address> - </affiliation> - </author> - - </authorgroup> - - <copyright> - <year>2013</year> - <!-- - <holder></holder> - --> - </copyright> - - <legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License version 2. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - </bookinfo> - - <toc></toc> - - <chapter id="w1_internal"> - <title>W1 API internal to the kernel</title> - - <sect1 id="w1_internal_api"> - <title>W1 API internal to the kernel</title> - <sect2 id="w1.h"> - <title>drivers/w1/w1.h</title> - <para>W1 core functions.</para> -!Idrivers/w1/w1.h - </sect2> - - <sect2 id="w1.c"> - <title>drivers/w1/w1.c</title> - <para>W1 core functions.</para> -!Idrivers/w1/w1.c - </sect2> - - <sect2 id="w1_family.h"> - <title>drivers/w1/w1_family.h</title> - <para>Allows registering device family operations.</para> -!Idrivers/w1/w1_family.h - </sect2> - - <sect2 id="w1_family.c"> - <title>drivers/w1/w1_family.c</title> - <para>Allows registering device family operations.</para> -!Edrivers/w1/w1_family.c - </sect2> - - <sect2 id="w1_int.c"> - <title>drivers/w1/w1_int.c</title> - <para>W1 internal initialization for master devices.</para> -!Edrivers/w1/w1_int.c - </sect2> - - <sect2 id="w1_netlink.h"> - <title>drivers/w1/w1_netlink.h</title> - <para>W1 external netlink API structures and commands.</para> -!Idrivers/w1/w1_netlink.h - </sect2> - - <sect2 id="w1_io.c"> - <title>drivers/w1/w1_io.c</title> - <para>W1 input/output.</para> -!Edrivers/w1/w1_io.c -!Idrivers/w1/w1_io.c - </sect2> - - </sect1> - - - </chapter> - -</book> diff --git a/Documentation/DocBook/z8530book.tmpl b/Documentation/DocBook/z8530book.tmpl deleted file mode 100644 index 6f3883be877e..000000000000 --- a/Documentation/DocBook/z8530book.tmpl +++ /dev/null @@ -1,371 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="Z85230Guide"> - <bookinfo> - <title>Z8530 Programming Guide</title> - - <authorgroup> - <author> - <firstname>Alan</firstname> - <surname>Cox</surname> - <affiliation> - <address> - <email>alan@lxorguk.ukuu.org.uk</email> - </address> - </affiliation> - </author> - </authorgroup> - - <copyright> - <year>2000</year> - <holder>Alan Cox</holder> - </copyright> - - <legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later - version. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - </para> - - <para> - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - </para> - - <para> - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - </bookinfo> - -<toc></toc> - - <chapter id="intro"> - <title>Introduction</title> - <para> - The Z85x30 family synchronous/asynchronous controller chips are - used on a large number of cheap network interface cards. The - kernel provides a core interface layer that is designed to make - it easy to provide WAN services using this chip. - </para> - <para> - The current driver only support synchronous operation. Merging the - asynchronous driver support into this code to allow any Z85x30 - device to be used as both a tty interface and as a synchronous - controller is a project for Linux post the 2.4 release - </para> - </chapter> - - <chapter id="Driver_Modes"> - <title>Driver Modes</title> - <para> - The Z85230 driver layer can drive Z8530, Z85C30 and Z85230 devices - in three different modes. Each mode can be applied to an individual - channel on the chip (each chip has two channels). - </para> - <para> - The PIO synchronous mode supports the most common Z8530 wiring. Here - the chip is interface to the I/O and interrupt facilities of the - host machine but not to the DMA subsystem. When running PIO the - Z8530 has extremely tight timing requirements. Doing high speeds, - even with a Z85230 will be tricky. Typically you should expect to - achieve at best 9600 baud with a Z8C530 and 64Kbits with a Z85230. - </para> - <para> - The DMA mode supports the chip when it is configured to use dual DMA - channels on an ISA bus. The better cards tend to support this mode - of operation for a single channel. With DMA running the Z85230 tops - out when it starts to hit ISA DMA constraints at about 512Kbits. It - is worth noting here that many PC machines hang or crash when the - chip is driven fast enough to hold the ISA bus solid. - </para> - <para> - Transmit DMA mode uses a single DMA channel. The DMA channel is used - for transmission as the transmit FIFO is smaller than the receive - FIFO. it gives better performance than pure PIO mode but is nowhere - near as ideal as pure DMA mode. - </para> - </chapter> - - <chapter id="Using_the_Z85230_driver"> - <title>Using the Z85230 driver</title> - <para> - The Z85230 driver provides the back end interface to your board. To - configure a Z8530 interface you need to detect the board and to - identify its ports and interrupt resources. It is also your problem - to verify the resources are available. - </para> - <para> - Having identified the chip you need to fill in a struct z8530_dev, - which describes each chip. This object must exist until you finally - shutdown the board. Firstly zero the active field. This ensures - nothing goes off without you intending it. The irq field should - be set to the interrupt number of the chip. (Each chip has a single - interrupt source rather than each channel). You are responsible - for allocating the interrupt line. The interrupt handler should be - set to <function>z8530_interrupt</function>. The device id should - be set to the z8530_dev structure pointer. Whether the interrupt can - be shared or not is board dependent, and up to you to initialise. - </para> - <para> - The structure holds two channel structures. - Initialise chanA.ctrlio and chanA.dataio with the address of the - control and data ports. You can or this with Z8530_PORT_SLEEP to - indicate your interface needs the 5uS delay for chip settling done - in software. The PORT_SLEEP option is architecture specific. Other - flags may become available on future platforms, eg for MMIO. - Initialise the chanA.irqs to &z8530_nop to start the chip up - as disabled and discarding interrupt events. This ensures that - stray interrupts will be mopped up and not hang the bus. Set - chanA.dev to point to the device structure itself. The - private and name field you may use as you wish. The private field - is unused by the Z85230 layer. The name is used for error reporting - and it may thus make sense to make it match the network name. - </para> - <para> - Repeat the same operation with the B channel if your chip has - both channels wired to something useful. This isn't always the - case. If it is not wired then the I/O values do not matter, but - you must initialise chanB.dev. - </para> - <para> - If your board has DMA facilities then initialise the txdma and - rxdma fields for the relevant channels. You must also allocate the - ISA DMA channels and do any necessary board level initialisation - to configure them. The low level driver will do the Z8530 and - DMA controller programming but not board specific magic. - </para> - <para> - Having initialised the device you can then call - <function>z8530_init</function>. This will probe the chip and - reset it into a known state. An identification sequence is then - run to identify the chip type. If the checks fail to pass the - function returns a non zero error code. Typically this indicates - that the port given is not valid. After this call the - type field of the z8530_dev structure is initialised to either - Z8530, Z85C30 or Z85230 according to the chip found. - </para> - <para> - Once you have called z8530_init you can also make use of the utility - function <function>z8530_describe</function>. This provides a - consistent reporting format for the Z8530 devices, and allows all - the drivers to provide consistent reporting. - </para> - </chapter> - - <chapter id="Attaching_Network_Interfaces"> - <title>Attaching Network Interfaces</title> - <para> - If you wish to use the network interface facilities of the driver, - then you need to attach a network device to each channel that is - present and in use. In addition to use the generic HDLC - you need to follow some additional plumbing rules. They may seem - complex but a look at the example hostess_sv11 driver should - reassure you. - </para> - <para> - The network device used for each channel should be pointed to by - the netdevice field of each channel. The hdlc-> priv field of the - network device points to your private data - you will need to be - able to find your private data from this. - </para> - <para> - The way most drivers approach this particular problem is to - create a structure holding the Z8530 device definition and - put that into the private field of the network device. The - network device fields of the channels then point back to the - network devices. - </para> - <para> - If you wish to use the generic HDLC then you need to register - the HDLC device. - </para> - <para> - Before you register your network device you will also need to - provide suitable handlers for most of the network device callbacks. - See the network device documentation for more details on this. - </para> - </chapter> - - <chapter id="Configuring_And_Activating_The_Port"> - <title>Configuring And Activating The Port</title> - <para> - The Z85230 driver provides helper functions and tables to load the - port registers on the Z8530 chips. When programming the register - settings for a channel be aware that the documentation recommends - initialisation orders. Strange things happen when these are not - followed. - </para> - <para> - <function>z8530_channel_load</function> takes an array of - pairs of initialisation values in an array of u8 type. The first - value is the Z8530 register number. Add 16 to indicate the alternate - register bank on the later chips. The array is terminated by a 255. - </para> - <para> - The driver provides a pair of public tables. The - z8530_hdlc_kilostream table is for the UK 'Kilostream' service and - also happens to cover most other end host configurations. The - z8530_hdlc_kilostream_85230 table is the same configuration using - the enhancements of the 85230 chip. The configuration loaded is - standard NRZ encoded synchronous data with HDLC bitstuffing. All - of the timing is taken from the other end of the link. - </para> - <para> - When writing your own tables be aware that the driver internally - tracks register values. It may need to reload values. You should - therefore be sure to set registers 1-7, 9-11, 14 and 15 in all - configurations. Where the register settings depend on DMA selection - the driver will update the bits itself when you open or close. - Loading a new table with the interface open is not recommended. - </para> - <para> - There are three standard configurations supported by the core - code. In PIO mode the interface is programmed up to use - interrupt driven PIO. This places high demands on the host processor - to avoid latency. The driver is written to take account of latency - issues but it cannot avoid latencies caused by other drivers, - notably IDE in PIO mode. Because the drivers allocate buffers you - must also prevent MTU changes while the port is open. - </para> - <para> - Once the port is open it will call the rx_function of each channel - whenever a completed packet arrived. This is invoked from - interrupt context and passes you the channel and a network - buffer (struct sk_buff) holding the data. The data includes - the CRC bytes so most users will want to trim the last two - bytes before processing the data. This function is very timing - critical. When you wish to simply discard data the support - code provides the function <function>z8530_null_rx</function> - to discard the data. - </para> - <para> - To active PIO mode sending and receiving the <function> - z8530_sync_open</function> is called. This expects to be passed - the network device and the channel. Typically this is called from - your network device open callback. On a failure a non zero error - status is returned. The <function>z8530_sync_close</function> - function shuts down a PIO channel. This must be done before the - channel is opened again and before the driver shuts down - and unloads. - </para> - <para> - The ideal mode of operation is dual channel DMA mode. Here the - kernel driver will configure the board for DMA in both directions. - The driver also handles ISA DMA issues such as controller - programming and the memory range limit for you. This mode is - activated by calling the <function>z8530_sync_dma_open</function> - function. On failure a non zero error value is returned. - Once this mode is activated it can be shut down by calling the - <function>z8530_sync_dma_close</function>. You must call the close - function matching the open mode you used. - </para> - <para> - The final supported mode uses a single DMA channel to drive the - transmit side. As the Z85C30 has a larger FIFO on the receive - channel this tends to increase the maximum speed a little. - This is activated by calling the <function>z8530_sync_txdma_open - </function>. This returns a non zero error code on failure. The - <function>z8530_sync_txdma_close</function> function closes down - the Z8530 interface from this mode. - </para> - </chapter> - - <chapter id="Network_Layer_Functions"> - <title>Network Layer Functions</title> - <para> - The Z8530 layer provides functions to queue packets for - transmission. The driver internally buffers the frame currently - being transmitted and one further frame (in order to keep back - to back transmission running). Any further buffering is up to - the caller. - </para> - <para> - The function <function>z8530_queue_xmit</function> takes a network - buffer in sk_buff format and queues it for transmission. The - caller must provide the entire packet with the exception of the - bitstuffing and CRC. This is normally done by the caller via - the generic HDLC interface layer. It returns 0 if the buffer has been - queued and non zero values for queue full. If the function accepts - the buffer it becomes property of the Z8530 layer and the caller - should not free it. - </para> - <para> - The function <function>z8530_get_stats</function> returns a pointer - to an internally maintained per interface statistics block. This - provides most of the interface code needed to implement the network - layer get_stats callback. - </para> - </chapter> - - <chapter id="Porting_The_Z8530_Driver"> - <title>Porting The Z8530 Driver</title> - <para> - The Z8530 driver is written to be portable. In DMA mode it makes - assumptions about the use of ISA DMA. These are probably warranted - in most cases as the Z85230 in particular was designed to glue to PC - type machines. The PIO mode makes no real assumptions. - </para> - <para> - Should you need to retarget the Z8530 driver to another architecture - the only code that should need changing are the port I/O functions. - At the moment these assume PC I/O port accesses. This may not be - appropriate for all platforms. Replacing - <function>z8530_read_port</function> and <function>z8530_write_port - </function> is intended to be all that is required to port this - driver layer. - </para> - </chapter> - - <chapter id="bugs"> - <title>Known Bugs And Assumptions</title> - <para> - <variablelist> - <varlistentry><term>Interrupt Locking</term> - <listitem> - <para> - The locking in the driver is done via the global cli/sti lock. This - makes for relatively poor SMP performance. Switching this to use a - per device spin lock would probably materially improve performance. - </para> - </listitem></varlistentry> - - <varlistentry><term>Occasional Failures</term> - <listitem> - <para> - We have reports of occasional failures when run for very long - periods of time and the driver starts to receive junk frames. At - the moment the cause of this is not clear. - </para> - </listitem></varlistentry> - </variablelist> - - </para> - </chapter> - - <chapter id="pubfunctions"> - <title>Public Functions Provided</title> -!Edrivers/net/wan/z85230.c - </chapter> - - <chapter id="intfunctions"> - <title>Internal Functions</title> -!Idrivers/net/wan/z85230.c - </chapter> - -</book> diff --git a/Documentation/Makefile b/Documentation/Makefile index c2a469112c37..a42320385df3 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -1 +1,126 @@ +# -*- makefile -*- +# Makefile for Sphinx documentation +# + subdir-y := + +# You can set these variables from the command line. +SPHINXBUILD = sphinx-build +SPHINXOPTS = +SPHINXDIRS = . +_SPHINXDIRS = $(patsubst $(srctree)/Documentation/%/conf.py,%,$(wildcard $(srctree)/Documentation/*/conf.py)) +SPHINX_CONF = conf.py +PAPER = +BUILDDIR = $(obj)/output +PDFLATEX = xelatex +LATEXOPTS = -interaction=batchmode + +# User-friendly check for sphinx-build +HAVE_SPHINX := $(shell if which $(SPHINXBUILD) >/dev/null 2>&1; then echo 1; else echo 0; fi) + +ifeq ($(HAVE_SPHINX),0) + +.DEFAULT: + $(warning The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed and in PATH, or set the SPHINXBUILD make variable to point to the full path of the '$(SPHINXBUILD)' executable.) + @echo " SKIP Sphinx $@ target." + +else # HAVE_SPHINX + +# User-friendly check for pdflatex +HAVE_PDFLATEX := $(shell if which $(PDFLATEX) >/dev/null 2>&1; then echo 1; else echo 0; fi) + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +KERNELDOC = $(srctree)/scripts/kernel-doc +KERNELDOC_CONF = -D kerneldoc_srctree=$(srctree) -D kerneldoc_bin=$(KERNELDOC) +ALLSPHINXOPTS = $(KERNELDOC_CONF) $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +# commands; the 'cmd' from scripts/Kbuild.include is not *loopable* +loop_cmd = $(echo-cmd) $(cmd_$(1)) || exit; + +# $2 sphinx builder e.g. "html" +# $3 name of the build subfolder / e.g. "media", used as: +# * dest folder relative to $(BUILDDIR) and +# * cache folder relative to $(BUILDDIR)/.doctrees +# $4 dest subfolder e.g. "man" for man pages at media/man +# $5 reST source folder relative to $(srctree)/$(src), +# e.g. "media" for the linux-tv book-set at ./Documentation/media + +quiet_cmd_sphinx = SPHINX $@ --> file://$(abspath $(BUILDDIR)/$3/$4) + cmd_sphinx = $(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media $2 && \ + PYTHONDONTWRITEBYTECODE=1 \ + BUILDDIR=$(abspath $(BUILDDIR)) SPHINX_CONF=$(abspath $(srctree)/$(src)/$5/$(SPHINX_CONF)) \ + $(SPHINXBUILD) \ + -b $2 \ + -c $(abspath $(srctree)/$(src)) \ + -d $(abspath $(BUILDDIR)/.doctrees/$3) \ + -D version=$(KERNELVERSION) -D release=$(KERNELRELEASE) \ + $(ALLSPHINXOPTS) \ + $(abspath $(srctree)/$(src)/$5) \ + $(abspath $(BUILDDIR)/$3/$4) + +htmldocs: + @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,html,$(var),,$(var))) + +linkcheckdocs: + @$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,linkcheck,$(var),,$(var))) + +latexdocs: + @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,latex,$(var),latex,$(var))) + +ifeq ($(HAVE_PDFLATEX),0) + +pdfdocs: + $(warning The '$(PDFLATEX)' command was not found. Make sure you have it installed and in PATH to produce PDF output.) + @echo " SKIP Sphinx $@ target." + +else # HAVE_PDFLATEX + +pdfdocs: latexdocs + $(foreach var,$(SPHINXDIRS), $(MAKE) PDFLATEX=$(PDFLATEX) LATEXOPTS="$(LATEXOPTS)" -C $(BUILDDIR)/$(var)/latex || exit;) + +endif # HAVE_PDFLATEX + +epubdocs: + @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,epub,$(var),epub,$(var))) + +xmldocs: + @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,xml,$(var),xml,$(var))) + +endif # HAVE_SPHINX + +# The following targets are independent of HAVE_SPHINX, and the rules should +# work or silently pass without Sphinx. + +# no-ops for the Sphinx toolchain +sgmldocs: + @: +psdocs: + @: +mandocs: + @: +installmandocs: + @: + +cleandocs: + $(Q)rm -rf $(BUILDDIR) + $(Q)$(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media clean + +dochelp: + @echo ' Linux kernel internal documentation in different formats from ReST:' + @echo ' htmldocs - HTML' + @echo ' latexdocs - LaTeX' + @echo ' pdfdocs - PDF' + @echo ' epubdocs - EPUB' + @echo ' xmldocs - XML' + @echo ' linkcheckdocs - check for broken external links (will connect to external hosts)' + @echo ' cleandocs - clean all generated files' + @echo + @echo ' make SPHINXDIRS="s1 s2" [target] Generate only docs of folder s1, s2' + @echo ' valid values for SPHINXDIRS are: $(_SPHINXDIRS)' + @echo + @echo ' make SPHINX_CONF={conf-file} [target] use *additional* sphinx-build' + @echo ' configuration. This is e.g. useful to build with nit-picking config.' diff --git a/Documentation/Makefile.sphinx b/Documentation/Makefile.sphinx deleted file mode 100644 index bcf529f6cf9b..000000000000 --- a/Documentation/Makefile.sphinx +++ /dev/null @@ -1,130 +0,0 @@ -# -*- makefile -*- -# Makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXBUILD = sphinx-build -SPHINXOPTS = -SPHINXDIRS = . -_SPHINXDIRS = $(patsubst $(srctree)/Documentation/%/conf.py,%,$(wildcard $(srctree)/Documentation/*/conf.py)) -SPHINX_CONF = conf.py -PAPER = -BUILDDIR = $(obj)/output -PDFLATEX = xelatex -LATEXOPTS = -interaction=batchmode - -# User-friendly check for sphinx-build -HAVE_SPHINX := $(shell if which $(SPHINXBUILD) >/dev/null 2>&1; then echo 1; else echo 0; fi) - -ifeq ($(HAVE_SPHINX),0) - -.DEFAULT: - $(warning The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed and in PATH, or set the SPHINXBUILD make variable to point to the full path of the '$(SPHINXBUILD)' executable.) - @echo " SKIP Sphinx $@ target." - -else ifneq ($(DOCBOOKS),) - -# Skip Sphinx build if the user explicitly requested DOCBOOKS. -.DEFAULT: - @echo " SKIP Sphinx $@ target (DOCBOOKS specified)." - -else # HAVE_SPHINX - -# User-friendly check for pdflatex -HAVE_PDFLATEX := $(shell if which $(PDFLATEX) >/dev/null 2>&1; then echo 1; else echo 0; fi) - -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter -KERNELDOC = $(srctree)/scripts/kernel-doc -KERNELDOC_CONF = -D kerneldoc_srctree=$(srctree) -D kerneldoc_bin=$(KERNELDOC) -ALLSPHINXOPTS = $(KERNELDOC_CONF) $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) -# the i18n builder cannot share the environment and doctrees with the others -I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . - -# commands; the 'cmd' from scripts/Kbuild.include is not *loopable* -loop_cmd = $(echo-cmd) $(cmd_$(1)) || exit; - -# $2 sphinx builder e.g. "html" -# $3 name of the build subfolder / e.g. "media", used as: -# * dest folder relative to $(BUILDDIR) and -# * cache folder relative to $(BUILDDIR)/.doctrees -# $4 dest subfolder e.g. "man" for man pages at media/man -# $5 reST source folder relative to $(srctree)/$(src), -# e.g. "media" for the linux-tv book-set at ./Documentation/media - -quiet_cmd_sphinx = SPHINX $@ --> file://$(abspath $(BUILDDIR)/$3/$4) - cmd_sphinx = $(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media $2 && \ - PYTHONDONTWRITEBYTECODE=1 \ - BUILDDIR=$(abspath $(BUILDDIR)) SPHINX_CONF=$(abspath $(srctree)/$(src)/$5/$(SPHINX_CONF)) \ - $(SPHINXBUILD) \ - -b $2 \ - -c $(abspath $(srctree)/$(src)) \ - -d $(abspath $(BUILDDIR)/.doctrees/$3) \ - -D version=$(KERNELVERSION) -D release=$(KERNELRELEASE) \ - $(ALLSPHINXOPTS) \ - $(abspath $(srctree)/$(src)/$5) \ - $(abspath $(BUILDDIR)/$3/$4) - -htmldocs: - @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,html,$(var),,$(var))) - -linkcheckdocs: - @$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,linkcheck,$(var),,$(var))) - -latexdocs: - @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,latex,$(var),latex,$(var))) - -ifeq ($(HAVE_PDFLATEX),0) - -pdfdocs: - $(warning The '$(PDFLATEX)' command was not found. Make sure you have it installed and in PATH to produce PDF output.) - @echo " SKIP Sphinx $@ target." - -else # HAVE_PDFLATEX - -pdfdocs: latexdocs - $(foreach var,$(SPHINXDIRS), $(MAKE) PDFLATEX=$(PDFLATEX) LATEXOPTS="$(LATEXOPTS)" -C $(BUILDDIR)/$(var)/latex || exit;) - -endif # HAVE_PDFLATEX - -epubdocs: - @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,epub,$(var),epub,$(var))) - -xmldocs: - @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,xml,$(var),xml,$(var))) - -endif # HAVE_SPHINX - -# The following targets are independent of HAVE_SPHINX, and the rules should -# work or silently pass without Sphinx. - -# no-ops for the Sphinx toolchain -sgmldocs: - @: -psdocs: - @: -mandocs: - @: -installmandocs: - @: - -cleandocs: - $(Q)rm -rf $(BUILDDIR) - $(Q)$(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media clean - -dochelp: - @echo ' Linux kernel internal documentation in different formats (Sphinx):' - @echo ' htmldocs - HTML' - @echo ' latexdocs - LaTeX' - @echo ' pdfdocs - PDF' - @echo ' epubdocs - EPUB' - @echo ' xmldocs - XML' - @echo ' linkcheckdocs - check for broken external links (will connect to external hosts)' - @echo ' cleandocs - clean all generated files' - @echo - @echo ' make SPHINXDIRS="s1 s2" [target] Generate only docs of folder s1, s2' - @echo ' valid values for SPHINXDIRS are: $(_SPHINXDIRS)' - @echo - @echo ' make SPHINX_CONF={conf-file} [target] use *additional* sphinx-build' - @echo ' configuration. This is e.g. useful to build with nit-picking config.' diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt index 1e37138027a3..618e13d5e276 100644 --- a/Documentation/PCI/MSI-HOWTO.txt +++ b/Documentation/PCI/MSI-HOWTO.txt @@ -186,7 +186,7 @@ must disable interrupts while the lock is held. If the device sends a different interrupt, the driver will deadlock trying to recursively acquire the spinlock. Such deadlocks can be avoided by using spin_lock_irqsave() or spin_lock_irq() which disable local interrupts -and acquire the lock (see Documentation/DocBook/kernel-locking). +and acquire the lock (see Documentation/kernel-hacking/locking.rst). 4.5 How to tell whether MSI/MSI-X is enabled on a device diff --git a/Documentation/admin-guide/README.rst b/Documentation/admin-guide/README.rst index b96e80f79e85..b5343c5aa224 100644 --- a/Documentation/admin-guide/README.rst +++ b/Documentation/admin-guide/README.rst @@ -55,12 +55,6 @@ Documentation contains information about the problems, which may result by upgrading your kernel. - - The Documentation/DocBook/ subdirectory contains several guides for - kernel developers and users. These guides can be rendered in a - number of formats: PostScript (.ps), PDF, HTML, & man-pages, among others. - After installation, ``make psdocs``, ``make pdfdocs``, ``make htmldocs``, - or ``make mandocs`` will render the documentation in the requested format. - Installing the kernel source ---------------------------- diff --git a/Documentation/conf.py b/Documentation/conf.py index bacf9d337c89..77d47bb1df1d 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -281,6 +281,7 @@ latex_elements = { \\definecolor{NoteColor}{RGB}{204,255,255} \\definecolor{WarningColor}{RGB}{255,204,204} \\definecolor{AttentionColor}{RGB}{255,255,204} + \\definecolor{ImportantColor}{RGB}{192,255,204} \\definecolor{OtherColor}{RGB}{204,204,204} \\newlength{\\mynoticelength} \\makeatletter\\newenvironment{coloredbox}[1]{% @@ -301,7 +302,12 @@ latex_elements = { \\ifthenelse% {\\equal{\\py@noticetype}{attention}}% {\\colorbox{AttentionColor}{\\usebox{\\@tempboxa}}}% - {\\colorbox{OtherColor}{\\usebox{\\@tempboxa}}}% + {% + \\ifthenelse% + {\\equal{\\py@noticetype}{important}}% + {\\colorbox{ImportantColor}{\\usebox{\\@tempboxa}}}% + {\\colorbox{OtherColor}{\\usebox{\\@tempboxa}}}% + }% }% }% }\\makeatother @@ -339,27 +345,42 @@ if major == 1 and minor > 3: # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). +# Sorted in alphabetical order latex_documents = [ - ('doc-guide/index', 'kernel-doc-guide.tex', 'Linux Kernel Documentation Guide', - 'The kernel development community', 'manual'), ('admin-guide/index', 'linux-user.tex', 'Linux Kernel User Documentation', 'The kernel development community', 'manual'), ('core-api/index', 'core-api.tex', 'The kernel core API manual', 'The kernel development community', 'manual'), - ('driver-api/index', 'driver-api.tex', 'The kernel driver API manual', + ('crypto/index', 'crypto-api.tex', 'Linux Kernel Crypto API manual', 'The kernel development community', 'manual'), - ('input/index', 'linux-input.tex', 'The Linux input driver subsystem', + ('dev-tools/index', 'dev-tools.tex', 'Development tools for the Kernel', 'The kernel development community', 'manual'), - ('kernel-documentation', 'kernel-documentation.tex', 'The Linux Kernel Documentation', + ('doc-guide/index', 'kernel-doc-guide.tex', 'Linux Kernel Documentation Guide', 'The kernel development community', 'manual'), - ('process/index', 'development-process.tex', 'Linux Kernel Development Documentation', + ('driver-api/index', 'driver-api.tex', 'The kernel driver API manual', + 'The kernel development community', 'manual'), + ('filesystems/index', 'filesystems.tex', 'Linux Filesystems API', 'The kernel development community', 'manual'), ('gpu/index', 'gpu.tex', 'Linux GPU Driver Developer\'s Guide', 'The kernel development community', 'manual'), + ('input/index', 'linux-input.tex', 'The Linux input driver subsystem', + 'The kernel development community', 'manual'), + ('kernel-hacking/index', 'kernel-hacking.tex', 'Unreliable Guide To Hacking The Linux Kernel', + 'The kernel development community', 'manual'), ('media/index', 'media.tex', 'Linux Media Subsystem Documentation', 'The kernel development community', 'manual'), + ('networking/index', 'networking.tex', 'Linux Networking Documentation', + 'The kernel development community', 'manual'), + ('process/index', 'development-process.tex', 'Linux Kernel Development Documentation', + 'The kernel development community', 'manual'), ('security/index', 'security.tex', 'The kernel security subsystem manual', 'The kernel development community', 'manual'), + ('sh/index', 'sh.tex', 'SuperH architecture implementation manual', + 'The kernel development community', 'manual'), + ('sound/index', 'sound.tex', 'Linux Sound Subsystem Documentation', + 'The kernel development community', 'manual'), + ('userspace-api/index', 'userspace-api.tex', 'The Linux kernel user-space API guide', + 'The kernel development community', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index 62abd36bfffb..0606be3a3111 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -19,6 +19,7 @@ Core utilities workqueue genericirq flexible-arrays + librs Interfaces for kernel debugging =============================== diff --git a/Documentation/core-api/librs.rst b/Documentation/core-api/librs.rst new file mode 100644 index 000000000000..6010f5bc5bf9 --- /dev/null +++ b/Documentation/core-api/librs.rst @@ -0,0 +1,212 @@ +========================================== +Reed-Solomon Library Programming Interface +========================================== + +:Author: Thomas Gleixner + +Introduction +============ + +The generic Reed-Solomon Library provides encoding, decoding and error +correction functions. + +Reed-Solomon codes are used in communication and storage applications to +ensure data integrity. + +This documentation is provided for developers who want to utilize the +functions provided by the library. + +Known Bugs And Assumptions +========================== + +None. + +Usage +===== + +This chapter provides examples of how to use the library. + +Initializing +------------ + +The init function init_rs returns a pointer to an rs decoder structure, +which holds the necessary information for encoding, decoding and error +correction with the given polynomial. It either uses an existing +matching decoder or creates a new one. On creation all the lookup tables +for fast en/decoding are created. The function may take a while, so make +sure not to call it in critical code paths. + +:: + + /* the Reed Solomon control structure */ + static struct rs_control *rs_decoder; + + /* Symbolsize is 10 (bits) + * Primitive polynomial is x^10+x^3+1 + * first consecutive root is 0 + * primitive element to generate roots = 1 + * generator polynomial degree (number of roots) = 6 + */ + rs_decoder = init_rs (10, 0x409, 0, 1, 6); + + +Encoding +-------- + +The encoder calculates the Reed-Solomon code over the given data length +and stores the result in the parity buffer. Note that the parity buffer +must be initialized before calling the encoder. + +The expanded data can be inverted on the fly by providing a non-zero +inversion mask. The expanded data is XOR'ed with the mask. This is used +e.g. for FLASH ECC, where the all 0xFF is inverted to an all 0x00. The +Reed-Solomon code for all 0x00 is all 0x00. The code is inverted before +storing to FLASH so it is 0xFF too. This prevents that reading from an +erased FLASH results in ECC errors. + +The databytes are expanded to the given symbol size on the fly. There is +no support for encoding continuous bitstreams with a symbol size != 8 at +the moment. If it is necessary it should be not a big deal to implement +such functionality. + +:: + + /* Parity buffer. Size = number of roots */ + uint16_t par[6]; + /* Initialize the parity buffer */ + memset(par, 0, sizeof(par)); + /* Encode 512 byte in data8. Store parity in buffer par */ + encode_rs8 (rs_decoder, data8, 512, par, 0); + + +Decoding +-------- + +The decoder calculates the syndrome over the given data length and the +received parity symbols and corrects errors in the data. + +If a syndrome is available from a hardware decoder then the syndrome +calculation is skipped. + +The correction of the data buffer can be suppressed by providing a +correction pattern buffer and an error location buffer to the decoder. +The decoder stores the calculated error location and the correction +bitmask in the given buffers. This is useful for hardware decoders which +use a weird bit ordering scheme. + +The databytes are expanded to the given symbol size on the fly. There is +no support for decoding continuous bitstreams with a symbolsize != 8 at +the moment. If it is necessary it should be not a big deal to implement +such functionality. + +Decoding with syndrome calculation, direct data correction +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:: + + /* Parity buffer. Size = number of roots */ + uint16_t par[6]; + uint8_t data[512]; + int numerr; + /* Receive data */ + ..... + /* Receive parity */ + ..... + /* Decode 512 byte in data8.*/ + numerr = decode_rs8 (rs_decoder, data8, par, 512, NULL, 0, NULL, 0, NULL); + + +Decoding with syndrome given by hardware decoder, direct data correction +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:: + + /* Parity buffer. Size = number of roots */ + uint16_t par[6], syn[6]; + uint8_t data[512]; + int numerr; + /* Receive data */ + ..... + /* Receive parity */ + ..... + /* Get syndrome from hardware decoder */ + ..... + /* Decode 512 byte in data8.*/ + numerr = decode_rs8 (rs_decoder, data8, par, 512, syn, 0, NULL, 0, NULL); + + +Decoding with syndrome given by hardware decoder, no direct data correction. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Note: It's not necessary to give data and received parity to the +decoder. + +:: + + /* Parity buffer. Size = number of roots */ + uint16_t par[6], syn[6], corr[8]; + uint8_t data[512]; + int numerr, errpos[8]; + /* Receive data */ + ..... + /* Receive parity */ + ..... + /* Get syndrome from hardware decoder */ + ..... + /* Decode 512 byte in data8.*/ + numerr = decode_rs8 (rs_decoder, NULL, NULL, 512, syn, 0, errpos, 0, corr); + for (i = 0; i < numerr; i++) { + do_error_correction_in_your_buffer(errpos[i], corr[i]); + } + + +Cleanup +------- + +The function free_rs frees the allocated resources, if the caller is +the last user of the decoder. + +:: + + /* Release resources */ + free_rs(rs_decoder); + + +Structures +========== + +This chapter contains the autogenerated documentation of the structures +which are used in the Reed-Solomon Library and are relevant for a +developer. + +.. kernel-doc:: include/linux/rslib.h + :internal: + +Public Functions Provided +========================= + +This chapter contains the autogenerated documentation of the +Reed-Solomon functions which are exported. + +.. kernel-doc:: lib/reed_solomon/reed_solomon.c + :export: + +Credits +======= + +The library code for encoding and decoding was written by Phil Karn. + +:: + + Copyright 2002, Phil Karn, KA9Q + May be used under the terms of the GNU General Public License (GPL) + + +The wrapper functions and interfaces are written by Thomas Gleixner. + +Many users have provided bugfixes, improvements and helping hands for +testing. Thanks a lot. + +The following people have contributed to this document: + +Thomas Gleixner\ tglx@linutronix.de diff --git a/Documentation/crypto/conf.py b/Documentation/crypto/conf.py new file mode 100644 index 000000000000..4335d251ddf3 --- /dev/null +++ b/Documentation/crypto/conf.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8; mode: python -*- + +project = 'Linux Kernel Crypto API' + +tags.add("subproject") + +latex_documents = [ + ('index', 'crypto-api.tex', 'Linux Kernel Crypto API manual', + 'The kernel development community', 'manual'), +] diff --git a/Documentation/dev-tools/index.rst b/Documentation/dev-tools/index.rst index 07d881147ef3..4ac991dbddb7 100644 --- a/Documentation/dev-tools/index.rst +++ b/Documentation/dev-tools/index.rst @@ -23,6 +23,7 @@ whole; patches welcome! kmemleak kmemcheck gdb-kernel-debugging + kgdb .. only:: subproject and html diff --git a/Documentation/dev-tools/kgdb.rst b/Documentation/dev-tools/kgdb.rst new file mode 100644 index 000000000000..75273203a35a --- /dev/null +++ b/Documentation/dev-tools/kgdb.rst @@ -0,0 +1,907 @@ +================================================= +Using kgdb, kdb and the kernel debugger internals +================================================= + +:Author: Jason Wessel + +Introduction +============ + +The kernel has two different debugger front ends (kdb and kgdb) which +interface to the debug core. It is possible to use either of the +debugger front ends and dynamically transition between them if you +configure the kernel properly at compile and runtime. + +Kdb is simplistic shell-style interface which you can use on a system +console with a keyboard or serial console. You can use it to inspect +memory, registers, process lists, dmesg, and even set breakpoints to +stop in a certain location. Kdb is not a source level debugger, although +you can set breakpoints and execute some basic kernel run control. Kdb +is mainly aimed at doing some analysis to aid in development or +diagnosing kernel problems. You can access some symbols by name in +kernel built-ins or in kernel modules if the code was built with +``CONFIG_KALLSYMS``. + +Kgdb is intended to be used as a source level debugger for the Linux +kernel. It is used along with gdb to debug a Linux kernel. The +expectation is that gdb can be used to "break in" to the kernel to +inspect memory, variables and look through call stack information +similar to the way an application developer would use gdb to debug an +application. It is possible to place breakpoints in kernel code and +perform some limited execution stepping. + +Two machines are required for using kgdb. One of these machines is a +development machine and the other is the target machine. The kernel to +be debugged runs on the target machine. The development machine runs an +instance of gdb against the vmlinux file which contains the symbols (not +a boot image such as bzImage, zImage, uImage...). In gdb the developer +specifies the connection parameters and connects to kgdb. The type of +connection a developer makes with gdb depends on the availability of +kgdb I/O modules compiled as built-ins or loadable kernel modules in the +test machine's kernel. + +Compiling a kernel +================== + +- In order to enable compilation of kdb, you must first enable kgdb. + +- The kgdb test compile options are described in the kgdb test suite + chapter. + +Kernel config options for kgdb +------------------------------ + +To enable ``CONFIG_KGDB`` you should look under +:menuselection:`Kernel hacking --> Kernel debugging` and select +:menuselection:`KGDB: kernel debugger`. + +While it is not a hard requirement that you have symbols in your vmlinux +file, gdb tends not to be very useful without the symbolic data, so you +will want to turn on ``CONFIG_DEBUG_INFO`` which is called +:menuselection:`Compile the kernel with debug info` in the config menu. + +It is advised, but not required, that you turn on the +``CONFIG_FRAME_POINTER`` kernel option which is called :menuselection:`Compile +the kernel with frame pointers` in the config menu. This option inserts code +to into the compiled executable which saves the frame information in +registers or on the stack at different points which allows a debugger +such as gdb to more accurately construct stack back traces while +debugging the kernel. + +If the architecture that you are using supports the kernel option +``CONFIG_STRICT_KERNEL_RWX``, you should consider turning it off. This +option will prevent the use of software breakpoints because it marks +certain regions of the kernel's memory space as read-only. If kgdb +supports it for the architecture you are using, you can use hardware +breakpoints if you desire to run with the ``CONFIG_STRICT_KERNEL_RWX`` +option turned on, else you need to turn off this option. + +Next you should choose one of more I/O drivers to interconnect debugging +host and debugged target. Early boot debugging requires a KGDB I/O +driver that supports early debugging and the driver must be built into +the kernel directly. Kgdb I/O driver configuration takes place via +kernel or module parameters which you can learn more about in the in the +section that describes the parameter kgdboc. + +Here is an example set of ``.config`` symbols to enable or disable for kgdb:: + + # CONFIG_STRICT_KERNEL_RWX is not set + CONFIG_FRAME_POINTER=y + CONFIG_KGDB=y + CONFIG_KGDB_SERIAL_CONSOLE=y + +Kernel config options for kdb +----------------------------- + +Kdb is quite a bit more complex than the simple gdbstub sitting on top +of the kernel's debug core. Kdb must implement a shell, and also adds +some helper functions in other parts of the kernel, responsible for +printing out interesting data such as what you would see if you ran +``lsmod``, or ``ps``. In order to build kdb into the kernel you follow the +same steps as you would for kgdb. + +The main config option for kdb is ``CONFIG_KGDB_KDB`` which is called +:menuselection:`KGDB_KDB: include kdb frontend for kgdb` in the config menu. +In theory you would have already also selected an I/O driver such as the +``CONFIG_KGDB_SERIAL_CONSOLE`` interface if you plan on using kdb on a +serial port, when you were configuring kgdb. + +If you want to use a PS/2-style keyboard with kdb, you would select +``CONFIG_KDB_KEYBOARD`` which is called :menuselection:`KGDB_KDB: keyboard as +input device` in the config menu. The ``CONFIG_KDB_KEYBOARD`` option is not +used for anything in the gdb interface to kgdb. The ``CONFIG_KDB_KEYBOARD`` +option only works with kdb. + +Here is an example set of ``.config`` symbols to enable/disable kdb:: + + # CONFIG_STRICT_KERNEL_RWX is not set + CONFIG_FRAME_POINTER=y + CONFIG_KGDB=y + CONFIG_KGDB_SERIAL_CONSOLE=y + CONFIG_KGDB_KDB=y + CONFIG_KDB_KEYBOARD=y + +Kernel Debugger Boot Arguments +============================== + +This section describes the various runtime kernel parameters that affect +the configuration of the kernel debugger. The following chapter covers +using kdb and kgdb as well as providing some examples of the +configuration parameters. + +Kernel parameter: kgdboc +------------------------ + +The kgdboc driver was originally an abbreviation meant to stand for +"kgdb over console". Today it is the primary mechanism to configure how +to communicate from gdb to kgdb as well as the devices you want to use +to interact with the kdb shell. + +For kgdb/gdb, kgdboc is designed to work with a single serial port. It +is intended to cover the circumstance where you want to use a serial +console as your primary console as well as using it to perform kernel +debugging. It is also possible to use kgdb on a serial port which is not +designated as a system console. Kgdboc may be configured as a kernel +built-in or a kernel loadable module. You can only make use of +``kgdbwait`` and early debugging if you build kgdboc into the kernel as +a built-in. + +Optionally you can elect to activate kms (Kernel Mode Setting) +integration. When you use kms with kgdboc and you have a video driver +that has atomic mode setting hooks, it is possible to enter the debugger +on the graphics console. When the kernel execution is resumed, the +previous graphics mode will be restored. This integration can serve as a +useful tool to aid in diagnosing crashes or doing analysis of memory +with kdb while allowing the full graphics console applications to run. + +kgdboc arguments +~~~~~~~~~~~~~~~~ + +Usage:: + + kgdboc=[kms][[,]kbd][[,]serial_device][,baud] + +The order listed above must be observed if you use any of the optional +configurations together. + +Abbreviations: + +- kms = Kernel Mode Setting + +- kbd = Keyboard + +You can configure kgdboc to use the keyboard, and/or a serial device +depending on if you are using kdb and/or kgdb, in one of the following +scenarios. The order listed above must be observed if you use any of the +optional configurations together. Using kms + only gdb is generally not +a useful combination. + +Using loadable module or built-in +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +1. As a kernel built-in: + + Use the kernel boot argument:: + + kgdboc=<tty-device>,[baud] + +2. As a kernel loadable module: + + Use the command:: + + modprobe kgdboc kgdboc=<tty-device>,[baud] + + Here are two examples of how you might format the kgdboc string. The + first is for an x86 target using the first serial port. The second + example is for the ARM Versatile AB using the second serial port. + + 1. ``kgdboc=ttyS0,115200`` + + 2. ``kgdboc=ttyAMA1,115200`` + +Configure kgdboc at runtime with sysfs +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +At run time you can enable or disable kgdboc by echoing a parameters +into the sysfs. Here are two examples: + +1. Enable kgdboc on ttyS0:: + + echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc + +2. Disable kgdboc:: + + echo "" > /sys/module/kgdboc/parameters/kgdboc + +.. note:: + + You do not need to specify the baud if you are configuring the + console on tty which is already configured or open. + +More examples +^^^^^^^^^^^^^ + +You can configure kgdboc to use the keyboard, and/or a serial device +depending on if you are using kdb and/or kgdb, in one of the following +scenarios. + +1. kdb and kgdb over only a serial port:: + + kgdboc=<serial_device>[,baud] + + Example:: + + kgdboc=ttyS0,115200 + +2. kdb and kgdb with keyboard and a serial port:: + + kgdboc=kbd,<serial_device>[,baud] + + Example:: + + kgdboc=kbd,ttyS0,115200 + +3. kdb with a keyboard:: + + kgdboc=kbd + +4. kdb with kernel mode setting:: + + kgdboc=kms,kbd + +5. kdb with kernel mode setting and kgdb over a serial port:: + + kgdboc=kms,kbd,ttyS0,115200 + +.. note:: + + Kgdboc does not support interrupting the target via the gdb remote + protocol. You must manually send a :kbd:`SysRq-G` unless you have a proxy + that splits console output to a terminal program. A console proxy has a + separate TCP port for the debugger and a separate TCP port for the + "human" console. The proxy can take care of sending the :kbd:`SysRq-G` + for you. + +When using kgdboc with no debugger proxy, you can end up connecting the +debugger at one of two entry points. If an exception occurs after you +have loaded kgdboc, a message should print on the console stating it is +waiting for the debugger. In this case you disconnect your terminal +program and then connect the debugger in its place. If you want to +interrupt the target system and forcibly enter a debug session you have +to issue a :kbd:`Sysrq` sequence and then type the letter :kbd:`g`. Then you +disconnect the terminal session and connect gdb. Your options if you +don't like this are to hack gdb to send the :kbd:`SysRq-G` for you as well as +on the initial connect, or to use a debugger proxy that allows an +unmodified gdb to do the debugging. + +Kernel parameter: ``kgdbwait`` +------------------------------ + +The Kernel command line option ``kgdbwait`` makes kgdb wait for a +debugger connection during booting of a kernel. You can only use this +option if you compiled a kgdb I/O driver into the kernel and you +specified the I/O driver configuration as a kernel command line option. +The kgdbwait parameter should always follow the configuration parameter +for the kgdb I/O driver in the kernel command line else the I/O driver +will not be configured prior to asking the kernel to use it to wait. + +The kernel will stop and wait as early as the I/O driver and +architecture allows when you use this option. If you build the kgdb I/O +driver as a loadable kernel module kgdbwait will not do anything. + +Kernel parameter: ``kgdbcon`` +----------------------------- + +The ``kgdbcon`` feature allows you to see :c:func:`printk` messages inside gdb +while gdb is connected to the kernel. Kdb does not make use of the kgdbcon +feature. + +Kgdb supports using the gdb serial protocol to send console messages to +the debugger when the debugger is connected and running. There are two +ways to activate this feature. + +1. Activate with the kernel command line option:: + + kgdbcon + +2. Use sysfs before configuring an I/O driver:: + + echo 1 > /sys/module/kgdb/parameters/kgdb_use_con + +.. note:: + + If you do this after you configure the kgdb I/O driver, the + setting will not take effect until the next point the I/O is + reconfigured. + +.. important:: + + You cannot use kgdboc + kgdbcon on a tty that is an + active system console. An example of incorrect usage is:: + + console=ttyS0,115200 kgdboc=ttyS0 kgdbcon + +It is possible to use this option with kgdboc on a tty that is not a +system console. + +Run time parameter: ``kgdbreboot`` +---------------------------------- + +The kgdbreboot feature allows you to change how the debugger deals with +the reboot notification. You have 3 choices for the behavior. The +default behavior is always set to 0. + +.. tabularcolumns:: |p{0.4cm}|p{11.5cm}|p{5.6cm}| + +.. flat-table:: + :widths: 1 10 8 + + * - 1 + - ``echo -1 > /sys/module/debug_core/parameters/kgdbreboot`` + - Ignore the reboot notification entirely. + + * - 2 + - ``echo 0 > /sys/module/debug_core/parameters/kgdbreboot`` + - Send the detach message to any attached debugger client. + + * - 3 + - ``echo 1 > /sys/module/debug_core/parameters/kgdbreboot`` + - Enter the debugger on reboot notify. + +Using kdb +========= + +Quick start for kdb on a serial port +------------------------------------ + +This is a quick example of how to use kdb. + +1. Configure kgdboc at boot using kernel parameters:: + + console=ttyS0,115200 kgdboc=ttyS0,115200 + + OR + + Configure kgdboc after the kernel has booted; assuming you are using + a serial port console:: + + echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc + +2. Enter the kernel debugger manually or by waiting for an oops or + fault. There are several ways you can enter the kernel debugger + manually; all involve using the :kbd:`SysRq-G`, which means you must have + enabled ``CONFIG_MAGIC_SysRq=y`` in your kernel config. + + - When logged in as root or with a super user session you can run:: + + echo g > /proc/sysrq-trigger + + - Example using minicom 2.2 + + Press: :kbd:`CTRL-A` :kbd:`f` :kbd:`g` + + - When you have telneted to a terminal server that supports sending + a remote break + + Press: :kbd:`CTRL-]` + + Type in: ``send break`` + + Press: :kbd:`Enter` :kbd:`g` + +3. From the kdb prompt you can run the ``help`` command to see a complete + list of the commands that are available. + + Some useful commands in kdb include: + + =========== ================================================================= + ``lsmod`` Shows where kernel modules are loaded + ``ps`` Displays only the active processes + ``ps A`` Shows all the processes + ``summary`` Shows kernel version info and memory usage + ``bt`` Get a backtrace of the current process using :c:func:`dump_stack` + ``dmesg`` View the kernel syslog buffer + ``go`` Continue the system + =========== ================================================================= + +4. When you are done using kdb you need to consider rebooting the system + or using the ``go`` command to resuming normal kernel execution. If you + have paused the kernel for a lengthy period of time, applications + that rely on timely networking or anything to do with real wall clock + time could be adversely affected, so you should take this into + consideration when using the kernel debugger. + +Quick start for kdb using a keyboard connected console +------------------------------------------------------ + +This is a quick example of how to use kdb with a keyboard. + +1. Configure kgdboc at boot using kernel parameters:: + + kgdboc=kbd + + OR + + Configure kgdboc after the kernel has booted:: + + echo kbd > /sys/module/kgdboc/parameters/kgdboc + +2. Enter the kernel debugger manually or by waiting for an oops or + fault. There are several ways you can enter the kernel debugger + manually; all involve using the :kbd:`SysRq-G`, which means you must have + enabled ``CONFIG_MAGIC_SysRq=y`` in your kernel config. + + - When logged in as root or with a super user session you can run:: + + echo g > /proc/sysrq-trigger + + - Example using a laptop keyboard: + + Press and hold down: :kbd:`Alt` + + Press and hold down: :kbd:`Fn` + + Press and release the key with the label: :kbd:`SysRq` + + Release: :kbd:`Fn` + + Press and release: :kbd:`g` + + Release: :kbd:`Alt` + + - Example using a PS/2 101-key keyboard + + Press and hold down: :kbd:`Alt` + + Press and release the key with the label: :kbd:`SysRq` + + Press and release: :kbd:`g` + + Release: :kbd:`Alt` + +3. Now type in a kdb command such as ``help``, ``dmesg``, ``bt`` or ``go`` to + continue kernel execution. + +Using kgdb / gdb +================ + +In order to use kgdb you must activate it by passing configuration +information to one of the kgdb I/O drivers. If you do not pass any +configuration information kgdb will not do anything at all. Kgdb will +only actively hook up to the kernel trap hooks if a kgdb I/O driver is +loaded and configured. If you unconfigure a kgdb I/O driver, kgdb will +unregister all the kernel hook points. + +All kgdb I/O drivers can be reconfigured at run time, if +``CONFIG_SYSFS`` and ``CONFIG_MODULES`` are enabled, by echo'ing a new +config string to ``/sys/module/<driver>/parameter/<option>``. The driver +can be unconfigured by passing an empty string. You cannot change the +configuration while the debugger is attached. Make sure to detach the +debugger with the ``detach`` command prior to trying to unconfigure a +kgdb I/O driver. + +Connecting with gdb to a serial port +------------------------------------ + +1. Configure kgdboc + + Configure kgdboc at boot using kernel parameters:: + + kgdboc=ttyS0,115200 + + OR + + Configure kgdboc after the kernel has booted:: + + echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc + +2. Stop kernel execution (break into the debugger) + + In order to connect to gdb via kgdboc, the kernel must first be + stopped. There are several ways to stop the kernel which include + using kgdbwait as a boot argument, via a :kbd:`SysRq-G`, or running the + kernel until it takes an exception where it waits for the debugger to + attach. + + - When logged in as root or with a super user session you can run:: + + echo g > /proc/sysrq-trigger + + - Example using minicom 2.2 + + Press: :kbd:`CTRL-A` :kbd:`f` :kbd:`g` + + - When you have telneted to a terminal server that supports sending + a remote break + + Press: :kbd:`CTRL-]` + + Type in: ``send break`` + + Press: :kbd:`Enter` :kbd:`g` + +3. Connect from gdb + + Example (using a directly connected port):: + + % gdb ./vmlinux + (gdb) set remotebaud 115200 + (gdb) target remote /dev/ttyS0 + + + Example (kgdb to a terminal server on TCP port 2012):: + + % gdb ./vmlinux + (gdb) target remote 192.168.2.2:2012 + + + Once connected, you can debug a kernel the way you would debug an + application program. + + If you are having problems connecting or something is going seriously + wrong while debugging, it will most often be the case that you want + to enable gdb to be verbose about its target communications. You do + this prior to issuing the ``target remote`` command by typing in:: + + set debug remote 1 + +Remember if you continue in gdb, and need to "break in" again, you need +to issue an other :kbd:`SysRq-G`. It is easy to create a simple entry point by +putting a breakpoint at ``sys_sync`` and then you can run ``sync`` from a +shell or script to break into the debugger. + +kgdb and kdb interoperability +============================= + +It is possible to transition between kdb and kgdb dynamically. The debug +core will remember which you used the last time and automatically start +in the same mode. + +Switching between kdb and kgdb +------------------------------ + +Switching from kgdb to kdb +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There are two ways to switch from kgdb to kdb: you can use gdb to issue +a maintenance packet, or you can blindly type the command ``$3#33``. +Whenever the kernel debugger stops in kgdb mode it will print the +message ``KGDB or $3#33 for KDB``. It is important to note that you have +to type the sequence correctly in one pass. You cannot type a backspace +or delete because kgdb will interpret that as part of the debug stream. + +1. Change from kgdb to kdb by blindly typing:: + + $3#33 + +2. Change from kgdb to kdb with gdb:: + + maintenance packet 3 + + .. note:: + + Now you must kill gdb. Typically you press :kbd:`CTRL-Z` and issue + the command:: + + kill -9 % + +Change from kdb to kgdb +~~~~~~~~~~~~~~~~~~~~~~~ + +There are two ways you can change from kdb to kgdb. You can manually +enter kgdb mode by issuing the kgdb command from the kdb shell prompt, +or you can connect gdb while the kdb shell prompt is active. The kdb +shell looks for the typical first commands that gdb would issue with the +gdb remote protocol and if it sees one of those commands it +automatically changes into kgdb mode. + +1. From kdb issue the command:: + + kgdb + + Now disconnect your terminal program and connect gdb in its place + +2. At the kdb prompt, disconnect the terminal program and connect gdb in + its place. + +Running kdb commands from gdb +----------------------------- + +It is possible to run a limited set of kdb commands from gdb, using the +gdb monitor command. You don't want to execute any of the run control or +breakpoint operations, because it can disrupt the state of the kernel +debugger. You should be using gdb for breakpoints and run control +operations if you have gdb connected. The more useful commands to run +are things like lsmod, dmesg, ps or possibly some of the memory +information commands. To see all the kdb commands you can run +``monitor help``. + +Example:: + + (gdb) monitor ps + 1 idle process (state I) and + 27 sleeping system daemon (state M) processes suppressed, + use 'ps A' to see all. + Task Addr Pid Parent [*] cpu State Thread Command + + 0xc78291d0 1 0 0 0 S 0xc7829404 init + 0xc7954150 942 1 0 0 S 0xc7954384 dropbear + 0xc78789c0 944 1 0 0 S 0xc7878bf4 sh + (gdb) + +kgdb Test Suite +=============== + +When kgdb is enabled in the kernel config you can also elect to enable +the config parameter ``KGDB_TESTS``. Turning this on will enable a special +kgdb I/O module which is designed to test the kgdb internal functions. + +The kgdb tests are mainly intended for developers to test the kgdb +internals as well as a tool for developing a new kgdb architecture +specific implementation. These tests are not really for end users of the +Linux kernel. The primary source of documentation would be to look in +the ``drivers/misc/kgdbts.c`` file. + +The kgdb test suite can also be configured at compile time to run the +core set of tests by setting the kernel config parameter +``KGDB_TESTS_ON_BOOT``. This particular option is aimed at automated +regression testing and does not require modifying the kernel boot config +arguments. If this is turned on, the kgdb test suite can be disabled by +specifying ``kgdbts=`` as a kernel boot argument. + +Kernel Debugger Internals +========================= + +Architecture Specifics +---------------------- + +The kernel debugger is organized into a number of components: + +1. The debug core + + The debug core is found in ``kernel/debugger/debug_core.c``. It + contains: + + - A generic OS exception handler which includes sync'ing the + processors into a stopped state on an multi-CPU system. + + - The API to talk to the kgdb I/O drivers + + - The API to make calls to the arch-specific kgdb implementation + + - The logic to perform safe memory reads and writes to memory while + using the debugger + + - A full implementation for software breakpoints unless overridden + by the arch + + - The API to invoke either the kdb or kgdb frontend to the debug + core. + + - The structures and callback API for atomic kernel mode setting. + + .. note:: kgdboc is where the kms callbacks are invoked. + +2. kgdb arch-specific implementation + + This implementation is generally found in ``arch/*/kernel/kgdb.c``. As + an example, ``arch/x86/kernel/kgdb.c`` contains the specifics to + implement HW breakpoint as well as the initialization to dynamically + register and unregister for the trap handlers on this architecture. + The arch-specific portion implements: + + - contains an arch-specific trap catcher which invokes + :c:func:`kgdb_handle_exception` to start kgdb about doing its work + + - translation to and from gdb specific packet format to :c:type:`pt_regs` + + - Registration and unregistration of architecture specific trap + hooks + + - Any special exception handling and cleanup + + - NMI exception handling and cleanup + + - (optional) HW breakpoints + +3. gdbstub frontend (aka kgdb) + + The gdbstub is located in ``kernel/debug/gdbstub.c``. It contains: + + - All the logic to implement the gdb serial protocol + +4. kdb frontend + + The kdb debugger shell is broken down into a number of components. + The kdb core is located in kernel/debug/kdb. There are a number of + helper functions in some of the other kernel components to make it + possible for kdb to examine and report information about the kernel + without taking locks that could cause a kernel deadlock. The kdb core + contains implements the following functionality. + + - A simple shell + + - The kdb core command set + + - A registration API to register additional kdb shell commands. + + - A good example of a self-contained kdb module is the ``ftdump`` + command for dumping the ftrace buffer. See: + ``kernel/trace/trace_kdb.c`` + + - For an example of how to dynamically register a new kdb command + you can build the kdb_hello.ko kernel module from + ``samples/kdb/kdb_hello.c``. To build this example you can set + ``CONFIG_SAMPLES=y`` and ``CONFIG_SAMPLE_KDB=m`` in your kernel + config. Later run ``modprobe kdb_hello`` and the next time you + enter the kdb shell, you can run the ``hello`` command. + + - The implementation for :c:func:`kdb_printf` which emits messages directly + to I/O drivers, bypassing the kernel log. + + - SW / HW breakpoint management for the kdb shell + +5. kgdb I/O driver + + Each kgdb I/O driver has to provide an implementation for the + following: + + - configuration via built-in or module + + - dynamic configuration and kgdb hook registration calls + + - read and write character interface + + - A cleanup handler for unconfiguring from the kgdb core + + - (optional) Early debug methodology + + Any given kgdb I/O driver has to operate very closely with the + hardware and must do it in such a way that does not enable interrupts + or change other parts of the system context without completely + restoring them. The kgdb core will repeatedly "poll" a kgdb I/O + driver for characters when it needs input. The I/O driver is expected + to return immediately if there is no data available. Doing so allows + for the future possibility to touch watchdog hardware in such a way + as to have a target system not reset when these are enabled. + +If you are intent on adding kgdb architecture specific support for a new +architecture, the architecture should define ``HAVE_ARCH_KGDB`` in the +architecture specific Kconfig file. This will enable kgdb for the +architecture, and at that point you must create an architecture specific +kgdb implementation. + +There are a few flags which must be set on every architecture in their +``asm/kgdb.h`` file. These are: + +- ``NUMREGBYTES``: + The size in bytes of all of the registers, so that we + can ensure they will all fit into a packet. + +- ``BUFMAX``: + The size in bytes of the buffer GDB will read into. This must + be larger than NUMREGBYTES. + +- ``CACHE_FLUSH_IS_SAFE``: + Set to 1 if it is always safe to call + flush_cache_range or flush_icache_range. On some architectures, + these functions may not be safe to call on SMP since we keep other + CPUs in a holding pattern. + +There are also the following functions for the common backend, found in +``kernel/kgdb.c``, that must be supplied by the architecture-specific +backend unless marked as (optional), in which case a default function +maybe used if the architecture does not need to provide a specific +implementation. + +.. kernel-doc:: include/linux/kgdb.h + :internal: + +kgdboc internals +---------------- + +kgdboc and uarts +~~~~~~~~~~~~~~~~ + +The kgdboc driver is actually a very thin driver that relies on the +underlying low level to the hardware driver having "polling hooks" to +which the tty driver is attached. In the initial implementation of +kgdboc the serial_core was changed to expose a low level UART hook for +doing polled mode reading and writing of a single character while in an +atomic context. When kgdb makes an I/O request to the debugger, kgdboc +invokes a callback in the serial core which in turn uses the callback in +the UART driver. + +When using kgdboc with a UART, the UART driver must implement two +callbacks in the :c:type:`struct uart_ops <uart_ops>`. +Example from ``drivers/8250.c``:: + + + #ifdef CONFIG_CONSOLE_POLL + .poll_get_char = serial8250_get_poll_char, + .poll_put_char = serial8250_put_poll_char, + #endif + + +Any implementation specifics around creating a polling driver use the +``#ifdef CONFIG_CONSOLE_POLL``, as shown above. Keep in mind that +polling hooks have to be implemented in such a way that they can be +called from an atomic context and have to restore the state of the UART +chip on return such that the system can return to normal when the +debugger detaches. You need to be very careful with any kind of lock you +consider, because failing here is most likely going to mean pressing the +reset button. + +kgdboc and keyboards +~~~~~~~~~~~~~~~~~~~~~~~~ + +The kgdboc driver contains logic to configure communications with an +attached keyboard. The keyboard infrastructure is only compiled into the +kernel when ``CONFIG_KDB_KEYBOARD=y`` is set in the kernel configuration. + +The core polled keyboard driver driver for PS/2 type keyboards is in +``drivers/char/kdb_keyboard.c``. This driver is hooked into the debug core +when kgdboc populates the callback in the array called +:c:type:`kdb_poll_funcs[]`. The :c:func:`kdb_get_kbd_char` is the top-level +function which polls hardware for single character input. + +kgdboc and kms +~~~~~~~~~~~~~~~~~~ + +The kgdboc driver contains logic to request the graphics display to +switch to a text context when you are using ``kgdboc=kms,kbd``, provided +that you have a video driver which has a frame buffer console and atomic +kernel mode setting support. + +Every time the kernel debugger is entered it calls +:c:func:`kgdboc_pre_exp_handler` which in turn calls :c:func:`con_debug_enter` +in the virtual console layer. On resuming kernel execution, the kernel +debugger calls :c:func:`kgdboc_post_exp_handler` which in turn calls +:c:func:`con_debug_leave`. + +Any video driver that wants to be compatible with the kernel debugger +and the atomic kms callbacks must implement the ``mode_set_base_atomic``, +``fb_debug_enter`` and ``fb_debug_leave operations``. For the +``fb_debug_enter`` and ``fb_debug_leave`` the option exists to use the +generic drm fb helper functions or implement something custom for the +hardware. The following example shows the initialization of the +.mode_set_base_atomic operation in +drivers/gpu/drm/i915/intel_display.c:: + + + static const struct drm_crtc_helper_funcs intel_helper_funcs = { + [...] + .mode_set_base_atomic = intel_pipe_set_base_atomic, + [...] + }; + + +Here is an example of how the i915 driver initializes the +fb_debug_enter and fb_debug_leave functions to use the generic drm +helpers in ``drivers/gpu/drm/i915/intel_fb.c``:: + + + static struct fb_ops intelfb_ops = { + [...] + .fb_debug_enter = drm_fb_helper_debug_enter, + .fb_debug_leave = drm_fb_helper_debug_leave, + [...] + }; + + +Credits +======= + +The following people have contributed to this document: + +1. Amit Kale <amitkale@linsyssoft.com> + +2. Tom Rini <trini@kernel.crashing.org> + +In March 2008 this document was completely rewritten by: + +- Jason Wessel <jason.wessel@windriver.com> + +In Jan 2010 this document was updated to include kdb. + +- Jason Wessel <jason.wessel@windriver.com> diff --git a/Documentation/doc-guide/docbook.rst b/Documentation/doc-guide/docbook.rst deleted file mode 100644 index d8bf04308b43..000000000000 --- a/Documentation/doc-guide/docbook.rst +++ /dev/null @@ -1,90 +0,0 @@ -DocBook XML [DEPRECATED] -======================== - -.. attention:: - - This section describes the deprecated DocBook XML toolchain. Please do not - create new DocBook XML template files. Please consider converting existing - DocBook XML templates files to Sphinx/reStructuredText. - -Converting DocBook to Sphinx ----------------------------- - -Over time, we expect all of the documents under ``Documentation/DocBook`` to be -converted to Sphinx and reStructuredText. For most DocBook XML documents, a good -enough solution is to use the simple ``Documentation/sphinx/tmplcvt`` script, -which uses ``pandoc`` under the hood. For example:: - - $ cd Documentation/sphinx - $ ./tmplcvt ../DocBook/in.tmpl ../out.rst - -Then edit the resulting rst files to fix any remaining issues, and add the -document in the ``toctree`` in ``Documentation/index.rst``. - -Components of the kernel-doc system ------------------------------------ - -Many places in the source tree have extractable documentation in the form of -block comments above functions. The components of this system are: - -- ``scripts/kernel-doc`` - - This is a perl script that hunts for the block comments and can mark them up - directly into reStructuredText, DocBook, man, text, and HTML. (No, not - texinfo.) - -- ``Documentation/DocBook/*.tmpl`` - - These are XML template files, which are normal XML files with special - place-holders for where the extracted documentation should go. - -- ``scripts/docproc.c`` - - This is a program for converting XML template files into XML files. When a - file is referenced it is searched for symbols exported (EXPORT_SYMBOL), to be - able to distinguish between internal and external functions. - - It invokes kernel-doc, giving it the list of functions that are to be - documented. - - Additionally it is used to scan the XML template files to locate all the files - referenced herein. This is used to generate dependency information as used by - make. - -- ``Makefile`` - - The targets 'xmldocs', 'psdocs', 'pdfdocs', and 'htmldocs' are used to build - DocBook XML files, PostScript files, PDF files, and html files in - Documentation/DocBook. The older target 'sgmldocs' is equivalent to 'xmldocs'. - -- ``Documentation/DocBook/Makefile`` - - This is where C files are associated with SGML templates. - -How to use kernel-doc comments in DocBook XML template files ------------------------------------------------------------- - -DocBook XML template files (\*.tmpl) are like normal XML files, except that they -can contain escape sequences where extracted documentation should be inserted. - -``!E<filename>`` is replaced by the documentation, in ``<filename>``, for -functions that are exported using ``EXPORT_SYMBOL``: the function list is -collected from files listed in ``Documentation/DocBook/Makefile``. - -``!I<filename>`` is replaced by the documentation for functions that are **not** -exported using ``EXPORT_SYMBOL``. - -``!D<filename>`` is used to name additional files to search for functions -exported using ``EXPORT_SYMBOL``. - -``!F<filename> <function [functions...]>`` is replaced by the documentation, in -``<filename>``, for the functions listed. - -``!P<filename> <section title>`` is replaced by the contents of the ``DOC:`` -section titled ``<section title>`` from ``<filename>``. Spaces are allowed in -``<section title>``; do not quote the ``<section title>``. - -``!C<filename>`` is replaced by nothing, but makes the tools check that all DOC: -sections and documented functions, symbols, etc. are used. This makes sense to -use when you use ``!F`` or ``!P`` only and want to verify that all documentation -is included. diff --git a/Documentation/doc-guide/index.rst b/Documentation/doc-guide/index.rst index 6fff4024606e..a7f95d7d3a63 100644 --- a/Documentation/doc-guide/index.rst +++ b/Documentation/doc-guide/index.rst @@ -10,7 +10,6 @@ How to write kernel documentation sphinx.rst kernel-doc.rst parse-headers.rst - docbook.rst .. only:: subproject and html diff --git a/Documentation/doc-guide/kernel-doc.rst b/Documentation/doc-guide/kernel-doc.rst index b32e4813ff6f..b24854b5d6be 100644 --- a/Documentation/doc-guide/kernel-doc.rst +++ b/Documentation/doc-guide/kernel-doc.rst @@ -149,6 +149,16 @@ Domain`_ references. ``%CONST`` Name of a constant. (No cross-referencing, just formatting.) +````literal```` + A literal block that should be handled as-is. The output will use a + ``monospaced font``. + + Useful if you need to use special characters that would otherwise have some + meaning either by kernel-doc script of by reStructuredText. + + This is particularly useful if you need to use things like ``%ph`` inside + a function description. + ``$ENVVAR`` Name of an environment variable. (No cross-referencing, just formatting.) diff --git a/Documentation/doc-guide/sphinx.rst b/Documentation/doc-guide/sphinx.rst index 731334de3efd..84e8e8a9cbdb 100644 --- a/Documentation/doc-guide/sphinx.rst +++ b/Documentation/doc-guide/sphinx.rst @@ -15,11 +15,6 @@ are used to describe the functions and types and design of the code. The kernel-doc comments have some special structure and formatting, but beyond that they are also treated as reStructuredText. -There is also the deprecated DocBook toolchain to generate documentation from -DocBook XML template files under ``Documentation/DocBook``. The DocBook files -are to be converted to reStructuredText, and the toolchain is slated to be -removed. - Finally, there are thousands of plain text documentation files scattered around ``Documentation``. Some of these will likely be converted to reStructuredText over time, but the bulk of them will remain in plain text. diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index 8058a87c1c74..3cf1acebc4ee 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -32,7 +32,13 @@ available subsections can be seen below. i2c hsi edac + scsi + libata + mtdnand miscellaneous + w1 + rapidio + s390-drivers vme 80211/index uio-howto diff --git a/Documentation/driver-api/libata.rst b/Documentation/driver-api/libata.rst new file mode 100644 index 000000000000..4adc056f7635 --- /dev/null +++ b/Documentation/driver-api/libata.rst @@ -0,0 +1,1031 @@ +======================== +libATA Developer's Guide +======================== + +:Author: Jeff Garzik + +Introduction +============ + +libATA is a library used inside the Linux kernel to support ATA host +controllers and devices. libATA provides an ATA driver API, class +transports for ATA and ATAPI devices, and SCSI<->ATA translation for ATA +devices according to the T10 SAT specification. + +This Guide documents the libATA driver API, library functions, library +internals, and a couple sample ATA low-level drivers. + +libata Driver API +================= + +:c:type:`struct ata_port_operations <ata_port_operations>` +is defined for every low-level libata +hardware driver, and it controls how the low-level driver interfaces +with the ATA and SCSI layers. + +FIS-based drivers will hook into the system with ``->qc_prep()`` and +``->qc_issue()`` high-level hooks. Hardware which behaves in a manner +similar to PCI IDE hardware may utilize several generic helpers, +defining at a bare minimum the bus I/O addresses of the ATA shadow +register blocks. + +:c:type:`struct ata_port_operations <ata_port_operations>` +---------------------------------------------------------- + +Disable ATA port +~~~~~~~~~~~~~~~~ + +:: + + void (*port_disable) (struct ata_port *); + + +Called from :c:func:`ata_bus_probe` error path, as well as when unregistering +from the SCSI module (rmmod, hot unplug). This function should do +whatever needs to be done to take the port out of use. In most cases, +:c:func:`ata_port_disable` can be used as this hook. + +Called from :c:func:`ata_bus_probe` on a failed probe. Called from +:c:func:`ata_scsi_release`. + +Post-IDENTIFY device configuration +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:: + + void (*dev_config) (struct ata_port *, struct ata_device *); + + +Called after IDENTIFY [PACKET] DEVICE is issued to each device found. +Typically used to apply device-specific fixups prior to issue of SET +FEATURES - XFER MODE, and prior to operation. + +This entry may be specified as NULL in ata_port_operations. + +Set PIO/DMA mode +~~~~~~~~~~~~~~~~ + +:: + + void (*set_piomode) (struct ata_port *, struct ata_device *); + void (*set_dmamode) (struct ata_port *, struct ata_device *); + void (*post_set_mode) (struct ata_port *); + unsigned int (*mode_filter) (struct ata_port *, struct ata_device *, unsigned int); + + +Hooks called prior to the issue of SET FEATURES - XFER MODE command. The +optional ``->mode_filter()`` hook is called when libata has built a mask of +the possible modes. This is passed to the ``->mode_filter()`` function +which should return a mask of valid modes after filtering those +unsuitable due to hardware limits. It is not valid to use this interface +to add modes. + +``dev->pio_mode`` and ``dev->dma_mode`` are guaranteed to be valid when +``->set_piomode()`` and when ``->set_dmamode()`` is called. The timings for +any other drive sharing the cable will also be valid at this point. That +is the library records the decisions for the modes of each drive on a +channel before it attempts to set any of them. + +``->post_set_mode()`` is called unconditionally, after the SET FEATURES - +XFER MODE command completes successfully. + +``->set_piomode()`` is always called (if present), but ``->set_dma_mode()`` +is only called if DMA is possible. + +Taskfile read/write +~~~~~~~~~~~~~~~~~~~ + +:: + + void (*sff_tf_load) (struct ata_port *ap, struct ata_taskfile *tf); + void (*sff_tf_read) (struct ata_port *ap, struct ata_taskfile *tf); + + +``->tf_load()`` is called to load the given taskfile into hardware +registers / DMA buffers. ``->tf_read()`` is called to read the hardware +registers / DMA buffers, to obtain the current set of taskfile register +values. Most drivers for taskfile-based hardware (PIO or MMIO) use +:c:func:`ata_sff_tf_load` and :c:func:`ata_sff_tf_read` for these hooks. + +PIO data read/write +~~~~~~~~~~~~~~~~~~~ + +:: + + void (*sff_data_xfer) (struct ata_device *, unsigned char *, unsigned int, int); + + +All bmdma-style drivers must implement this hook. This is the low-level +operation that actually copies the data bytes during a PIO data +transfer. Typically the driver will choose one of +:c:func:`ata_sff_data_xfer_noirq`, :c:func:`ata_sff_data_xfer`, or +:c:func:`ata_sff_data_xfer32`. + +ATA command execute +~~~~~~~~~~~~~~~~~~~ + +:: + + void (*sff_exec_command)(struct ata_port *ap, struct ata_taskfile *tf); + + +causes an ATA command, previously loaded with ``->tf_load()``, to be +initiated in hardware. Most drivers for taskfile-based hardware use +:c:func:`ata_sff_exec_command` for this hook. + +Per-cmd ATAPI DMA capabilities filter +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:: + + int (*check_atapi_dma) (struct ata_queued_cmd *qc); + + +Allow low-level driver to filter ATA PACKET commands, returning a status +indicating whether or not it is OK to use DMA for the supplied PACKET +command. + +This hook may be specified as NULL, in which case libata will assume +that atapi dma can be supported. + +Read specific ATA shadow registers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:: + + u8 (*sff_check_status)(struct ata_port *ap); + u8 (*sff_check_altstatus)(struct ata_port *ap); + + +Reads the Status/AltStatus ATA shadow register from hardware. On some +hardware, reading the Status register has the side effect of clearing +the interrupt condition. Most drivers for taskfile-based hardware use +:c:func:`ata_sff_check_status` for this hook. + +Write specific ATA shadow register +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:: + + void (*sff_set_devctl)(struct ata_port *ap, u8 ctl); + + +Write the device control ATA shadow register to the hardware. Most +drivers don't need to define this. + +Select ATA device on bus +~~~~~~~~~~~~~~~~~~~~~~~~ + +:: + + void (*sff_dev_select)(struct ata_port *ap, unsigned int device); + + +Issues the low-level hardware command(s) that causes one of N hardware +devices to be considered 'selected' (active and available for use) on +the ATA bus. This generally has no meaning on FIS-based devices. + +Most drivers for taskfile-based hardware use :c:func:`ata_sff_dev_select` for +this hook. + +Private tuning method +~~~~~~~~~~~~~~~~~~~~~ + +:: + + void (*set_mode) (struct ata_port *ap); + + +By default libata performs drive and controller tuning in accordance +with the ATA timing rules and also applies blacklists and cable limits. +Some controllers need special handling and have custom tuning rules, +typically raid controllers that use ATA commands but do not actually do +drive timing. + + **Warning** + + This hook should not be used to replace the standard controller + tuning logic when a controller has quirks. Replacing the default + tuning logic in that case would bypass handling for drive and bridge + quirks that may be important to data reliability. If a controller + needs to filter the mode selection it should use the mode_filter + hook instead. + +Control PCI IDE BMDMA engine +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:: + + void (*bmdma_setup) (struct ata_queued_cmd *qc); + void (*bmdma_start) (struct ata_queued_cmd *qc); + void (*bmdma_stop) (struct ata_port *ap); + u8 (*bmdma_status) (struct ata_port *ap); + + +When setting up an IDE BMDMA transaction, these hooks arm +(``->bmdma_setup``), fire (``->bmdma_start``), and halt (``->bmdma_stop``) the +hardware's DMA engine. ``->bmdma_status`` is used to read the standard PCI +IDE DMA Status register. + +These hooks are typically either no-ops, or simply not implemented, in +FIS-based drivers. + +Most legacy IDE drivers use :c:func:`ata_bmdma_setup` for the +:c:func:`bmdma_setup` hook. :c:func:`ata_bmdma_setup` will write the pointer +to the PRD table to the IDE PRD Table Address register, enable DMA in the DMA +Command register, and call :c:func:`exec_command` to begin the transfer. + +Most legacy IDE drivers use :c:func:`ata_bmdma_start` for the +:c:func:`bmdma_start` hook. :c:func:`ata_bmdma_start` will write the +ATA_DMA_START flag to the DMA Command register. + +Many legacy IDE drivers use :c:func:`ata_bmdma_stop` for the +:c:func:`bmdma_stop` hook. :c:func:`ata_bmdma_stop` clears the ATA_DMA_START +flag in the DMA command register. + +Many legacy IDE drivers use :c:func:`ata_bmdma_status` as the +:c:func:`bmdma_status` hook. + +High-level taskfile hooks +~~~~~~~~~~~~~~~~~~~~~~~~~ + +:: + + void (*qc_prep) (struct ata_queued_cmd *qc); + int (*qc_issue) (struct ata_queued_cmd *qc); + + +Higher-level hooks, these two hooks can potentially supercede several of +the above taskfile/DMA engine hooks. ``->qc_prep`` is called after the +buffers have been DMA-mapped, and is typically used to populate the +hardware's DMA scatter-gather table. Most drivers use the standard +:c:func:`ata_qc_prep` helper function, but more advanced drivers roll their +own. + +``->qc_issue`` is used to make a command active, once the hardware and S/G +tables have been prepared. IDE BMDMA drivers use the helper function +:c:func:`ata_qc_issue_prot` for taskfile protocol-based dispatch. More +advanced drivers implement their own ``->qc_issue``. + +:c:func:`ata_qc_issue_prot` calls ``->tf_load()``, ``->bmdma_setup()``, and +``->bmdma_start()`` as necessary to initiate a transfer. + +Exception and probe handling (EH) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:: + + void (*eng_timeout) (struct ata_port *ap); + void (*phy_reset) (struct ata_port *ap); + + +Deprecated. Use ``->error_handler()`` instead. + +:: + + void (*freeze) (struct ata_port *ap); + void (*thaw) (struct ata_port *ap); + + +:c:func:`ata_port_freeze` is called when HSM violations or some other +condition disrupts normal operation of the port. A frozen port is not +allowed to perform any operation until the port is thawed, which usually +follows a successful reset. + +The optional ``->freeze()`` callback can be used for freezing the port +hardware-wise (e.g. mask interrupt and stop DMA engine). If a port +cannot be frozen hardware-wise, the interrupt handler must ack and clear +interrupts unconditionally while the port is frozen. + +The optional ``->thaw()`` callback is called to perform the opposite of +``->freeze()``: prepare the port for normal operation once again. Unmask +interrupts, start DMA engine, etc. + +:: + + void (*error_handler) (struct ata_port *ap); + + +``->error_handler()`` is a driver's hook into probe, hotplug, and recovery +and other exceptional conditions. The primary responsibility of an +implementation is to call :c:func:`ata_do_eh` or :c:func:`ata_bmdma_drive_eh` +with a set of EH hooks as arguments: + +'prereset' hook (may be NULL) is called during an EH reset, before any +other actions are taken. + +'postreset' hook (may be NULL) is called after the EH reset is +performed. Based on existing conditions, severity of the problem, and +hardware capabilities, + +Either 'softreset' (may be NULL) or 'hardreset' (may be NULL) will be +called to perform the low-level EH reset. + +:: + + void (*post_internal_cmd) (struct ata_queued_cmd *qc); + + +Perform any hardware-specific actions necessary to finish processing +after executing a probe-time or EH-time command via +:c:func:`ata_exec_internal`. + +Hardware interrupt handling +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:: + + irqreturn_t (*irq_handler)(int, void *, struct pt_regs *); + void (*irq_clear) (struct ata_port *); + + +``->irq_handler`` is the interrupt handling routine registered with the +system, by libata. ``->irq_clear`` is called during probe just before the +interrupt handler is registered, to be sure hardware is quiet. + +The second argument, dev_instance, should be cast to a pointer to +:c:type:`struct ata_host_set <ata_host_set>`. + +Most legacy IDE drivers use :c:func:`ata_sff_interrupt` for the irq_handler +hook, which scans all ports in the host_set, determines which queued +command was active (if any), and calls ata_sff_host_intr(ap,qc). + +Most legacy IDE drivers use :c:func:`ata_sff_irq_clear` for the +:c:func:`irq_clear` hook, which simply clears the interrupt and error flags +in the DMA status register. + +SATA phy read/write +~~~~~~~~~~~~~~~~~~~ + +:: + + int (*scr_read) (struct ata_port *ap, unsigned int sc_reg, + u32 *val); + int (*scr_write) (struct ata_port *ap, unsigned int sc_reg, + u32 val); + + +Read and write standard SATA phy registers. Currently only used if +``->phy_reset`` hook called the :c:func:`sata_phy_reset` helper function. +sc_reg is one of SCR_STATUS, SCR_CONTROL, SCR_ERROR, or SCR_ACTIVE. + +Init and shutdown +~~~~~~~~~~~~~~~~~ + +:: + + int (*port_start) (struct ata_port *ap); + void (*port_stop) (struct ata_port *ap); + void (*host_stop) (struct ata_host_set *host_set); + + +``->port_start()`` is called just after the data structures for each port +are initialized. Typically this is used to alloc per-port DMA buffers / +tables / rings, enable DMA engines, and similar tasks. Some drivers also +use this entry point as a chance to allocate driver-private memory for +``ap->private_data``. + +Many drivers use :c:func:`ata_port_start` as this hook or call it from their +own :c:func:`port_start` hooks. :c:func:`ata_port_start` allocates space for +a legacy IDE PRD table and returns. + +``->port_stop()`` is called after ``->host_stop()``. Its sole function is to +release DMA/memory resources, now that they are no longer actively being +used. Many drivers also free driver-private data from port at this time. + +``->host_stop()`` is called after all ``->port_stop()`` calls have completed. +The hook must finalize hardware shutdown, release DMA and other +resources, etc. This hook may be specified as NULL, in which case it is +not called. + +Error handling +============== + +This chapter describes how errors are handled under libata. Readers are +advised to read SCSI EH (Documentation/scsi/scsi_eh.txt) and ATA +exceptions doc first. + +Origins of commands +------------------- + +In libata, a command is represented with +:c:type:`struct ata_queued_cmd <ata_queued_cmd>` or qc. +qc's are preallocated during port initialization and repetitively used +for command executions. Currently only one qc is allocated per port but +yet-to-be-merged NCQ branch allocates one for each tag and maps each qc +to NCQ tag 1-to-1. + +libata commands can originate from two sources - libata itself and SCSI +midlayer. libata internal commands are used for initialization and error +handling. All normal blk requests and commands for SCSI emulation are +passed as SCSI commands through queuecommand callback of SCSI host +template. + +How commands are issued +----------------------- + +Internal commands + First, qc is allocated and initialized using :c:func:`ata_qc_new_init`. + Although :c:func:`ata_qc_new_init` doesn't implement any wait or retry + mechanism when qc is not available, internal commands are currently + issued only during initialization and error recovery, so no other + command is active and allocation is guaranteed to succeed. + + Once allocated qc's taskfile is initialized for the command to be + executed. qc currently has two mechanisms to notify completion. One + is via ``qc->complete_fn()`` callback and the other is completion + ``qc->waiting``. ``qc->complete_fn()`` callback is the asynchronous path + used by normal SCSI translated commands and ``qc->waiting`` is the + synchronous (issuer sleeps in process context) path used by internal + commands. + + Once initialization is complete, host_set lock is acquired and the + qc is issued. + +SCSI commands + All libata drivers use :c:func:`ata_scsi_queuecmd` as + ``hostt->queuecommand`` callback. scmds can either be simulated or + translated. No qc is involved in processing a simulated scmd. The + result is computed right away and the scmd is completed. + + For a translated scmd, :c:func:`ata_qc_new_init` is invoked to allocate a + qc and the scmd is translated into the qc. SCSI midlayer's + completion notification function pointer is stored into + ``qc->scsidone``. + + ``qc->complete_fn()`` callback is used for completion notification. ATA + commands use :c:func:`ata_scsi_qc_complete` while ATAPI commands use + :c:func:`atapi_qc_complete`. Both functions end up calling ``qc->scsidone`` + to notify upper layer when the qc is finished. After translation is + completed, the qc is issued with :c:func:`ata_qc_issue`. + + Note that SCSI midlayer invokes hostt->queuecommand while holding + host_set lock, so all above occur while holding host_set lock. + +How commands are processed +-------------------------- + +Depending on which protocol and which controller are used, commands are +processed differently. For the purpose of discussion, a controller which +uses taskfile interface and all standard callbacks is assumed. + +Currently 6 ATA command protocols are used. They can be sorted into the +following four categories according to how they are processed. + +ATA NO DATA or DMA + ATA_PROT_NODATA and ATA_PROT_DMA fall into this category. These + types of commands don't require any software intervention once + issued. Device will raise interrupt on completion. + +ATA PIO + ATA_PROT_PIO is in this category. libata currently implements PIO + with polling. ATA_NIEN bit is set to turn off interrupt and + pio_task on ata_wq performs polling and IO. + +ATAPI NODATA or DMA + ATA_PROT_ATAPI_NODATA and ATA_PROT_ATAPI_DMA are in this + category. packet_task is used to poll BSY bit after issuing PACKET + command. Once BSY is turned off by the device, packet_task + transfers CDB and hands off processing to interrupt handler. + +ATAPI PIO + ATA_PROT_ATAPI is in this category. ATA_NIEN bit is set and, as + in ATAPI NODATA or DMA, packet_task submits cdb. However, after + submitting cdb, further processing (data transfer) is handed off to + pio_task. + +How commands are completed +-------------------------- + +Once issued, all qc's are either completed with :c:func:`ata_qc_complete` or +time out. For commands which are handled by interrupts, +:c:func:`ata_host_intr` invokes :c:func:`ata_qc_complete`, and, for PIO tasks, +pio_task invokes :c:func:`ata_qc_complete`. In error cases, packet_task may +also complete commands. + +:c:func:`ata_qc_complete` does the following. + +1. DMA memory is unmapped. + +2. ATA_QCFLAG_ACTIVE is cleared from qc->flags. + +3. :c:func:`qc->complete_fn` callback is invoked. If the return value of the + callback is not zero. Completion is short circuited and + :c:func:`ata_qc_complete` returns. + +4. :c:func:`__ata_qc_complete` is called, which does + + 1. ``qc->flags`` is cleared to zero. + + 2. ``ap->active_tag`` and ``qc->tag`` are poisoned. + + 3. ``qc->waiting`` is cleared & completed (in that order). + + 4. qc is deallocated by clearing appropriate bit in ``ap->qactive``. + +So, it basically notifies upper layer and deallocates qc. One exception +is short-circuit path in #3 which is used by :c:func:`atapi_qc_complete`. + +For all non-ATAPI commands, whether it fails or not, almost the same +code path is taken and very little error handling takes place. A qc is +completed with success status if it succeeded, with failed status +otherwise. + +However, failed ATAPI commands require more handling as REQUEST SENSE is +needed to acquire sense data. If an ATAPI command fails, +:c:func:`ata_qc_complete` is invoked with error status, which in turn invokes +:c:func:`atapi_qc_complete` via ``qc->complete_fn()`` callback. + +This makes :c:func:`atapi_qc_complete` set ``scmd->result`` to +SAM_STAT_CHECK_CONDITION, complete the scmd and return 1. As the +sense data is empty but ``scmd->result`` is CHECK CONDITION, SCSI midlayer +will invoke EH for the scmd, and returning 1 makes :c:func:`ata_qc_complete` +to return without deallocating the qc. This leads us to +:c:func:`ata_scsi_error` with partially completed qc. + +:c:func:`ata_scsi_error` +------------------------ + +:c:func:`ata_scsi_error` is the current ``transportt->eh_strategy_handler()`` +for libata. As discussed above, this will be entered in two cases - +timeout and ATAPI error completion. This function calls low level libata +driver's :c:func:`eng_timeout` callback, the standard callback for which is +:c:func:`ata_eng_timeout`. It checks if a qc is active and calls +:c:func:`ata_qc_timeout` on the qc if so. Actual error handling occurs in +:c:func:`ata_qc_timeout`. + +If EH is invoked for timeout, :c:func:`ata_qc_timeout` stops BMDMA and +completes the qc. Note that as we're currently in EH, we cannot call +scsi_done. As described in SCSI EH doc, a recovered scmd should be +either retried with :c:func:`scsi_queue_insert` or finished with +:c:func:`scsi_finish_command`. Here, we override ``qc->scsidone`` with +:c:func:`scsi_finish_command` and calls :c:func:`ata_qc_complete`. + +If EH is invoked due to a failed ATAPI qc, the qc here is completed but +not deallocated. The purpose of this half-completion is to use the qc as +place holder to make EH code reach this place. This is a bit hackish, +but it works. + +Once control reaches here, the qc is deallocated by invoking +:c:func:`__ata_qc_complete` explicitly. Then, internal qc for REQUEST SENSE +is issued. Once sense data is acquired, scmd is finished by directly +invoking :c:func:`scsi_finish_command` on the scmd. Note that as we already +have completed and deallocated the qc which was associated with the +scmd, we don't need to/cannot call :c:func:`ata_qc_complete` again. + +Problems with the current EH +---------------------------- + +- Error representation is too crude. Currently any and all error + conditions are represented with ATA STATUS and ERROR registers. + Errors which aren't ATA device errors are treated as ATA device + errors by setting ATA_ERR bit. Better error descriptor which can + properly represent ATA and other errors/exceptions is needed. + +- When handling timeouts, no action is taken to make device forget + about the timed out command and ready for new commands. + +- EH handling via :c:func:`ata_scsi_error` is not properly protected from + usual command processing. On EH entrance, the device is not in + quiescent state. Timed out commands may succeed or fail any time. + pio_task and atapi_task may still be running. + +- Too weak error recovery. Devices / controllers causing HSM mismatch + errors and other errors quite often require reset to return to known + state. Also, advanced error handling is necessary to support features + like NCQ and hotplug. + +- ATA errors are directly handled in the interrupt handler and PIO + errors in pio_task. This is problematic for advanced error handling + for the following reasons. + + First, advanced error handling often requires context and internal qc + execution. + + Second, even a simple failure (say, CRC error) needs information + gathering and could trigger complex error handling (say, resetting & + reconfiguring). Having multiple code paths to gather information, + enter EH and trigger actions makes life painful. + + Third, scattered EH code makes implementing low level drivers + difficult. Low level drivers override libata callbacks. If EH is + scattered over several places, each affected callbacks should perform + its part of error handling. This can be error prone and painful. + +libata Library +============== + +.. kernel-doc:: drivers/ata/libata-core.c + :export: + +libata Core Internals +===================== + +.. kernel-doc:: drivers/ata/libata-core.c + :internal: + +.. kernel-doc:: drivers/ata/libata-eh.c + +libata SCSI translation/emulation +================================= + +.. kernel-doc:: drivers/ata/libata-scsi.c + :export: + +.. kernel-doc:: drivers/ata/libata-scsi.c + :internal: + +ATA errors and exceptions +========================= + +This chapter tries to identify what error/exception conditions exist for +ATA/ATAPI devices and describe how they should be handled in +implementation-neutral way. + +The term 'error' is used to describe conditions where either an explicit +error condition is reported from device or a command has timed out. + +The term 'exception' is either used to describe exceptional conditions +which are not errors (say, power or hotplug events), or to describe both +errors and non-error exceptional conditions. Where explicit distinction +between error and exception is necessary, the term 'non-error exception' +is used. + +Exception categories +-------------------- + +Exceptions are described primarily with respect to legacy taskfile + bus +master IDE interface. If a controller provides other better mechanism +for error reporting, mapping those into categories described below +shouldn't be difficult. + +In the following sections, two recovery actions - reset and +reconfiguring transport - are mentioned. These are described further in +`EH recovery actions <#exrec>`__. + +HSM violation +~~~~~~~~~~~~~ + +This error is indicated when STATUS value doesn't match HSM requirement +during issuing or execution any ATA/ATAPI command. + +- ATA_STATUS doesn't contain !BSY && DRDY && !DRQ while trying to + issue a command. + +- !BSY && !DRQ during PIO data transfer. + +- DRQ on command completion. + +- !BSY && ERR after CDB transfer starts but before the last byte of CDB + is transferred. ATA/ATAPI standard states that "The device shall not + terminate the PACKET command with an error before the last byte of + the command packet has been written" in the error outputs description + of PACKET command and the state diagram doesn't include such + transitions. + +In these cases, HSM is violated and not much information regarding the +error can be acquired from STATUS or ERROR register. IOW, this error can +be anything - driver bug, faulty device, controller and/or cable. + +As HSM is violated, reset is necessary to restore known state. +Reconfiguring transport for lower speed might be helpful too as +transmission errors sometimes cause this kind of errors. + +ATA/ATAPI device error (non-NCQ / non-CHECK CONDITION) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +These are errors detected and reported by ATA/ATAPI devices indicating +device problems. For this type of errors, STATUS and ERROR register +values are valid and describe error condition. Note that some of ATA bus +errors are detected by ATA/ATAPI devices and reported using the same +mechanism as device errors. Those cases are described later in this +section. + +For ATA commands, this type of errors are indicated by !BSY && ERR +during command execution and on completion. + +For ATAPI commands, + +- !BSY && ERR && ABRT right after issuing PACKET indicates that PACKET + command is not supported and falls in this category. + +- !BSY && ERR(==CHK) && !ABRT after the last byte of CDB is transferred + indicates CHECK CONDITION and doesn't fall in this category. + +- !BSY && ERR(==CHK) && ABRT after the last byte of CDB is transferred + \*probably\* indicates CHECK CONDITION and doesn't fall in this + category. + +Of errors detected as above, the following are not ATA/ATAPI device +errors but ATA bus errors and should be handled according to +`ATA bus error <#excatATAbusErr>`__. + +CRC error during data transfer + This is indicated by ICRC bit in the ERROR register and means that + corruption occurred during data transfer. Up to ATA/ATAPI-7, the + standard specifies that this bit is only applicable to UDMA + transfers but ATA/ATAPI-8 draft revision 1f says that the bit may be + applicable to multiword DMA and PIO. + +ABRT error during data transfer or on completion + Up to ATA/ATAPI-7, the standard specifies that ABRT could be set on + ICRC errors and on cases where a device is not able to complete a + command. Combined with the fact that MWDMA and PIO transfer errors + aren't allowed to use ICRC bit up to ATA/ATAPI-7, it seems to imply + that ABRT bit alone could indicate transfer errors. + + However, ATA/ATAPI-8 draft revision 1f removes the part that ICRC + errors can turn on ABRT. So, this is kind of gray area. Some + heuristics are needed here. + +ATA/ATAPI device errors can be further categorized as follows. + +Media errors + This is indicated by UNC bit in the ERROR register. ATA devices + reports UNC error only after certain number of retries cannot + recover the data, so there's nothing much else to do other than + notifying upper layer. + + READ and WRITE commands report CHS or LBA of the first failed sector + but ATA/ATAPI standard specifies that the amount of transferred data + on error completion is indeterminate, so we cannot assume that + sectors preceding the failed sector have been transferred and thus + cannot complete those sectors successfully as SCSI does. + +Media changed / media change requested error + <<TODO: fill here>> + +Address error + This is indicated by IDNF bit in the ERROR register. Report to upper + layer. + +Other errors + This can be invalid command or parameter indicated by ABRT ERROR bit + or some other error condition. Note that ABRT bit can indicate a lot + of things including ICRC and Address errors. Heuristics needed. + +Depending on commands, not all STATUS/ERROR bits are applicable. These +non-applicable bits are marked with "na" in the output descriptions but +up to ATA/ATAPI-7 no definition of "na" can be found. However, +ATA/ATAPI-8 draft revision 1f describes "N/A" as follows. + + 3.2.3.3a N/A + A keyword the indicates a field has no defined value in this + standard and should not be checked by the host or device. N/A + fields should be cleared to zero. + +So, it seems reasonable to assume that "na" bits are cleared to zero by +devices and thus need no explicit masking. + +ATAPI device CHECK CONDITION +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +ATAPI device CHECK CONDITION error is indicated by set CHK bit (ERR bit) +in the STATUS register after the last byte of CDB is transferred for a +PACKET command. For this kind of errors, sense data should be acquired +to gather information regarding the errors. REQUEST SENSE packet command +should be used to acquire sense data. + +Once sense data is acquired, this type of errors can be handled +similarly to other SCSI errors. Note that sense data may indicate ATA +bus error (e.g. Sense Key 04h HARDWARE ERROR && ASC/ASCQ 47h/00h SCSI +PARITY ERROR). In such cases, the error should be considered as an ATA +bus error and handled according to `ATA bus error <#excatATAbusErr>`__. + +ATA device error (NCQ) +~~~~~~~~~~~~~~~~~~~~~~ + +NCQ command error is indicated by cleared BSY and set ERR bit during NCQ +command phase (one or more NCQ commands outstanding). Although STATUS +and ERROR registers will contain valid values describing the error, READ +LOG EXT is required to clear the error condition, determine which +command has failed and acquire more information. + +READ LOG EXT Log Page 10h reports which tag has failed and taskfile +register values describing the error. With this information the failed +command can be handled as a normal ATA command error as in +`ATA/ATAPI device error (non-NCQ / non-CHECK CONDITION) <#excatDevErr>`__ +and all other in-flight commands must be retried. Note that this retry +should not be counted - it's likely that commands retried this way would +have completed normally if it were not for the failed command. + +Note that ATA bus errors can be reported as ATA device NCQ errors. This +should be handled as described in `ATA bus error <#excatATAbusErr>`__. + +If READ LOG EXT Log Page 10h fails or reports NQ, we're thoroughly +screwed. This condition should be treated according to +`HSM violation <#excatHSMviolation>`__. + +ATA bus error +~~~~~~~~~~~~~ + +ATA bus error means that data corruption occurred during transmission +over ATA bus (SATA or PATA). This type of errors can be indicated by + +- ICRC or ABRT error as described in + `ATA/ATAPI device error (non-NCQ / non-CHECK CONDITION) <#excatDevErr>`__. + +- Controller-specific error completion with error information + indicating transmission error. + +- On some controllers, command timeout. In this case, there may be a + mechanism to determine that the timeout is due to transmission error. + +- Unknown/random errors, timeouts and all sorts of weirdities. + +As described above, transmission errors can cause wide variety of +symptoms ranging from device ICRC error to random device lockup, and, +for many cases, there is no way to tell if an error condition is due to +transmission error or not; therefore, it's necessary to employ some kind +of heuristic when dealing with errors and timeouts. For example, +encountering repetitive ABRT errors for known supported command is +likely to indicate ATA bus error. + +Once it's determined that ATA bus errors have possibly occurred, +lowering ATA bus transmission speed is one of actions which may +alleviate the problem. See `Reconfigure transport <#exrecReconf>`__ for +more information. + +PCI bus error +~~~~~~~~~~~~~ + +Data corruption or other failures during transmission over PCI (or other +system bus). For standard BMDMA, this is indicated by Error bit in the +BMDMA Status register. This type of errors must be logged as it +indicates something is very wrong with the system. Resetting host +controller is recommended. + +Late completion +~~~~~~~~~~~~~~~ + +This occurs when timeout occurs and the timeout handler finds out that +the timed out command has completed successfully or with error. This is +usually caused by lost interrupts. This type of errors must be logged. +Resetting host controller is recommended. + +Unknown error (timeout) +~~~~~~~~~~~~~~~~~~~~~~~ + +This is when timeout occurs and the command is still processing or the +host and device are in unknown state. When this occurs, HSM could be in +any valid or invalid state. To bring the device to known state and make +it forget about the timed out command, resetting is necessary. The timed +out command may be retried. + +Timeouts can also be caused by transmission errors. Refer to +`ATA bus error <#excatATAbusErr>`__ for more details. + +Hotplug and power management exceptions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +<<TODO: fill here>> + +EH recovery actions +------------------- + +This section discusses several important recovery actions. + +Clearing error condition +~~~~~~~~~~~~~~~~~~~~~~~~ + +Many controllers require its error registers to be cleared by error +handler. Different controllers may have different requirements. + +For SATA, it's strongly recommended to clear at least SError register +during error handling. + +Reset +~~~~~ + +During EH, resetting is necessary in the following cases. + +- HSM is in unknown or invalid state + +- HBA is in unknown or invalid state + +- EH needs to make HBA/device forget about in-flight commands + +- HBA/device behaves weirdly + +Resetting during EH might be a good idea regardless of error condition +to improve EH robustness. Whether to reset both or either one of HBA and +device depends on situation but the following scheme is recommended. + +- When it's known that HBA is in ready state but ATA/ATAPI device is in + unknown state, reset only device. + +- If HBA is in unknown state, reset both HBA and device. + +HBA resetting is implementation specific. For a controller complying to +taskfile/BMDMA PCI IDE, stopping active DMA transaction may be +sufficient iff BMDMA state is the only HBA context. But even mostly +taskfile/BMDMA PCI IDE complying controllers may have implementation +specific requirements and mechanism to reset themselves. This must be +addressed by specific drivers. + +OTOH, ATA/ATAPI standard describes in detail ways to reset ATA/ATAPI +devices. + +PATA hardware reset + This is hardware initiated device reset signalled with asserted PATA + RESET- signal. There is no standard way to initiate hardware reset + from software although some hardware provides registers that allow + driver to directly tweak the RESET- signal. + +Software reset + This is achieved by turning CONTROL SRST bit on for at least 5us. + Both PATA and SATA support it but, in case of SATA, this may require + controller-specific support as the second Register FIS to clear SRST + should be transmitted while BSY bit is still set. Note that on PATA, + this resets both master and slave devices on a channel. + +EXECUTE DEVICE DIAGNOSTIC command + Although ATA/ATAPI standard doesn't describe exactly, EDD implies + some level of resetting, possibly similar level with software reset. + Host-side EDD protocol can be handled with normal command processing + and most SATA controllers should be able to handle EDD's just like + other commands. As in software reset, EDD affects both devices on a + PATA bus. + + Although EDD does reset devices, this doesn't suit error handling as + EDD cannot be issued while BSY is set and it's unclear how it will + act when device is in unknown/weird state. + +ATAPI DEVICE RESET command + This is very similar to software reset except that reset can be + restricted to the selected device without affecting the other device + sharing the cable. + +SATA phy reset + This is the preferred way of resetting a SATA device. In effect, + it's identical to PATA hardware reset. Note that this can be done + with the standard SCR Control register. As such, it's usually easier + to implement than software reset. + +One more thing to consider when resetting devices is that resetting +clears certain configuration parameters and they need to be set to their +previous or newly adjusted values after reset. + +Parameters affected are. + +- CHS set up with INITIALIZE DEVICE PARAMETERS (seldom used) + +- Parameters set with SET FEATURES including transfer mode setting + +- Block count set with SET MULTIPLE MODE + +- Other parameters (SET MAX, MEDIA LOCK...) + +ATA/ATAPI standard specifies that some parameters must be maintained +across hardware or software reset, but doesn't strictly specify all of +them. Always reconfiguring needed parameters after reset is required for +robustness. Note that this also applies when resuming from deep sleep +(power-off). + +Also, ATA/ATAPI standard requires that IDENTIFY DEVICE / IDENTIFY PACKET +DEVICE is issued after any configuration parameter is updated or a +hardware reset and the result used for further operation. OS driver is +required to implement revalidation mechanism to support this. + +Reconfigure transport +~~~~~~~~~~~~~~~~~~~~~ + +For both PATA and SATA, a lot of corners are cut for cheap connectors, +cables or controllers and it's quite common to see high transmission +error rate. This can be mitigated by lowering transmission speed. + +The following is a possible scheme Jeff Garzik suggested. + + If more than $N (3?) transmission errors happen in 15 minutes, + + - if SATA, decrease SATA PHY speed. if speed cannot be decreased, + + - decrease UDMA xfer speed. if at UDMA0, switch to PIO4, + + - decrease PIO xfer speed. if at PIO3, complain, but continue + +ata_piix Internals +=================== + +.. kernel-doc:: drivers/ata/ata_piix.c + :internal: + +sata_sil Internals +=================== + +.. kernel-doc:: drivers/ata/sata_sil.c + :internal: + +Thanks +====== + +The bulk of the ATA knowledge comes thanks to long conversations with +Andre Hedrick (www.linux-ide.org), and long hours pondering the ATA and +SCSI specifications. + +Thanks to Alan Cox for pointing out similarities between SATA and SCSI, +and in general for motivation to hack on libata. + +libata's device detection method, ata_pio_devchk, and in general all +the early probing was based on extensive study of Hale Landis's +probe/reset code in his ATADRVR driver (www.ata-atapi.com). diff --git a/Documentation/driver-api/mtdnand.rst b/Documentation/driver-api/mtdnand.rst new file mode 100644 index 000000000000..e9afa586d15e --- /dev/null +++ b/Documentation/driver-api/mtdnand.rst @@ -0,0 +1,1007 @@ +===================================== +MTD NAND Driver Programming Interface +===================================== + +:Author: Thomas Gleixner + +Introduction +============ + +The generic NAND driver supports almost all NAND and AG-AND based chips +and connects them to the Memory Technology Devices (MTD) subsystem of +the Linux Kernel. + +This documentation is provided for developers who want to implement +board drivers or filesystem drivers suitable for NAND devices. + +Known Bugs And Assumptions +========================== + +None. + +Documentation hints +=================== + +The function and structure docs are autogenerated. Each function and +struct member has a short description which is marked with an [XXX] +identifier. The following chapters explain the meaning of those +identifiers. + +Function identifiers [XXX] +-------------------------- + +The functions are marked with [XXX] identifiers in the short comment. +The identifiers explain the usage and scope of the functions. Following +identifiers are used: + +- [MTD Interface] + + These functions provide the interface to the MTD kernel API. They are + not replaceable and provide functionality which is complete hardware + independent. + +- [NAND Interface] + + These functions are exported and provide the interface to the NAND + kernel API. + +- [GENERIC] + + Generic functions are not replaceable and provide functionality which + is complete hardware independent. + +- [DEFAULT] + + Default functions provide hardware related functionality which is + suitable for most of the implementations. These functions can be + replaced by the board driver if necessary. Those functions are called + via pointers in the NAND chip description structure. The board driver + can set the functions which should be replaced by board dependent + functions before calling nand_scan(). If the function pointer is + NULL on entry to nand_scan() then the pointer is set to the default + function which is suitable for the detected chip type. + +Struct member identifiers [XXX] +------------------------------- + +The struct members are marked with [XXX] identifiers in the comment. The +identifiers explain the usage and scope of the members. Following +identifiers are used: + +- [INTERN] + + These members are for NAND driver internal use only and must not be + modified. Most of these values are calculated from the chip geometry + information which is evaluated during nand_scan(). + +- [REPLACEABLE] + + Replaceable members hold hardware related functions which can be + provided by the board driver. The board driver can set the functions + which should be replaced by board dependent functions before calling + nand_scan(). If the function pointer is NULL on entry to + nand_scan() then the pointer is set to the default function which is + suitable for the detected chip type. + +- [BOARDSPECIFIC] + + Board specific members hold hardware related information which must + be provided by the board driver. The board driver must set the + function pointers and datafields before calling nand_scan(). + +- [OPTIONAL] + + Optional members can hold information relevant for the board driver. + The generic NAND driver code does not use this information. + +Basic board driver +================== + +For most boards it will be sufficient to provide just the basic +functions and fill out some really board dependent members in the nand +chip description structure. + +Basic defines +------------- + +At least you have to provide a nand_chip structure and a storage for +the ioremap'ed chip address. You can allocate the nand_chip structure +using kmalloc or you can allocate it statically. The NAND chip structure +embeds an mtd structure which will be registered to the MTD subsystem. +You can extract a pointer to the mtd structure from a nand_chip pointer +using the nand_to_mtd() helper. + +Kmalloc based example + +:: + + static struct mtd_info *board_mtd; + static void __iomem *baseaddr; + + +Static example + +:: + + static struct nand_chip board_chip; + static void __iomem *baseaddr; + + +Partition defines +----------------- + +If you want to divide your device into partitions, then define a +partitioning scheme suitable to your board. + +:: + + #define NUM_PARTITIONS 2 + static struct mtd_partition partition_info[] = { + { .name = "Flash partition 1", + .offset = 0, + .size = 8 * 1024 * 1024 }, + { .name = "Flash partition 2", + .offset = MTDPART_OFS_NEXT, + .size = MTDPART_SIZ_FULL }, + }; + + +Hardware control function +------------------------- + +The hardware control function provides access to the control pins of the +NAND chip(s). The access can be done by GPIO pins or by address lines. +If you use address lines, make sure that the timing requirements are +met. + +*GPIO based example* + +:: + + static void board_hwcontrol(struct mtd_info *mtd, int cmd) + { + switch(cmd){ + case NAND_CTL_SETCLE: /* Set CLE pin high */ break; + case NAND_CTL_CLRCLE: /* Set CLE pin low */ break; + case NAND_CTL_SETALE: /* Set ALE pin high */ break; + case NAND_CTL_CLRALE: /* Set ALE pin low */ break; + case NAND_CTL_SETNCE: /* Set nCE pin low */ break; + case NAND_CTL_CLRNCE: /* Set nCE pin high */ break; + } + } + + +*Address lines based example.* It's assumed that the nCE pin is driven +by a chip select decoder. + +:: + + static void board_hwcontrol(struct mtd_info *mtd, int cmd) + { + struct nand_chip *this = mtd_to_nand(mtd); + switch(cmd){ + case NAND_CTL_SETCLE: this->IO_ADDR_W |= CLE_ADRR_BIT; break; + case NAND_CTL_CLRCLE: this->IO_ADDR_W &= ~CLE_ADRR_BIT; break; + case NAND_CTL_SETALE: this->IO_ADDR_W |= ALE_ADRR_BIT; break; + case NAND_CTL_CLRALE: this->IO_ADDR_W &= ~ALE_ADRR_BIT; break; + } + } + + +Device ready function +--------------------- + +If the hardware interface has the ready busy pin of the NAND chip +connected to a GPIO or other accessible I/O pin, this function is used +to read back the state of the pin. The function has no arguments and +should return 0, if the device is busy (R/B pin is low) and 1, if the +device is ready (R/B pin is high). If the hardware interface does not +give access to the ready busy pin, then the function must not be defined +and the function pointer this->dev_ready is set to NULL. + +Init function +------------- + +The init function allocates memory and sets up all the board specific +parameters and function pointers. When everything is set up nand_scan() +is called. This function tries to detect and identify then chip. If a +chip is found all the internal data fields are initialized accordingly. +The structure(s) have to be zeroed out first and then filled with the +necessary information about the device. + +:: + + static int __init board_init (void) + { + struct nand_chip *this; + int err = 0; + + /* Allocate memory for MTD device structure and private data */ + this = kzalloc(sizeof(struct nand_chip), GFP_KERNEL); + if (!this) { + printk ("Unable to allocate NAND MTD device structure.\n"); + err = -ENOMEM; + goto out; + } + + board_mtd = nand_to_mtd(this); + + /* map physical address */ + baseaddr = ioremap(CHIP_PHYSICAL_ADDRESS, 1024); + if (!baseaddr) { + printk("Ioremap to access NAND chip failed\n"); + err = -EIO; + goto out_mtd; + } + + /* Set address of NAND IO lines */ + this->IO_ADDR_R = baseaddr; + this->IO_ADDR_W = baseaddr; + /* Reference hardware control function */ + this->hwcontrol = board_hwcontrol; + /* Set command delay time, see datasheet for correct value */ + this->chip_delay = CHIP_DEPENDEND_COMMAND_DELAY; + /* Assign the device ready function, if available */ + this->dev_ready = board_dev_ready; + this->eccmode = NAND_ECC_SOFT; + + /* Scan to find existence of the device */ + if (nand_scan (board_mtd, 1)) { + err = -ENXIO; + goto out_ior; + } + + add_mtd_partitions(board_mtd, partition_info, NUM_PARTITIONS); + goto out; + + out_ior: + iounmap(baseaddr); + out_mtd: + kfree (this); + out: + return err; + } + module_init(board_init); + + +Exit function +------------- + +The exit function is only necessary if the driver is compiled as a +module. It releases all resources which are held by the chip driver and +unregisters the partitions in the MTD layer. + +:: + + #ifdef MODULE + static void __exit board_cleanup (void) + { + /* Release resources, unregister device */ + nand_release (board_mtd); + + /* unmap physical address */ + iounmap(baseaddr); + + /* Free the MTD device structure */ + kfree (mtd_to_nand(board_mtd)); + } + module_exit(board_cleanup); + #endif + + +Advanced board driver functions +=============================== + +This chapter describes the advanced functionality of the NAND driver. +For a list of functions which can be overridden by the board driver see +the documentation of the nand_chip structure. + +Multiple chip control +--------------------- + +The nand driver can control chip arrays. Therefore the board driver must +provide an own select_chip function. This function must (de)select the +requested chip. The function pointer in the nand_chip structure must be +set before calling nand_scan(). The maxchip parameter of nand_scan() +defines the maximum number of chips to scan for. Make sure that the +select_chip function can handle the requested number of chips. + +The nand driver concatenates the chips to one virtual chip and provides +this virtual chip to the MTD layer. + +*Note: The driver can only handle linear chip arrays of equally sized +chips. There is no support for parallel arrays which extend the +buswidth.* + +*GPIO based example* + +:: + + static void board_select_chip (struct mtd_info *mtd, int chip) + { + /* Deselect all chips, set all nCE pins high */ + GPIO(BOARD_NAND_NCE) |= 0xff; + if (chip >= 0) + GPIO(BOARD_NAND_NCE) &= ~ (1 << chip); + } + + +*Address lines based example.* Its assumed that the nCE pins are +connected to an address decoder. + +:: + + static void board_select_chip (struct mtd_info *mtd, int chip) + { + struct nand_chip *this = mtd_to_nand(mtd); + + /* Deselect all chips */ + this->IO_ADDR_R &= ~BOARD_NAND_ADDR_MASK; + this->IO_ADDR_W &= ~BOARD_NAND_ADDR_MASK; + switch (chip) { + case 0: + this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIP0; + this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIP0; + break; + .... + case n: + this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIPn; + this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIPn; + break; + } + } + + +Hardware ECC support +-------------------- + +Functions and constants +~~~~~~~~~~~~~~~~~~~~~~~ + +The nand driver supports three different types of hardware ECC. + +- NAND_ECC_HW3_256 + + Hardware ECC generator providing 3 bytes ECC per 256 byte. + +- NAND_ECC_HW3_512 + + Hardware ECC generator providing 3 bytes ECC per 512 byte. + +- NAND_ECC_HW6_512 + + Hardware ECC generator providing 6 bytes ECC per 512 byte. + +- NAND_ECC_HW8_512 + + Hardware ECC generator providing 6 bytes ECC per 512 byte. + +If your hardware generator has a different functionality add it at the +appropriate place in nand_base.c + +The board driver must provide following functions: + +- enable_hwecc + + This function is called before reading / writing to the chip. Reset + or initialize the hardware generator in this function. The function + is called with an argument which let you distinguish between read and + write operations. + +- calculate_ecc + + This function is called after read / write from / to the chip. + Transfer the ECC from the hardware to the buffer. If the option + NAND_HWECC_SYNDROME is set then the function is only called on + write. See below. + +- correct_data + + In case of an ECC error this function is called for error detection + and correction. Return 1 respectively 2 in case the error can be + corrected. If the error is not correctable return -1. If your + hardware generator matches the default algorithm of the nand_ecc + software generator then use the correction function provided by + nand_ecc instead of implementing duplicated code. + +Hardware ECC with syndrome calculation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Many hardware ECC implementations provide Reed-Solomon codes and +calculate an error syndrome on read. The syndrome must be converted to a +standard Reed-Solomon syndrome before calling the error correction code +in the generic Reed-Solomon library. + +The ECC bytes must be placed immediately after the data bytes in order +to make the syndrome generator work. This is contrary to the usual +layout used by software ECC. The separation of data and out of band area +is not longer possible. The nand driver code handles this layout and the +remaining free bytes in the oob area are managed by the autoplacement +code. Provide a matching oob-layout in this case. See rts_from4.c and +diskonchip.c for implementation reference. In those cases we must also +use bad block tables on FLASH, because the ECC layout is interfering +with the bad block marker positions. See bad block table support for +details. + +Bad block table support +----------------------- + +Most NAND chips mark the bad blocks at a defined position in the spare +area. Those blocks must not be erased under any circumstances as the bad +block information would be lost. It is possible to check the bad block +mark each time when the blocks are accessed by reading the spare area of +the first page in the block. This is time consuming so a bad block table +is used. + +The nand driver supports various types of bad block tables. + +- Per device + + The bad block table contains all bad block information of the device + which can consist of multiple chips. + +- Per chip + + A bad block table is used per chip and contains the bad block + information for this particular chip. + +- Fixed offset + + The bad block table is located at a fixed offset in the chip + (device). This applies to various DiskOnChip devices. + +- Automatic placed + + The bad block table is automatically placed and detected either at + the end or at the beginning of a chip (device) + +- Mirrored tables + + The bad block table is mirrored on the chip (device) to allow updates + of the bad block table without data loss. + +nand_scan() calls the function nand_default_bbt(). +nand_default_bbt() selects appropriate default bad block table +descriptors depending on the chip information which was retrieved by +nand_scan(). + +The standard policy is scanning the device for bad blocks and build a +ram based bad block table which allows faster access than always +checking the bad block information on the flash chip itself. + +Flash based tables +~~~~~~~~~~~~~~~~~~ + +It may be desired or necessary to keep a bad block table in FLASH. For +AG-AND chips this is mandatory, as they have no factory marked bad +blocks. They have factory marked good blocks. The marker pattern is +erased when the block is erased to be reused. So in case of powerloss +before writing the pattern back to the chip this block would be lost and +added to the bad blocks. Therefore we scan the chip(s) when we detect +them the first time for good blocks and store this information in a bad +block table before erasing any of the blocks. + +The blocks in which the tables are stored are protected against +accidental access by marking them bad in the memory bad block table. The +bad block table management functions are allowed to circumvent this +protection. + +The simplest way to activate the FLASH based bad block table support is +to set the option NAND_BBT_USE_FLASH in the bbt_option field of the +nand chip structure before calling nand_scan(). For AG-AND chips is +this done by default. This activates the default FLASH based bad block +table functionality of the NAND driver. The default bad block table +options are + +- Store bad block table per chip + +- Use 2 bits per block + +- Automatic placement at the end of the chip + +- Use mirrored tables with version numbers + +- Reserve 4 blocks at the end of the chip + +User defined tables +~~~~~~~~~~~~~~~~~~~ + +User defined tables are created by filling out a nand_bbt_descr +structure and storing the pointer in the nand_chip structure member +bbt_td before calling nand_scan(). If a mirror table is necessary a +second structure must be created and a pointer to this structure must be +stored in bbt_md inside the nand_chip structure. If the bbt_md member +is set to NULL then only the main table is used and no scan for the +mirrored table is performed. + +The most important field in the nand_bbt_descr structure is the +options field. The options define most of the table properties. Use the +predefined constants from nand.h to define the options. + +- Number of bits per block + + The supported number of bits is 1, 2, 4, 8. + +- Table per chip + + Setting the constant NAND_BBT_PERCHIP selects that a bad block + table is managed for each chip in a chip array. If this option is not + set then a per device bad block table is used. + +- Table location is absolute + + Use the option constant NAND_BBT_ABSPAGE and define the absolute + page number where the bad block table starts in the field pages. If + you have selected bad block tables per chip and you have a multi chip + array then the start page must be given for each chip in the chip + array. Note: there is no scan for a table ident pattern performed, so + the fields pattern, veroffs, offs, len can be left uninitialized + +- Table location is automatically detected + + The table can either be located in the first or the last good blocks + of the chip (device). Set NAND_BBT_LASTBLOCK to place the bad block + table at the end of the chip (device). The bad block tables are + marked and identified by a pattern which is stored in the spare area + of the first page in the block which holds the bad block table. Store + a pointer to the pattern in the pattern field. Further the length of + the pattern has to be stored in len and the offset in the spare area + must be given in the offs member of the nand_bbt_descr structure. + For mirrored bad block tables different patterns are mandatory. + +- Table creation + + Set the option NAND_BBT_CREATE to enable the table creation if no + table can be found during the scan. Usually this is done only once if + a new chip is found. + +- Table write support + + Set the option NAND_BBT_WRITE to enable the table write support. + This allows the update of the bad block table(s) in case a block has + to be marked bad due to wear. The MTD interface function + block_markbad is calling the update function of the bad block table. + If the write support is enabled then the table is updated on FLASH. + + Note: Write support should only be enabled for mirrored tables with + version control. + +- Table version control + + Set the option NAND_BBT_VERSION to enable the table version + control. It's highly recommended to enable this for mirrored tables + with write support. It makes sure that the risk of losing the bad + block table information is reduced to the loss of the information + about the one worn out block which should be marked bad. The version + is stored in 4 consecutive bytes in the spare area of the device. The + position of the version number is defined by the member veroffs in + the bad block table descriptor. + +- Save block contents on write + + In case that the block which holds the bad block table does contain + other useful information, set the option NAND_BBT_SAVECONTENT. When + the bad block table is written then the whole block is read the bad + block table is updated and the block is erased and everything is + written back. If this option is not set only the bad block table is + written and everything else in the block is ignored and erased. + +- Number of reserved blocks + + For automatic placement some blocks must be reserved for bad block + table storage. The number of reserved blocks is defined in the + maxblocks member of the bad block table description structure. + Reserving 4 blocks for mirrored tables should be a reasonable number. + This also limits the number of blocks which are scanned for the bad + block table ident pattern. + +Spare area (auto)placement +-------------------------- + +The nand driver implements different possibilities for placement of +filesystem data in the spare area, + +- Placement defined by fs driver + +- Automatic placement + +The default placement function is automatic placement. The nand driver +has built in default placement schemes for the various chiptypes. If due +to hardware ECC functionality the default placement does not fit then +the board driver can provide a own placement scheme. + +File system drivers can provide a own placement scheme which is used +instead of the default placement scheme. + +Placement schemes are defined by a nand_oobinfo structure + +:: + + struct nand_oobinfo { + int useecc; + int eccbytes; + int eccpos[24]; + int oobfree[8][2]; + }; + + +- useecc + + The useecc member controls the ecc and placement function. The header + file include/mtd/mtd-abi.h contains constants to select ecc and + placement. MTD_NANDECC_OFF switches off the ecc complete. This is + not recommended and available for testing and diagnosis only. + MTD_NANDECC_PLACE selects caller defined placement, + MTD_NANDECC_AUTOPLACE selects automatic placement. + +- eccbytes + + The eccbytes member defines the number of ecc bytes per page. + +- eccpos + + The eccpos array holds the byte offsets in the spare area where the + ecc codes are placed. + +- oobfree + + The oobfree array defines the areas in the spare area which can be + used for automatic placement. The information is given in the format + {offset, size}. offset defines the start of the usable area, size the + length in bytes. More than one area can be defined. The list is + terminated by an {0, 0} entry. + +Placement defined by fs driver +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The calling function provides a pointer to a nand_oobinfo structure +which defines the ecc placement. For writes the caller must provide a +spare area buffer along with the data buffer. The spare area buffer size +is (number of pages) \* (size of spare area). For reads the buffer size +is (number of pages) \* ((size of spare area) + (number of ecc steps per +page) \* sizeof (int)). The driver stores the result of the ecc check +for each tuple in the spare buffer. The storage sequence is:: + + <spare data page 0><ecc result 0>...<ecc result n> + + ... + + <spare data page n><ecc result 0>...<ecc result n> + +This is a legacy mode used by YAFFS1. + +If the spare area buffer is NULL then only the ECC placement is done +according to the given scheme in the nand_oobinfo structure. + +Automatic placement +~~~~~~~~~~~~~~~~~~~ + +Automatic placement uses the built in defaults to place the ecc bytes in +the spare area. If filesystem data have to be stored / read into the +spare area then the calling function must provide a buffer. The buffer +size per page is determined by the oobfree array in the nand_oobinfo +structure. + +If the spare area buffer is NULL then only the ECC placement is done +according to the default builtin scheme. + +Spare area autoplacement default schemes +---------------------------------------- + +256 byte pagesize +~~~~~~~~~~~~~~~~~ + +======== ================== =================================================== +Offset Content Comment +======== ================== =================================================== +0x00 ECC byte 0 Error correction code byte 0 +0x01 ECC byte 1 Error correction code byte 1 +0x02 ECC byte 2 Error correction code byte 2 +0x03 Autoplace 0 +0x04 Autoplace 1 +0x05 Bad block marker If any bit in this byte is zero, then this + block is bad. This applies only to the first + page in a block. In the remaining pages this + byte is reserved +0x06 Autoplace 2 +0x07 Autoplace 3 +======== ================== =================================================== + +512 byte pagesize +~~~~~~~~~~~~~~~~~ + + +============= ================== ============================================== +Offset Content Comment +============= ================== ============================================== +0x00 ECC byte 0 Error correction code byte 0 of the lower + 256 Byte data in this page +0x01 ECC byte 1 Error correction code byte 1 of the lower + 256 Bytes of data in this page +0x02 ECC byte 2 Error correction code byte 2 of the lower + 256 Bytes of data in this page +0x03 ECC byte 3 Error correction code byte 0 of the upper + 256 Bytes of data in this page +0x04 reserved reserved +0x05 Bad block marker If any bit in this byte is zero, then this + block is bad. This applies only to the first + page in a block. In the remaining pages this + byte is reserved +0x06 ECC byte 4 Error correction code byte 1 of the upper + 256 Bytes of data in this page +0x07 ECC byte 5 Error correction code byte 2 of the upper + 256 Bytes of data in this page +0x08 - 0x0F Autoplace 0 - 7 +============= ================== ============================================== + +2048 byte pagesize +~~~~~~~~~~~~~~~~~~ + +=========== ================== ================================================ +Offset Content Comment +=========== ================== ================================================ +0x00 Bad block marker If any bit in this byte is zero, then this block + is bad. This applies only to the first page in a + block. In the remaining pages this byte is + reserved +0x01 Reserved Reserved +0x02-0x27 Autoplace 0 - 37 +0x28 ECC byte 0 Error correction code byte 0 of the first + 256 Byte data in this page +0x29 ECC byte 1 Error correction code byte 1 of the first + 256 Bytes of data in this page +0x2A ECC byte 2 Error correction code byte 2 of the first + 256 Bytes data in this page +0x2B ECC byte 3 Error correction code byte 0 of the second + 256 Bytes of data in this page +0x2C ECC byte 4 Error correction code byte 1 of the second + 256 Bytes of data in this page +0x2D ECC byte 5 Error correction code byte 2 of the second + 256 Bytes of data in this page +0x2E ECC byte 6 Error correction code byte 0 of the third + 256 Bytes of data in this page +0x2F ECC byte 7 Error correction code byte 1 of the third + 256 Bytes of data in this page +0x30 ECC byte 8 Error correction code byte 2 of the third + 256 Bytes of data in this page +0x31 ECC byte 9 Error correction code byte 0 of the fourth + 256 Bytes of data in this page +0x32 ECC byte 10 Error correction code byte 1 of the fourth + 256 Bytes of data in this page +0x33 ECC byte 11 Error correction code byte 2 of the fourth + 256 Bytes of data in this page +0x34 ECC byte 12 Error correction code byte 0 of the fifth + 256 Bytes of data in this page +0x35 ECC byte 13 Error correction code byte 1 of the fifth + 256 Bytes of data in this page +0x36 ECC byte 14 Error correction code byte 2 of the fifth + 256 Bytes of data in this page +0x37 ECC byte 15 Error correction code byte 0 of the sixth + 256 Bytes of data in this page +0x38 ECC byte 16 Error correction code byte 1 of the sixth + 256 Bytes of data in this page +0x39 ECC byte 17 Error correction code byte 2 of the sixth + 256 Bytes of data in this page +0x3A ECC byte 18 Error correction code byte 0 of the seventh + 256 Bytes of data in this page +0x3B ECC byte 19 Error correction code byte 1 of the seventh + 256 Bytes of data in this page +0x3C ECC byte 20 Error correction code byte 2 of the seventh + 256 Bytes of data in this page +0x3D ECC byte 21 Error correction code byte 0 of the eighth + 256 Bytes of data in this page +0x3E ECC byte 22 Error correction code byte 1 of the eighth + 256 Bytes of data in this page +0x3F ECC byte 23 Error correction code byte 2 of the eighth + 256 Bytes of data in this page +=========== ================== ================================================ + +Filesystem support +================== + +The NAND driver provides all necessary functions for a filesystem via +the MTD interface. + +Filesystems must be aware of the NAND peculiarities and restrictions. +One major restrictions of NAND Flash is, that you cannot write as often +as you want to a page. The consecutive writes to a page, before erasing +it again, are restricted to 1-3 writes, depending on the manufacturers +specifications. This applies similar to the spare area. + +Therefore NAND aware filesystems must either write in page size chunks +or hold a writebuffer to collect smaller writes until they sum up to +pagesize. Available NAND aware filesystems: JFFS2, YAFFS. + +The spare area usage to store filesystem data is controlled by the spare +area placement functionality which is described in one of the earlier +chapters. + +Tools +===== + +The MTD project provides a couple of helpful tools to handle NAND Flash. + +- flasherase, flasheraseall: Erase and format FLASH partitions + +- nandwrite: write filesystem images to NAND FLASH + +- nanddump: dump the contents of a NAND FLASH partitions + +These tools are aware of the NAND restrictions. Please use those tools +instead of complaining about errors which are caused by non NAND aware +access methods. + +Constants +========= + +This chapter describes the constants which might be relevant for a +driver developer. + +Chip option constants +--------------------- + +Constants for chip id table +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +These constants are defined in nand.h. They are OR-ed together to +describe the chip functionality:: + + /* Buswitdh is 16 bit */ + #define NAND_BUSWIDTH_16 0x00000002 + /* Device supports partial programming without padding */ + #define NAND_NO_PADDING 0x00000004 + /* Chip has cache program function */ + #define NAND_CACHEPRG 0x00000008 + /* Chip has copy back function */ + #define NAND_COPYBACK 0x00000010 + /* AND Chip which has 4 banks and a confusing page / block + * assignment. See Renesas datasheet for further information */ + #define NAND_IS_AND 0x00000020 + /* Chip has a array of 4 pages which can be read without + * additional ready /busy waits */ + #define NAND_4PAGE_ARRAY 0x00000040 + + +Constants for runtime options +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +These constants are defined in nand.h. They are OR-ed together to +describe the functionality:: + + /* The hw ecc generator provides a syndrome instead a ecc value on read + * This can only work if we have the ecc bytes directly behind the + * data bytes. Applies for DOC and AG-AND Renesas HW Reed Solomon generators */ + #define NAND_HWECC_SYNDROME 0x00020000 + + +ECC selection constants +----------------------- + +Use these constants to select the ECC algorithm:: + + /* No ECC. Usage is not recommended ! */ + #define NAND_ECC_NONE 0 + /* Software ECC 3 byte ECC per 256 Byte data */ + #define NAND_ECC_SOFT 1 + /* Hardware ECC 3 byte ECC per 256 Byte data */ + #define NAND_ECC_HW3_256 2 + /* Hardware ECC 3 byte ECC per 512 Byte data */ + #define NAND_ECC_HW3_512 3 + /* Hardware ECC 6 byte ECC per 512 Byte data */ + #define NAND_ECC_HW6_512 4 + /* Hardware ECC 6 byte ECC per 512 Byte data */ + #define NAND_ECC_HW8_512 6 + + +Hardware control related constants +---------------------------------- + +These constants describe the requested hardware access function when the +boardspecific hardware control function is called:: + + /* Select the chip by setting nCE to low */ + #define NAND_CTL_SETNCE 1 + /* Deselect the chip by setting nCE to high */ + #define NAND_CTL_CLRNCE 2 + /* Select the command latch by setting CLE to high */ + #define NAND_CTL_SETCLE 3 + /* Deselect the command latch by setting CLE to low */ + #define NAND_CTL_CLRCLE 4 + /* Select the address latch by setting ALE to high */ + #define NAND_CTL_SETALE 5 + /* Deselect the address latch by setting ALE to low */ + #define NAND_CTL_CLRALE 6 + /* Set write protection by setting WP to high. Not used! */ + #define NAND_CTL_SETWP 7 + /* Clear write protection by setting WP to low. Not used! */ + #define NAND_CTL_CLRWP 8 + + +Bad block table related constants +--------------------------------- + +These constants describe the options used for bad block table +descriptors:: + + /* Options for the bad block table descriptors */ + + /* The number of bits used per block in the bbt on the device */ + #define NAND_BBT_NRBITS_MSK 0x0000000F + #define NAND_BBT_1BIT 0x00000001 + #define NAND_BBT_2BIT 0x00000002 + #define NAND_BBT_4BIT 0x00000004 + #define NAND_BBT_8BIT 0x00000008 + /* The bad block table is in the last good block of the device */ + #define NAND_BBT_LASTBLOCK 0x00000010 + /* The bbt is at the given page, else we must scan for the bbt */ + #define NAND_BBT_ABSPAGE 0x00000020 + /* bbt is stored per chip on multichip devices */ + #define NAND_BBT_PERCHIP 0x00000080 + /* bbt has a version counter at offset veroffs */ + #define NAND_BBT_VERSION 0x00000100 + /* Create a bbt if none axists */ + #define NAND_BBT_CREATE 0x00000200 + /* Write bbt if necessary */ + #define NAND_BBT_WRITE 0x00001000 + /* Read and write back block contents when writing bbt */ + #define NAND_BBT_SAVECONTENT 0x00002000 + + +Structures +========== + +This chapter contains the autogenerated documentation of the structures +which are used in the NAND driver and might be relevant for a driver +developer. Each struct member has a short description which is marked +with an [XXX] identifier. See the chapter "Documentation hints" for an +explanation. + +.. kernel-doc:: include/linux/mtd/nand.h + :internal: + +Public Functions Provided +========================= + +This chapter contains the autogenerated documentation of the NAND kernel +API functions which are exported. Each function has a short description +which is marked with an [XXX] identifier. See the chapter "Documentation +hints" for an explanation. + +.. kernel-doc:: drivers/mtd/nand/nand_base.c + :export: + +.. kernel-doc:: drivers/mtd/nand/nand_ecc.c + :export: + +Internal Functions Provided +=========================== + +This chapter contains the autogenerated documentation of the NAND driver +internal functions. Each function has a short description which is +marked with an [XXX] identifier. See the chapter "Documentation hints" +for an explanation. The functions marked with [DEFAULT] might be +relevant for a board driver developer. + +.. kernel-doc:: drivers/mtd/nand/nand_base.c + :internal: + +.. kernel-doc:: drivers/mtd/nand/nand_bbt.c + :internal: + +Credits +======= + +The following people have contributed to the NAND driver: + +1. Steven J. Hill\ sjhill@realitydiluted.com + +2. David Woodhouse\ dwmw2@infradead.org + +3. Thomas Gleixner\ tglx@linutronix.de + +A lot of users have provided bugfixes, improvements and helping hands +for testing. Thanks a lot. + +The following people have contributed to this document: + +1. Thomas Gleixner\ tglx@linutronix.de diff --git a/Documentation/driver-api/rapidio.rst b/Documentation/driver-api/rapidio.rst new file mode 100644 index 000000000000..71ff658ab78e --- /dev/null +++ b/Documentation/driver-api/rapidio.rst @@ -0,0 +1,107 @@ +======================= +RapidIO Subsystem Guide +======================= + +:Author: Matt Porter + +Introduction +============ + +RapidIO is a high speed switched fabric interconnect with features aimed +at the embedded market. RapidIO provides support for memory-mapped I/O +as well as message-based transactions over the switched fabric network. +RapidIO has a standardized discovery mechanism not unlike the PCI bus +standard that allows simple detection of devices in a network. + +This documentation is provided for developers intending to support +RapidIO on new architectures, write new drivers, or to understand the +subsystem internals. + +Known Bugs and Limitations +========================== + +Bugs +---- + +None. ;) + +Limitations +----------- + +1. Access/management of RapidIO memory regions is not supported + +2. Multiple host enumeration is not supported + +RapidIO driver interface +======================== + +Drivers are provided a set of calls in order to interface with the +subsystem to gather info on devices, request/map memory region +resources, and manage mailboxes/doorbells. + +Functions +--------- + +.. kernel-doc:: include/linux/rio_drv.h + :internal: + +.. kernel-doc:: drivers/rapidio/rio-driver.c + :export: + +.. kernel-doc:: drivers/rapidio/rio.c + :export: + +Internals +========= + +This chapter contains the autogenerated documentation of the RapidIO +subsystem. + +Structures +---------- + +.. kernel-doc:: include/linux/rio.h + :internal: + +Enumeration and Discovery +------------------------- + +.. kernel-doc:: drivers/rapidio/rio-scan.c + :internal: + +Driver functionality +-------------------- + +.. kernel-doc:: drivers/rapidio/rio.c + :internal: + +.. kernel-doc:: drivers/rapidio/rio-access.c + :internal: + +Device model support +-------------------- + +.. kernel-doc:: drivers/rapidio/rio-driver.c + :internal: + +PPC32 support +------------- + +.. kernel-doc:: arch/powerpc/sysdev/fsl_rio.c + :internal: + +Credits +======= + +The following people have contributed to the RapidIO subsystem directly +or indirectly: + +1. Matt Porter\ mporter@kernel.crashing.org + +2. Randy Vinson\ rvinson@mvista.com + +3. Dan Malek\ dan@embeddedalley.com + +The following people have contributed to this document: + +1. Matt Porter\ mporter@kernel.crashing.org diff --git a/Documentation/driver-api/s390-drivers.rst b/Documentation/driver-api/s390-drivers.rst new file mode 100644 index 000000000000..7060da136095 --- /dev/null +++ b/Documentation/driver-api/s390-drivers.rst @@ -0,0 +1,111 @@ +=================================== +Writing s390 channel device drivers +=================================== + +:Author: Cornelia Huck + +Introduction +============ + +This document describes the interfaces available for device drivers that +drive s390 based channel attached I/O devices. This includes interfaces +for interaction with the hardware and interfaces for interacting with +the common driver core. Those interfaces are provided by the s390 common +I/O layer. + +The document assumes a familarity with the technical terms associated +with the s390 channel I/O architecture. For a description of this +architecture, please refer to the "z/Architecture: Principles of +Operation", IBM publication no. SA22-7832. + +While most I/O devices on a s390 system are typically driven through the +channel I/O mechanism described here, there are various other methods +(like the diag interface). These are out of the scope of this document. + +Some additional information can also be found in the kernel source under +Documentation/s390/driver-model.txt. + +The ccw bus +=========== + +The ccw bus typically contains the majority of devices available to a +s390 system. Named after the channel command word (ccw), the basic +command structure used to address its devices, the ccw bus contains +so-called channel attached devices. They are addressed via I/O +subchannels, visible on the css bus. A device driver for +channel-attached devices, however, will never interact with the +subchannel directly, but only via the I/O device on the ccw bus, the ccw +device. + +I/O functions for channel-attached devices +------------------------------------------ + +Some hardware structures have been translated into C structures for use +by the common I/O layer and device drivers. For more information on the +hardware structures represented here, please consult the Principles of +Operation. + +.. kernel-doc:: arch/s390/include/asm/cio.h + :internal: + +ccw devices +----------- + +Devices that want to initiate channel I/O need to attach to the ccw bus. +Interaction with the driver core is done via the common I/O layer, which +provides the abstractions of ccw devices and ccw device drivers. + +The functions that initiate or terminate channel I/O all act upon a ccw +device structure. Device drivers must not bypass those functions or +strange side effects may happen. + +.. kernel-doc:: arch/s390/include/asm/ccwdev.h + :internal: + +.. kernel-doc:: drivers/s390/cio/device.c + :export: + +.. kernel-doc:: drivers/s390/cio/device_ops.c + :export: + +The channel-measurement facility +-------------------------------- + +The channel-measurement facility provides a means to collect measurement +data which is made available by the channel subsystem for each channel +attached device. + +.. kernel-doc:: arch/s390/include/asm/cmb.h + :internal: + +.. kernel-doc:: drivers/s390/cio/cmf.c + :export: + +The ccwgroup bus +================ + +The ccwgroup bus only contains artificial devices, created by the user. +Many networking devices (e.g. qeth) are in fact composed of several ccw +devices (like read, write and data channel for qeth). The ccwgroup bus +provides a mechanism to create a meta-device which contains those ccw +devices as slave devices and can be associated with the netdevice. + +ccw group devices +----------------- + +.. kernel-doc:: arch/s390/include/asm/ccwgroup.h + :internal: + +.. kernel-doc:: drivers/s390/cio/ccwgroup.c + :export: + +Generic interfaces +================== + +Some interfaces are available to other drivers that do not necessarily +have anything to do with the busses described above, but still are +indirectly using basic infrastructure in the common I/O layer. One +example is the support for adapter interrupts. + +.. kernel-doc:: drivers/s390/cio/airq.c + :export: diff --git a/Documentation/driver-api/scsi.rst b/Documentation/driver-api/scsi.rst new file mode 100644 index 000000000000..859fb672319f --- /dev/null +++ b/Documentation/driver-api/scsi.rst @@ -0,0 +1,344 @@ +===================== +SCSI Interfaces Guide +===================== + +:Author: James Bottomley +:Author: Rob Landley + +Introduction +============ + +Protocol vs bus +--------------- + +Once upon a time, the Small Computer Systems Interface defined both a +parallel I/O bus and a data protocol to connect a wide variety of +peripherals (disk drives, tape drives, modems, printers, scanners, +optical drives, test equipment, and medical devices) to a host computer. + +Although the old parallel (fast/wide/ultra) SCSI bus has largely fallen +out of use, the SCSI command set is more widely used than ever to +communicate with devices over a number of different busses. + +The `SCSI protocol <http://www.t10.org/scsi-3.htm>`__ is a big-endian +peer-to-peer packet based protocol. SCSI commands are 6, 10, 12, or 16 +bytes long, often followed by an associated data payload. + +SCSI commands can be transported over just about any kind of bus, and +are the default protocol for storage devices attached to USB, SATA, SAS, +Fibre Channel, FireWire, and ATAPI devices. SCSI packets are also +commonly exchanged over Infiniband, +`I20 <http://i2o.shadowconnect.com/faq.php>`__, TCP/IP +(`iSCSI <https://en.wikipedia.org/wiki/ISCSI>`__), even `Parallel +ports <http://cyberelk.net/tim/parport/parscsi.html>`__. + +Design of the Linux SCSI subsystem +---------------------------------- + +The SCSI subsystem uses a three layer design, with upper, mid, and low +layers. Every operation involving the SCSI subsystem (such as reading a +sector from a disk) uses one driver at each of the 3 levels: one upper +layer driver, one lower layer driver, and the SCSI midlayer. + +The SCSI upper layer provides the interface between userspace and the +kernel, in the form of block and char device nodes for I/O and ioctl(). +The SCSI lower layer contains drivers for specific hardware devices. + +In between is the SCSI mid-layer, analogous to a network routing layer +such as the IPv4 stack. The SCSI mid-layer routes a packet based data +protocol between the upper layer's /dev nodes and the corresponding +devices in the lower layer. It manages command queues, provides error +handling and power management functions, and responds to ioctl() +requests. + +SCSI upper layer +================ + +The upper layer supports the user-kernel interface by providing device +nodes. + +sd (SCSI Disk) +-------------- + +sd (sd_mod.o) + +sr (SCSI CD-ROM) +---------------- + +sr (sr_mod.o) + +st (SCSI Tape) +-------------- + +st (st.o) + +sg (SCSI Generic) +----------------- + +sg (sg.o) + +ch (SCSI Media Changer) +----------------------- + +ch (ch.c) + +SCSI mid layer +============== + +SCSI midlayer implementation +---------------------------- + +include/scsi/scsi_device.h +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. kernel-doc:: include/scsi/scsi_device.h + :internal: + +drivers/scsi/scsi.c +~~~~~~~~~~~~~~~~~~~ + +Main file for the SCSI midlayer. + +.. kernel-doc:: drivers/scsi/scsi.c + :export: + +drivers/scsi/scsicam.c +~~~~~~~~~~~~~~~~~~~~~~ + +`SCSI Common Access +Method <http://www.t10.org/ftp/t10/drafts/cam/cam-r12b.pdf>`__ support +functions, for use with HDIO_GETGEO, etc. + +.. kernel-doc:: drivers/scsi/scsicam.c + :export: + +drivers/scsi/scsi_error.c +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Common SCSI error/timeout handling routines. + +.. kernel-doc:: drivers/scsi/scsi_error.c + :export: + +drivers/scsi/scsi_devinfo.c +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Manage scsi_dev_info_list, which tracks blacklisted and whitelisted +devices. + +.. kernel-doc:: drivers/scsi/scsi_devinfo.c + :internal: + +drivers/scsi/scsi_ioctl.c +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Handle ioctl() calls for SCSI devices. + +.. kernel-doc:: drivers/scsi/scsi_ioctl.c + :export: + +drivers/scsi/scsi_lib.c +~~~~~~~~~~~~~~~~~~~~~~~~ + +SCSI queuing library. + +.. kernel-doc:: drivers/scsi/scsi_lib.c + :export: + +drivers/scsi/scsi_lib_dma.c +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +SCSI library functions depending on DMA (map and unmap scatter-gather +lists). + +.. kernel-doc:: drivers/scsi/scsi_lib_dma.c + :export: + +drivers/scsi/scsi_module.c +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The file drivers/scsi/scsi_module.c contains legacy support for +old-style host templates. It should never be used by any new driver. + +drivers/scsi/scsi_proc.c +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The functions in this file provide an interface between the PROC file +system and the SCSI device drivers It is mainly used for debugging, +statistics and to pass information directly to the lowlevel driver. I.E. +plumbing to manage /proc/scsi/\* + +.. kernel-doc:: drivers/scsi/scsi_proc.c + :internal: + +drivers/scsi/scsi_netlink.c +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Infrastructure to provide async events from transports to userspace via +netlink, using a single NETLINK_SCSITRANSPORT protocol for all +transports. See `the original patch +submission <http://marc.info/?l=linux-scsi&m=115507374832500&w=2>`__ for +more details. + +.. kernel-doc:: drivers/scsi/scsi_netlink.c + :internal: + +drivers/scsi/scsi_scan.c +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Scan a host to determine which (if any) devices are attached. The +general scanning/probing algorithm is as follows, exceptions are made to +it depending on device specific flags, compilation options, and global +variable (boot or module load time) settings. A specific LUN is scanned +via an INQUIRY command; if the LUN has a device attached, a scsi_device +is allocated and setup for it. For every id of every channel on the +given host, start by scanning LUN 0. Skip hosts that don't respond at +all to a scan of LUN 0. Otherwise, if LUN 0 has a device attached, +allocate and setup a scsi_device for it. If target is SCSI-3 or up, +issue a REPORT LUN, and scan all of the LUNs returned by the REPORT LUN; +else, sequentially scan LUNs up until some maximum is reached, or a LUN +is seen that cannot have a device attached to it. + +.. kernel-doc:: drivers/scsi/scsi_scan.c + :internal: + +drivers/scsi/scsi_sysctl.c +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Set up the sysctl entry: "/dev/scsi/logging_level" +(DEV_SCSI_LOGGING_LEVEL) which sets/returns scsi_logging_level. + +drivers/scsi/scsi_sysfs.c +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +SCSI sysfs interface routines. + +.. kernel-doc:: drivers/scsi/scsi_sysfs.c + :export: + +drivers/scsi/hosts.c +~~~~~~~~~~~~~~~~~~~~ + +mid to lowlevel SCSI driver interface + +.. kernel-doc:: drivers/scsi/hosts.c + :export: + +drivers/scsi/constants.c +~~~~~~~~~~~~~~~~~~~~~~~~ + +mid to lowlevel SCSI driver interface + +.. kernel-doc:: drivers/scsi/constants.c + :export: + +Transport classes +----------------- + +Transport classes are service libraries for drivers in the SCSI lower +layer, which expose transport attributes in sysfs. + +Fibre Channel transport +~~~~~~~~~~~~~~~~~~~~~~~ + +The file drivers/scsi/scsi_transport_fc.c defines transport attributes +for Fibre Channel. + +.. kernel-doc:: drivers/scsi/scsi_transport_fc.c + :export: + +iSCSI transport class +~~~~~~~~~~~~~~~~~~~~~ + +The file drivers/scsi/scsi_transport_iscsi.c defines transport +attributes for the iSCSI class, which sends SCSI packets over TCP/IP +connections. + +.. kernel-doc:: drivers/scsi/scsi_transport_iscsi.c + :export: + +Serial Attached SCSI (SAS) transport class +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The file drivers/scsi/scsi_transport_sas.c defines transport +attributes for Serial Attached SCSI, a variant of SATA aimed at large +high-end systems. + +The SAS transport class contains common code to deal with SAS HBAs, an +aproximated representation of SAS topologies in the driver model, and +various sysfs attributes to expose these topologies and management +interfaces to userspace. + +In addition to the basic SCSI core objects this transport class +introduces two additional intermediate objects: The SAS PHY as +represented by struct sas_phy defines an "outgoing" PHY on a SAS HBA or +Expander, and the SAS remote PHY represented by struct sas_rphy defines +an "incoming" PHY on a SAS Expander or end device. Note that this is +purely a software concept, the underlying hardware for a PHY and a +remote PHY is the exactly the same. + +There is no concept of a SAS port in this code, users can see what PHYs +form a wide port based on the port_identifier attribute, which is the +same for all PHYs in a port. + +.. kernel-doc:: drivers/scsi/scsi_transport_sas.c + :export: + +SATA transport class +~~~~~~~~~~~~~~~~~~~~ + +The SATA transport is handled by libata, which has its own book of +documentation in this directory. + +Parallel SCSI (SPI) transport class +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The file drivers/scsi/scsi_transport_spi.c defines transport +attributes for traditional (fast/wide/ultra) SCSI busses. + +.. kernel-doc:: drivers/scsi/scsi_transport_spi.c + :export: + +SCSI RDMA (SRP) transport class +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The file drivers/scsi/scsi_transport_srp.c defines transport +attributes for SCSI over Remote Direct Memory Access. + +.. kernel-doc:: drivers/scsi/scsi_transport_srp.c + :export: + +SCSI lower layer +================ + +Host Bus Adapter transport types +-------------------------------- + +Many modern device controllers use the SCSI command set as a protocol to +communicate with their devices through many different types of physical +connections. + +In SCSI language a bus capable of carrying SCSI commands is called a +"transport", and a controller connecting to such a bus is called a "host +bus adapter" (HBA). + +Debug transport +~~~~~~~~~~~~~~~ + +The file drivers/scsi/scsi_debug.c simulates a host adapter with a +variable number of disks (or disk like devices) attached, sharing a +common amount of RAM. Does a lot of checking to make sure that we are +not getting blocks mixed up, and panics the kernel if anything out of +the ordinary is seen. + +To be more realistic, the simulated devices have the transport +attributes of SAS disks. + +For documentation see http://sg.danny.cz/sg/sdebug26.html + +todo +~~~~ + +Parallel (fast/wide/ultra) SCSI, USB, SATA, SAS, Fibre Channel, +FireWire, ATAPI devices, Infiniband, I20, iSCSI, Parallel ports, +netlink... diff --git a/Documentation/driver-api/w1.rst b/Documentation/driver-api/w1.rst new file mode 100644 index 000000000000..c1da8f0cb476 --- /dev/null +++ b/Documentation/driver-api/w1.rst @@ -0,0 +1,70 @@ +====================== +W1: Dallas' 1-wire bus +====================== + +:Author: David Fries + +W1 API internal to the kernel +============================= + +W1 API internal to the kernel +----------------------------- + +drivers/w1/w1.h +~~~~~~~~~~~~~~~ + +W1 core functions. + +.. kernel-doc:: drivers/w1/w1.h + :internal: + +drivers/w1/w1.c +~~~~~~~~~~~~~~~ + +W1 core functions. + +.. kernel-doc:: drivers/w1/w1.c + :internal: + +drivers/w1/w1_family.h +~~~~~~~~~~~~~~~~~~~~~~~ + +Allows registering device family operations. + +.. kernel-doc:: drivers/w1/w1_family.h + :internal: + +drivers/w1/w1_family.c +~~~~~~~~~~~~~~~~~~~~~~~ + +Allows registering device family operations. + +.. kernel-doc:: drivers/w1/w1_family.c + :export: + +drivers/w1/w1_int.c +~~~~~~~~~~~~~~~~~~~~ + +W1 internal initialization for master devices. + +.. kernel-doc:: drivers/w1/w1_int.c + :export: + +drivers/w1/w1_netlink.h +~~~~~~~~~~~~~~~~~~~~~~~~ + +W1 external netlink API structures and commands. + +.. kernel-doc:: drivers/w1/w1_netlink.h + :internal: + +drivers/w1/w1_io.c +~~~~~~~~~~~~~~~~~~~ + +W1 input/output. + +.. kernel-doc:: drivers/w1/w1_io.c + :export: + +.. kernel-doc:: drivers/w1/w1_io.c + :internal: diff --git a/Documentation/fb/api.txt b/Documentation/fb/api.txt index d4ff7de85700..d52cf1e3b975 100644 --- a/Documentation/fb/api.txt +++ b/Documentation/fb/api.txt @@ -289,12 +289,12 @@ the FB_CAP_FOURCC bit in the fb_fix_screeninfo capabilities field. FOURCC definitions are located in the linux/videodev2.h header. However, and despite starting with the V4L2_PIX_FMT_prefix, they are not restricted to V4L2 and don't require usage of the V4L2 subsystem. FOURCC documentation is -available in Documentation/DocBook/v4l/pixfmt.xml. +available in Documentation/media/uapi/v4l/pixfmt.rst. To select a format, applications set the grayscale field to the desired FOURCC. For YUV formats, they should also select the appropriate colorspace by setting the colorspace field to one of the colorspaces listed in linux/videodev2.h and -documented in Documentation/DocBook/v4l/colorspaces.xml. +documented in Documentation/media/uapi/v4l/colorspaces.rst. The red, green, blue and transp fields are not used with the FOURCC-based API. For forward compatibility reasons applications must zero those fields, and diff --git a/Documentation/filesystems/conf.py b/Documentation/filesystems/conf.py new file mode 100644 index 000000000000..ea44172af5c4 --- /dev/null +++ b/Documentation/filesystems/conf.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8; mode: python -*- + +project = "Linux Filesystems API" + +tags.add("subproject") + +latex_documents = [ + ('index', 'filesystems.tex', project, + 'The kernel development community', 'manual'), +] diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst new file mode 100644 index 000000000000..256e10eedba4 --- /dev/null +++ b/Documentation/filesystems/index.rst @@ -0,0 +1,317 @@ +===================== +Linux Filesystems API +===================== + +The Linux VFS +============= + +The Filesystem types +-------------------- + +.. kernel-doc:: include/linux/fs.h + :internal: + +The Directory Cache +------------------- + +.. kernel-doc:: fs/dcache.c + :export: + +.. kernel-doc:: include/linux/dcache.h + :internal: + +Inode Handling +-------------- + +.. kernel-doc:: fs/inode.c + :export: + +.. kernel-doc:: fs/bad_inode.c + :export: + +Registration and Superblocks +---------------------------- + +.. kernel-doc:: fs/super.c + :export: + +File Locks +---------- + +.. kernel-doc:: fs/locks.c + :export: + +.. kernel-doc:: fs/locks.c + :internal: + +Other Functions +--------------- + +.. kernel-doc:: fs/mpage.c + :export: + +.. kernel-doc:: fs/namei.c + :export: + +.. kernel-doc:: fs/buffer.c + :export: + +.. kernel-doc:: block/bio.c + :export: + +.. kernel-doc:: fs/seq_file.c + :export: + +.. kernel-doc:: fs/filesystems.c + :export: + +.. kernel-doc:: fs/fs-writeback.c + :export: + +.. kernel-doc:: fs/block_dev.c + :export: + +The proc filesystem +=================== + +sysctl interface +---------------- + +.. kernel-doc:: kernel/sysctl.c + :export: + +proc filesystem interface +------------------------- + +.. kernel-doc:: fs/proc/base.c + :internal: + +Events based on file descriptors +================================ + +.. kernel-doc:: fs/eventfd.c + :export: + +The Filesystem for Exporting Kernel Objects +=========================================== + +.. kernel-doc:: fs/sysfs/file.c + :export: + +.. kernel-doc:: fs/sysfs/symlink.c + :export: + +The debugfs filesystem +====================== + +debugfs interface +----------------- + +.. kernel-doc:: fs/debugfs/inode.c + :export: + +.. kernel-doc:: fs/debugfs/file.c + :export: + +The Linux Journalling API +========================= + +Overview +-------- + +Details +~~~~~~~ + +The journalling layer is easy to use. You need to first of all create a +journal_t data structure. There are two calls to do this dependent on +how you decide to allocate the physical media on which the journal +resides. The :c:func:`jbd2_journal_init_inode` call is for journals stored in +filesystem inodes, or the :c:func:`jbd2_journal_init_dev` call can be used +for journal stored on a raw device (in a continuous range of blocks). A +journal_t is a typedef for a struct pointer, so when you are finally +finished make sure you call :c:func:`jbd2_journal_destroy` on it to free up +any used kernel memory. + +Once you have got your journal_t object you need to 'mount' or load the +journal file. The journalling layer expects the space for the journal +was already allocated and initialized properly by the userspace tools. +When loading the journal you must call :c:func:`jbd2_journal_load` to process +journal contents. If the client file system detects the journal contents +does not need to be processed (or even need not have valid contents), it +may call :c:func:`jbd2_journal_wipe` to clear the journal contents before +calling :c:func:`jbd2_journal_load`. + +Note that jbd2_journal_wipe(..,0) calls +:c:func:`jbd2_journal_skip_recovery` for you if it detects any outstanding +transactions in the journal and similarly :c:func:`jbd2_journal_load` will +call :c:func:`jbd2_journal_recover` if necessary. I would advise reading +:c:func:`ext4_load_journal` in fs/ext4/super.c for examples on this stage. + +Now you can go ahead and start modifying the underlying filesystem. +Almost. + +You still need to actually journal your filesystem changes, this is done +by wrapping them into transactions. Additionally you also need to wrap +the modification of each of the buffers with calls to the journal layer, +so it knows what the modifications you are actually making are. To do +this use :c:func:`jbd2_journal_start` which returns a transaction handle. + +:c:func:`jbd2_journal_start` and its counterpart :c:func:`jbd2_journal_stop`, +which indicates the end of a transaction are nestable calls, so you can +reenter a transaction if necessary, but remember you must call +:c:func:`jbd2_journal_stop` the same number of times as +:c:func:`jbd2_journal_start` before the transaction is completed (or more +accurately leaves the update phase). Ext4/VFS makes use of this feature to +simplify handling of inode dirtying, quota support, etc. + +Inside each transaction you need to wrap the modifications to the +individual buffers (blocks). Before you start to modify a buffer you +need to call :c:func:`jbd2_journal_get_create_access()` / +:c:func:`jbd2_journal_get_write_access()` / +:c:func:`jbd2_journal_get_undo_access()` as appropriate, this allows the +journalling layer to copy the unmodified +data if it needs to. After all the buffer may be part of a previously +uncommitted transaction. At this point you are at last ready to modify a +buffer, and once you are have done so you need to call +:c:func:`jbd2_journal_dirty_metadata`. Or if you've asked for access to a +buffer you now know is now longer required to be pushed back on the +device you can call :c:func:`jbd2_journal_forget` in much the same way as you +might have used :c:func:`bforget` in the past. + +A :c:func:`jbd2_journal_flush` may be called at any time to commit and +checkpoint all your transactions. + +Then at umount time , in your :c:func:`put_super` you can then call +:c:func:`jbd2_journal_destroy` to clean up your in-core journal object. + +Unfortunately there a couple of ways the journal layer can cause a +deadlock. The first thing to note is that each task can only have a +single outstanding transaction at any one time, remember nothing commits +until the outermost :c:func:`jbd2_journal_stop`. This means you must complete +the transaction at the end of each file/inode/address etc. operation you +perform, so that the journalling system isn't re-entered on another +journal. Since transactions can't be nested/batched across differing +journals, and another filesystem other than yours (say ext4) may be +modified in a later syscall. + +The second case to bear in mind is that :c:func:`jbd2_journal_start` can block +if there isn't enough space in the journal for your transaction (based +on the passed nblocks param) - when it blocks it merely(!) needs to wait +for transactions to complete and be committed from other tasks, so +essentially we are waiting for :c:func:`jbd2_journal_stop`. So to avoid +deadlocks you must treat :c:func:`jbd2_journal_start` / +:c:func:`jbd2_journal_stop` as if they were semaphores and include them in +your semaphore ordering rules to prevent +deadlocks. Note that :c:func:`jbd2_journal_extend` has similar blocking +behaviour to :c:func:`jbd2_journal_start` so you can deadlock here just as +easily as on :c:func:`jbd2_journal_start`. + +Try to reserve the right number of blocks the first time. ;-). This will +be the maximum number of blocks you are going to touch in this +transaction. I advise having a look at at least ext4_jbd.h to see the +basis on which ext4 uses to make these decisions. + +Another wriggle to watch out for is your on-disk block allocation +strategy. Why? Because, if you do a delete, you need to ensure you +haven't reused any of the freed blocks until the transaction freeing +these blocks commits. If you reused these blocks and crash happens, +there is no way to restore the contents of the reallocated blocks at the +end of the last fully committed transaction. One simple way of doing +this is to mark blocks as free in internal in-memory block allocation +structures only after the transaction freeing them commits. Ext4 uses +journal commit callback for this purpose. + +With journal commit callbacks you can ask the journalling layer to call +a callback function when the transaction is finally committed to disk, +so that you can do some of your own management. You ask the journalling +layer for calling the callback by simply setting +``journal->j_commit_callback`` function pointer and that function is +called after each transaction commit. You can also use +``transaction->t_private_list`` for attaching entries to a transaction +that need processing when the transaction commits. + +JBD2 also provides a way to block all transaction updates via +:c:func:`jbd2_journal_lock_updates()` / +:c:func:`jbd2_journal_unlock_updates()`. Ext4 uses this when it wants a +window with a clean and stable fs for a moment. E.g. + +:: + + + jbd2_journal_lock_updates() //stop new stuff happening.. + jbd2_journal_flush() // checkpoint everything. + ..do stuff on stable fs + jbd2_journal_unlock_updates() // carry on with filesystem use. + +The opportunities for abuse and DOS attacks with this should be obvious, +if you allow unprivileged userspace to trigger codepaths containing +these calls. + +Summary +~~~~~~~ + +Using the journal is a matter of wrapping the different context changes, +being each mount, each modification (transaction) and each changed +buffer to tell the journalling layer about them. + +Data Types +---------- + +The journalling layer uses typedefs to 'hide' the concrete definitions +of the structures used. As a client of the JBD2 layer you can just rely +on the using the pointer as a magic cookie of some sort. Obviously the +hiding is not enforced as this is 'C'. + +Structures +~~~~~~~~~~ + +.. kernel-doc:: include/linux/jbd2.h + :internal: + +Functions +--------- + +The functions here are split into two groups those that affect a journal +as a whole, and those which are used to manage transactions + +Journal Level +~~~~~~~~~~~~~ + +.. kernel-doc:: fs/jbd2/journal.c + :export: + +.. kernel-doc:: fs/jbd2/recovery.c + :internal: + +Transasction Level +~~~~~~~~~~~~~~~~~~ + +.. kernel-doc:: fs/jbd2/transaction.c + +See also +-------- + +`Journaling the Linux ext2fs Filesystem, LinuxExpo 98, Stephen +Tweedie <http://kernel.org/pub/linux/kernel/people/sct/ext3/journal-design.ps.gz>`__ + +`Ext3 Journalling FileSystem, OLS 2000, Dr. Stephen +Tweedie <http://olstrans.sourceforge.net/release/OLS2000-ext3/OLS2000-ext3.html>`__ + +splice API +========== + +splice is a method for moving blocks of data around inside the kernel, +without continually transferring them between the kernel and user space. + +.. kernel-doc:: fs/splice.c + +pipes API +========= + +Pipe interfaces are all for in-kernel (builtin image) use. They are not +exported for use by modules. + +.. kernel-doc:: include/linux/pipe_fs_i.h + :internal: + +.. kernel-doc:: fs/pipe.c diff --git a/Documentation/gpu/todo.rst b/Documentation/gpu/todo.rst index 1bdb7356a310..6162d0e9dc28 100644 --- a/Documentation/gpu/todo.rst +++ b/Documentation/gpu/todo.rst @@ -228,7 +228,7 @@ The DRM reference documentation is still lacking kerneldoc in a few areas. The task would be to clean up interfaces like moving functions around between files to better group them and improving the interfaces like dropping return values for functions that never fail. Then write kerneldoc for all exported -functions and an overview section and integrate it all into the drm DocBook. +functions and an overview section and integrate it all into the drm book. See https://dri.freedesktop.org/docs/drm/ for what's there already. diff --git a/Documentation/index.rst b/Documentation/index.rst index bc67dbf76eb0..e9017bb3a6ce 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -51,6 +51,7 @@ merged much easier. process/index dev-tools/index doc-guide/index + kernel-hacking/index Kernel API documentation ------------------------ @@ -67,11 +68,24 @@ needed). driver-api/index core-api/index media/index + networking/index input/index gpu/index security/index sound/index crypto/index + filesystems/index + +Architecture-specific documentation +----------------------------------- + +These books provide programming details about architecture-specific +implementation. + +.. toctree:: + :maxdepth: 2 + + sh/index Korean translations ------------------- diff --git a/Documentation/kernel-doc-nano-HOWTO.txt b/Documentation/kernel-doc-nano-HOWTO.txt index 104740ea0041..c23e2c5ab80d 100644 --- a/Documentation/kernel-doc-nano-HOWTO.txt +++ b/Documentation/kernel-doc-nano-HOWTO.txt @@ -17,8 +17,8 @@ The format for this documentation is called the kernel-doc format. It is documented in this Documentation/kernel-doc-nano-HOWTO.txt file. This style embeds the documentation within the source files, using -a few simple conventions. The scripts/kernel-doc perl script, some -SGML templates in Documentation/DocBook, and other tools understand +a few simple conventions. The scripts/kernel-doc perl script, the +Documentation/sphinx/kerneldoc.py Sphinx extension and other tools understand these conventions, and are used to extract this embedded documentation into various documents. @@ -122,15 +122,9 @@ are: - scripts/kernel-doc This is a perl script that hunts for the block comments and can mark - them up directly into DocBook, man, text, and HTML. (No, not + them up directly into DocBook, ReST, man, text, and HTML. (No, not texinfo.) -- Documentation/DocBook/*.tmpl - - These are SGML template files, which are normal SGML files with - special place-holders for where the extracted documentation should - go. - - scripts/docproc.c This is a program for converting SGML template files into SGML @@ -145,25 +139,18 @@ are: - Makefile - The targets 'xmldocs', 'psdocs', 'pdfdocs', and 'htmldocs' are used - to build XML DocBook files, PostScript files, PDF files, and html files - in Documentation/DocBook. The older target 'sgmldocs' is equivalent - to 'xmldocs'. - -- Documentation/DocBook/Makefile - - This is where C files are associated with SGML templates. - + The targets 'xmldocs', 'latexdocs', 'pdfdocs', 'epubdocs'and 'htmldocs' + are used to build XML DocBook files, LaTeX files, PDF files, + ePub files and html files in Documentation/. How to extract the documentation -------------------------------- If you just want to read the ready-made books on the various -subsystems (see Documentation/DocBook/*.tmpl), just type 'make -psdocs', or 'make pdfdocs', or 'make htmldocs', depending on your -preference. If you would rather read a different format, you can type -'make xmldocs' and then use DocBook tools to convert -Documentation/DocBook/*.xml to a format of your choice (for example, +subsystems, just type 'make epubdocs', or 'make pdfdocs', or 'make htmldocs', +depending on your preference. If you would rather read a different format, +you can type 'make xmldocs' and then use DocBook tools to convert +Documentation/output/*.xml to a format of your choice (for example, 'db2html ...' if 'make htmldocs' was not defined). If you want to see man pages instead, you can do this: @@ -329,37 +316,7 @@ This is done by using a DOC: section keyword with a section title. E.g.: * hardware, software, or its subject(s). */ -DOC: sections are used in SGML templates files as indicated below. - - -How to make new SGML template files ------------------------------------ - -SGML template files (*.tmpl) are like normal SGML files, except that -they can contain escape sequences where extracted documentation should -be inserted. - -!E<filename> is replaced by the documentation, in <filename>, for -functions that are exported using EXPORT_SYMBOL: the function list is -collected from files listed in Documentation/DocBook/Makefile. - -!I<filename> is replaced by the documentation for functions that are -_not_ exported using EXPORT_SYMBOL. - -!D<filename> is used to name additional files to search for functions -exported using EXPORT_SYMBOL. - -!F<filename> <function [functions...]> is replaced by the -documentation, in <filename>, for the functions listed. - -!P<filename> <section title> is replaced by the contents of the DOC: -section titled <section title> from <filename>. -Spaces are allowed in <section title>; do not quote the <section title>. - -!C<filename> is replaced by nothing, but makes the tools check that -all DOC: sections and documented functions, symbols, etc. are used. -This makes sense to use when you use !F/!P only and want to verify -that all documentation is included. +DOC: sections are used in ReST files. Tim. */ <twaugh@redhat.com> diff --git a/Documentation/kernel-hacking/conf.py b/Documentation/kernel-hacking/conf.py new file mode 100644 index 000000000000..3d8acf0f33ad --- /dev/null +++ b/Documentation/kernel-hacking/conf.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8; mode: python -*- + +project = "Kernel Hacking Guides" + +tags.add("subproject") + +latex_documents = [ + ('index', 'kernel-hacking.tex', project, + 'The kernel development community', 'manual'), +] diff --git a/Documentation/kernel-hacking/hacking.rst b/Documentation/kernel-hacking/hacking.rst new file mode 100644 index 000000000000..1a456b60a7cf --- /dev/null +++ b/Documentation/kernel-hacking/hacking.rst @@ -0,0 +1,811 @@ +============================================ +Unreliable Guide To Hacking The Linux Kernel +============================================ + +:Author: Rusty Russell + +Introduction +============ + +Welcome, gentle reader, to Rusty's Remarkably Unreliable Guide to Linux +Kernel Hacking. This document describes the common routines and general +requirements for kernel code: its goal is to serve as a primer for Linux +kernel development for experienced C programmers. I avoid implementation +details: that's what the code is for, and I ignore whole tracts of +useful routines. + +Before you read this, please understand that I never wanted to write +this document, being grossly under-qualified, but I always wanted to +read it, and this was the only way. I hope it will grow into a +compendium of best practice, common starting points and random +information. + +The Players +=========== + +At any time each of the CPUs in a system can be: + +- not associated with any process, serving a hardware interrupt; + +- not associated with any process, serving a softirq or tasklet; + +- running in kernel space, associated with a process (user context); + +- running a process in user space. + +There is an ordering between these. The bottom two can preempt each +other, but above that is a strict hierarchy: each can only be preempted +by the ones above it. For example, while a softirq is running on a CPU, +no other softirq will preempt it, but a hardware interrupt can. However, +any other CPUs in the system execute independently. + +We'll see a number of ways that the user context can block interrupts, +to become truly non-preemptable. + +User Context +------------ + +User context is when you are coming in from a system call or other trap: +like userspace, you can be preempted by more important tasks and by +interrupts. You can sleep, by calling :c:func:`schedule()`. + +.. note:: + + You are always in user context on module load and unload, and on + operations on the block device layer. + +In user context, the ``current`` pointer (indicating the task we are +currently executing) is valid, and :c:func:`in_interrupt()` +(``include/linux/preempt.h``) is false. + +.. warning:: + + Beware that if you have preemption or softirqs disabled (see below), + :c:func:`in_interrupt()` will return a false positive. + +Hardware Interrupts (Hard IRQs) +------------------------------- + +Timer ticks, network cards and keyboard are examples of real hardware +which produce interrupts at any time. The kernel runs interrupt +handlers, which services the hardware. The kernel guarantees that this +handler is never re-entered: if the same interrupt arrives, it is queued +(or dropped). Because it disables interrupts, this handler has to be +fast: frequently it simply acknowledges the interrupt, marks a 'software +interrupt' for execution and exits. + +You can tell you are in a hardware interrupt, because +:c:func:`in_irq()` returns true. + +.. warning:: + + Beware that this will return a false positive if interrupts are + disabled (see below). + +Software Interrupt Context: Softirqs and Tasklets +------------------------------------------------- + +Whenever a system call is about to return to userspace, or a hardware +interrupt handler exits, any 'software interrupts' which are marked +pending (usually by hardware interrupts) are run (``kernel/softirq.c``). + +Much of the real interrupt handling work is done here. Early in the +transition to SMP, there were only 'bottom halves' (BHs), which didn't +take advantage of multiple CPUs. Shortly after we switched from wind-up +computers made of match-sticks and snot, we abandoned this limitation +and switched to 'softirqs'. + +``include/linux/interrupt.h`` lists the different softirqs. A very +important softirq is the timer softirq (``include/linux/timer.h``): you +can register to have it call functions for you in a given length of +time. + +Softirqs are often a pain to deal with, since the same softirq will run +simultaneously on more than one CPU. For this reason, tasklets +(``include/linux/interrupt.h``) are more often used: they are +dynamically-registrable (meaning you can have as many as you want), and +they also guarantee that any tasklet will only run on one CPU at any +time, although different tasklets can run simultaneously. + +.. warning:: + + The name 'tasklet' is misleading: they have nothing to do with + 'tasks', and probably more to do with some bad vodka Alexey + Kuznetsov had at the time. + +You can tell you are in a softirq (or tasklet) using the +:c:func:`in_softirq()` macro (``include/linux/preempt.h``). + +.. warning:: + + Beware that this will return a false positive if a + :ref:`botton half lock <local_bh_disable>` is held. + +Some Basic Rules +================ + +No memory protection + If you corrupt memory, whether in user context or interrupt context, + the whole machine will crash. Are you sure you can't do what you + want in userspace? + +No floating point or MMX + The FPU context is not saved; even in user context the FPU state + probably won't correspond with the current process: you would mess + with some user process' FPU state. If you really want to do this, + you would have to explicitly save/restore the full FPU state (and + avoid context switches). It is generally a bad idea; use fixed point + arithmetic first. + +A rigid stack limit + Depending on configuration options the kernel stack is about 3K to + 6K for most 32-bit architectures: it's about 14K on most 64-bit + archs, and often shared with interrupts so you can't use it all. + Avoid deep recursion and huge local arrays on the stack (allocate + them dynamically instead). + +The Linux kernel is portable + Let's keep it that way. Your code should be 64-bit clean, and + endian-independent. You should also minimize CPU specific stuff, + e.g. inline assembly should be cleanly encapsulated and minimized to + ease porting. Generally it should be restricted to the + architecture-dependent part of the kernel tree. + +ioctls: Not writing a new system call +===================================== + +A system call generally looks like this:: + + asmlinkage long sys_mycall(int arg) + { + return 0; + } + + +First, in most cases you don't want to create a new system call. You +create a character device and implement an appropriate ioctl for it. +This is much more flexible than system calls, doesn't have to be entered +in every architecture's ``include/asm/unistd.h`` and +``arch/kernel/entry.S`` file, and is much more likely to be accepted by +Linus. + +If all your routine does is read or write some parameter, consider +implementing a :c:func:`sysfs()` interface instead. + +Inside the ioctl you're in user context to a process. When a error +occurs you return a negated errno (see +``include/uapi/asm-generic/errno-base.h``, +``include/uapi/asm-generic/errno.h`` and ``include/linux/errno.h``), +otherwise you return 0. + +After you slept you should check if a signal occurred: the Unix/Linux +way of handling signals is to temporarily exit the system call with the +``-ERESTARTSYS`` error. The system call entry code will switch back to +user context, process the signal handler and then your system call will +be restarted (unless the user disabled that). So you should be prepared +to process the restart, e.g. if you're in the middle of manipulating +some data structure. + +:: + + if (signal_pending(current)) + return -ERESTARTSYS; + + +If you're doing longer computations: first think userspace. If you +**really** want to do it in kernel you should regularly check if you need +to give up the CPU (remember there is cooperative multitasking per CPU). +Idiom:: + + cond_resched(); /* Will sleep */ + + +A short note on interface design: the UNIX system call motto is "Provide +mechanism not policy". + +Recipes for Deadlock +==================== + +You cannot call any routines which may sleep, unless: + +- You are in user context. + +- You do not own any spinlocks. + +- You have interrupts enabled (actually, Andi Kleen says that the + scheduling code will enable them for you, but that's probably not + what you wanted). + +Note that some functions may sleep implicitly: common ones are the user +space access functions (\*_user) and memory allocation functions +without ``GFP_ATOMIC``. + +You should always compile your kernel ``CONFIG_DEBUG_ATOMIC_SLEEP`` on, +and it will warn you if you break these rules. If you **do** break the +rules, you will eventually lock up your box. + +Really. + +Common Routines +=============== + +:c:func:`printk()` +------------------ + +Defined in ``include/linux/printk.h`` + +:c:func:`printk()` feeds kernel messages to the console, dmesg, and +the syslog daemon. It is useful for debugging and reporting errors, and +can be used inside interrupt context, but use with caution: a machine +which has its console flooded with printk messages is unusable. It uses +a format string mostly compatible with ANSI C printf, and C string +concatenation to give it a first "priority" argument:: + + printk(KERN_INFO "i = %u\n", i); + + +See ``include/linux/kern_levels.h``; for other ``KERN_`` values; these are +interpreted by syslog as the level. Special case: for printing an IP +address use:: + + __be32 ipaddress; + printk(KERN_INFO "my ip: %pI4\n", &ipaddress); + + +:c:func:`printk()` internally uses a 1K buffer and does not catch +overruns. Make sure that will be enough. + +.. note:: + + You will know when you are a real kernel hacker when you start + typoing printf as printk in your user programs :) + +.. note:: + + Another sidenote: the original Unix Version 6 sources had a comment + on top of its printf function: "Printf should not be used for + chit-chat". You should follow that advice. + +:c:func:`copy_to_user()` / :c:func:`copy_from_user()` / :c:func:`get_user()` / :c:func:`put_user()` +--------------------------------------------------------------------------------------------------- + +Defined in ``include/linux/uaccess.h`` / ``asm/uaccess.h`` + +**[SLEEPS]** + +:c:func:`put_user()` and :c:func:`get_user()` are used to get +and put single values (such as an int, char, or long) from and to +userspace. A pointer into userspace should never be simply dereferenced: +data should be copied using these routines. Both return ``-EFAULT`` or +0. + +:c:func:`copy_to_user()` and :c:func:`copy_from_user()` are +more general: they copy an arbitrary amount of data to and from +userspace. + +.. warning:: + + Unlike :c:func:`put_user()` and :c:func:`get_user()`, they + return the amount of uncopied data (ie. 0 still means success). + +[Yes, this moronic interface makes me cringe. The flamewar comes up +every year or so. --RR.] + +The functions may sleep implicitly. This should never be called outside +user context (it makes no sense), with interrupts disabled, or a +spinlock held. + +:c:func:`kmalloc()`/:c:func:`kfree()` +------------------------------------- + +Defined in ``include/linux/slab.h`` + +**[MAY SLEEP: SEE BELOW]** + +These routines are used to dynamically request pointer-aligned chunks of +memory, like malloc and free do in userspace, but +:c:func:`kmalloc()` takes an extra flag word. Important values: + +``GFP_KERNEL`` + May sleep and swap to free memory. Only allowed in user context, but + is the most reliable way to allocate memory. + +``GFP_ATOMIC`` + Don't sleep. Less reliable than ``GFP_KERNEL``, but may be called + from interrupt context. You should **really** have a good + out-of-memory error-handling strategy. + +``GFP_DMA`` + Allocate ISA DMA lower than 16MB. If you don't know what that is you + don't need it. Very unreliable. + +If you see a sleeping function called from invalid context warning +message, then maybe you called a sleeping allocation function from +interrupt context without ``GFP_ATOMIC``. You should really fix that. +Run, don't walk. + +If you are allocating at least ``PAGE_SIZE`` (``asm/page.h`` or +``asm/page_types.h``) bytes, consider using :c:func:`__get_free_pages()` +(``include/linux/gfp.h``). It takes an order argument (0 for page sized, +1 for double page, 2 for four pages etc.) and the same memory priority +flag word as above. + +If you are allocating more than a page worth of bytes you can use +:c:func:`vmalloc()`. It'll allocate virtual memory in the kernel +map. This block is not contiguous in physical memory, but the MMU makes +it look like it is for you (so it'll only look contiguous to the CPUs, +not to external device drivers). If you really need large physically +contiguous memory for some weird device, you have a problem: it is +poorly supported in Linux because after some time memory fragmentation +in a running kernel makes it hard. The best way is to allocate the block +early in the boot process via the :c:func:`alloc_bootmem()` +routine. + +Before inventing your own cache of often-used objects consider using a +slab cache in ``include/linux/slab.h`` + +:c:func:`current()` +------------------- + +Defined in ``include/asm/current.h`` + +This global variable (really a macro) contains a pointer to the current +task structure, so is only valid in user context. For example, when a +process makes a system call, this will point to the task structure of +the calling process. It is **not NULL** in interrupt context. + +:c:func:`mdelay()`/:c:func:`udelay()` +------------------------------------- + +Defined in ``include/asm/delay.h`` / ``include/linux/delay.h`` + +The :c:func:`udelay()` and :c:func:`ndelay()` functions can be +used for small pauses. Do not use large values with them as you risk +overflow - the helper function :c:func:`mdelay()` is useful here, or +consider :c:func:`msleep()`. + +:c:func:`cpu_to_be32()`/:c:func:`be32_to_cpu()`/:c:func:`cpu_to_le32()`/:c:func:`le32_to_cpu()` +----------------------------------------------------------------------------------------------- + +Defined in ``include/asm/byteorder.h`` + +The :c:func:`cpu_to_be32()` family (where the "32" can be replaced +by 64 or 16, and the "be" can be replaced by "le") are the general way +to do endian conversions in the kernel: they return the converted value. +All variations supply the reverse as well: +:c:func:`be32_to_cpu()`, etc. + +There are two major variations of these functions: the pointer +variation, such as :c:func:`cpu_to_be32p()`, which take a pointer +to the given type, and return the converted value. The other variation +is the "in-situ" family, such as :c:func:`cpu_to_be32s()`, which +convert value referred to by the pointer, and return void. + +:c:func:`local_irq_save()`/:c:func:`local_irq_restore()` +-------------------------------------------------------- + +Defined in ``include/linux/irqflags.h`` + +These routines disable hard interrupts on the local CPU, and restore +them. They are reentrant; saving the previous state in their one +``unsigned long flags`` argument. If you know that interrupts are +enabled, you can simply use :c:func:`local_irq_disable()` and +:c:func:`local_irq_enable()`. + +.. _local_bh_disable: + +:c:func:`local_bh_disable()`/:c:func:`local_bh_enable()` +-------------------------------------------------------- + +Defined in ``include/linux/bottom_half.h`` + + +These routines disable soft interrupts on the local CPU, and restore +them. They are reentrant; if soft interrupts were disabled before, they +will still be disabled after this pair of functions has been called. +They prevent softirqs and tasklets from running on the current CPU. + +:c:func:`smp_processor_id()` +---------------------------- + +Defined in ``include/linux/smp.h`` + +:c:func:`get_cpu()` disables preemption (so you won't suddenly get +moved to another CPU) and returns the current processor number, between +0 and ``NR_CPUS``. Note that the CPU numbers are not necessarily +continuous. You return it again with :c:func:`put_cpu()` when you +are done. + +If you know you cannot be preempted by another task (ie. you are in +interrupt context, or have preemption disabled) you can use +smp_processor_id(). + +``__init``/``__exit``/``__initdata`` +------------------------------------ + +Defined in ``include/linux/init.h`` + +After boot, the kernel frees up a special section; functions marked with +``__init`` and data structures marked with ``__initdata`` are dropped +after boot is complete: similarly modules discard this memory after +initialization. ``__exit`` is used to declare a function which is only +required on exit: the function will be dropped if this file is not +compiled as a module. See the header file for use. Note that it makes no +sense for a function marked with ``__init`` to be exported to modules +with :c:func:`EXPORT_SYMBOL()` or :c:func:`EXPORT_SYMBOL_GPL()`- this +will break. + +:c:func:`__initcall()`/:c:func:`module_init()` +---------------------------------------------- + +Defined in ``include/linux/init.h`` / ``include/linux/module.h`` + +Many parts of the kernel are well served as a module +(dynamically-loadable parts of the kernel). Using the +:c:func:`module_init()` and :c:func:`module_exit()` macros it +is easy to write code without #ifdefs which can operate both as a module +or built into the kernel. + +The :c:func:`module_init()` macro defines which function is to be +called at module insertion time (if the file is compiled as a module), +or at boot time: if the file is not compiled as a module the +:c:func:`module_init()` macro becomes equivalent to +:c:func:`__initcall()`, which through linker magic ensures that +the function is called on boot. + +The function can return a negative error number to cause module loading +to fail (unfortunately, this has no effect if the module is compiled +into the kernel). This function is called in user context with +interrupts enabled, so it can sleep. + +:c:func:`module_exit()` +----------------------- + + +Defined in ``include/linux/module.h`` + +This macro defines the function to be called at module removal time (or +never, in the case of the file compiled into the kernel). It will only +be called if the module usage count has reached zero. This function can +also sleep, but cannot fail: everything must be cleaned up by the time +it returns. + +Note that this macro is optional: if it is not present, your module will +not be removable (except for 'rmmod -f'). + +:c:func:`try_module_get()`/:c:func:`module_put()` +------------------------------------------------- + +Defined in ``include/linux/module.h`` + +These manipulate the module usage count, to protect against removal (a +module also can't be removed if another module uses one of its exported +symbols: see below). Before calling into module code, you should call +:c:func:`try_module_get()` on that module: if it fails, then the +module is being removed and you should act as if it wasn't there. +Otherwise, you can safely enter the module, and call +:c:func:`module_put()` when you're finished. + +Most registerable structures have an owner field, such as in the +:c:type:`struct file_operations <file_operations>` structure. +Set this field to the macro ``THIS_MODULE``. + +Wait Queues ``include/linux/wait.h`` +==================================== + +**[SLEEPS]** + +A wait queue is used to wait for someone to wake you up when a certain +condition is true. They must be used carefully to ensure there is no +race condition. You declare a :c:type:`wait_queue_head_t`, and then processes +which want to wait for that condition declare a :c:type:`wait_queue_t` +referring to themselves, and place that in the queue. + +Declaring +--------- + +You declare a ``wait_queue_head_t`` using the +:c:func:`DECLARE_WAIT_QUEUE_HEAD()` macro, or using the +:c:func:`init_waitqueue_head()` routine in your initialization +code. + +Queuing +------- + +Placing yourself in the waitqueue is fairly complex, because you must +put yourself in the queue before checking the condition. There is a +macro to do this: :c:func:`wait_event_interruptible()` +(``include/linux/wait.h``) The first argument is the wait queue head, and +the second is an expression which is evaluated; the macro returns 0 when +this expression is true, or ``-ERESTARTSYS`` if a signal is received. The +:c:func:`wait_event()` version ignores signals. + +Waking Up Queued Tasks +---------------------- + +Call :c:func:`wake_up()` (``include/linux/wait.h``);, which will wake +up every process in the queue. The exception is if one has +``TASK_EXCLUSIVE`` set, in which case the remainder of the queue will +not be woken. There are other variants of this basic function available +in the same header. + +Atomic Operations +================= + +Certain operations are guaranteed atomic on all platforms. The first +class of operations work on :c:type:`atomic_t` (``include/asm/atomic.h``); +this contains a signed integer (at least 32 bits long), and you must use +these functions to manipulate or read :c:type:`atomic_t` variables. +:c:func:`atomic_read()` and :c:func:`atomic_set()` get and set +the counter, :c:func:`atomic_add()`, :c:func:`atomic_sub()`, +:c:func:`atomic_inc()`, :c:func:`atomic_dec()`, and +:c:func:`atomic_dec_and_test()` (returns true if it was +decremented to zero). + +Yes. It returns true (i.e. != 0) if the atomic variable is zero. + +Note that these functions are slower than normal arithmetic, and so +should not be used unnecessarily. + +The second class of atomic operations is atomic bit operations on an +``unsigned long``, defined in ``include/linux/bitops.h``. These +operations generally take a pointer to the bit pattern, and a bit +number: 0 is the least significant bit. :c:func:`set_bit()`, +:c:func:`clear_bit()` and :c:func:`change_bit()` set, clear, +and flip the given bit. :c:func:`test_and_set_bit()`, +:c:func:`test_and_clear_bit()` and +:c:func:`test_and_change_bit()` do the same thing, except return +true if the bit was previously set; these are particularly useful for +atomically setting flags. + +It is possible to call these operations with bit indices greater than +``BITS_PER_LONG``. The resulting behavior is strange on big-endian +platforms though so it is a good idea not to do this. + +Symbols +======= + +Within the kernel proper, the normal linking rules apply (ie. unless a +symbol is declared to be file scope with the ``static`` keyword, it can +be used anywhere in the kernel). However, for modules, a special +exported symbol table is kept which limits the entry points to the +kernel proper. Modules can also export symbols. + +:c:func:`EXPORT_SYMBOL()` +------------------------- + +Defined in ``include/linux/export.h`` + +This is the classic method of exporting a symbol: dynamically loaded +modules will be able to use the symbol as normal. + +:c:func:`EXPORT_SYMBOL_GPL()` +----------------------------- + +Defined in ``include/linux/export.h`` + +Similar to :c:func:`EXPORT_SYMBOL()` except that the symbols +exported by :c:func:`EXPORT_SYMBOL_GPL()` can only be seen by +modules with a :c:func:`MODULE_LICENSE()` that specifies a GPL +compatible license. It implies that the function is considered an +internal implementation issue, and not really an interface. Some +maintainers and developers may however require EXPORT_SYMBOL_GPL() +when adding any new APIs or functionality. + +Routines and Conventions +======================== + +Double-linked lists ``include/linux/list.h`` +-------------------------------------------- + +There used to be three sets of linked-list routines in the kernel +headers, but this one is the winner. If you don't have some particular +pressing need for a single list, it's a good choice. + +In particular, :c:func:`list_for_each_entry()` is useful. + +Return Conventions +------------------ + +For code called in user context, it's very common to defy C convention, +and return 0 for success, and a negative error number (eg. ``-EFAULT``) for +failure. This can be unintuitive at first, but it's fairly widespread in +the kernel. + +Using :c:func:`ERR_PTR()` (``include/linux/err.h``) to encode a +negative error number into a pointer, and :c:func:`IS_ERR()` and +:c:func:`PTR_ERR()` to get it back out again: avoids a separate +pointer parameter for the error number. Icky, but in a good way. + +Breaking Compilation +-------------------- + +Linus and the other developers sometimes change function or structure +names in development kernels; this is not done just to keep everyone on +their toes: it reflects a fundamental change (eg. can no longer be +called with interrupts on, or does extra checks, or doesn't do checks +which were caught before). Usually this is accompanied by a fairly +complete note to the linux-kernel mailing list; search the archive. +Simply doing a global replace on the file usually makes things **worse**. + +Initializing structure members +------------------------------ + +The preferred method of initializing structures is to use designated +initialisers, as defined by ISO C99, eg:: + + static struct block_device_operations opt_fops = { + .open = opt_open, + .release = opt_release, + .ioctl = opt_ioctl, + .check_media_change = opt_media_change, + }; + + +This makes it easy to grep for, and makes it clear which structure +fields are set. You should do this because it looks cool. + +GNU Extensions +-------------- + +GNU Extensions are explicitly allowed in the Linux kernel. Note that +some of the more complex ones are not very well supported, due to lack +of general use, but the following are considered standard (see the GCC +info page section "C Extensions" for more details - Yes, really the info +page, the man page is only a short summary of the stuff in info). + +- Inline functions + +- Statement expressions (ie. the ({ and }) constructs). + +- Declaring attributes of a function / variable / type + (__attribute__) + +- typeof + +- Zero length arrays + +- Macro varargs + +- Arithmetic on void pointers + +- Non-Constant initializers + +- Assembler Instructions (not outside arch/ and include/asm/) + +- Function names as strings (__func__). + +- __builtin_constant_p() + +Be wary when using long long in the kernel, the code gcc generates for +it is horrible and worse: division and multiplication does not work on +i386 because the GCC runtime functions for it are missing from the +kernel environment. + +C++ +--- + +Using C++ in the kernel is usually a bad idea, because the kernel does +not provide the necessary runtime environment and the include files are +not tested for it. It is still possible, but not recommended. If you +really want to do this, forget about exceptions at least. + +NUMif +----- + +It is generally considered cleaner to use macros in header files (or at +the top of .c files) to abstract away functions rather than using \`#if' +pre-processor statements throughout the source code. + +Putting Your Stuff in the Kernel +================================ + +In order to get your stuff into shape for official inclusion, or even to +make a neat patch, there's administrative work to be done: + +- Figure out whose pond you've been pissing in. Look at the top of the + source files, inside the ``MAINTAINERS`` file, and last of all in the + ``CREDITS`` file. You should coordinate with this person to make sure + you're not duplicating effort, or trying something that's already + been rejected. + + Make sure you put your name and EMail address at the top of any files + you create or mangle significantly. This is the first place people + will look when they find a bug, or when **they** want to make a change. + +- Usually you want a configuration option for your kernel hack. Edit + ``Kconfig`` in the appropriate directory. The Config language is + simple to use by cut and paste, and there's complete documentation in + ``Documentation/kbuild/kconfig-language.txt``. + + In your description of the option, make sure you address both the + expert user and the user who knows nothing about your feature. + Mention incompatibilities and issues here. **Definitely** end your + description with “if in doubt, say N” (or, occasionally, \`Y'); this + is for people who have no idea what you are talking about. + +- Edit the ``Makefile``: the CONFIG variables are exported here so you + can usually just add a "obj-$(CONFIG_xxx) += xxx.o" line. The syntax + is documented in ``Documentation/kbuild/makefiles.txt``. + +- Put yourself in ``CREDITS`` if you've done something noteworthy, + usually beyond a single file (your name should be at the top of the + source files anyway). ``MAINTAINERS`` means you want to be consulted + when changes are made to a subsystem, and hear about bugs; it implies + a more-than-passing commitment to some part of the code. + +- Finally, don't forget to read + ``Documentation/process/submitting-patches.rst`` and possibly + ``Documentation/process/submitting-drivers.rst``. + +Kernel Cantrips +=============== + +Some favorites from browsing the source. Feel free to add to this list. + +``arch/x86/include/asm/delay.h``:: + + #define ndelay(n) (__builtin_constant_p(n) ? \ + ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \ + __ndelay(n)) + + +``include/linux/fs.h``:: + + /* + * Kernel pointers have redundant information, so we can use a + * scheme where we can return either an error code or a dentry + * pointer with the same return value. + * + * This should be a per-architecture thing, to allow different + * error and pointer decisions. + */ + #define ERR_PTR(err) ((void *)((long)(err))) + #define PTR_ERR(ptr) ((long)(ptr)) + #define IS_ERR(ptr) ((unsigned long)(ptr) > (unsigned long)(-1000)) + +``arch/x86/include/asm/uaccess_32.h:``:: + + #define copy_to_user(to,from,n) \ + (__builtin_constant_p(n) ? \ + __constant_copy_to_user((to),(from),(n)) : \ + __generic_copy_to_user((to),(from),(n))) + + +``arch/sparc/kernel/head.S:``:: + + /* + * Sun people can't spell worth damn. "compatability" indeed. + * At least we *know* we can't spell, and use a spell-checker. + */ + + /* Uh, actually Linus it is I who cannot spell. Too much murky + * Sparc assembly will do this to ya. + */ + C_LABEL(cputypvar): + .asciz "compatibility" + + /* Tested on SS-5, SS-10. Probably someone at Sun applied a spell-checker. */ + .align 4 + C_LABEL(cputypvar_sun4m): + .asciz "compatible" + + +``arch/sparc/lib/checksum.S:``:: + + /* Sun, you just can't beat me, you just can't. Stop trying, + * give up. I'm serious, I am going to kick the living shit + * out of you, game over, lights out. + */ + + +Thanks +====== + +Thanks to Andi Kleen for the idea, answering my questions, fixing my +mistakes, filling content, etc. Philipp Rumpf for more spelling and +clarity fixes, and some excellent non-obvious points. Werner Almesberger +for giving me a great summary of :c:func:`disable_irq()`, and Jes +Sorensen and Andrea Arcangeli added caveats. Michael Elizabeth Chastain +for checking and adding to the Configure section. Telsa Gwynne for +teaching me DocBook. diff --git a/Documentation/kernel-hacking/index.rst b/Documentation/kernel-hacking/index.rst new file mode 100644 index 000000000000..fcb0eda3cca3 --- /dev/null +++ b/Documentation/kernel-hacking/index.rst @@ -0,0 +1,9 @@ +===================== +Kernel Hacking Guides +===================== + +.. toctree:: + :maxdepth: 2 + + hacking + locking diff --git a/Documentation/kernel-hacking/locking.rst b/Documentation/kernel-hacking/locking.rst new file mode 100644 index 000000000000..f937c0fd11aa --- /dev/null +++ b/Documentation/kernel-hacking/locking.rst @@ -0,0 +1,1446 @@ +=========================== +Unreliable Guide To Locking +=========================== + +:Author: Rusty Russell + +Introduction +============ + +Welcome, to Rusty's Remarkably Unreliable Guide to Kernel Locking +issues. This document describes the locking systems in the Linux Kernel +in 2.6. + +With the wide availability of HyperThreading, and preemption in the +Linux Kernel, everyone hacking on the kernel needs to know the +fundamentals of concurrency and locking for SMP. + +The Problem With Concurrency +============================ + +(Skip this if you know what a Race Condition is). + +In a normal program, you can increment a counter like so: + +:: + + very_important_count++; + + +This is what they would expect to happen: + + +.. table:: Expected Results + + +------------------------------------+------------------------------------+ + | Instance 1 | Instance 2 | + +====================================+====================================+ + | read very_important_count (5) | | + +------------------------------------+------------------------------------+ + | add 1 (6) | | + +------------------------------------+------------------------------------+ + | write very_important_count (6) | | + +------------------------------------+------------------------------------+ + | | read very_important_count (6) | + +------------------------------------+------------------------------------+ + | | add 1 (7) | + +------------------------------------+------------------------------------+ + | | write very_important_count (7) | + +------------------------------------+------------------------------------+ + +This is what might happen: + +.. table:: Possible Results + + +------------------------------------+------------------------------------+ + | Instance 1 | Instance 2 | + +====================================+====================================+ + | read very_important_count (5) | | + +------------------------------------+------------------------------------+ + | | read very_important_count (5) | + +------------------------------------+------------------------------------+ + | add 1 (6) | | + +------------------------------------+------------------------------------+ + | | add 1 (6) | + +------------------------------------+------------------------------------+ + | write very_important_count (6) | | + +------------------------------------+------------------------------------+ + | | write very_important_count (6) | + +------------------------------------+------------------------------------+ + + +Race Conditions and Critical Regions +------------------------------------ + +This overlap, where the result depends on the relative timing of +multiple tasks, is called a race condition. The piece of code containing +the concurrency issue is called a critical region. And especially since +Linux starting running on SMP machines, they became one of the major +issues in kernel design and implementation. + +Preemption can have the same effect, even if there is only one CPU: by +preempting one task during the critical region, we have exactly the same +race condition. In this case the thread which preempts might run the +critical region itself. + +The solution is to recognize when these simultaneous accesses occur, and +use locks to make sure that only one instance can enter the critical +region at any time. There are many friendly primitives in the Linux +kernel to help you do this. And then there are the unfriendly +primitives, but I'll pretend they don't exist. + +Locking in the Linux Kernel +=========================== + +If I could give you one piece of advice: never sleep with anyone crazier +than yourself. But if I had to give you advice on locking: **keep it +simple**. + +Be reluctant to introduce new locks. + +Strangely enough, this last one is the exact reverse of my advice when +you **have** slept with someone crazier than yourself. And you should +think about getting a big dog. + +Two Main Types of Kernel Locks: Spinlocks and Mutexes +----------------------------------------------------- + +There are two main types of kernel locks. The fundamental type is the +spinlock (``include/asm/spinlock.h``), which is a very simple +single-holder lock: if you can't get the spinlock, you keep trying +(spinning) until you can. Spinlocks are very small and fast, and can be +used anywhere. + +The second type is a mutex (``include/linux/mutex.h``): it is like a +spinlock, but you may block holding a mutex. If you can't lock a mutex, +your task will suspend itself, and be woken up when the mutex is +released. This means the CPU can do something else while you are +waiting. There are many cases when you simply can't sleep (see +`What Functions Are Safe To Call From Interrupts? <#sleeping-things>`__), +and so have to use a spinlock instead. + +Neither type of lock is recursive: see +`Deadlock: Simple and Advanced <#deadlock>`__. + +Locks and Uniprocessor Kernels +------------------------------ + +For kernels compiled without ``CONFIG_SMP``, and without +``CONFIG_PREEMPT`` spinlocks do not exist at all. This is an excellent +design decision: when no-one else can run at the same time, there is no +reason to have a lock. + +If the kernel is compiled without ``CONFIG_SMP``, but ``CONFIG_PREEMPT`` +is set, then spinlocks simply disable preemption, which is sufficient to +prevent any races. For most purposes, we can think of preemption as +equivalent to SMP, and not worry about it separately. + +You should always test your locking code with ``CONFIG_SMP`` and +``CONFIG_PREEMPT`` enabled, even if you don't have an SMP test box, +because it will still catch some kinds of locking bugs. + +Mutexes still exist, because they are required for synchronization +between user contexts, as we will see below. + +Locking Only In User Context +---------------------------- + +If you have a data structure which is only ever accessed from user +context, then you can use a simple mutex (``include/linux/mutex.h``) to +protect it. This is the most trivial case: you initialize the mutex. +Then you can call :c:func:`mutex_lock_interruptible()` to grab the +mutex, and :c:func:`mutex_unlock()` to release it. There is also a +:c:func:`mutex_lock()`, which should be avoided, because it will +not return if a signal is received. + +Example: ``net/netfilter/nf_sockopt.c`` allows registration of new +:c:func:`setsockopt()` and :c:func:`getsockopt()` calls, with +:c:func:`nf_register_sockopt()`. Registration and de-registration +are only done on module load and unload (and boot time, where there is +no concurrency), and the list of registrations is only consulted for an +unknown :c:func:`setsockopt()` or :c:func:`getsockopt()` system +call. The ``nf_sockopt_mutex`` is perfect to protect this, especially +since the setsockopt and getsockopt calls may well sleep. + +Locking Between User Context and Softirqs +----------------------------------------- + +If a softirq shares data with user context, you have two problems. +Firstly, the current user context can be interrupted by a softirq, and +secondly, the critical region could be entered from another CPU. This is +where :c:func:`spin_lock_bh()` (``include/linux/spinlock.h``) is +used. It disables softirqs on that CPU, then grabs the lock. +:c:func:`spin_unlock_bh()` does the reverse. (The '_bh' suffix is +a historical reference to "Bottom Halves", the old name for software +interrupts. It should really be called spin_lock_softirq()' in a +perfect world). + +Note that you can also use :c:func:`spin_lock_irq()` or +:c:func:`spin_lock_irqsave()` here, which stop hardware interrupts +as well: see `Hard IRQ Context <#hardirq-context>`__. + +This works perfectly for UP as well: the spin lock vanishes, and this +macro simply becomes :c:func:`local_bh_disable()` +(``include/linux/interrupt.h``), which protects you from the softirq +being run. + +Locking Between User Context and Tasklets +----------------------------------------- + +This is exactly the same as above, because tasklets are actually run +from a softirq. + +Locking Between User Context and Timers +--------------------------------------- + +This, too, is exactly the same as above, because timers are actually run +from a softirq. From a locking point of view, tasklets and timers are +identical. + +Locking Between Tasklets/Timers +------------------------------- + +Sometimes a tasklet or timer might want to share data with another +tasklet or timer. + +The Same Tasklet/Timer +~~~~~~~~~~~~~~~~~~~~~~ + +Since a tasklet is never run on two CPUs at once, you don't need to +worry about your tasklet being reentrant (running twice at once), even +on SMP. + +Different Tasklets/Timers +~~~~~~~~~~~~~~~~~~~~~~~~~ + +If another tasklet/timer wants to share data with your tasklet or timer +, you will both need to use :c:func:`spin_lock()` and +:c:func:`spin_unlock()` calls. :c:func:`spin_lock_bh()` is +unnecessary here, as you are already in a tasklet, and none will be run +on the same CPU. + +Locking Between Softirqs +------------------------ + +Often a softirq might want to share data with itself or a tasklet/timer. + +The Same Softirq +~~~~~~~~~~~~~~~~ + +The same softirq can run on the other CPUs: you can use a per-CPU array +(see `Per-CPU Data <#per-cpu>`__) for better performance. If you're +going so far as to use a softirq, you probably care about scalable +performance enough to justify the extra complexity. + +You'll need to use :c:func:`spin_lock()` and +:c:func:`spin_unlock()` for shared data. + +Different Softirqs +~~~~~~~~~~~~~~~~~~ + +You'll need to use :c:func:`spin_lock()` and +:c:func:`spin_unlock()` for shared data, whether it be a timer, +tasklet, different softirq or the same or another softirq: any of them +could be running on a different CPU. + +Hard IRQ Context +================ + +Hardware interrupts usually communicate with a tasklet or softirq. +Frequently this involves putting work in a queue, which the softirq will +take out. + +Locking Between Hard IRQ and Softirqs/Tasklets +---------------------------------------------- + +If a hardware irq handler shares data with a softirq, you have two +concerns. Firstly, the softirq processing can be interrupted by a +hardware interrupt, and secondly, the critical region could be entered +by a hardware interrupt on another CPU. This is where +:c:func:`spin_lock_irq()` is used. It is defined to disable +interrupts on that cpu, then grab the lock. +:c:func:`spin_unlock_irq()` does the reverse. + +The irq handler does not to use :c:func:`spin_lock_irq()`, because +the softirq cannot run while the irq handler is running: it can use +:c:func:`spin_lock()`, which is slightly faster. The only exception +would be if a different hardware irq handler uses the same lock: +:c:func:`spin_lock_irq()` will stop that from interrupting us. + +This works perfectly for UP as well: the spin lock vanishes, and this +macro simply becomes :c:func:`local_irq_disable()` +(``include/asm/smp.h``), which protects you from the softirq/tasklet/BH +being run. + +:c:func:`spin_lock_irqsave()` (``include/linux/spinlock.h``) is a +variant which saves whether interrupts were on or off in a flags word, +which is passed to :c:func:`spin_unlock_irqrestore()`. This means +that the same code can be used inside an hard irq handler (where +interrupts are already off) and in softirqs (where the irq disabling is +required). + +Note that softirqs (and hence tasklets and timers) are run on return +from hardware interrupts, so :c:func:`spin_lock_irq()` also stops +these. In that sense, :c:func:`spin_lock_irqsave()` is the most +general and powerful locking function. + +Locking Between Two Hard IRQ Handlers +------------------------------------- + +It is rare to have to share data between two IRQ handlers, but if you +do, :c:func:`spin_lock_irqsave()` should be used: it is +architecture-specific whether all interrupts are disabled inside irq +handlers themselves. + +Cheat Sheet For Locking +======================= + +Pete Zaitcev gives the following summary: + +- If you are in a process context (any syscall) and want to lock other + process out, use a mutex. You can take a mutex and sleep + (``copy_from_user*(`` or ``kmalloc(x,GFP_KERNEL)``). + +- Otherwise (== data can be touched in an interrupt), use + :c:func:`spin_lock_irqsave()` and + :c:func:`spin_unlock_irqrestore()`. + +- Avoid holding spinlock for more than 5 lines of code and across any + function call (except accessors like :c:func:`readb()`). + +Table of Minimum Requirements +----------------------------- + +The following table lists the **minimum** locking requirements between +various contexts. In some cases, the same context can only be running on +one CPU at a time, so no locking is required for that context (eg. a +particular thread can only run on one CPU at a time, but if it needs +shares data with another thread, locking is required). + +Remember the advice above: you can always use +:c:func:`spin_lock_irqsave()`, which is a superset of all other +spinlock primitives. + +============== ============= ============= ========= ========= ========= ========= ======= ======= ============== ============== +. IRQ Handler A IRQ Handler B Softirq A Softirq B Tasklet A Tasklet B Timer A Timer B User Context A User Context B +============== ============= ============= ========= ========= ========= ========= ======= ======= ============== ============== +IRQ Handler A None +IRQ Handler B SLIS None +Softirq A SLI SLI SL +Softirq B SLI SLI SL SL +Tasklet A SLI SLI SL SL None +Tasklet B SLI SLI SL SL SL None +Timer A SLI SLI SL SL SL SL None +Timer B SLI SLI SL SL SL SL SL None +User Context A SLI SLI SLBH SLBH SLBH SLBH SLBH SLBH None +User Context B SLI SLI SLBH SLBH SLBH SLBH SLBH SLBH MLI None +============== ============= ============= ========= ========= ========= ========= ======= ======= ============== ============== + +Table: Table of Locking Requirements + ++--------+----------------------------+ +| SLIS | spin_lock_irqsave | ++--------+----------------------------+ +| SLI | spin_lock_irq | ++--------+----------------------------+ +| SL | spin_lock | ++--------+----------------------------+ +| SLBH | spin_lock_bh | ++--------+----------------------------+ +| MLI | mutex_lock_interruptible | ++--------+----------------------------+ + +Table: Legend for Locking Requirements Table + +The trylock Functions +===================== + +There are functions that try to acquire a lock only once and immediately +return a value telling about success or failure to acquire the lock. +They can be used if you need no access to the data protected with the +lock when some other thread is holding the lock. You should acquire the +lock later if you then need access to the data protected with the lock. + +:c:func:`spin_trylock()` does not spin but returns non-zero if it +acquires the spinlock on the first try or 0 if not. This function can be +used in all contexts like :c:func:`spin_lock()`: you must have +disabled the contexts that might interrupt you and acquire the spin +lock. + +:c:func:`mutex_trylock()` does not suspend your task but returns +non-zero if it could lock the mutex on the first try or 0 if not. This +function cannot be safely used in hardware or software interrupt +contexts despite not sleeping. + +Common Examples +=============== + +Let's step through a simple example: a cache of number to name mappings. +The cache keeps a count of how often each of the objects is used, and +when it gets full, throws out the least used one. + +All In User Context +------------------- + +For our first example, we assume that all operations are in user context +(ie. from system calls), so we can sleep. This means we can use a mutex +to protect the cache and all the objects within it. Here's the code:: + + #include <linux/list.h> + #include <linux/slab.h> + #include <linux/string.h> + #include <linux/mutex.h> + #include <asm/errno.h> + + struct object + { + struct list_head list; + int id; + char name[32]; + int popularity; + }; + + /* Protects the cache, cache_num, and the objects within it */ + static DEFINE_MUTEX(cache_lock); + static LIST_HEAD(cache); + static unsigned int cache_num = 0; + #define MAX_CACHE_SIZE 10 + + /* Must be holding cache_lock */ + static struct object *__cache_find(int id) + { + struct object *i; + + list_for_each_entry(i, &cache, list) + if (i->id == id) { + i->popularity++; + return i; + } + return NULL; + } + + /* Must be holding cache_lock */ + static void __cache_delete(struct object *obj) + { + BUG_ON(!obj); + list_del(&obj->list); + kfree(obj); + cache_num--; + } + + /* Must be holding cache_lock */ + static void __cache_add(struct object *obj) + { + list_add(&obj->list, &cache); + if (++cache_num > MAX_CACHE_SIZE) { + struct object *i, *outcast = NULL; + list_for_each_entry(i, &cache, list) { + if (!outcast || i->popularity < outcast->popularity) + outcast = i; + } + __cache_delete(outcast); + } + } + + int cache_add(int id, const char *name) + { + struct object *obj; + + if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL) + return -ENOMEM; + + strlcpy(obj->name, name, sizeof(obj->name)); + obj->id = id; + obj->popularity = 0; + + mutex_lock(&cache_lock); + __cache_add(obj); + mutex_unlock(&cache_lock); + return 0; + } + + void cache_delete(int id) + { + mutex_lock(&cache_lock); + __cache_delete(__cache_find(id)); + mutex_unlock(&cache_lock); + } + + int cache_find(int id, char *name) + { + struct object *obj; + int ret = -ENOENT; + + mutex_lock(&cache_lock); + obj = __cache_find(id); + if (obj) { + ret = 0; + strcpy(name, obj->name); + } + mutex_unlock(&cache_lock); + return ret; + } + +Note that we always make sure we have the cache_lock when we add, +delete, or look up the cache: both the cache infrastructure itself and +the contents of the objects are protected by the lock. In this case it's +easy, since we copy the data for the user, and never let them access the +objects directly. + +There is a slight (and common) optimization here: in +:c:func:`cache_add()` we set up the fields of the object before +grabbing the lock. This is safe, as no-one else can access it until we +put it in cache. + +Accessing From Interrupt Context +-------------------------------- + +Now consider the case where :c:func:`cache_find()` can be called +from interrupt context: either a hardware interrupt or a softirq. An +example would be a timer which deletes object from the cache. + +The change is shown below, in standard patch format: the ``-`` are lines +which are taken away, and the ``+`` are lines which are added. + +:: + + --- cache.c.usercontext 2003-12-09 13:58:54.000000000 +1100 + +++ cache.c.interrupt 2003-12-09 14:07:49.000000000 +1100 + @@ -12,7 +12,7 @@ + int popularity; + }; + + -static DEFINE_MUTEX(cache_lock); + +static DEFINE_SPINLOCK(cache_lock); + static LIST_HEAD(cache); + static unsigned int cache_num = 0; + #define MAX_CACHE_SIZE 10 + @@ -55,6 +55,7 @@ + int cache_add(int id, const char *name) + { + struct object *obj; + + unsigned long flags; + + if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL) + return -ENOMEM; + @@ -63,30 +64,33 @@ + obj->id = id; + obj->popularity = 0; + + - mutex_lock(&cache_lock); + + spin_lock_irqsave(&cache_lock, flags); + __cache_add(obj); + - mutex_unlock(&cache_lock); + + spin_unlock_irqrestore(&cache_lock, flags); + return 0; + } + + void cache_delete(int id) + { + - mutex_lock(&cache_lock); + + unsigned long flags; + + + + spin_lock_irqsave(&cache_lock, flags); + __cache_delete(__cache_find(id)); + - mutex_unlock(&cache_lock); + + spin_unlock_irqrestore(&cache_lock, flags); + } + + int cache_find(int id, char *name) + { + struct object *obj; + int ret = -ENOENT; + + unsigned long flags; + + - mutex_lock(&cache_lock); + + spin_lock_irqsave(&cache_lock, flags); + obj = __cache_find(id); + if (obj) { + ret = 0; + strcpy(name, obj->name); + } + - mutex_unlock(&cache_lock); + + spin_unlock_irqrestore(&cache_lock, flags); + return ret; + } + +Note that the :c:func:`spin_lock_irqsave()` will turn off +interrupts if they are on, otherwise does nothing (if we are already in +an interrupt handler), hence these functions are safe to call from any +context. + +Unfortunately, :c:func:`cache_add()` calls :c:func:`kmalloc()` +with the ``GFP_KERNEL`` flag, which is only legal in user context. I +have assumed that :c:func:`cache_add()` is still only called in +user context, otherwise this should become a parameter to +:c:func:`cache_add()`. + +Exposing Objects Outside This File +---------------------------------- + +If our objects contained more information, it might not be sufficient to +copy the information in and out: other parts of the code might want to +keep pointers to these objects, for example, rather than looking up the +id every time. This produces two problems. + +The first problem is that we use the ``cache_lock`` to protect objects: +we'd need to make this non-static so the rest of the code can use it. +This makes locking trickier, as it is no longer all in one place. + +The second problem is the lifetime problem: if another structure keeps a +pointer to an object, it presumably expects that pointer to remain +valid. Unfortunately, this is only guaranteed while you hold the lock, +otherwise someone might call :c:func:`cache_delete()` and even +worse, add another object, re-using the same address. + +As there is only one lock, you can't hold it forever: no-one else would +get any work done. + +The solution to this problem is to use a reference count: everyone who +has a pointer to the object increases it when they first get the object, +and drops the reference count when they're finished with it. Whoever +drops it to zero knows it is unused, and can actually delete it. + +Here is the code:: + + --- cache.c.interrupt 2003-12-09 14:25:43.000000000 +1100 + +++ cache.c.refcnt 2003-12-09 14:33:05.000000000 +1100 + @@ -7,6 +7,7 @@ + struct object + { + struct list_head list; + + unsigned int refcnt; + int id; + char name[32]; + int popularity; + @@ -17,6 +18,35 @@ + static unsigned int cache_num = 0; + #define MAX_CACHE_SIZE 10 + + +static void __object_put(struct object *obj) + +{ + + if (--obj->refcnt == 0) + + kfree(obj); + +} + + + +static void __object_get(struct object *obj) + +{ + + obj->refcnt++; + +} + + + +void object_put(struct object *obj) + +{ + + unsigned long flags; + + + + spin_lock_irqsave(&cache_lock, flags); + + __object_put(obj); + + spin_unlock_irqrestore(&cache_lock, flags); + +} + + + +void object_get(struct object *obj) + +{ + + unsigned long flags; + + + + spin_lock_irqsave(&cache_lock, flags); + + __object_get(obj); + + spin_unlock_irqrestore(&cache_lock, flags); + +} + + + /* Must be holding cache_lock */ + static struct object *__cache_find(int id) + { + @@ -35,6 +65,7 @@ + { + BUG_ON(!obj); + list_del(&obj->list); + + __object_put(obj); + cache_num--; + } + + @@ -63,6 +94,7 @@ + strlcpy(obj->name, name, sizeof(obj->name)); + obj->id = id; + obj->popularity = 0; + + obj->refcnt = 1; /* The cache holds a reference */ + + spin_lock_irqsave(&cache_lock, flags); + __cache_add(obj); + @@ -79,18 +111,15 @@ + spin_unlock_irqrestore(&cache_lock, flags); + } + + -int cache_find(int id, char *name) + +struct object *cache_find(int id) + { + struct object *obj; + - int ret = -ENOENT; + unsigned long flags; + + spin_lock_irqsave(&cache_lock, flags); + obj = __cache_find(id); + - if (obj) { + - ret = 0; + - strcpy(name, obj->name); + - } + + if (obj) + + __object_get(obj); + spin_unlock_irqrestore(&cache_lock, flags); + - return ret; + + return obj; + } + +We encapsulate the reference counting in the standard 'get' and 'put' +functions. Now we can return the object itself from +:c:func:`cache_find()` which has the advantage that the user can +now sleep holding the object (eg. to :c:func:`copy_to_user()` to +name to userspace). + +The other point to note is that I said a reference should be held for +every pointer to the object: thus the reference count is 1 when first +inserted into the cache. In some versions the framework does not hold a +reference count, but they are more complicated. + +Using Atomic Operations For The Reference Count +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In practice, :c:type:`atomic_t` would usually be used for refcnt. There are a +number of atomic operations defined in ``include/asm/atomic.h``: these +are guaranteed to be seen atomically from all CPUs in the system, so no +lock is required. In this case, it is simpler than using spinlocks, +although for anything non-trivial using spinlocks is clearer. The +:c:func:`atomic_inc()` and :c:func:`atomic_dec_and_test()` +are used instead of the standard increment and decrement operators, and +the lock is no longer used to protect the reference count itself. + +:: + + --- cache.c.refcnt 2003-12-09 15:00:35.000000000 +1100 + +++ cache.c.refcnt-atomic 2003-12-11 15:49:42.000000000 +1100 + @@ -7,7 +7,7 @@ + struct object + { + struct list_head list; + - unsigned int refcnt; + + atomic_t refcnt; + int id; + char name[32]; + int popularity; + @@ -18,33 +18,15 @@ + static unsigned int cache_num = 0; + #define MAX_CACHE_SIZE 10 + + -static void __object_put(struct object *obj) + -{ + - if (--obj->refcnt == 0) + - kfree(obj); + -} + - + -static void __object_get(struct object *obj) + -{ + - obj->refcnt++; + -} + - + void object_put(struct object *obj) + { + - unsigned long flags; + - + - spin_lock_irqsave(&cache_lock, flags); + - __object_put(obj); + - spin_unlock_irqrestore(&cache_lock, flags); + + if (atomic_dec_and_test(&obj->refcnt)) + + kfree(obj); + } + + void object_get(struct object *obj) + { + - unsigned long flags; + - + - spin_lock_irqsave(&cache_lock, flags); + - __object_get(obj); + - spin_unlock_irqrestore(&cache_lock, flags); + + atomic_inc(&obj->refcnt); + } + + /* Must be holding cache_lock */ + @@ -65,7 +47,7 @@ + { + BUG_ON(!obj); + list_del(&obj->list); + - __object_put(obj); + + object_put(obj); + cache_num--; + } + + @@ -94,7 +76,7 @@ + strlcpy(obj->name, name, sizeof(obj->name)); + obj->id = id; + obj->popularity = 0; + - obj->refcnt = 1; /* The cache holds a reference */ + + atomic_set(&obj->refcnt, 1); /* The cache holds a reference */ + + spin_lock_irqsave(&cache_lock, flags); + __cache_add(obj); + @@ -119,7 +101,7 @@ + spin_lock_irqsave(&cache_lock, flags); + obj = __cache_find(id); + if (obj) + - __object_get(obj); + + object_get(obj); + spin_unlock_irqrestore(&cache_lock, flags); + return obj; + } + +Protecting The Objects Themselves +--------------------------------- + +In these examples, we assumed that the objects (except the reference +counts) never changed once they are created. If we wanted to allow the +name to change, there are three possibilities: + +- You can make ``cache_lock`` non-static, and tell people to grab that + lock before changing the name in any object. + +- You can provide a :c:func:`cache_obj_rename()` which grabs this + lock and changes the name for the caller, and tell everyone to use + that function. + +- You can make the ``cache_lock`` protect only the cache itself, and + use another lock to protect the name. + +Theoretically, you can make the locks as fine-grained as one lock for +every field, for every object. In practice, the most common variants +are: + +- One lock which protects the infrastructure (the ``cache`` list in + this example) and all the objects. This is what we have done so far. + +- One lock which protects the infrastructure (including the list + pointers inside the objects), and one lock inside the object which + protects the rest of that object. + +- Multiple locks to protect the infrastructure (eg. one lock per hash + chain), possibly with a separate per-object lock. + +Here is the "lock-per-object" implementation: + +:: + + --- cache.c.refcnt-atomic 2003-12-11 15:50:54.000000000 +1100 + +++ cache.c.perobjectlock 2003-12-11 17:15:03.000000000 +1100 + @@ -6,11 +6,17 @@ + + struct object + { + + /* These two protected by cache_lock. */ + struct list_head list; + + int popularity; + + + atomic_t refcnt; + + + + /* Doesn't change once created. */ + int id; + + + + spinlock_t lock; /* Protects the name */ + char name[32]; + - int popularity; + }; + + static DEFINE_SPINLOCK(cache_lock); + @@ -77,6 +84,7 @@ + obj->id = id; + obj->popularity = 0; + atomic_set(&obj->refcnt, 1); /* The cache holds a reference */ + + spin_lock_init(&obj->lock); + + spin_lock_irqsave(&cache_lock, flags); + __cache_add(obj); + +Note that I decide that the popularity count should be protected by the +``cache_lock`` rather than the per-object lock: this is because it (like +the :c:type:`struct list_head <list_head>` inside the object) +is logically part of the infrastructure. This way, I don't need to grab +the lock of every object in :c:func:`__cache_add()` when seeking +the least popular. + +I also decided that the id member is unchangeable, so I don't need to +grab each object lock in :c:func:`__cache_find()` to examine the +id: the object lock is only used by a caller who wants to read or write +the name field. + +Note also that I added a comment describing what data was protected by +which locks. This is extremely important, as it describes the runtime +behavior of the code, and can be hard to gain from just reading. And as +Alan Cox says, “Lock data, not code”. + +Common Problems +=============== + +Deadlock: Simple and Advanced +----------------------------- + +There is a coding bug where a piece of code tries to grab a spinlock +twice: it will spin forever, waiting for the lock to be released +(spinlocks, rwlocks and mutexes are not recursive in Linux). This is +trivial to diagnose: not a +stay-up-five-nights-talk-to-fluffy-code-bunnies kind of problem. + +For a slightly more complex case, imagine you have a region shared by a +softirq and user context. If you use a :c:func:`spin_lock()` call +to protect it, it is possible that the user context will be interrupted +by the softirq while it holds the lock, and the softirq will then spin +forever trying to get the same lock. + +Both of these are called deadlock, and as shown above, it can occur even +with a single CPU (although not on UP compiles, since spinlocks vanish +on kernel compiles with ``CONFIG_SMP``\ =n. You'll still get data +corruption in the second example). + +This complete lockup is easy to diagnose: on SMP boxes the watchdog +timer or compiling with ``DEBUG_SPINLOCK`` set +(``include/linux/spinlock.h``) will show this up immediately when it +happens. + +A more complex problem is the so-called 'deadly embrace', involving two +or more locks. Say you have a hash table: each entry in the table is a +spinlock, and a chain of hashed objects. Inside a softirq handler, you +sometimes want to alter an object from one place in the hash to another: +you grab the spinlock of the old hash chain and the spinlock of the new +hash chain, and delete the object from the old one, and insert it in the +new one. + +There are two problems here. First, if your code ever tries to move the +object to the same chain, it will deadlock with itself as it tries to +lock it twice. Secondly, if the same softirq on another CPU is trying to +move another object in the reverse direction, the following could +happen: + ++-----------------------+-----------------------+ +| CPU 1 | CPU 2 | ++=======================+=======================+ +| Grab lock A -> OK | Grab lock B -> OK | ++-----------------------+-----------------------+ +| Grab lock B -> spin | Grab lock A -> spin | ++-----------------------+-----------------------+ + +Table: Consequences + +The two CPUs will spin forever, waiting for the other to give up their +lock. It will look, smell, and feel like a crash. + +Preventing Deadlock +------------------- + +Textbooks will tell you that if you always lock in the same order, you +will never get this kind of deadlock. Practice will tell you that this +approach doesn't scale: when I create a new lock, I don't understand +enough of the kernel to figure out where in the 5000 lock hierarchy it +will fit. + +The best locks are encapsulated: they never get exposed in headers, and +are never held around calls to non-trivial functions outside the same +file. You can read through this code and see that it will never +deadlock, because it never tries to grab another lock while it has that +one. People using your code don't even need to know you are using a +lock. + +A classic problem here is when you provide callbacks or hooks: if you +call these with the lock held, you risk simple deadlock, or a deadly +embrace (who knows what the callback will do?). Remember, the other +programmers are out to get you, so don't do this. + +Overzealous Prevention Of Deadlocks +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Deadlocks are problematic, but not as bad as data corruption. Code which +grabs a read lock, searches a list, fails to find what it wants, drops +the read lock, grabs a write lock and inserts the object has a race +condition. + +If you don't see why, please stay the fuck away from my code. + +Racing Timers: A Kernel Pastime +------------------------------- + +Timers can produce their own special problems with races. Consider a +collection of objects (list, hash, etc) where each object has a timer +which is due to destroy it. + +If you want to destroy the entire collection (say on module removal), +you might do the following:: + + /* THIS CODE BAD BAD BAD BAD: IF IT WAS ANY WORSE IT WOULD USE + HUNGARIAN NOTATION */ + spin_lock_bh(&list_lock); + + while (list) { + struct foo *next = list->next; + del_timer(&list->timer); + kfree(list); + list = next; + } + + spin_unlock_bh(&list_lock); + + +Sooner or later, this will crash on SMP, because a timer can have just +gone off before the :c:func:`spin_lock_bh()`, and it will only get +the lock after we :c:func:`spin_unlock_bh()`, and then try to free +the element (which has already been freed!). + +This can be avoided by checking the result of +:c:func:`del_timer()`: if it returns 1, the timer has been deleted. +If 0, it means (in this case) that it is currently running, so we can +do:: + + retry: + spin_lock_bh(&list_lock); + + while (list) { + struct foo *next = list->next; + if (!del_timer(&list->timer)) { + /* Give timer a chance to delete this */ + spin_unlock_bh(&list_lock); + goto retry; + } + kfree(list); + list = next; + } + + spin_unlock_bh(&list_lock); + + +Another common problem is deleting timers which restart themselves (by +calling :c:func:`add_timer()` at the end of their timer function). +Because this is a fairly common case which is prone to races, you should +use :c:func:`del_timer_sync()` (``include/linux/timer.h``) to +handle this case. It returns the number of times the timer had to be +deleted before we finally stopped it from adding itself back in. + +Locking Speed +============= + +There are three main things to worry about when considering speed of +some code which does locking. First is concurrency: how many things are +going to be waiting while someone else is holding a lock. Second is the +time taken to actually acquire and release an uncontended lock. Third is +using fewer, or smarter locks. I'm assuming that the lock is used fairly +often: otherwise, you wouldn't be concerned about efficiency. + +Concurrency depends on how long the lock is usually held: you should +hold the lock for as long as needed, but no longer. In the cache +example, we always create the object without the lock held, and then +grab the lock only when we are ready to insert it in the list. + +Acquisition times depend on how much damage the lock operations do to +the pipeline (pipeline stalls) and how likely it is that this CPU was +the last one to grab the lock (ie. is the lock cache-hot for this CPU): +on a machine with more CPUs, this likelihood drops fast. Consider a +700MHz Intel Pentium III: an instruction takes about 0.7ns, an atomic +increment takes about 58ns, a lock which is cache-hot on this CPU takes +160ns, and a cacheline transfer from another CPU takes an additional 170 +to 360ns. (These figures from Paul McKenney's `Linux Journal RCU +article <http://www.linuxjournal.com/article.php?sid=6993>`__). + +These two aims conflict: holding a lock for a short time might be done +by splitting locks into parts (such as in our final per-object-lock +example), but this increases the number of lock acquisitions, and the +results are often slower than having a single lock. This is another +reason to advocate locking simplicity. + +The third concern is addressed below: there are some methods to reduce +the amount of locking which needs to be done. + +Read/Write Lock Variants +------------------------ + +Both spinlocks and mutexes have read/write variants: ``rwlock_t`` and +:c:type:`struct rw_semaphore <rw_semaphore>`. These divide +users into two classes: the readers and the writers. If you are only +reading the data, you can get a read lock, but to write to the data you +need the write lock. Many people can hold a read lock, but a writer must +be sole holder. + +If your code divides neatly along reader/writer lines (as our cache code +does), and the lock is held by readers for significant lengths of time, +using these locks can help. They are slightly slower than the normal +locks though, so in practice ``rwlock_t`` is not usually worthwhile. + +Avoiding Locks: Read Copy Update +-------------------------------- + +There is a special method of read/write locking called Read Copy Update. +Using RCU, the readers can avoid taking a lock altogether: as we expect +our cache to be read more often than updated (otherwise the cache is a +waste of time), it is a candidate for this optimization. + +How do we get rid of read locks? Getting rid of read locks means that +writers may be changing the list underneath the readers. That is +actually quite simple: we can read a linked list while an element is +being added if the writer adds the element very carefully. For example, +adding ``new`` to a single linked list called ``list``:: + + new->next = list->next; + wmb(); + list->next = new; + + +The :c:func:`wmb()` is a write memory barrier. It ensures that the +first operation (setting the new element's ``next`` pointer) is complete +and will be seen by all CPUs, before the second operation is (putting +the new element into the list). This is important, since modern +compilers and modern CPUs can both reorder instructions unless told +otherwise: we want a reader to either not see the new element at all, or +see the new element with the ``next`` pointer correctly pointing at the +rest of the list. + +Fortunately, there is a function to do this for standard +:c:type:`struct list_head <list_head>` lists: +:c:func:`list_add_rcu()` (``include/linux/list.h``). + +Removing an element from the list is even simpler: we replace the +pointer to the old element with a pointer to its successor, and readers +will either see it, or skip over it. + +:: + + list->next = old->next; + + +There is :c:func:`list_del_rcu()` (``include/linux/list.h``) which +does this (the normal version poisons the old object, which we don't +want). + +The reader must also be careful: some CPUs can look through the ``next`` +pointer to start reading the contents of the next element early, but +don't realize that the pre-fetched contents is wrong when the ``next`` +pointer changes underneath them. Once again, there is a +:c:func:`list_for_each_entry_rcu()` (``include/linux/list.h``) +to help you. Of course, writers can just use +:c:func:`list_for_each_entry()`, since there cannot be two +simultaneous writers. + +Our final dilemma is this: when can we actually destroy the removed +element? Remember, a reader might be stepping through this element in +the list right now: if we free this element and the ``next`` pointer +changes, the reader will jump off into garbage and crash. We need to +wait until we know that all the readers who were traversing the list +when we deleted the element are finished. We use +:c:func:`call_rcu()` to register a callback which will actually +destroy the object once all pre-existing readers are finished. +Alternatively, :c:func:`synchronize_rcu()` may be used to block +until all pre-existing are finished. + +But how does Read Copy Update know when the readers are finished? The +method is this: firstly, the readers always traverse the list inside +:c:func:`rcu_read_lock()`/:c:func:`rcu_read_unlock()` pairs: +these simply disable preemption so the reader won't go to sleep while +reading the list. + +RCU then waits until every other CPU has slept at least once: since +readers cannot sleep, we know that any readers which were traversing the +list during the deletion are finished, and the callback is triggered. +The real Read Copy Update code is a little more optimized than this, but +this is the fundamental idea. + +:: + + --- cache.c.perobjectlock 2003-12-11 17:15:03.000000000 +1100 + +++ cache.c.rcupdate 2003-12-11 17:55:14.000000000 +1100 + @@ -1,15 +1,18 @@ + #include <linux/list.h> + #include <linux/slab.h> + #include <linux/string.h> + +#include <linux/rcupdate.h> + #include <linux/mutex.h> + #include <asm/errno.h> + + struct object + { + - /* These two protected by cache_lock. */ + + /* This is protected by RCU */ + struct list_head list; + int popularity; + + + struct rcu_head rcu; + + + atomic_t refcnt; + + /* Doesn't change once created. */ + @@ -40,7 +43,7 @@ + { + struct object *i; + + - list_for_each_entry(i, &cache, list) { + + list_for_each_entry_rcu(i, &cache, list) { + if (i->id == id) { + i->popularity++; + return i; + @@ -49,19 +52,25 @@ + return NULL; + } + + +/* Final discard done once we know no readers are looking. */ + +static void cache_delete_rcu(void *arg) + +{ + + object_put(arg); + +} + + + /* Must be holding cache_lock */ + static void __cache_delete(struct object *obj) + { + BUG_ON(!obj); + - list_del(&obj->list); + - object_put(obj); + + list_del_rcu(&obj->list); + cache_num--; + + call_rcu(&obj->rcu, cache_delete_rcu); + } + + /* Must be holding cache_lock */ + static void __cache_add(struct object *obj) + { + - list_add(&obj->list, &cache); + + list_add_rcu(&obj->list, &cache); + if (++cache_num > MAX_CACHE_SIZE) { + struct object *i, *outcast = NULL; + list_for_each_entry(i, &cache, list) { + @@ -104,12 +114,11 @@ + struct object *cache_find(int id) + { + struct object *obj; + - unsigned long flags; + + - spin_lock_irqsave(&cache_lock, flags); + + rcu_read_lock(); + obj = __cache_find(id); + if (obj) + object_get(obj); + - spin_unlock_irqrestore(&cache_lock, flags); + + rcu_read_unlock(); + return obj; + } + +Note that the reader will alter the popularity member in +:c:func:`__cache_find()`, and now it doesn't hold a lock. One +solution would be to make it an ``atomic_t``, but for this usage, we +don't really care about races: an approximate result is good enough, so +I didn't change it. + +The result is that :c:func:`cache_find()` requires no +synchronization with any other functions, so is almost as fast on SMP as +it would be on UP. + +There is a further optimization possible here: remember our original +cache code, where there were no reference counts and the caller simply +held the lock whenever using the object? This is still possible: if you +hold the lock, no one can delete the object, so you don't need to get +and put the reference count. + +Now, because the 'read lock' in RCU is simply disabling preemption, a +caller which always has preemption disabled between calling +:c:func:`cache_find()` and :c:func:`object_put()` does not +need to actually get and put the reference count: we could expose +:c:func:`__cache_find()` by making it non-static, and such +callers could simply call that. + +The benefit here is that the reference count is not written to: the +object is not altered in any way, which is much faster on SMP machines +due to caching. + +Per-CPU Data +------------ + +Another technique for avoiding locking which is used fairly widely is to +duplicate information for each CPU. For example, if you wanted to keep a +count of a common condition, you could use a spin lock and a single +counter. Nice and simple. + +If that was too slow (it's usually not, but if you've got a really big +machine to test on and can show that it is), you could instead use a +counter for each CPU, then none of them need an exclusive lock. See +:c:func:`DEFINE_PER_CPU()`, :c:func:`get_cpu_var()` and +:c:func:`put_cpu_var()` (``include/linux/percpu.h``). + +Of particular use for simple per-cpu counters is the ``local_t`` type, +and the :c:func:`cpu_local_inc()` and related functions, which are +more efficient than simple code on some architectures +(``include/asm/local.h``). + +Note that there is no simple, reliable way of getting an exact value of +such a counter, without introducing more locks. This is not a problem +for some uses. + +Data Which Mostly Used By An IRQ Handler +---------------------------------------- + +If data is always accessed from within the same IRQ handler, you don't +need a lock at all: the kernel already guarantees that the irq handler +will not run simultaneously on multiple CPUs. + +Manfred Spraul points out that you can still do this, even if the data +is very occasionally accessed in user context or softirqs/tasklets. The +irq handler doesn't use a lock, and all other accesses are done as so:: + + spin_lock(&lock); + disable_irq(irq); + ... + enable_irq(irq); + spin_unlock(&lock); + +The :c:func:`disable_irq()` prevents the irq handler from running +(and waits for it to finish if it's currently running on other CPUs). +The spinlock prevents any other accesses happening at the same time. +Naturally, this is slower than just a :c:func:`spin_lock_irq()` +call, so it only makes sense if this type of access happens extremely +rarely. + +What Functions Are Safe To Call From Interrupts? +================================================ + +Many functions in the kernel sleep (ie. call schedule()) directly or +indirectly: you can never call them while holding a spinlock, or with +preemption disabled. This also means you need to be in user context: +calling them from an interrupt is illegal. + +Some Functions Which Sleep +-------------------------- + +The most common ones are listed below, but you usually have to read the +code to find out if other calls are safe. If everyone else who calls it +can sleep, you probably need to be able to sleep, too. In particular, +registration and deregistration functions usually expect to be called +from user context, and can sleep. + +- Accesses to userspace: + + - :c:func:`copy_from_user()` + + - :c:func:`copy_to_user()` + + - :c:func:`get_user()` + + - :c:func:`put_user()` + +- :c:func:`kmalloc(GFP_KERNEL) <kmalloc>` + +- :c:func:`mutex_lock_interruptible()` and + :c:func:`mutex_lock()` + + There is a :c:func:`mutex_trylock()` which does not sleep. + Still, it must not be used inside interrupt context since its + implementation is not safe for that. :c:func:`mutex_unlock()` + will also never sleep. It cannot be used in interrupt context either + since a mutex must be released by the same task that acquired it. + +Some Functions Which Don't Sleep +-------------------------------- + +Some functions are safe to call from any context, or holding almost any +lock. + +- :c:func:`printk()` + +- :c:func:`kfree()` + +- :c:func:`add_timer()` and :c:func:`del_timer()` + +Mutex API reference +=================== + +.. kernel-doc:: include/linux/mutex.h + :internal: + +.. kernel-doc:: kernel/locking/mutex.c + :export: + +Futex API reference +=================== + +.. kernel-doc:: kernel/futex.c + :internal: + +Further reading +=============== + +- ``Documentation/locking/spinlocks.txt``: Linus Torvalds' spinlocking + tutorial in the kernel sources. + +- Unix Systems for Modern Architectures: Symmetric Multiprocessing and + Caching for Kernel Programmers: + + Curt Schimmel's very good introduction to kernel level locking (not + written for Linux, but nearly everything applies). The book is + expensive, but really worth every penny to understand SMP locking. + [ISBN: 0201633388] + +Thanks +====== + +Thanks to Telsa Gwynne for DocBooking, neatening and adding style. + +Thanks to Martin Pool, Philipp Rumpf, Stephen Rothwell, Paul Mackerras, +Ruedi Aschwanden, Alan Cox, Manfred Spraul, Tim Waugh, Pete Zaitcev, +James Morris, Robert Love, Paul McKenney, John Ashby for proofreading, +correcting, flaming, commenting. + +Thanks to the cabal for having no influence on this document. + +Glossary +======== + +preemption + Prior to 2.5, or when ``CONFIG_PREEMPT`` is unset, processes in user + context inside the kernel would not preempt each other (ie. you had that + CPU until you gave it up, except for interrupts). With the addition of + ``CONFIG_PREEMPT`` in 2.5.4, this changed: when in user context, higher + priority tasks can "cut in": spinlocks were changed to disable + preemption, even on UP. + +bh + Bottom Half: for historical reasons, functions with '_bh' in them often + now refer to any software interrupt, e.g. :c:func:`spin_lock_bh()` + blocks any software interrupt on the current CPU. Bottom halves are + deprecated, and will eventually be replaced by tasklets. Only one bottom + half will be running at any time. + +Hardware Interrupt / Hardware IRQ + Hardware interrupt request. :c:func:`in_irq()` returns true in a + hardware interrupt handler. + +Interrupt Context + Not user context: processing a hardware irq or software irq. Indicated + by the :c:func:`in_interrupt()` macro returning true. + +SMP + Symmetric Multi-Processor: kernels compiled for multiple-CPU machines. + (``CONFIG_SMP=y``). + +Software Interrupt / softirq + Software interrupt handler. :c:func:`in_irq()` returns false; + :c:func:`in_softirq()` returns true. Tasklets and softirqs both + fall into the category of 'software interrupts'. + + Strictly speaking a softirq is one of up to 32 enumerated software + interrupts which can run on multiple CPUs at once. Sometimes used to + refer to tasklets as well (ie. all software interrupts). + +tasklet + A dynamically-registrable software interrupt, which is guaranteed to + only run on one CPU at a time. + +timer + A dynamically-registrable software interrupt, which is run at (or close + to) a given time. When running, it is just like a tasklet (in fact, they + are called from the ``TIMER_SOFTIRQ``). + +UP + Uni-Processor: Non-SMP. (``CONFIG_SMP=n``). + +User Context + The kernel executing on behalf of a particular process (ie. a system + call or trap) or kernel thread. You can tell which process with the + ``current`` macro.) Not to be confused with userspace. Can be + interrupted by software or hardware interrupts. + +Userspace + A process executing its own code outside the kernel. diff --git a/Documentation/lsm.txt b/Documentation/lsm.txt new file mode 100644 index 000000000000..ad4dfd020e0d --- /dev/null +++ b/Documentation/lsm.txt @@ -0,0 +1,201 @@ +======================================================== +Linux Security Modules: General Security Hooks for Linux +======================================================== + +:Author: Stephen Smalley +:Author: Timothy Fraser +:Author: Chris Vance + +.. note:: + + The APIs described in this book are outdated. + +Introduction +============ + +In March 2001, the National Security Agency (NSA) gave a presentation +about Security-Enhanced Linux (SELinux) at the 2.5 Linux Kernel Summit. +SELinux is an implementation of flexible and fine-grained +nondiscretionary access controls in the Linux kernel, originally +implemented as its own particular kernel patch. Several other security +projects (e.g. RSBAC, Medusa) have also developed flexible access +control architectures for the Linux kernel, and various projects have +developed particular access control models for Linux (e.g. LIDS, DTE, +SubDomain). Each project has developed and maintained its own kernel +patch to support its security needs. + +In response to the NSA presentation, Linus Torvalds made a set of +remarks that described a security framework he would be willing to +consider for inclusion in the mainstream Linux kernel. He described a +general framework that would provide a set of security hooks to control +operations on kernel objects and a set of opaque security fields in +kernel data structures for maintaining security attributes. This +framework could then be used by loadable kernel modules to implement any +desired model of security. Linus also suggested the possibility of +migrating the Linux capabilities code into such a module. + +The Linux Security Modules (LSM) project was started by WireX to develop +such a framework. LSM is a joint development effort by several security +projects, including Immunix, SELinux, SGI and Janus, and several +individuals, including Greg Kroah-Hartman and James Morris, to develop a +Linux kernel patch that implements this framework. The patch is +currently tracking the 2.4 series and is targeted for integration into +the 2.5 development series. This technical report provides an overview +of the framework and the example capabilities security module provided +by the LSM kernel patch. + +LSM Framework +============= + +The LSM kernel patch provides a general kernel framework to support +security modules. In particular, the LSM framework is primarily focused +on supporting access control modules, although future development is +likely to address other security needs such as auditing. By itself, the +framework does not provide any additional security; it merely provides +the infrastructure to support security modules. The LSM kernel patch +also moves most of the capabilities logic into an optional security +module, with the system defaulting to the traditional superuser logic. +This capabilities module is discussed further in +`LSM Capabilities Module <#cap>`__. + +The LSM kernel patch adds security fields to kernel data structures and +inserts calls to hook functions at critical points in the kernel code to +manage the security fields and to perform access control. It also adds +functions for registering and unregistering security modules, and adds a +general :c:func:`security()` system call to support new system calls +for security-aware applications. + +The LSM security fields are simply ``void*`` pointers. For process and +program execution security information, security fields were added to +:c:type:`struct task_struct <task_struct>` and +:c:type:`struct linux_binprm <linux_binprm>`. For filesystem +security information, a security field was added to :c:type:`struct +super_block <super_block>`. For pipe, file, and socket security +information, security fields were added to :c:type:`struct inode +<inode>` and :c:type:`struct file <file>`. For packet and +network device security information, security fields were added to +:c:type:`struct sk_buff <sk_buff>` and :c:type:`struct +net_device <net_device>`. For System V IPC security information, +security fields were added to :c:type:`struct kern_ipc_perm +<kern_ipc_perm>` and :c:type:`struct msg_msg +<msg_msg>`; additionally, the definitions for :c:type:`struct +msg_msg <msg_msg>`, struct msg_queue, and struct shmid_kernel +were moved to header files (``include/linux/msg.h`` and +``include/linux/shm.h`` as appropriate) to allow the security modules to +use these definitions. + +Each LSM hook is a function pointer in a global table, security_ops. +This table is a :c:type:`struct security_operations +<security_operations>` structure as defined by +``include/linux/security.h``. Detailed documentation for each hook is +included in this header file. At present, this structure consists of a +collection of substructures that group related hooks based on the kernel +object (e.g. task, inode, file, sk_buff, etc) as well as some top-level +hook function pointers for system operations. This structure is likely +to be flattened in the future for performance. The placement of the hook +calls in the kernel code is described by the "called:" lines in the +per-hook documentation in the header file. The hook calls can also be +easily found in the kernel code by looking for the string +"security_ops->". + +Linus mentioned per-process security hooks in his original remarks as a +possible alternative to global security hooks. However, if LSM were to +start from the perspective of per-process hooks, then the base framework +would have to deal with how to handle operations that involve multiple +processes (e.g. kill), since each process might have its own hook for +controlling the operation. This would require a general mechanism for +composing hooks in the base framework. Additionally, LSM would still +need global hooks for operations that have no process context (e.g. +network input operations). Consequently, LSM provides global security +hooks, but a security module is free to implement per-process hooks +(where that makes sense) by storing a security_ops table in each +process' security field and then invoking these per-process hooks from +the global hooks. The problem of composition is thus deferred to the +module. + +The global security_ops table is initialized to a set of hook functions +provided by a dummy security module that provides traditional superuser +logic. A :c:func:`register_security()` function (in +``security/security.c``) is provided to allow a security module to set +security_ops to refer to its own hook functions, and an +:c:func:`unregister_security()` function is provided to revert +security_ops to the dummy module hooks. This mechanism is used to set +the primary security module, which is responsible for making the final +decision for each hook. + +LSM also provides a simple mechanism for stacking additional security +modules with the primary security module. It defines +:c:func:`register_security()` and +:c:func:`unregister_security()` hooks in the :c:type:`struct +security_operations <security_operations>` structure and +provides :c:func:`mod_reg_security()` and +:c:func:`mod_unreg_security()` functions that invoke these hooks +after performing some sanity checking. A security module can call these +functions in order to stack with other modules. However, the actual +details of how this stacking is handled are deferred to the module, +which can implement these hooks in any way it wishes (including always +returning an error if it does not wish to support stacking). In this +manner, LSM again defers the problem of composition to the module. + +Although the LSM hooks are organized into substructures based on kernel +object, all of the hooks can be viewed as falling into two major +categories: hooks that are used to manage the security fields and hooks +that are used to perform access control. Examples of the first category +of hooks include the :c:func:`alloc_security()` and +:c:func:`free_security()` hooks defined for each kernel data +structure that has a security field. These hooks are used to allocate +and free security structures for kernel objects. The first category of +hooks also includes hooks that set information in the security field +after allocation, such as the :c:func:`post_lookup()` hook in +:c:type:`struct inode_security_ops <inode_security_ops>`. +This hook is used to set security information for inodes after +successful lookup operations. An example of the second category of hooks +is the :c:func:`permission()` hook in :c:type:`struct +inode_security_ops <inode_security_ops>`. This hook checks +permission when accessing an inode. + +LSM Capabilities Module +======================= + +The LSM kernel patch moves most of the existing POSIX.1e capabilities +logic into an optional security module stored in the file +``security/capability.c``. This change allows users who do not want to +use capabilities to omit this code entirely from their kernel, instead +using the dummy module for traditional superuser logic or any other +module that they desire. This change also allows the developers of the +capabilities logic to maintain and enhance their code more freely, +without needing to integrate patches back into the base kernel. + +In addition to moving the capabilities logic, the LSM kernel patch could +move the capability-related fields from the kernel data structures into +the new security fields managed by the security modules. However, at +present, the LSM kernel patch leaves the capability fields in the kernel +data structures. In his original remarks, Linus suggested that this +might be preferable so that other security modules can be easily stacked +with the capabilities module without needing to chain multiple security +structures on the security field. It also avoids imposing extra overhead +on the capabilities module to manage the security fields. However, the +LSM framework could certainly support such a move if it is determined to +be desirable, with only a few additional changes described below. + +At present, the capabilities logic for computing process capabilities on +:c:func:`execve()` and :c:func:`set\*uid()`, checking +capabilities for a particular process, saving and checking capabilities +for netlink messages, and handling the :c:func:`capget()` and +:c:func:`capset()` system calls have been moved into the +capabilities module. There are still a few locations in the base kernel +where capability-related fields are directly examined or modified, but +the current version of the LSM patch does allow a security module to +completely replace the assignment and testing of capabilities. These few +locations would need to be changed if the capability-related fields were +moved into the security field. The following is a list of known +locations that still perform such direct examination or modification of +capability-related fields: + +- ``fs/open.c``::c:func:`sys_access()` + +- ``fs/lockd/host.c``::c:func:`nlm_bind_host()` + +- ``fs/nfsd/auth.c``::c:func:`nfsd_setuser()` + +- ``fs/proc/array.c``::c:func:`task_cap()` diff --git a/Documentation/networking/conf.py b/Documentation/networking/conf.py new file mode 100644 index 000000000000..40f69e67a883 --- /dev/null +++ b/Documentation/networking/conf.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8; mode: python -*- + +project = "Linux Networking Documentation" + +tags.add("subproject") + +latex_documents = [ + ('index', 'networking.tex', project, + 'The kernel development community', 'manual'), +] diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst new file mode 100644 index 000000000000..b5bd87e01f52 --- /dev/null +++ b/Documentation/networking/index.rst @@ -0,0 +1,18 @@ +Linux Networking Documentation +============================== + +Contents: + +.. toctree:: + :maxdepth: 2 + + kapi + z8530book + +.. only:: subproject + + Indices + ======= + + * :ref:`genindex` + diff --git a/Documentation/networking/kapi.rst b/Documentation/networking/kapi.rst new file mode 100644 index 000000000000..580289f345da --- /dev/null +++ b/Documentation/networking/kapi.rst @@ -0,0 +1,147 @@ +========================================= +Linux Networking and Network Devices APIs +========================================= + +Linux Networking +================ + +Networking Base Types +--------------------- + +.. kernel-doc:: include/linux/net.h + :internal: + +Socket Buffer Functions +----------------------- + +.. kernel-doc:: include/linux/skbuff.h + :internal: + +.. kernel-doc:: include/net/sock.h + :internal: + +.. kernel-doc:: net/socket.c + :export: + +.. kernel-doc:: net/core/skbuff.c + :export: + +.. kernel-doc:: net/core/sock.c + :export: + +.. kernel-doc:: net/core/datagram.c + :export: + +.. kernel-doc:: net/core/stream.c + :export: + +Socket Filter +------------- + +.. kernel-doc:: net/core/filter.c + :export: + +Generic Network Statistics +-------------------------- + +.. kernel-doc:: include/uapi/linux/gen_stats.h + :internal: + +.. kernel-doc:: net/core/gen_stats.c + :export: + +.. kernel-doc:: net/core/gen_estimator.c + :export: + +SUN RPC subsystem +----------------- + +.. kernel-doc:: net/sunrpc/xdr.c + :export: + +.. kernel-doc:: net/sunrpc/svc_xprt.c + :export: + +.. kernel-doc:: net/sunrpc/xprt.c + :export: + +.. kernel-doc:: net/sunrpc/sched.c + :export: + +.. kernel-doc:: net/sunrpc/socklib.c + :export: + +.. kernel-doc:: net/sunrpc/stats.c + :export: + +.. kernel-doc:: net/sunrpc/rpc_pipe.c + :export: + +.. kernel-doc:: net/sunrpc/rpcb_clnt.c + :export: + +.. kernel-doc:: net/sunrpc/clnt.c + :export: + +WiMAX +----- + +.. kernel-doc:: net/wimax/op-msg.c + :export: + +.. kernel-doc:: net/wimax/op-reset.c + :export: + +.. kernel-doc:: net/wimax/op-rfkill.c + :export: + +.. kernel-doc:: net/wimax/stack.c + :export: + +.. kernel-doc:: include/net/wimax.h + :internal: + +.. kernel-doc:: include/uapi/linux/wimax.h + :internal: + +Network device support +====================== + +Driver Support +-------------- + +.. kernel-doc:: net/core/dev.c + :export: + +.. kernel-doc:: net/ethernet/eth.c + :export: + +.. kernel-doc:: net/sched/sch_generic.c + :export: + +.. kernel-doc:: include/linux/etherdevice.h + :internal: + +.. kernel-doc:: include/linux/netdevice.h + :internal: + +PHY Support +----------- + +.. kernel-doc:: drivers/net/phy/phy.c + :export: + +.. kernel-doc:: drivers/net/phy/phy.c + :internal: + +.. kernel-doc:: drivers/net/phy/phy_device.c + :export: + +.. kernel-doc:: drivers/net/phy/phy_device.c + :internal: + +.. kernel-doc:: drivers/net/phy/mdio_bus.c + :export: + +.. kernel-doc:: drivers/net/phy/mdio_bus.c + :internal: diff --git a/Documentation/networking/z8530book.rst b/Documentation/networking/z8530book.rst new file mode 100644 index 000000000000..fea2c40e7973 --- /dev/null +++ b/Documentation/networking/z8530book.rst @@ -0,0 +1,256 @@ +======================= +Z8530 Programming Guide +======================= + +:Author: Alan Cox + +Introduction +============ + +The Z85x30 family synchronous/asynchronous controller chips are used on +a large number of cheap network interface cards. The kernel provides a +core interface layer that is designed to make it easy to provide WAN +services using this chip. + +The current driver only support synchronous operation. Merging the +asynchronous driver support into this code to allow any Z85x30 device to +be used as both a tty interface and as a synchronous controller is a +project for Linux post the 2.4 release + +Driver Modes +============ + +The Z85230 driver layer can drive Z8530, Z85C30 and Z85230 devices in +three different modes. Each mode can be applied to an individual channel +on the chip (each chip has two channels). + +The PIO synchronous mode supports the most common Z8530 wiring. Here the +chip is interface to the I/O and interrupt facilities of the host +machine but not to the DMA subsystem. When running PIO the Z8530 has +extremely tight timing requirements. Doing high speeds, even with a +Z85230 will be tricky. Typically you should expect to achieve at best +9600 baud with a Z8C530 and 64Kbits with a Z85230. + +The DMA mode supports the chip when it is configured to use dual DMA +channels on an ISA bus. The better cards tend to support this mode of +operation for a single channel. With DMA running the Z85230 tops out +when it starts to hit ISA DMA constraints at about 512Kbits. It is worth +noting here that many PC machines hang or crash when the chip is driven +fast enough to hold the ISA bus solid. + +Transmit DMA mode uses a single DMA channel. The DMA channel is used for +transmission as the transmit FIFO is smaller than the receive FIFO. it +gives better performance than pure PIO mode but is nowhere near as ideal +as pure DMA mode. + +Using the Z85230 driver +======================= + +The Z85230 driver provides the back end interface to your board. To +configure a Z8530 interface you need to detect the board and to identify +its ports and interrupt resources. It is also your problem to verify the +resources are available. + +Having identified the chip you need to fill in a struct z8530_dev, +which describes each chip. This object must exist until you finally +shutdown the board. Firstly zero the active field. This ensures nothing +goes off without you intending it. The irq field should be set to the +interrupt number of the chip. (Each chip has a single interrupt source +rather than each channel). You are responsible for allocating the +interrupt line. The interrupt handler should be set to +:c:func:`z8530_interrupt()`. The device id should be set to the +z8530_dev structure pointer. Whether the interrupt can be shared or not +is board dependent, and up to you to initialise. + +The structure holds two channel structures. Initialise chanA.ctrlio and +chanA.dataio with the address of the control and data ports. You can or +this with Z8530_PORT_SLEEP to indicate your interface needs the 5uS +delay for chip settling done in software. The PORT_SLEEP option is +architecture specific. Other flags may become available on future +platforms, eg for MMIO. Initialise the chanA.irqs to &z8530_nop to +start the chip up as disabled and discarding interrupt events. This +ensures that stray interrupts will be mopped up and not hang the bus. +Set chanA.dev to point to the device structure itself. The private and +name field you may use as you wish. The private field is unused by the +Z85230 layer. The name is used for error reporting and it may thus make +sense to make it match the network name. + +Repeat the same operation with the B channel if your chip has both +channels wired to something useful. This isn't always the case. If it is +not wired then the I/O values do not matter, but you must initialise +chanB.dev. + +If your board has DMA facilities then initialise the txdma and rxdma +fields for the relevant channels. You must also allocate the ISA DMA +channels and do any necessary board level initialisation to configure +them. The low level driver will do the Z8530 and DMA controller +programming but not board specific magic. + +Having initialised the device you can then call +:c:func:`z8530_init()`. This will probe the chip and reset it into +a known state. An identification sequence is then run to identify the +chip type. If the checks fail to pass the function returns a non zero +error code. Typically this indicates that the port given is not valid. +After this call the type field of the z8530_dev structure is +initialised to either Z8530, Z85C30 or Z85230 according to the chip +found. + +Once you have called z8530_init you can also make use of the utility +function :c:func:`z8530_describe()`. This provides a consistent +reporting format for the Z8530 devices, and allows all the drivers to +provide consistent reporting. + +Attaching Network Interfaces +============================ + +If you wish to use the network interface facilities of the driver, then +you need to attach a network device to each channel that is present and +in use. In addition to use the generic HDLC you need to follow some +additional plumbing rules. They may seem complex but a look at the +example hostess_sv11 driver should reassure you. + +The network device used for each channel should be pointed to by the +netdevice field of each channel. The hdlc-> priv field of the network +device points to your private data - you will need to be able to find +your private data from this. + +The way most drivers approach this particular problem is to create a +structure holding the Z8530 device definition and put that into the +private field of the network device. The network device fields of the +channels then point back to the network devices. + +If you wish to use the generic HDLC then you need to register the HDLC +device. + +Before you register your network device you will also need to provide +suitable handlers for most of the network device callbacks. See the +network device documentation for more details on this. + +Configuring And Activating The Port +=================================== + +The Z85230 driver provides helper functions and tables to load the port +registers on the Z8530 chips. When programming the register settings for +a channel be aware that the documentation recommends initialisation +orders. Strange things happen when these are not followed. + +:c:func:`z8530_channel_load()` takes an array of pairs of +initialisation values in an array of u8 type. The first value is the +Z8530 register number. Add 16 to indicate the alternate register bank on +the later chips. The array is terminated by a 255. + +The driver provides a pair of public tables. The z8530_hdlc_kilostream +table is for the UK 'Kilostream' service and also happens to cover most +other end host configurations. The z8530_hdlc_kilostream_85230 table +is the same configuration using the enhancements of the 85230 chip. The +configuration loaded is standard NRZ encoded synchronous data with HDLC +bitstuffing. All of the timing is taken from the other end of the link. + +When writing your own tables be aware that the driver internally tracks +register values. It may need to reload values. You should therefore be +sure to set registers 1-7, 9-11, 14 and 15 in all configurations. Where +the register settings depend on DMA selection the driver will update the +bits itself when you open or close. Loading a new table with the +interface open is not recommended. + +There are three standard configurations supported by the core code. In +PIO mode the interface is programmed up to use interrupt driven PIO. +This places high demands on the host processor to avoid latency. The +driver is written to take account of latency issues but it cannot avoid +latencies caused by other drivers, notably IDE in PIO mode. Because the +drivers allocate buffers you must also prevent MTU changes while the +port is open. + +Once the port is open it will call the rx_function of each channel +whenever a completed packet arrived. This is invoked from interrupt +context and passes you the channel and a network buffer (struct +sk_buff) holding the data. The data includes the CRC bytes so most +users will want to trim the last two bytes before processing the data. +This function is very timing critical. When you wish to simply discard +data the support code provides the function +:c:func:`z8530_null_rx()` to discard the data. + +To active PIO mode sending and receiving the ``z8530_sync_open`` is called. +This expects to be passed the network device and the channel. Typically +this is called from your network device open callback. On a failure a +non zero error status is returned. +The :c:func:`z8530_sync_close()` function shuts down a PIO +channel. This must be done before the channel is opened again and before +the driver shuts down and unloads. + +The ideal mode of operation is dual channel DMA mode. Here the kernel +driver will configure the board for DMA in both directions. The driver +also handles ISA DMA issues such as controller programming and the +memory range limit for you. This mode is activated by calling the +:c:func:`z8530_sync_dma_open()` function. On failure a non zero +error value is returned. Once this mode is activated it can be shut down +by calling the :c:func:`z8530_sync_dma_close()`. You must call +the close function matching the open mode you used. + +The final supported mode uses a single DMA channel to drive the transmit +side. As the Z85C30 has a larger FIFO on the receive channel this tends +to increase the maximum speed a little. This is activated by calling the +``z8530_sync_txdma_open``. This returns a non zero error code on failure. The +:c:func:`z8530_sync_txdma_close()` function closes down the Z8530 +interface from this mode. + +Network Layer Functions +======================= + +The Z8530 layer provides functions to queue packets for transmission. +The driver internally buffers the frame currently being transmitted and +one further frame (in order to keep back to back transmission running). +Any further buffering is up to the caller. + +The function :c:func:`z8530_queue_xmit()` takes a network buffer +in sk_buff format and queues it for transmission. The caller must +provide the entire packet with the exception of the bitstuffing and CRC. +This is normally done by the caller via the generic HDLC interface +layer. It returns 0 if the buffer has been queued and non zero values +for queue full. If the function accepts the buffer it becomes property +of the Z8530 layer and the caller should not free it. + +The function :c:func:`z8530_get_stats()` returns a pointer to an +internally maintained per interface statistics block. This provides most +of the interface code needed to implement the network layer get_stats +callback. + +Porting The Z8530 Driver +======================== + +The Z8530 driver is written to be portable. In DMA mode it makes +assumptions about the use of ISA DMA. These are probably warranted in +most cases as the Z85230 in particular was designed to glue to PC type +machines. The PIO mode makes no real assumptions. + +Should you need to retarget the Z8530 driver to another architecture the +only code that should need changing are the port I/O functions. At the +moment these assume PC I/O port accesses. This may not be appropriate +for all platforms. Replacing :c:func:`z8530_read_port()` and +``z8530_write_port`` is intended to be all that is required to port +this driver layer. + +Known Bugs And Assumptions +========================== + +Interrupt Locking + The locking in the driver is done via the global cli/sti lock. This + makes for relatively poor SMP performance. Switching this to use a + per device spin lock would probably materially improve performance. + +Occasional Failures + We have reports of occasional failures when run for very long + periods of time and the driver starts to receive junk frames. At the + moment the cause of this is not clear. + +Public Functions Provided +========================= + +.. kernel-doc:: drivers/net/wan/z85230.c + :export: + +Internal Functions +================== + +.. kernel-doc:: drivers/net/wan/z85230.c + :internal: diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index e25d63f8c0da..3aed751e0cb5 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -116,12 +116,11 @@ DevFS has been obsoleted in favour of udev Linux documentation for functions is transitioning to inline documentation via specially-formatted comments near their -definitions in the source. These comments can be combined with the -SGML templates in the Documentation/DocBook directory to make DocBook -files, which can then be converted by DocBook stylesheets to PostScript, -HTML, PDF files, and several other formats. In order to convert from -DocBook format to a format of your choice, you'll need to install Jade as -well as the desired DocBook stylesheets. +definitions in the source. These comments can be combined with ReST +files the Documentation/ directory to make enriched documentation, which can +then be converted to PostScript, HTML, LaTex, ePUB and PDF files. +In order to convert from ReST format to a format of your choice, you'll need +Sphinx. Util-linux ---------- @@ -323,12 +322,6 @@ PDF outputs, it is recommended to use version 1.4.6. functionalities required for ``XeLaTex`` to work. For PDF output you'll also need ``convert(1)`` from ImageMagick (https://www.imagemagick.org). -Other tools ------------ - -In order to produce documentation from DocBook, you'll also need ``xmlto``. -Please notice, however, that we're currently migrating all documents to use -``Sphinx``. Getting updated software ======================== @@ -409,15 +402,6 @@ Quota-tools - <http://sourceforge.net/projects/linuxquota/> -DocBook Stylesheets -------------------- - -- <http://sourceforge.net/projects/docbook/files/docbook-dsssl/> - -XMLTO XSLT Frontend -------------------- - -- <http://cyberelk.net/tim/xmlto/> Intel P6 microcode ------------------ diff --git a/Documentation/process/howto.rst b/Documentation/process/howto.rst index 1260f60d4cb9..c6875b1db56f 100644 --- a/Documentation/process/howto.rst +++ b/Documentation/process/howto.rst @@ -180,14 +180,6 @@ They can also be generated on LaTeX and ePub formats with:: make latexdocs make epubdocs -Currently, there are some documents written on DocBook that are in -the process of conversion to ReST. Such documents will be created in the -Documentation/DocBook/ directory and can be generated also as -Postscript or man pages by running:: - - make psdocs - make mandocs - Becoming A Kernel Developer --------------------------- diff --git a/Documentation/process/kernel-docs.rst b/Documentation/process/kernel-docs.rst index 05a7857a4a83..b8cac85a4001 100644 --- a/Documentation/process/kernel-docs.rst +++ b/Documentation/process/kernel-docs.rst @@ -40,50 +40,18 @@ Enjoy! Docs at the Linux Kernel tree ----------------------------- -The DocBook books should be built with ``make {htmldocs | psdocs | pdfdocs}``. The Sphinx books should be built with ``make {htmldocs | pdfdocs | epubdocs}``. * Name: **linux/Documentation** :Author: Many. :Location: Documentation/ - :Keywords: text files, Sphinx, DocBook. + :Keywords: text files, Sphinx. :Description: Documentation that comes with the kernel sources, inside the Documentation directory. Some pages from this document (including this document itself) have been moved there, and might be more up to date than the web version. - * Title: **The Kernel Hacking HOWTO** - - :Author: Various Talented People, and Rusty. - :Location: Documentation/DocBook/kernel-hacking.tmpl - :Keywords: HOWTO, kernel contexts, deadlock, locking, modules, - symbols, return conventions. - :Description: From the Introduction: "Please understand that I - never wanted to write this document, being grossly underqualified, - but I always wanted to read it, and this was the only way. I - simply explain some best practices, and give reading entry-points - into the kernel sources. I avoid implementation details: that's - what the code is for, and I ignore whole tracts of useful - routines. This document assumes familiarity with C, and an - understanding of what the kernel is, and how it is used. It was - originally written for the 2.3 kernels, but nearly all of it - applies to 2.2 too; 2.0 is slightly different". - - * Title: **Linux Kernel Locking HOWTO** - - :Author: Various Talented People, and Rusty. - :Location: Documentation/DocBook/kernel-locking.tmpl - :Keywords: locks, locking, spinlock, semaphore, atomic, race - condition, bottom halves, tasklets, softirqs. - :Description: The title says it all: document describing the - locking system in the Linux Kernel either in uniprocessor or SMP - systems. - :Notes: "It was originally written for the later (>2.3.47) 2.3 - kernels, but most of it applies to 2.2 too; 2.0 is slightly - different". Freely redistributable under the conditions of the GNU - General Public License. - On-line docs ------------ diff --git a/Documentation/sh/conf.py b/Documentation/sh/conf.py new file mode 100644 index 000000000000..1eb684a13ac8 --- /dev/null +++ b/Documentation/sh/conf.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8; mode: python -*- + +project = "SuperH architecture implementation manual" + +tags.add("subproject") + +latex_documents = [ + ('index', 'sh.tex', project, + 'The kernel development community', 'manual'), +] diff --git a/Documentation/sh/index.rst b/Documentation/sh/index.rst new file mode 100644 index 000000000000..bc8db7ba894a --- /dev/null +++ b/Documentation/sh/index.rst @@ -0,0 +1,59 @@ +======================= +SuperH Interfaces Guide +======================= + +:Author: Paul Mundt + +Memory Management +================= + +SH-4 +---- + +Store Queue API +~~~~~~~~~~~~~~~ + +.. kernel-doc:: arch/sh/kernel/cpu/sh4/sq.c + :export: + +SH-5 +---- + +TLB Interfaces +~~~~~~~~~~~~~~ + +.. kernel-doc:: arch/sh/mm/tlb-sh5.c + :internal: + +.. kernel-doc:: arch/sh/include/asm/tlb_64.h + :internal: + +Machine Specific Interfaces +=========================== + +mach-dreamcast +-------------- + +.. kernel-doc:: arch/sh/boards/mach-dreamcast/rtc.c + :internal: + +mach-x3proto +------------ + +.. kernel-doc:: arch/sh/boards/mach-x3proto/ilsel.c + :export: + +Busses +====== + +SuperHyway +---------- + +.. kernel-doc:: drivers/sh/superhyway/superhyway.c + :export: + +Maple +----- + +.. kernel-doc:: drivers/sh/maple/maple.c + :export: diff --git a/Documentation/sound/conf.py b/Documentation/sound/conf.py new file mode 100644 index 000000000000..3f1fc5e74e7b --- /dev/null +++ b/Documentation/sound/conf.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8; mode: python -*- + +project = "Linux Sound Subsystem Documentation" + +tags.add("subproject") + +latex_documents = [ + ('index', 'sound.tex', project, + 'The kernel development community', 'manual'), +] diff --git a/Documentation/sphinx/tmplcvt b/Documentation/sphinx/tmplcvt deleted file mode 100755 index 6848f0a26fa5..000000000000 --- a/Documentation/sphinx/tmplcvt +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -# -# Convert a template file into something like RST -# -# fix <function> -# feed to pandoc -# fix \_ -# title line? -# -set -eu - -if [ "$#" != "2" ]; then - echo "$0 <docbook file> <rst file>" - exit -fi - -DIR=$(dirname $0) - -in=$1 -rst=$2 -tmp=$rst.tmp - -cp $in $tmp -sed --in-place -f $DIR/convert_template.sed $tmp -pandoc -s -S -f docbook -t rst -o $rst $tmp -sed --in-place -f $DIR/post_convert.sed $rst -rm $tmp -echo "book writen to $rst" diff --git a/Documentation/translations/ja_JP/howto.rst b/Documentation/translations/ja_JP/howto.rst index 4511eed0fabb..8d7ed0cbbf5f 100644 --- a/Documentation/translations/ja_JP/howto.rst +++ b/Documentation/translations/ja_JP/howto.rst @@ -197,13 +197,6 @@ ReSTマークアップを使ったドキュメントは Documentation/outputに make latexdocs make epubdocs -現在、幾つかの DocBook形式で書かれたドキュメントは ReST形式に転換中で -す。それらのドキュメントはDocumentation/DocBook ディレクトリに生成され、 -Postscript または man ページの形式を生成するには以下のようにします - :: - - make psdocs - make mandocs - カーネル開発者になるには ------------------------ diff --git a/Documentation/translations/ko_KR/howto.rst b/Documentation/translations/ko_KR/howto.rst index 2333697251dd..f06de9ca41a4 100644 --- a/Documentation/translations/ko_KR/howto.rst +++ b/Documentation/translations/ko_KR/howto.rst @@ -191,13 +191,6 @@ ReST 마크업을 사용하는 문서들은 Documentation/output 에 생성된 make latexdocs make epubdocs -현재, ReST 로의 변환이 진행중인, DocBook 으로 쓰인 문서들이 존재한다. 그런 -문서들은 Documentation/DocBook/ 디렉토리 안에 생성될 것이고 다음 커맨드를 통해 -Postscript 나 man page 로도 만들어질 수 있다:: - - make psdocs - make mandocs - 커널 개발자가 되는 것 --------------------- |